In [153]:
import json
import csv
import pandas as pd
import numpy as np
import scipy as sci
from scipy import sparse
from sklearn.model_selection import train_test_split
import sklearn.feature_extraction.text as sk_text
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import accuracy_score


In [154]:
review_df= pd.read_csv('review_stars.tsv', delimiter ="\t", encoding = "utf-8")

In [155]:
business_df= pd.read_csv('business.tsv', delimiter ="\t")

In [156]:
review_agg_df = review_df.groupby('business_id')['text'].sum()

In [5]:
df_ready_for_sklearn = pd.DataFrame({'business_id': review_agg_df.index, 'all_reviews': review_agg_df.values})

In [6]:
merge_df = pd.merge(business_df, df_ready_for_sklearn, on='business_id')

In [7]:
merge_df.insert(3,'normalized_count',((merge_df['review_count'] - merge_df['review_count'].min()) / (merge_df['review_count'].max() - merge_df['review_count'].min())).astype(float))
merge_df.drop('review_count', axis=1, inplace=True)

In [117]:
merge_df[0:10]

Unnamed: 0,business_id,categories,stars,normalized_count,all_reviews,encoded_stars
0,DR30lzIHVTF6xhyMI-3IlQ,"Thrift Stores, Shopping, Used, Vintage & Consi...",3.5,0.001758,"b""If this place was on on fire i wouldn't reac...",5
1,YIez_A3WOt9J2SXN7OMa2Q,"Caribbean, Food, Bakeries, Restaurants",4.0,0.012806,b'Love the jerk chicken sandwich and jerk chic...,6
2,Gc8R7b3I3CTwAiWv7MjtSg,"Body Shops, Auto Repair, Automotive",4.5,0.002637,b'My experience was excellent. They expedited ...,7
3,pIzuXtFdkj8fHuzJfYiwqw,"Restaurants, Event Planning & Services, Italia...",4.5,0.0,"b""I'm visiting Calgary from Toronto for a few ...",7
4,5T6kFKFycym_GkhgOiysIw,"Poutineries, Restaurants, Diners",4.0,0.196108,"b'This place is amazing. I mean, you really ca...",6
5,OyJDaAAMr220qkZsovCARQ,"Food, Coffee & Tea",3.0,0.005775,"b""My favorite Starbucks. Extremely friendly st...",4
6,YkAIlxYZ1guSqbbowU9X4g,"Restaurants, Chinese, Dim Sum, Breakfast & Brunch",3.5,0.021092,b'Came here for a lovely dinner with husband ...,5
7,ZQ-7uFQk21NHoOzJfhEjBw,"Coffee & Tea, Food",3.0,0.007031,"b""Wish I had positive things to say. I ordered...",4
8,2ktKjN5z8EcqmUv6EDiDgA,"Fashion, Department Stores, Automotive, Shoppi...",3.5,0.014815,b'Got $1000 worth of tires today. They told me...,5
9,ohYgabP6PqkNsF0vnZUxeg,"Arts & Entertainment, Coffee & Tea, Bars, Food...",4.5,0.01833,"b""Just. Yes.\n\nCoffee: 9.2/10\nBeer: 10/10 \n...",7


In [10]:
vectorizer = sk_text.TfidfVectorizer(stop_words='english', min_df=5)
matrix = vectorizer.fit_transform(merge_df['all_reviews'])
matrix = matrix.toarray()

In [11]:
x_matrix=np.column_stack((matrix, merge_df['normalized_count']))

In [12]:
#train test data for linear regression

x_train, x_test, y_train, y_test = train_test_split(x_matrix, merge_df['stars'] , test_size=0.2, random_state=42)

In [15]:
# linear regression

lin_reg_model = LinearRegression()

lin_reg_model.fit(x_train, y_train)

y_pred = lin_reg_model.predict(x_test)

y_pred

array([ 3.85001718e+11, -3.37730225e+11,  1.36488369e+11, ...,
       -4.88460965e+11,  6.69145911e+11, -1.74935424e+11])

In [16]:
# RMS value

print('Coefficients: \n', lin_reg_model.coef_)
print("Mean squared error: %.2f"
      % mean_squared_error(y_test, y_pred))
print('Variance score: %.2f' % r2_score(y_test, y_pred))

Coefficients: 
 [ 2.75832223e+11 -4.25755416e+11  5.76897146e+11 ...  8.96147921e+08
  6.24317449e+10  2.02216797e+01]
Mean squared error: 412092116133070646542336.00
Variance score: -676164422869387518672896.00


In [37]:
# label encoding data

label_encoder = preprocessing.LabelEncoder()

merge_df['encoded_stars'] = label_encoder.fit_transform(merge_df['stars'])

#merge_df['encoded_stars'][0:10]

0    5
1    6
2    7
3    7
4    6
5    4
6    5
7    4
8    5
9    7
Name: encoded_stars, dtype: int64

In [39]:
#merge_df.dtypes

In [41]:
# train test dat afor other models

x_train1, x_test1, y_train1, y_test1 = train_test_split(x_matrix, merge_df['encoded_stars'] , test_size=0.2, random_state=42)

In [19]:
# logistic Regression

Log_reg_model = LogisticRegression()

Log_reg_model.fit(x_train1, y_train1)

y_pred1 = Log_reg_model.predict(x_test1)

y_pred1

array([6, 6, 5, ..., 6, 6, 6])

In [21]:
# RMs for logistic

print('Coefficients: \n', Log_reg_model.coef_)
print("Mean squared error: %.2f"
      % mean_squared_error(y_test, y_pred1))
print('Variance score: %.2f' % r2_score(y_test1, y_pred1))

Coefficients: 
 [[ 1.78576151e-01  2.72323337e-01 -6.27871042e-03 ... -2.84030886e-03
  -1.37751765e-02 -7.68510013e-01]
 [-1.80635278e-01  1.00244123e-01 -1.21031837e-02 ... -4.16838632e-03
  -2.17052944e-02 -7.07345827e-01]
 [ 7.26476908e-01  6.58735427e-01 -2.48279961e-02 ... -1.29580970e-02
  -5.44202418e-02 -1.67830594e+00]
 ...
 [-9.15579704e-01 -1.36934235e-01 -3.61101132e-02 ...  2.09971238e-01
   9.90884249e-03  3.16869154e+00]
 [ 8.92149049e-01 -2.53092027e-01 -5.15928223e-02 ... -6.68271074e-02
  -3.39188596e-01  2.39635410e+00]
 [-6.66767872e-01  3.22331299e-02 -3.66290082e-02 ... -2.22718800e-02
  -6.45914139e-02 -2.58290173e+00]]
Mean squared error: 5.42
Variance score: 0.05


In [42]:
# MNB

mnb_model = MultinomialNB()

mnb_model.fit(x_train1, y_train1)

y_pred1 = mnb_model.predict(x_test1)

y_pred1

array([6, 6, 6, ..., 6, 6, 6])

In [43]:
print("accuracy: %.2f"
      % accuracy_score(y_test1, y_pred1)) 

accuracy: 0.30


In [118]:
# SVM

svm_model = SVC(kernel="linear")

svm_model.fit(x_train1, y_train1)

y_pred1 = svm_model.predict(x_test1)

y_pred1

array([7, 6, 5, ..., 6, 6, 6])

In [119]:
print("accuracy: %.2f"
      % accuracy_score(y_test1, y_pred1))  

accuracy: 0.32


In [30]:
# KNN

knn_model = KNeighborsClassifier(n_neighbors=1)

knn_model.fit(x_train1, y_train1)

y_pred1 = knn_model.predict(x_test1)

y_pred1

array([4, 3, 6, ..., 6, 3, 6])

In [31]:
print("accuracy: %.2f"
      % accuracy_score(y_test1, y_pred1)) 

accuracy: 0.24


In [84]:
print(y_pred1[0])
# print(y_test1[1176])

6
6


In [87]:
# list  the business with the stars and prediction

for i in range(0,len(y_pred1)):
    idx=y_test1.index[i]
    print("business id - %s actual stars label - %d predicted - %d" 
          %(merge_df['business_id'][idx], y_test1[idx], y_pred1[i]))

business id - 2GIuCQJ9vfMY7rERB-SkUA actual stars label - 6 predicted - 6
business id - xvrkd0c5E7sl9KuY-IMHIA actual stars label - 3 predicted - 6
business id - xdphjWZymq-3dqOWmA_wwA actual stars label - 5 predicted - 6
business id - aUWc_viqJs2SAYSFXwvXDg actual stars label - 8 predicted - 7
business id - tNxGH0dYx_JinTxnwwjZUA actual stars label - 6 predicted - 6
business id - iK1Srj0VMvjVfYoVA_2lLA actual stars label - 3 predicted - 6
business id - iFfSUhyv0yRIiBt8y0A_xA actual stars label - 5 predicted - 6
business id - ivAe-BA1y3DOyRUKHdPnQA actual stars label - 6 predicted - 6
business id - i3o52-kSqR3s_HmJn02WHw actual stars label - 7 predicted - 6
business id - mAhu38hHUQyOatamyQsOww actual stars label - 4 predicted - 5
business id - cLeYiKG14AGURvrcaCPytw actual stars label - 7 predicted - 6
business id - 2RsLaIEnNUIzjQyCvPCNWA actual stars label - 7 predicted - 6
business id - rNqSjfRM09T-VOjRTAF66g actual stars label - 5 predicted - 5
business id - 8XoDXN2DKmQGJHZhkQbJ9A a

In [116]:
idx = merge_df.index[merge_df['business_id']=='2GIuCQJ9vfMY7rERB-SkUA']
#print(idx)
print(y_test1[idx].values)
#print(y_test1.index)
#print(y_test1.iloc[0])

idx1 = np.where(y_test1.index == idx)

#print(y_pred1[idx])

[6]


ValueError: Lengths must match to compare

In [90]:
merge_df['business_id'][1176]

'2GIuCQJ9vfMY7rERB-SkUA'

In [127]:
outfile = open("business_nei.tsv", 'w')
sfile = csv.writer(outfile, delimiter ="\t", quoting=csv.QUOTE_MINIMAL)
sfile.writerow(['business_id','categories', 'stars', 'review_count','postal code'])
with open('yelp_dataset/yelp_academic_dataset_business.json') as f:
    for line in f:
        row = json.loads(line)
        # some special char must be encoded in 'utf-8'
        sfile.writerow([row['business_id'], row['categories'], row['stars'],row['review_count'], row['postal_code']])

outfile.close()



In [128]:
business_nei_df= pd.read_csv('business_nei.tsv', delimiter ="\t")

In [130]:
business_nei_df.dtypes

business_id      object
categories       object
stars           float64
review_count      int64
postal code      object
dtype: object

In [137]:
merge_postal_df = pd.merge(business_nei_df, df_ready_for_sklearn, on='business_id')

In [139]:
merge_postal_df.head()

Unnamed: 0,business_id,categories,stars,review_count,postal code,all_reviews
0,DR30lzIHVTF6xhyMI-3IlQ,"Thrift Stores, Shopping, Used, Vintage & Consi...",3.5,17,89102,"b""If this place was on on fire i wouldn't reac..."
1,YIez_A3WOt9J2SXN7OMa2Q,"Caribbean, Food, Bakeries, Restaurants",4.0,105,M3A 1K8,b'Love the jerk chicken sandwich and jerk chic...
2,Gc8R7b3I3CTwAiWv7MjtSg,"Body Shops, Auto Repair, Automotive",4.5,24,28206,b'My experience was excellent. They expedited ...
3,pIzuXtFdkj8fHuzJfYiwqw,"Restaurants, Event Planning & Services, Italia...",4.5,3,T2P 1G4,"b""I'm visiting Calgary from Toronto for a few ..."
4,5T6kFKFycym_GkhgOiysIw,"Poutineries, Restaurants, Diners",4.0,1565,H2J 2J3,"b'This place is amazing. I mean, you really ca..."


In [133]:
hotcoded_df = pd.get_dummies(merge_postal_df['postal code'], sparse = 'true')

In [134]:
hotcoded_df.head()

Unnamed: 0,12200,15017,15026,15044,15056,15063,15085,15090,15101,15102,...,T4B 0N2,T4B 3G6,T4B 3K3,T4B 3L8,V5H 1J9,YO11 1PE,YO11 2ED,YO21 3PU,YO22 4RG,YO22 5LY
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [140]:
result_df = pd.concat([merge_postal_df, hotcoded_df], axis=1, sort=False)

In [141]:
result_df.head()

Unnamed: 0,business_id,categories,stars,review_count,postal code,all_reviews,12200,15017,15026,15044,...,T4B 0N2,T4B 3G6,T4B 3K3,T4B 3L8,V5H 1J9,YO11 1PE,YO11 2ED,YO21 3PU,YO22 4RG,YO22 5LY
0,DR30lzIHVTF6xhyMI-3IlQ,"Thrift Stores, Shopping, Used, Vintage & Consi...",3.5,17,89102,"b""If this place was on on fire i wouldn't reac...",0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,YIez_A3WOt9J2SXN7OMa2Q,"Caribbean, Food, Bakeries, Restaurants",4.0,105,M3A 1K8,b'Love the jerk chicken sandwich and jerk chic...,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Gc8R7b3I3CTwAiWv7MjtSg,"Body Shops, Auto Repair, Automotive",4.5,24,28206,b'My experience was excellent. They expedited ...,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,pIzuXtFdkj8fHuzJfYiwqw,"Restaurants, Event Planning & Services, Italia...",4.5,3,T2P 1G4,"b""I'm visiting Calgary from Toronto for a few ...",0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5T6kFKFycym_GkhgOiysIw,"Poutineries, Restaurants, Diners",4.0,1565,H2J 2J3,"b'This place is amazing. I mean, you really ca...",0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [142]:
x_matrix=np.column_stack((x_matrix,hotcoded_df))

In [144]:
x_matrix.shape

(8577, 9610)

In [145]:
x_train2, x_test2, y_train2, y_test2 = train_test_split(x_matrix, merge_df['encoded_stars'] , test_size=0.2, random_state=42)

In [147]:
# MNB

mnb_model = MultinomialNB()

mnb_model.fit(x_train2, y_train2)

y_pred2 = mnb_model.predict(x_test2)

y_pred2

array([5, 6, 5, ..., 6, 6, 7])

In [152]:
print("accuracy: %.2f"
      % accuracy_score(y_test2, y_pred2)) 

accuracy: 0.32


In [149]:
# SVM

svm_model = SVC(kernel="linear")

svm_model.fit(x_train2, y_train2)

y_pred2 = svm_model.predict(x_test2)

y_pred2

array([6, 6, 5, ..., 6, 6, 7])