In [44]:
import json
import csv
import pandas as pd
import numpy as np
import scipy as sci
from scipy import sparse
from sklearn.model_selection import train_test_split
import sklearn.feature_extraction.text as sk_text
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import accuracy_score


In [45]:
#Open business.json file, create tsv file with business_id, business name, categories, and review count to be used as features 
#and stars as label

outfile = open("business.tsv", 'w')
sfile = csv.writer(outfile, delimiter ="\t", quoting=csv.QUOTE_MINIMAL)
sfile.writerow(['business_id','categories', 'stars', 'review_count'])
with open('yelp_academic_dataset_business.json', encoding="utf8") as f:
    for line in f:
        row = json.loads(line)
        sfile.writerow([row['business_id'], row['categories'], row['stars'],row['review_count'] ])

outfile.close()

business_df= pd.read_csv('business.tsv', delimiter ="\t")

In [46]:
#Open review.json file, create tsv file with business_id,text to be used as features 
#and stars as label

outfile = open("review_stars.tsv", 'w')
sfile = csv.writer(outfile, delimiter ="\t", quoting=csv.QUOTE_MINIMAL)
sfile.writerow(['business_id','stars', 'text'])
with open('yelp_academic_dataset_review.json', encoding="utf8") as f:
    for line in f:
        row = json.loads(line)
        # some special char must be encoded in 'utf-8'
        sfile.writerow([row['business_id'], row['stars'], (row['text']).encode('utf-8')])

outfile.close()

review_df= pd.read_csv('review_stars.tsv', delimiter ="\t")

In [47]:
# Group all reviews by business_id
review_agg_df = review_df.groupby('business_id')['text'].sum()
df_ready_for_sklearn = pd.DataFrame({'business_id': review_agg_df.index, 'all_reviews': review_agg_df.values})


In [48]:
#Merge the resulting review aggregate dataframe with business dataframe
merge_df = pd.merge(business_df, df_ready_for_sklearn, on='business_id')


In [49]:
#print the first 5 rows using head
merge_df.head()

Unnamed: 0,business_id,categories,stars,review_count,all_reviews
0,Apn5Q_b6Nz61Tq4XzPdf9A,"Tours, Breweries, Pizza, Restaurants, Food, Ho...",4.0,24,b'Great place. Major flaw is how early it clos...
1,AjEbIBw6ZFfln7ePHha9PA,"Chicken Wings, Burgers, Caterers, Street Vendo...",4.5,3,"b""CK's BBQ is off the charts best BBQ I have E..."
2,O8S5hYJ1SMc8fA4QBtVujA,"Breakfast & Brunch, Restaurants, French, Sandw...",4.0,5,"b""La nourriture est excellente, le service Imp..."
3,bFzdJJ3wp3PZssNEsyU23g,"Insurance, Financial Services",1.5,8,b'GEICO for auto is great! But they are really...
4,8USyCYqpScwiNEb58Bt6CA,"Home & Garden, Nurseries & Gardening, Shopping...",2.0,4,"b""This listing was originally under gardening ..."


In [50]:
#Normalization of review count field so it becomes comparable and remove bias
merge_df.insert(3,'normalized_count',((merge_df['review_count'] - merge_df['review_count'].min()) / (merge_df['review_count'].max() - merge_df['review_count'].min())).astype(float))
merge_df.drop('review_count', axis=1, inplace=True)

In [51]:
#print the normalized counts for first 5 rows
merge_df.head()

Unnamed: 0,business_id,categories,stars,normalized_count,all_reviews
0,Apn5Q_b6Nz61Tq4XzPdf9A,"Tours, Breweries, Pizza, Restaurants, Food, Ho...",4.0,0.002637,b'Great place. Major flaw is how early it clos...
1,AjEbIBw6ZFfln7ePHha9PA,"Chicken Wings, Burgers, Caterers, Street Vendo...",4.5,0.0,"b""CK's BBQ is off the charts best BBQ I have E..."
2,O8S5hYJ1SMc8fA4QBtVujA,"Breakfast & Brunch, Restaurants, French, Sandw...",4.0,0.000251,"b""La nourriture est excellente, le service Imp..."
3,bFzdJJ3wp3PZssNEsyU23g,"Insurance, Financial Services",1.5,0.000628,b'GEICO for auto is great! But they are really...
4,8USyCYqpScwiNEb58Bt6CA,"Home & Garden, Nurseries & Gardening, Shopping...",2.0,0.000126,"b""This listing was originally under gardening ..."


In [52]:
#TF-IDF calculation

tfidf = sk_text.TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word',
 stop_words= 'english',ngram_range=(1,1))

In [53]:
# Fit the reviews column with TFIDFvectorizer
matrix = tfidf.fit_transform(merge_df['all_reviews'])
matrix = matrix.toarray()

In [54]:
# We are adding the normalized count to the original matrix with TFIDFvectorizer
x_matrix=np.column_stack((matrix, merge_df['normalized_count']))

In [12]:
#train test data for linear regression

x_train, x_test, y_train, y_test = train_test_split(x_matrix, merge_df['stars'] , test_size=0.2, random_state=42)

In [13]:
# Checking the shape of train and test data
x_train.shape

(150874, 1001)

In [14]:
x_test.shape


(37719, 1001)

In [15]:
y_train.shape

(150874,)

In [16]:
y_test.shape

(37719,)

In [17]:
# linear regression

lin_reg_model = LinearRegression()

lin_reg_model.fit(x_train, y_train)

y_pred = lin_reg_model.predict(x_test)

y_pred

array([2.79065971, 4.04537282, 4.74927598, ..., 2.42110101, 3.49581221,
       2.88075762])

In [45]:
# list  the business with the stars and prediction

for i in range(0,10):
    idx=y_test.index[i]
    print("business id - %s actual stars label - %d predicted - %d" 
          %(merge_df['business_id'][idx], y_test[idx], y_pred[i]))

business id - y6cfbFoQuPBf4H6mubticw actual stars label - 2 predicted - 2
business id - tVW7pefO8xIgWMF86oVhKQ actual stars label - 4 predicted - 4
business id - WYSaOdpqFAV9NxrVdKKR9w actual stars label - 5 predicted - 4
business id - MJjTgjRZJLnU0VHF1TeMGQ actual stars label - 3 predicted - 3
business id - 5bwIr6lySrlr2fUQJEoqkg actual stars label - 5 predicted - 4
business id - ZYyRkdrSKG7tYwuE8R38og actual stars label - 4 predicted - 3
business id - T1iskW4W6jGRTwzq3DKN_Q actual stars label - 4 predicted - 4
business id - 83lCWOhpSJ0DcTczGfGcxg actual stars label - 4 predicted - 3
business id - SpO3kyBfxN0wpfdkLSNepA actual stars label - 5 predicted - 4
business id - Cbu0DpFdk47f7FUwNku7-A actual stars label - 5 predicted - 4


In [18]:
#Performance of Linear Regression Model
# The lower the mean squared the better performance
print("Mean squared error: %.2f"
      % mean_squared_error(y_test, y_pred))
print('R2 score: %.2f' % r2_score(y_test, y_pred))

Mean squared error: 0.27
R2 score: 0.74


In [None]:
print("accuracy: %.2f"
      % accuracy_score(y_test, y_pred))

In [19]:
#stars need to be encoded for all the models we are going to train
le = preprocessing.LabelEncoder()
merge_df['encoded_stars'] = le.fit_transform(merge_df['stars'])

In [20]:
#Display encoded stars
merge_df.head()

Unnamed: 0,business_id,categories,stars,normalized_count,all_reviews,encoded_stars
0,Apn5Q_b6Nz61Tq4XzPdf9A,"Tours, Breweries, Pizza, Restaurants, Food, Ho...",4.0,0.002637,b'Great place. Major flaw is how early it clos...,6
1,AjEbIBw6ZFfln7ePHha9PA,"Chicken Wings, Burgers, Caterers, Street Vendo...",4.5,0.0,"b""CK's BBQ is off the charts best BBQ I have E...",7
2,O8S5hYJ1SMc8fA4QBtVujA,"Breakfast & Brunch, Restaurants, French, Sandw...",4.0,0.000251,"b""La nourriture est excellente, le service Imp...",6
3,bFzdJJ3wp3PZssNEsyU23g,"Insurance, Financial Services",1.5,0.000628,b'GEICO for auto is great! But they are really...,1
4,8USyCYqpScwiNEb58Bt6CA,"Home & Garden, Nurseries & Gardening, Shopping...",2.0,0.000126,"b""This listing was originally under gardening ...",2


In [21]:
# train test dat afor other models

x_train1, x_test1, y_train1, y_test1 = train_test_split(x_matrix, merge_df['encoded_stars'] , test_size=0.2, random_state=42)

In [22]:
# logistic Regression

Log_reg_model = LogisticRegression()

Log_reg_model.fit(x_train1, y_train1)

y_pred1 = Log_reg_model.predict(x_test1)

y_pred1

array([4, 6, 8, ..., 4, 5, 5], dtype=int64)

In [23]:
# list  the business with the stars and prediction

for i in range(0,10):
    idx=y_test1.index[i]
    print("business id - %s actual stars label - %d predicted - %d" 
          %(merge_df['business_id'][idx], y_test1[idx], y_pred1[i]))

business id - y6cfbFoQuPBf4H6mubticw actual stars label - 3 predicted - 4
business id - tVW7pefO8xIgWMF86oVhKQ actual stars label - 6 predicted - 6
business id - WYSaOdpqFAV9NxrVdKKR9w actual stars label - 8 predicted - 8
business id - MJjTgjRZJLnU0VHF1TeMGQ actual stars label - 5 predicted - 5
business id - 5bwIr6lySrlr2fUQJEoqkg actual stars label - 8 predicted - 8
business id - ZYyRkdrSKG7tYwuE8R38og actual stars label - 6 predicted - 5
business id - T1iskW4W6jGRTwzq3DKN_Q actual stars label - 7 predicted - 8
business id - 83lCWOhpSJ0DcTczGfGcxg actual stars label - 6 predicted - 8
business id - SpO3kyBfxN0wpfdkLSNepA actual stars label - 8 predicted - 8
business id - Cbu0DpFdk47f7FUwNku7-A actual stars label - 8 predicted - 8


In [24]:
#Performance of Linear Regression Model
# The lower the mean squared the better performance
print("Mean squared error: %.2f"
      % mean_squared_error(y_test1, y_pred1))
print('R2 score: %.2f' % r2_score(y_test1, y_pred1))

Mean squared error: 1.46
R2 score: 0.64


In [None]:
print("accuracy: %.2f"
      % accuracy_score(y_test1, y_pred1))

In [25]:
#implementing Nearest Neighbor

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=1)

knn.fit(x_train1, y_train1) 

y_pred1 = knn.predict(x_test1)

y_pred1 

array([5, 5, 7, ..., 2, 5, 7], dtype=int64)

In [26]:
# list  the business with the stars and prediction

for i in range(0,10):
    idx=y_test1.index[i]
    print("business id - %s actual stars label - %d predicted - %d" 
          %(merge_df['business_id'][idx], y_test1[idx], y_pred1[i]))

business id - y6cfbFoQuPBf4H6mubticw actual stars label - 3 predicted - 5
business id - tVW7pefO8xIgWMF86oVhKQ actual stars label - 6 predicted - 5
business id - WYSaOdpqFAV9NxrVdKKR9w actual stars label - 8 predicted - 7
business id - MJjTgjRZJLnU0VHF1TeMGQ actual stars label - 5 predicted - 6
business id - 5bwIr6lySrlr2fUQJEoqkg actual stars label - 8 predicted - 7
business id - ZYyRkdrSKG7tYwuE8R38og actual stars label - 6 predicted - 5
business id - T1iskW4W6jGRTwzq3DKN_Q actual stars label - 7 predicted - 7
business id - 83lCWOhpSJ0DcTczGfGcxg actual stars label - 6 predicted - 8
business id - SpO3kyBfxN0wpfdkLSNepA actual stars label - 8 predicted - 8
business id - Cbu0DpFdk47f7FUwNku7-A actual stars label - 8 predicted - 5


In [27]:
print("accuracy: %.2f"
      % accuracy_score(y_test1, y_pred1))

accuracy: 0.32


In [28]:
# SVM

svm_model = SVC()

svm_model.fit(x_train1, y_train1)

y_pred1 = svm_model.predict(x_test1)

y_pred1

array([8, 6, 8, ..., 6, 6, 6], dtype=int64)

In [29]:
# list  the business with the stars and prediction

for i in range(0,10):
    idx=y_test1.index[i]
    print("business id - %s actual stars label - %d predicted - %d" 
          %(merge_df['business_id'][idx], y_test1[idx], y_pred1[i]))

business id - y6cfbFoQuPBf4H6mubticw actual stars label - 3 predicted - 8
business id - tVW7pefO8xIgWMF86oVhKQ actual stars label - 6 predicted - 6
business id - WYSaOdpqFAV9NxrVdKKR9w actual stars label - 8 predicted - 8
business id - MJjTgjRZJLnU0VHF1TeMGQ actual stars label - 5 predicted - 6
business id - 5bwIr6lySrlr2fUQJEoqkg actual stars label - 8 predicted - 8
business id - ZYyRkdrSKG7tYwuE8R38og actual stars label - 6 predicted - 6
business id - T1iskW4W6jGRTwzq3DKN_Q actual stars label - 7 predicted - 6
business id - 83lCWOhpSJ0DcTczGfGcxg actual stars label - 6 predicted - 8
business id - SpO3kyBfxN0wpfdkLSNepA actual stars label - 8 predicted - 8
business id - Cbu0DpFdk47f7FUwNku7-A actual stars label - 8 predicted - 8


In [30]:
print("accuracy: %.2f"
      % accuracy_score(y_test1, y_pred1))

accuracy: 0.28


In [31]:
# MNB

mnb_model = MultinomialNB()

mnb_model.fit(x_train1, y_train1)

y_pred1 = mnb_model.predict(x_test1)

y_pred1

array([8, 6, 6, ..., 5, 5, 5], dtype=int64)

In [32]:
# list  the business with the stars and prediction

for i in range(0,10):
    idx=y_test1.index[i]
    print("business id - %s actual stars label - %d predicted - %d" 
          %(merge_df['business_id'][idx], y_test1[idx], y_pred1[i]))

business id - y6cfbFoQuPBf4H6mubticw actual stars label - 3 predicted - 8
business id - tVW7pefO8xIgWMF86oVhKQ actual stars label - 6 predicted - 6
business id - WYSaOdpqFAV9NxrVdKKR9w actual stars label - 8 predicted - 6
business id - MJjTgjRZJLnU0VHF1TeMGQ actual stars label - 5 predicted - 5
business id - 5bwIr6lySrlr2fUQJEoqkg actual stars label - 8 predicted - 8
business id - ZYyRkdrSKG7tYwuE8R38og actual stars label - 6 predicted - 5
business id - T1iskW4W6jGRTwzq3DKN_Q actual stars label - 7 predicted - 7
business id - 83lCWOhpSJ0DcTczGfGcxg actual stars label - 6 predicted - 8
business id - SpO3kyBfxN0wpfdkLSNepA actual stars label - 8 predicted - 8
business id - Cbu0DpFdk47f7FUwNku7-A actual stars label - 8 predicted - 8


In [33]:
print("accuracy: %.2f"
      % accuracy_score(y_test1, y_pred1))

accuracy: 0.34


** Additional Features **

** POSTAL CODE **

In [92]:
# creating new TSV for business dataset adding postal code as an additional feature

outfile = open("business_postal.tsv", 'w')
sfile = csv.writer(outfile, delimiter ="\t", quoting=csv.QUOTE_MINIMAL)
sfile.writerow(['business_id','categories', 'stars', 'review_count','postal code'])
with open('yelp_academic_dataset_business.json', encoding="utf8") as f:
    for line in f:
        row = json.loads(line)
        # some special char must be encoded in 'utf-8'
        sfile.writerow([row['business_id'], row['categories'], row['stars'],row['review_count'], row['postal_code']])

outfile.close()

In [93]:
# Creating new dataframe

business_postal_df= pd.read_csv('business_postal.tsv', delimiter ="\t")

In [104]:
# merging the new dataframe with the reviews
merge_postal_df = pd.merge(business_postal_df, df_ready_for_sklearn, on='business_id')
merge_postal_df = merge_postal_df[0:9999]
x_matrix1 = x_matrix[0:9999]

In [105]:
# one-hot cooding of postal codes 

hotcoded_df = pd.get_dummies(merge_postal_df['postal code'], sparse = 'true')

In [112]:
#stars need to be encoded for all the models we are going to train
le = preprocessing.LabelEncoder()
merge_postal_df['encoded_stars'] = le.fit_transform(merge_postal_df['stars'])

In [113]:
# printing the One Hot Coded postal codes

hotcoded_df.head()

Unnamed: 0,06502,06632,15003,15015,15017,15024,15025,15026,15044,15056,...,T4B 0N2,T4B 1R9,WA15 9PA,WA15 9SN,YO11 1PE,YO11 1PQ,YO11 2AG,YO12,YO12 5EG,YO22 4DE
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [114]:
# merge the encoded postal codes with the matrix of TFIDF and Review count

x_matrix1=np.column_stack((x_matrix1,hotcoded_df))

In [115]:
# Spliting the new data set

x_train2, x_test2, y_train2, y_test2 = train_test_split(x_matrix1, merge_postal_df['encoded_stars'] , test_size=0.2, random_state=42)

In [116]:
# MNB

mnb_model = MultinomialNB()

mnb_model.fit(x_train2, y_train2)

y_pred2 = mnb_model.predict(x_test2)

y_pred2

array([6, 5, 5, ..., 8, 8, 6], dtype=int64)

In [117]:
print("accuracy: %.2f"
      % accuracy_score(y_test2, y_pred2))

accuracy: 0.27


In [118]:
# SVM

svm_model = SVC(kernel="linear")

svm_model.fit(x_train2, y_train2)

y_pred2 = svm_model.predict(x_test2)

y_pred2

array([6, 4, 4, ..., 3, 7, 4], dtype=int64)

In [119]:
print("accuracy: %.2f"
      % accuracy_score(y_test2, y_pred2))

accuracy: 0.40


** CATEGORIES **

In [26]:
del merge_df_1

In [27]:
business_df1= pd.read_csv('business.tsv', delimiter ="\t", nrows=10000)

In [28]:
review_df1= pd.read_csv('review_stars.tsv', delimiter ="\t" , nrows=10000)

In [29]:
review_agg_df1 = review_df1.groupby('business_id')['text'].sum()
df_ready_for_sklearn1 = pd.DataFrame({'business_id': review_agg_df1.index, 'all_reviews': review_agg_df1.values})

In [30]:
merge_df_1 = pd.merge(business_df1, df_ready_for_sklearn1, on='business_id')

merge_df_1

Unnamed: 0,business_id,categories,stars,review_count,all_reviews
0,DR30lzIHVTF6xhyMI-3IlQ,"Thrift Stores, Shopping, Used, Vintage & Consi...",3.5,17,"b""If this place was on on fire i wouldn't reac..."
1,YIez_A3WOt9J2SXN7OMa2Q,"Caribbean, Food, Bakeries, Restaurants",4.0,105,b'Love the jerk chicken sandwich and jerk chic...
2,Gc8R7b3I3CTwAiWv7MjtSg,"Body Shops, Auto Repair, Automotive",4.5,24,b'My experience was excellent. They expedited ...
3,pIzuXtFdkj8fHuzJfYiwqw,"Restaurants, Event Planning & Services, Italia...",4.5,3,"b""I'm visiting Calgary from Toronto for a few ..."
4,5T6kFKFycym_GkhgOiysIw,"Poutineries, Restaurants, Diners",4.0,1565,"b'This place is amazing. I mean, you really ca..."
5,OyJDaAAMr220qkZsovCARQ,"Food, Coffee & Tea",3.0,49,"b""My favorite Starbucks. Extremely friendly st..."
6,YkAIlxYZ1guSqbbowU9X4g,"Restaurants, Chinese, Dim Sum, Breakfast & Brunch",3.5,171,b'Came here for a lovely dinner with husband ...
7,ZQ-7uFQk21NHoOzJfhEjBw,"Coffee & Tea, Food",3.0,59,"b""Wish I had positive things to say. I ordered..."
8,2ktKjN5z8EcqmUv6EDiDgA,"Fashion, Department Stores, Automotive, Shoppi...",3.5,121,b'Got $1000 worth of tires today. They told me...
9,ohYgabP6PqkNsF0vnZUxeg,"Arts & Entertainment, Coffee & Tea, Bars, Food...",4.5,149,"b""Just. Yes.\n\nCoffee: 9.2/10\nBeer: 10/10 \n..."


In [31]:
merge_df_1.insert(3,'normalized_count',((merge_df_1['review_count'] - merge_df_1['review_count'].min()) / (merge_df_1['review_count'].max() - merge_df_1['review_count'].min())).astype(float))
merge_df_1.drop('review_count', axis=1, inplace=True)
merge_df_1['categories'][0]

'Thrift Stores, Shopping, Used, Vintage & Consignment, Antiques, Fashion, Auction Houses'

In [32]:
merge_df_1['categories'] = merge_df_1['categories'].str.lower()

In [33]:
merge_df_1['categories'] = merge_df_1['categories'].str.replace('[^\w\s]','')

In [34]:
merge_df_1['categories'].str.split(',').head()

0    [thrift stores shopping used vintage  consignm...
1                [caribbean food bakeries restaurants]
2                  [body shops auto repair automotive]
3    [restaurants event planning  services italian ...
4                     [poutineries restaurants diners]
Name: categories, dtype: object

In [35]:
def get_element(my_list, position):
    return my_list[position]

In [36]:
merge_df_1['categories'] = merge_df_1['categories'].str.split(' ').apply(get_element, position=0)

In [37]:
merge_df_1['categories']

0             thrift
1          caribbean
2               body
3        restaurants
4        poutineries
5               food
6        restaurants
7             coffee
8            fashion
9               arts
10          japanese
11              food
12            hotels
13          desserts
14       restaurants
15             tapas
16         nightlife
17       restaurants
18       restaurants
19             pizza
20       restaurants
21       restaurants
22              food
23          shopping
24              meat
25             local
26          shopping
27          american
28         nightlife
29            beauty
           ...      
394          jewelry
395      electronics
396             thai
397      restaurants
398           bubble
399      restaurants
400        breakfast
401             home
402         american
403             bars
404       sandwiches
405         printing
406       gymnastics
407      restaurants
408         shopping
409      restaurants
410          

In [38]:
# one hot coding of categories

hotcoded_df = pd.get_dummies(merge_df_1['categories'], sparse = 'true')

In [39]:
hotcoded_df

Unnamed: 0,acai,active,acupuncture,american,arts,asian,auto,automotive,bagels,bakeries,...,tobacco,trailer,translation,transportation,turkish,venues,veterinarians,vietnamese,water,wine
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [55]:
# merge the one hot coded categories with the matrix of TFIDF and Review count
x_matrix_1 = x_matrix[0:424]
x_matrix_1 = np.column_stack((x_matrix_1, hotcoded_df))

In [56]:
#stars need to be encoded for all the models we are going to train
le = preprocessing.LabelEncoder()
merge_df_1['encoded_stars'] = le.fit_transform(merge_df_1['stars'])

In [57]:
# Spliting the new data set

x_train3, x_test3, y_train3, y_test3 = train_test_split(x_matrix_1, merge_df_1['encoded_stars'] , test_size=0.2, random_state=42)

In [58]:
# MNB

mnb_model = MultinomialNB()

mnb_model.fit(x_train3, y_train3)

y_pred3 = mnb_model.predict(x_test3)

y_pred3

array([6, 6, 5, 5, 6, 6, 6, 5, 6, 6, 6, 6, 6, 6, 6, 5, 6, 5, 6, 5, 6, 5,
       6, 5, 6, 6, 6, 6, 6, 5, 6, 6, 6, 6, 5, 6, 5, 6, 5, 6, 5, 5, 5, 6,
       5, 6, 6, 6, 5, 6, 6, 5, 6, 5, 6, 5, 5, 6, 5, 6, 6, 6, 5, 5, 5, 6,
       6, 5, 5, 5, 6, 5, 6, 6, 5, 6, 6, 6, 5, 6, 6, 6, 5, 6, 5],
      dtype=int64)

In [59]:
print("accuracy: %.2f"
      % accuracy_score(y_test3, y_pred3))

accuracy: 0.22
