In [1]:
import pandas as pd

# retrieve data for reviews
reviews = pd.read_csv('csv_data/yelp_academic_dataset_review.csv').sort_values(by=['user_id', 'date'])

In [2]:
# update values for previous star and previous user id
reviews['previous_stars'] = reviews.stars.shift(1)
reviews['previous_user_id'] = reviews.user_id.shift(1)

# check the result from data manipulation
reviews

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,previous_stars,previous_user_id
1094389,rJ3CASyRfG-7ZviCBFCJQg,---1lKK3aKOuomHnwAkAow,f19eLfhXqR47Ct8Hz2y_pA,5.0,0.0,0.0,0.0,I hard think luxurious experience begins to de...,2018-12-19 22:26:22,,
2228827,f-M5V-LeQKIy3U6vQ_FfYQ,---2PmXbF47D870stH1jqA,pgO-fORYt4nb5Tj0x1F_aQ,5.0,0.0,0.0,0.0,"Dine there once a week....Great Food, Great Se...",2012-10-24 13:33:39,5.0,---1lKK3aKOuomHnwAkAow
5798416,hepOKRE2O5gXDWLKkAjn5w,---2PmXbF47D870stH1jqA,1An4DxtMmvvSe0HX4viRCA,5.0,1.0,0.0,1.0,"Dinner here last night...Their 7"" personal pan...",2012-10-28 17:16:13,5.0,---2PmXbF47D870stH1jqA
1671443,PtiOktOk5COHoNjc6K4gcw,---2PmXbF47D870stH1jqA,eR7ieJD12PUzsYrP8fw6rQ,5.0,0.0,0.0,0.0,Great lunch spot @ Citrus Park Mall. Had the 1...,2012-11-02 00:30:24,5.0,---2PmXbF47D870stH1jqA
3130128,xSRDnEBpog6z0IwltFEDxg,---2PmXbF47D870stH1jqA,HpWi2CRJlxVCYKd8kS0X-A,5.0,5.0,0.0,2.0,Met Friends there that we hadn't seen in Years...,2013-02-17 15:13:21,5.0,---2PmXbF47D870stH1jqA
...,...,...,...,...,...,...,...,...,...,...,...
857328,67tpVRdPI-6OBQHDJl60kA,zzzCg-_lpyYE82TlJCdwdw,LttC5xNMFcgOg3bt_MlXTg,5.0,1.0,1.0,1.0,McAlister's is a wonderful place. The owner i...,2014-02-13 05:11:06,1.0,zzz-M4QvkEpUWWPL9RTzLA
1979549,GxXQHQzx2kMd01FD5mHgwA,zzzGgfvrSJ4AQeKtcgocIw,XyYsl2OyoMi6OMvzsdcVoQ,3.0,0.0,0.0,0.0,We went last nite the hostess and our waitress...,2021-03-13 13:42:23,5.0,zzzCg-_lpyYE82TlJCdwdw
2680312,76BiRS2w9FBEUAnUC8EgZA,zzzMBVS73g3ZJ7qL8JyhiA,3FKIev7ZB_KE6XHL9sUJCg,1.0,0.0,0.0,0.0,I was scheduled to have a small wedding recept...,2020-07-02 19:42:37,3.0,zzzGgfvrSJ4AQeKtcgocIw
5972321,m86ROmbVUDdTbcjkFWrrUQ,zzzUFM4HFe0SFG0bPjntQA,xe2L_RvBNgkrVburQrfW_Q,5.0,0.0,0.0,0.0,"Go see Jack, at the Shack, in Frontenac!! No j...",2019-12-15 15:13:52,1.0,zzzMBVS73g3ZJ7qL8JyhiA


In [3]:
# delete first reviews from each user
reviews = reviews.loc[reviews['user_id'] == reviews['previous_user_id']]
reviews = reviews.loc[reviews['stars'] == 3]

# calculate the delta in the review rating
reviews['delta_stars'] = reviews['stars'] - reviews['previous_stars']

# add categories based on delta
reviews['class'] = 'A'
reviews.loc[reviews['delta_stars'] < 0, 'class'] = 'A'
reviews.loc[reviews['delta_stars'] == 0, 'class'] = 'B'
reviews.loc[reviews['delta_stars'] > 0, 'class'] = 'C'

In [4]:
# reduce the dataset to 5000 reviews of each category
sampled_reviews = reviews.groupby('class').apply(lambda x: x.sample(2000, random_state=0).reset_index(drop=True))
sampled_reviews = sampled_reviews.droplevel(level=0)

# check if the sampling went well
sampled_reviews.groupby('class').count()

Unnamed: 0_level_0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,previous_stars,previous_user_id,delta_stars
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
A,2000,2000,2000,2000,2000,2000,2000,2000,2000,2000,2000,2000
B,2000,2000,2000,2000,2000,2000,2000,2000,2000,2000,2000,2000
C,2000,2000,2000,2000,2000,2000,2000,2000,2000,2000,2000,2000


In [5]:
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
import unidecode

# strip accents and use lowercase for all the text
sampled_reviews['text'] = [unidecode.unidecode(review_text).lower() for review_text in sampled_reviews['text']]

# tokenize the reviews using spaCy
spacy_tokenizer = English()
sampled_reviews['spacy_token'] = [[token.text for token in spacy_tokenizer(review_text)] for review_text in sampled_reviews['text']]

# remove stop words
sampled_reviews['spacy_token'] = [list(filter(lambda word: word not in STOP_WORDS, list_of_tokens)) for list_of_tokens in sampled_reviews['spacy_token']]

# create a list with inputs in the ideal format for BoW and TF-IDF
pre_processed_inputs = [' '.join(review) for review in sampled_reviews['spacy_token'].tolist()]

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

# create the BoW representation for the set
count_vec = CountVectorizer(max_features=512, ngram_range=(1,2))
bow_representation = count_vec.fit_transform(pre_processed_inputs)

# create the array with BoW and normalize it
bow_array = bow_representation.toarray()
bow_normalized = [vector/sum(vector) if sum(vector) != 0 else vector for vector in bow_array]

In [9]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

review_group = sampled_reviews['class'].tolist()
X_train, X_test, y_train, y_test = train_test_split(np.array(bow_normalized), np.array(review_group), test_size = 0.25, random_state=0)

# using KNNs for classification
knn = KNeighborsClassifier(n_neighbors=20).fit(X_train, y_train)
print("KNN test score:", knn.score(X_test, y_test))
print("KNN train score:", knn.score(X_train, y_train))

# using SVMs for classification
svm = SVC().fit(X_train, y_train)
print("SVM test score:", svm.score(X_test, y_test))
print("SVM train score:", svm.score(X_train, y_train))

# using random forests for classification
rfc = RandomForestClassifier(max_depth=6, random_state=0).fit(X_train, y_train)
print("RF test score:", rfc.score(X_test, y_test))
print("RF train score:", rfc.score(X_train, y_train))

KNN test score: 0.32866666666666666
KNN train score: 0.45222222222222225
SVM test score: 0.36466666666666664
SVM train score: 0.8068888888888889
RF test score: 0.36
RF train score: 0.6662222222222223
