In [22]:
import pandas as pd

# read file with reviews data for restaurants
reviews = pd.read_csv('processed_data/reviews_restaurants.csv')

In [23]:
# filter out restaurants with price range of 2
reviews = reviews[reviews['price_range'] != 2]

# add categories based on price
reviews['price_class'] = ''
reviews.loc[reviews['price_range'] == 1, 'price_class'] = 'cheap'
reviews.loc[reviews['price_range'] >= 3, 'price_class'] = 'expensive'

# check results
reviews.groupby('price_class').count()

Unnamed: 0_level_0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,price_range,state
price_class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
cheap,967113,967113,967113,967113,967113,967113,967113,967113,967113,967113,967113
expensive,332694,332694,332694,332694,332694,332694,332694,332694,332694,332694,332694


In [24]:
# reduce the dataset to 10000 reviews of each category
reviews = reviews.groupby('price_class').apply(lambda x: x.sample(10000, random_state=0).reset_index(drop=True))
reviews = reviews.droplevel(level=0)

# check if the sampling went well
reviews.groupby('price_class').count()

Unnamed: 0_level_0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,price_range,state
price_class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
cheap,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000
expensive,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000


In [25]:
from sklearn.model_selection import train_test_split
import numpy as np

review_group = reviews['price_class'].tolist()
X_train, X_test, y_train, y_test = train_test_split(reviews[['text']], np.array(review_group), test_size = 0.25, random_state=0)

In [26]:
from yelp_functions import get_processed_inputs, sum_to_one
from sklearn.feature_extraction.text import CountVectorizer

# pre-process the inputs
X_train = get_processed_inputs(X_train)

# create the BoW representation for the set
bow_vec = CountVectorizer(max_features=4096, ngram_range=(1,2))
X_train = bow_vec.fit_transform(X_train)

# create the array with BoW and normalize it
X_train = X_train.toarray()
X_train = sum_to_one(X_train)

# we now create vectors for testing
X_test = get_processed_inputs(X_test)
X_test = bow_vec.transform(X_test)
X_test = X_test.toarray()
X_test = sum_to_one(X_test)

# check results for X_test
len(X_train), X_train[0].shape[0], X_train[0].sum()

(15000, 4096, 1.0)

In [27]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

# using KNNs for classification
knn = KNeighborsClassifier(n_neighbors=20).fit(X_train, y_train)
print("KNN test score:", knn.score(X_test, y_test))
print("KNN train score:", knn.score(X_train, y_train))

# using SVMs for classification
svm = SVC().fit(X_train, y_train)
print("SVM test score:", svm.score(X_test, y_test))
print("SVM train score:", svm.score(X_train, y_train))

# using random forests for classification
rfc = RandomForestClassifier(max_depth=6, random_state=0).fit(X_train, y_train)
print("RF test score:", rfc.score(X_test, y_test))
print("RF train score:", rfc.score(X_train, y_train))

# using logistic regression for classification
lrc = LogisticRegression(random_state=0).fit(X_train, y_train)
print("LR test score:", lrc.score(X_test, y_test))
print("LR train score:", lrc.score(X_train, y_train))

# using logistic regression for classification
nbc = MultinomialNB().fit(X_train, y_train)
print("NB test score:", nbc.score(X_test, y_test))
print("NB train score:", nbc.score(X_train, y_train))

KNN test score: 0.5908
KNN train score: 0.5988666666666667
SVM test score: 0.9206
SVM train score: 0.9847333333333333
RF test score: 0.8324
RF train score: 0.8426
LR test score: 0.8902
LR train score: 0.8923333333333333
NB test score: 0.925
NB train score: 0.9302
