In [1]:
import pandas as pd

# get data for reviews
reviews = pd.read_csv('csv_data/yelp_academic_dataset_review.csv').sample(n=200000, random_state=0)

# get list of businesses which are restaurants
businesses = pd.read_csv('csv_data/yelp_academic_dataset_business.csv')
businesses = businesses[businesses['categories'].str.contains('Restaurants', na=False)]

# filter only reviews for restaurants
reviews = reviews[reviews['business_id'].isin(businesses['business_id'].tolist())]

# check dataframe for reviews
reviews

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
3014718,EXt74EO6JhK-2dM44f8KiA,JapJQZIsWLn8MpsHOH6J-w,ySHS6i9RxGZOIkVPcj-ckQ,3.0,0.0,0.0,0.0,I had the 6 ounce Filet with the mixed vegetab...,2014-04-08 00:37:08
6473116,xQ0skqjmT1o4o_QqbVjzeg,Q0Y7P42DgCjCN4gOHpL91Q,Cbktlvci_z4HwdqPUW-5cg,3.0,0.0,0.0,0.0,Super friendly service. Fair price on drinks. ...,2018-09-17 20:43:45
5966131,2ov0mIOaYUCanSxXiemjJQ,ryBRMyty7d2lxO19q_oPjg,hJTwBhYBTkiHaDMml_v_sw,1.0,2.0,0.0,0.0,I swear you could order just a side baguette a...,2018-11-06 19:47:46
2066348,Bv8gCzzR3gdacxeQ8ZrHgw,W1GvlnCar_UjOK3Go3iVYQ,mFWFtD6bXdpLz8bDXe-LHw,5.0,0.0,0.0,0.0,Just WOW! My friends know how much I'm a tea ...,2019-10-16 14:48:38
2625623,fNKn-QiJ_pZ_kSsWS9UtUQ,Jq_GTGcuw5jU5Izzh1w5Yg,16tjKOvMw_nOgNViq2LwOQ,5.0,0.0,0.0,0.0,I went while on vacation in September. The chi...,2020-01-21 09:25:12
...,...,...,...,...,...,...,...,...,...
6944129,KH1KO8E65II6Y9VhGjWScw,FR8UGOX13F0CAwpTwM2t1g,RBnQoIRF3V-FMJSn_BHy0A,1.0,3.0,2.0,0.0,[Sigh]...it pains me to have to give PFC one s...,2014-04-14 00:00:45
1078136,TNkZ8yawBb0nMPThgw2qcg,16RMPIx3diBsRCvgkshRkg,n5TNfoXg1i8wX5R3iY4WUA,2.0,2.0,0.0,0.0,"Everything, from the food to the drinks (great...",2016-12-15 02:20:47
2434958,RFKHSI1_69GffhK-qSl8MA,OIoiAfpuBLvOaOh_GfYUzA,yID8zq7b4YRcWn5Vcc-MuA,5.0,0.0,0.0,0.0,I love my neighborhood Vietnamese restaurant! ...,2019-01-11 23:41:56
6602594,aP-tvUjlD0xNQ7-J5kpa7A,SgdS9i7RxMUxwuV22teaLA,I_mCFePUG2MGuH-zuws7bA,5.0,0.0,0.0,0.0,Had been disappointed the last couple of times...,2018-10-04 23:44:15


In [2]:
# check amount of reviews per category of stars
reviews.groupby('stars').count()

Unnamed: 0_level_0,review_id,user_id,business_id,useful,funny,cool,text,date
stars,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1.0,16151,16151,16151,16151,16151,16151,16151,16151
2.0,11518,11518,11518,11518,11518,11518,11518,11518
3.0,15379,15379,15379,15379,15379,15379,15379,15379
4.0,32353,32353,32353,32353,32353,32353,32353,32353
5.0,59720,59720,59720,59720,59720,59720,59720,59720


In [3]:
from yelp_functions import get_processed_inputs

# get processed inputs
tfidf_inputs = get_processed_inputs(reviews)

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

# create the TF-IDF representation for the set
tfidf_vec = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
tfidf_representation = tfidf_vec.fit_transform(tfidf_inputs)

# create the array with TF-IDF and normalize it
tfidf_array = tfidf_representation.toarray()
tfidf_normalized = [vector/sum(vector) if sum(vector) != 0 else vector for vector in tfidf_array]

In [5]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# split the data into train and test set
ratings = reviews['stars'].tolist()
X_train, X_test, y_train, y_test = train_test_split(np.array(tfidf_normalized), np.array(ratings), test_size = 0.25, random_state=0)

# using logistic regression for classification
lrc = LogisticRegression(random_state=0).fit(X_train, y_train)
print("LR test score:", lrc.score(X_test, y_test))
print("LR train score:", lrc.score(X_train, y_train))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LR test score: 0.5853882359906456
LR train score: 0.5927373199131636
