In [1]:
import pandas as pd

# import data for businesses
businesses = pd.read_csv('csv_data/yelp_academic_dataset_business.csv')

# import data from states and sort it
states_data = pd.read_csv('processed_data/states_data.csv')
states_data = states_data.sort_values(by='gini_index_2019').reset_index(drop=True)

# add the level of inequality to states data
states_data['inequality'] = ''
states_data.loc[states_data.index < 17, 'inequality'] = 'low'
states_data.loc[states_data.index > 33, 'inequality'] = 'high'
states_data.loc[states_data['inequality'] == '', 'inequality'] = 'medium'

# check results
states_data

Unnamed: 0,state,postal_code,gini_index_2019,gdppc_2021,inequality
0,Utah,UT,0.4268,66011,low
1,Idaho,ID,0.4337,49616,low
2,Wyoming,WY,0.4345,71911,low
3,South Dakota,SD,0.436,68357,low
4,Alaska,AK,0.4376,75027,low
5,Wisconsin,WI,0.4391,62065,low
6,Hawaii,HI,0.4397,62474,low
7,Nebraska,NE,0.44,76584,low
8,New Hampshire,NH,0.4406,70729,low
9,Iowa,IA,0.4422,68849,low


In [2]:
# add inequality level to each business
states_dict = dict(zip(states_data['postal_code'].tolist(), states_data['inequality'].tolist()))
businesses['state_inequality'] = businesses['state'].apply(lambda x: states_dict.get(x))

# check results
businesses

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours,state_inequality
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",,high
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ...",medium
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ...",medium
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ...",high
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2...",high
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150341,IUQopTMmYQG-qRtBk-8QnA,Binh's Nails,3388 Gateway Blvd,Edmonton,AB,T6J 5H2,53.468419,-113.492054,3.0,13,1,"{'ByAppointmentOnly': 'False', 'RestaurantsPri...","Nail Salons, Beauty & Spas","{'Monday': '10:0-19:30', 'Tuesday': '10:0-19:3...",
150342,c8GjPIOTGVmIemT7j5_SyQ,Wild Birds Unlimited,2813 Bransford Ave,Nashville,TN,37204,36.115118,-86.766925,4.0,5,1,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","Pets, Nurseries & Gardening, Pet Stores, Hobby...","{'Monday': '9:30-17:30', 'Tuesday': '9:30-17:3...",high
150343,_QAMST-NrQobXduilWEqSw,Claire's Boutique,"6020 E 82nd St, Ste 46",Indianapolis,IN,46250,39.908707,-86.065088,3.5,8,1,"{'RestaurantsPriceRange2': '1', 'BusinessAccep...","Shopping, Jewelry, Piercing, Toy Stores, Beaut...",,medium
150344,mtGm22y5c2UHNXDFAjaPNw,Cyclery & Fitness Center,2472 Troy Rd,Edwardsville,IL,62025,38.782351,-89.950558,4.0,24,1,"{'BusinessParking': ""{'garage': False, 'street...","Fitness/Exercise Equipment, Eyewear & Optician...","{'Monday': '9:0-20:0', 'Tuesday': '9:0-20:0', ...",high


In [3]:
# create a dict to transfer the data
businesses_dict = dict(zip(businesses['business_id'].tolist(), businesses['state_inequality'].tolist()))

# import data for reviews and sample it
reviews = pd.read_csv('csv_data/yelp_academic_dataset_review.csv').sample(frac=0.1, random_state=0)

# add inequality levels for reviews
reviews['state_inequality'] = reviews['business_id'].apply(lambda x: businesses_dict.get(x))

# check results
reviews

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,state_inequality
3014718,EXt74EO6JhK-2dM44f8KiA,JapJQZIsWLn8MpsHOH6J-w,ySHS6i9RxGZOIkVPcj-ckQ,3.0,0.0,0.0,0.0,I had the 6 ounce Filet with the mixed vegetab...,2014-04-08 00:37:08,medium
6473116,xQ0skqjmT1o4o_QqbVjzeg,Q0Y7P42DgCjCN4gOHpL91Q,Cbktlvci_z4HwdqPUW-5cg,3.0,0.0,0.0,0.0,Super friendly service. Fair price on drinks. ...,2018-09-17 20:43:45,high
5966131,2ov0mIOaYUCanSxXiemjJQ,ryBRMyty7d2lxO19q_oPjg,hJTwBhYBTkiHaDMml_v_sw,1.0,2.0,0.0,0.0,I swear you could order just a side baguette a...,2018-11-06 19:47:46,high
2066348,Bv8gCzzR3gdacxeQ8ZrHgw,W1GvlnCar_UjOK3Go3iVYQ,mFWFtD6bXdpLz8bDXe-LHw,5.0,0.0,0.0,0.0,Just WOW! My friends know how much I'm a tea ...,2019-10-16 14:48:38,high
112607,1d0l2nyTEDjrpwS3BV-ABQ,UTsHQia_JFcg1deWozrvXQ,6exYCuj4iSl1Hd6hb0C_Bg,5.0,0.0,0.0,1.0,I gave Leigh Anne a photo that I found on the ...,2015-01-21 23:50:03,medium
...,...,...,...,...,...,...,...,...,...,...
4130384,bRBnGQtWV-giTCvl6RiuBQ,zZvlvWrWxka2Q0wEEgaYqA,CjotjEjteeBeBsURCPBZxQ,1.0,3.0,0.0,1.0,I had a technician come out on 6/29 after my A...,2019-07-03 16:45:48,medium
1625675,8J_OD5dcVfcLV8RwnEwuMw,DuwZ_C3ZncPtEtTWeEPFjw,Vn6fOha6UZ2HGyZoAcxPkw,5.0,0.0,0.0,0.0,BEST gophers and pralines ever. Seriously supe...,2018-03-10 01:07:39,high
3262276,wZZSsUVFwlyMhTrRngtC-Q,TQ2PSzXAvLTjDheHP9S7lg,jz41zeEyGVEBhE2LiROICQ,1.0,1.0,0.0,0.0,Tragic. I used to visit Thai Nana when they w...,2015-03-04 19:41:39,high
4601676,2V6jKH9KfSBJOgXT8b3ceA,TNLaYYt1Tpy4YqghZUvt1A,WnyiCJfpsl1RViIWafNwIA,4.0,6.0,2.0,5.0,Think of how much you enjoy a fine adult bever...,2009-12-29 21:18:16,medium


In [4]:
# use only reviews that have either high or low inequality
reviews = reviews[reviews['state_inequality'].isin(['high', 'low'])]

# reduce the dataset to 10000 reviews of each category
sampled_reviews = reviews.groupby('state_inequality').apply(lambda x: x.sample(10000, random_state=0).reset_index(drop=True))
sampled_reviews = sampled_reviews.droplevel(level=0)

# check results
sampled_reviews.groupby('state_inequality').count()

Unnamed: 0_level_0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
state_inequality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
high,10000,10000,10000,10000,10000,10000,10000,10000,10000
low,10000,10000,10000,10000,10000,10000,10000,10000,10000


In [5]:
# check mean rating for each group
sampled_reviews.groupby('state_inequality').mean()

Unnamed: 0_level_0,stars,useful,funny,cool
state_inequality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
high,3.7748,1.1527,0.3262,0.4669
low,3.6695,1.1169,0.2583,0.3941


In [6]:
# check correlation between inequality and gdppc
states_data.corr()

Unnamed: 0,gini_index_2019,gdppc_2021
gini_index_2019,1.0,-0.022353
gdppc_2021,-0.022353,1.0


In [7]:
from yelp_functions import get_processed_inputs, sum_to_one
from sklearn.feature_extraction.text import CountVectorizer

# use BoW to create vectors
bow_inputs = get_processed_inputs(sampled_reviews)

# create the BoW representation for the set
count_vec = CountVectorizer(max_features=1024, ngram_range=(1,2))
bow_representation = count_vec.fit_transform(bow_inputs)

# create the array with BoW and normalize it
bow_array = bow_representation.toarray()
bow_normalized = sum_to_one(bow_array)

In [8]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

review_group = sampled_reviews['state_inequality'].tolist()
X_train, X_test, y_train, y_test = train_test_split(np.array(bow_normalized), np.array(review_group), test_size = 0.25, random_state=0)

# using KNNs for classification
knn = KNeighborsClassifier(n_neighbors=20).fit(X_train, y_train)
print("KNN test score:", knn.score(X_test, y_test))
print("KNN train score:", knn.score(X_train, y_train))

# using SVMs for classification
svm = SVC().fit(X_train, y_train)
print("SVM test score:", svm.score(X_test, y_test))
print("SVM train score:", svm.score(X_train, y_train))

# using random forests for classification
rfc = RandomForestClassifier(max_depth=6, random_state=0).fit(X_train, y_train)
print("RF test score:", rfc.score(X_test, y_test))
print("RF train score:", rfc.score(X_train, y_train))

KNN test score: 0.5444
KNN train score: 0.6201333333333333
SVM test score: 0.6134
SVM train score: 0.8924666666666666
RF test score: 0.613
RF train score: 0.6877333333333333


In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

# using logistic regression for classification
lrc = LogisticRegression(random_state=0).fit(X_train, y_train)
print("LR test score:", lrc.score(X_test, y_test))
print("LR train score:", lrc.score(X_train, y_train))

# using logistic regression for classification
nbc = MultinomialNB().fit(X_train, y_train)
print("NB test score:", nbc.score(X_test, y_test))
print("NB train score:", nbc.score(X_train, y_train))

LR test score: 0.6192
LR train score: 0.6519333333333334
NB test score: 0.6226
NB train score: 0.6612


In [10]:
import gensim.downloader as api

# getting the pre-trained word2vec model
w2v = api.load('word2vec-google-news-300')

In [14]:
# we now test the model (just for demonstrations purposes)
vec_king = w2v['king']
w2v.similarity('king', 'queen')

0.6510957

In [12]:
# create inputs for the w2v model
w2v_inputs = get_processed_inputs(sampled_reviews, mode='list_of_lists')
w2v_inputs

[['heard',
  'place',
  'donuts',
  'philly',
  '.',
  'donuts',
  'hot',
  'fresh',
  'lacked',
  'flavor',
  '.',
  'chose',
  'cookies',
  'cream',
  'raspberry',
  ',',
  'maybe',
  'way',
  'excited',
  'eat',
  ',',
  'bit',
  'disappointed',
  '.',
  'customer',
  'service',
  'good',
  'coffee',
  '.'],
 ['ok',
  ',',
  'bestie',
  'needed',
  'drive',
  ',',
  'decided',
  'drive',
  'santa',
  'barbara',
  'los',
  'angeles',
  ',',
  'minute',
  'decision',
  'arrived',
  'santa',
  'barbara',
  '3',
  'pm',
  'hungry',
  'breakfast',
  ',',
  'yelped',
  'mexican',
  'restaurant',
  'dine',
  'restaurant',
  'closest',
  ',',
  'bestie',
  'thought',
  'place',
  'authentic',
  '/',
  'mexican',
  'good',
  ',',
  'like',
  'american',
  'mexican',
  ',',
  ',',
  'went',
  'low',
  'expectations',
  ',',
  'bestie',
  'asked',
  'server',
  'places',
  '"',
  'better',
  '"',
  'mexican',
  'food',
  ',',
  'server',
  'kept',
  'insisting',
  'beat',
  'place',
  ',',
  '

In [21]:
# we now copy the methodology used in the reference book
w2v_array = []
for text in w2v_inputs:
    text_array = np.zeros(300)
    len_count = 0
    for token in text:
        if token in w2v:
            text_array += w2v[token]
            len_count += 1
    if len_count != 0:
        text_array /= len_count
    w2v_array.append(text_array)

In [22]:
# prepare for train
X_train, X_test, y_train, y_test = train_test_split(np.array(w2v_array), np.array(review_group), test_size = 0.25, random_state=0)

# using KNNs for classification
knn = KNeighborsClassifier(n_neighbors=20).fit(X_train, y_train)
print("KNN test score:", knn.score(X_test, y_test))
print("KNN train score:", knn.score(X_train, y_train))

# using SVMs for classification
svm = SVC().fit(X_train, y_train)
print("SVM test score:", svm.score(X_test, y_test))
print("SVM train score:", svm.score(X_train, y_train))

# using random forests for classification
rfc = RandomForestClassifier(max_depth=6, random_state=0).fit(X_train, y_train)
print("RF test score:", rfc.score(X_test, y_test))
print("RF train score:", rfc.score(X_train, y_train))

# using logistic regression for classification
lrc = LogisticRegression(random_state=0).fit(X_train, y_train)
print("LR test score:", lrc.score(X_test, y_test))
print("LR train score:", lrc.score(X_train, y_train))

# using logistic regression for classification
nbc = MultinomialNB().fit(X_train, y_train)
print("NB test score:", nbc.score(X_test, y_test))
print("NB train score:", nbc.score(X_train, y_train))

KNN test score: 0.5584
KNN train score: 0.6151333333333333
SVM test score: 0.6086
SVM train score: 0.6942666666666667
RF test score: 0.5642
RF train score: 0.7124
LR test score: 0.606
LR train score: 0.6183333333333333


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


ValueError: Negative values in data passed to MultinomialNB (input X)