In [1]:
import os
import json
import sys
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
import nltk
stdout = sys.stdout
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [2]:
reviews = pd.read_csv("all_chains_cs_reviews.csv")
reviews.head(5)

Unnamed: 0.1,Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars_x,...,categories,hours,review_id,user_id,stars_y,useful,funny,cool,text,date
0,44,c32IpmTXxiDxKYKvhJiSuQ,Popeyes,6000 N Terminal Pkwy,Atlanta,GA,30320,33.640737,-84.429305,3.5,...,"Fast Food, Chicken Wings, Restaurants","{'Monday': '10:0-1:0', 'Tuesday': '10:0-1:0', ...",tr6X_kKyXgnnyOnax5o0Ng,cZA_G7kIkyIrR15EKXoVFw,1.0,0,0,0,Awful. Waited in line for 15-20 for a chicken...,2016-05-25 17:00:07
1,91,c32IpmTXxiDxKYKvhJiSuQ,Popeyes,6000 N Terminal Pkwy,Atlanta,GA,30320,33.640737,-84.429305,3.5,...,"Fast Food, Chicken Wings, Restaurants","{'Monday': '10:0-1:0', 'Tuesday': '10:0-1:0', ...",3EJq4ihQoTVmvTunSP2-RQ,zKMCLxQnAOXpHJIKMZCI_Q,1.0,5,1,0,This is my first time at the Hartsfield Airpor...,2015-06-08 16:13:57
2,131,IpNJfLJ6R3RjZJF_ucGaZA,Popeyes,4932 State Route 46,Sanford,FL,32771,28.811882,-81.344261,1.5,...,"Chicken Wings, Restaurants, Fast Food, America...","{'Monday': '10:0-23:0', 'Tuesday': '10:0-23:0'...",4ASda-KRC4gz5n194yHQ1A,CpRBM-El-mqvbv93lYX5QA,4.0,5,2,4,They weren't ready for the return of the chick...,2019-11-03 17:56:41
3,132,IpNJfLJ6R3RjZJF_ucGaZA,Popeyes,4932 State Route 46,Sanford,FL,32771,28.811882,-81.344261,1.5,...,"Chicken Wings, Restaurants, Fast Food, America...","{'Monday': '10:0-23:0', 'Tuesday': '10:0-23:0'...",4v8hM7d-Zu5x5bdsYfxU1A,ui1vL68Ty9_aeKGtzJNSHg,1.0,1,0,0,I have being several times there but they neve...,2020-02-04 13:37:25
4,134,IpNJfLJ6R3RjZJF_ucGaZA,Popeyes,4932 State Route 46,Sanford,FL,32771,28.811882,-81.344261,1.5,...,"Chicken Wings, Restaurants, Fast Food, America...","{'Monday': '10:0-23:0', 'Tuesday': '10:0-23:0'...",thb_gq39zatj4dEv_iyAbw,Lk_clm7vulcrkvcBAyO6fg,1.0,0,0,0,I have been trying to get the spicy chicken sa...,2019-12-17 02:20:50


In [3]:
reviews.shape

(2424, 23)

In [4]:
lemmatizer = WordNetLemmatizer()
reviews['lemmatized'] = reviews.text.map(lambda x: ' '.join([lemmatizer.lemmatize(y) for y in x.split(' ')]))

In [5]:
stemmer = SnowballStemmer("english")
reviews['stemmed'] = reviews.text.map(lambda x: ' '.join([stemmer.stem(y) for y in x.split(' ')]))
reviews.stemmed.head()

0    awful.  wait in line for 15-20 for a chicken s...
1    this is my first time at the hartsfield airpor...
2    they weren't readi for the return of the chick...
3    i have be sever time there but they never have...
4    i have been tri to get the spici chicken sandw...
Name: stemmed, dtype: object

In [6]:
cvec = CountVectorizer(stop_words='english', min_df=1, max_df=.5, ngram_range=(1,2))
cvec

CountVectorizer(max_df=0.5, ngram_range=(1, 2), stop_words='english')

In [7]:
from itertools import islice
cvec.fit(reviews.stemmed)
list(islice(cvec.vocabulary_.items(), 20))

[('awful', 6252),
 ('wait', 91514),
 ('line', 47921),
 ('15', 419),
 ('20', 578),
 ('biscuit', 8715),
 ('didn', 22322),
 ('order', 59599),
 ('bone', 9471),
 ('staff', 80694),
 ('50', 1119),
 ('miser', 54677),
 ('compar', 16976),
 ('helpful', 39400),
 ('unacceptable', 89470),
 ('awful wait', 6278),
 ('wait line', 91649),
 ('line 15', 47928),
 ('15 20', 422),
 ('20 chicken', 584)]

In [8]:
len(cvec.vocabulary_)

96816

In [9]:
cvec = CountVectorizer(stop_words='english', min_df=.0025, max_df=.1, ngram_range=(1,2))
cvec.fit(reviews.stemmed)
len(cvec.vocabulary_)

3378

In [10]:
cvec_counts = cvec.transform(reviews.stemmed)
print('sparse matrix shape:', cvec_counts.shape)
print('nonzero count:', cvec_counts.nnz)
print('sparsity: %.2f%%' % (100.0 * cvec_counts.nnz / (cvec_counts.shape[0] * cvec_counts.shape[1])))

sparse matrix shape: (2424, 3378)
nonzero count: 87747
sparsity: 1.07%


In [11]:
occ = np.asarray(cvec_counts.sum(axis=0)).ravel().tolist()
counts_df = pd.DataFrame({'term': cvec.get_feature_names(), 'occurrences': occ})
counts_df.sort_values(by='occurrences', ascending=False).head(20)

Unnamed: 0,term,occurrences
384,burger,351
3267,wendi,318
3297,window,308
1361,grill,304
2299,pretti,301
1843,mcdonald,297
1781,lunch,283
2274,popey,282
949,everi,281
1755,lot,281


In [12]:
transformer = TfidfTransformer()
transformed_weights = transformer.fit_transform(cvec_counts)
transformed_weights

<2424x3378 sparse matrix of type '<class 'numpy.float64'>'
	with 87747 stored elements in Compressed Sparse Row format>

In [13]:
weights = np.asarray(transformed_weights.mean(axis=0)).ravel().tolist()
weights_df = pd.DataFrame({'term': cvec.get_feature_names(), 'weight': weights})
weights_df.sort_values(by='weight', ascending=False).head(20)

Unnamed: 0,term,weight
3267,wendi,0.013365
384,burger,0.012847
1361,grill,0.012155
2299,pretti,0.011813
2274,popey,0.0117
1781,lunch,0.011253
2714,service,0.011111
2343,quick,0.011042
1843,mcdonald,0.010888
3352,wrong,0.010671


In [14]:
#tfidf for all reviews combined(across all chains)
from sklearn.feature_extraction.text import TfidfVectorizer
tvec = TfidfVectorizer(min_df=.0025, max_df=.1, stop_words='english')
tvec_weights = tvec.fit_transform(reviews.lemmatized.dropna())
weights = np.asarray(tvec_weights.mean(axis=0)).ravel().tolist()
weights_df = pd.DataFrame({'term': tvec.get_feature_names(), 'weight': weights})
weights_df.sort_values(by='weight', ascending=False).head(20)

Unnamed: 0,term,weight
774,grilled,0.015108
1314,popeyes,0.014992
1334,pretty,0.014844
231,burger,0.014816
476,delicious,0.014403
441,crispy,0.014266
1031,lunch,0.013858
293,cheese,0.01332
1936,wrong,0.01325
341,cold,0.012913


In [15]:
#independent tfidf for each chain
df1,df2,df3,df4,df5,df6,df7 = [x for _,x in reviews.groupby(reviews.name)]

In [16]:
dfs = [df1,df2,df3,df4,df5,df6,df7]

In [17]:
results = []

In [18]:
for df in dfs:
    #stemming the words(lemmatizer works better)
#     stemmer = SnowballStemmer("english")
#     df['stemmed'] = df.text.map(lambda x: ' '.join([stemmer.stem(y) for y in x.split(' ')]))
    lemmatizer = WordNetLemmatizer()
    df['lemmatized'] = df.text.map(lambda x: ' '.join([lemmatizer.lemmatize(y) for y in x.split(' ')]))
    #create tfidf with stopwords
    tvec = TfidfVectorizer(min_df=.0025, max_df=.1, stop_words='english')
    tvec_weights = tvec.fit_transform(df.lemmatized.dropna())
    weights = np.asarray(tvec_weights.mean(axis=0)).ravel().tolist()
    weights_df = pd.DataFrame({'term': tvec.get_feature_names(), 'weight': weights})
    results.append(weights_df)

In [19]:
results[0].sort_values(by='weight', ascending=False).head(40)

Unnamed: 0,term,weight
1448,mayo,0.015013
2660,wrong,0.014572
359,burnt,0.014497
327,bread,0.014194
1169,hot,0.013462
1109,hard,0.013284
1475,menu,0.013086
2053,sauce,0.012722
2043,sandwiches,0.012481
2159,sitting,0.012474


In [20]:
results[1].sort_values(by='weight', ascending=False).head(40)

Unnamed: 0,term,weight
475,salad,0.030678
383,new,0.028564
34,ask,0.023499
630,wish,0.020338
582,times,0.019648
337,look,0.019581
91,charbroiled,0.019568
553,super,0.019525
646,zero,0.019156
394,offer,0.01871


In [21]:
results[2].sort_values(by='weight', ascending=False).head(40)

Unnamed: 0,term,weight
1689,sandwiches,0.015694
746,favorite,0.015113
333,cheese,0.014284
546,deluxe,0.014021
112,area,0.013823
362,clean,0.013777
1571,quick,0.013731
84,amazing,0.013516
1333,nuggets,0.013321
811,free,0.01308


In [22]:
results[3].sort_values(by='weight', ascending=False).head(40)

Unnamed: 0,term,weight
999,taco,0.02178
426,free,0.020739
468,great,0.019635
102,bell,0.018044
730,piece,0.017959
158,care,0.017748
313,donut,0.017322
1102,walked,0.017237
1101,waited,0.016722
200,cold,0.016655


In [23]:
results[4].sort_values(by='weight', ascending=False).head(40)

Unnamed: 0,term,weight
2006,sauce,0.015661
750,dry,0.015065
1424,mayo,0.014666
2329,tasted,0.013193
1674,patty,0.013175
473,coffee,0.013055
2335,tea,0.01302
981,gave,0.01253
15,15,0.012193
2551,wanted,0.011999


In [24]:
results[5].sort_values(by='weight', ascending=False).head(40)

Unnamed: 0,term,weight
146,best,0.016995
584,fresh,0.016515
1116,pretty,0.016145
964,nice,0.015753
897,mayo,0.015507
464,eat,0.015275
590,fries,0.015244
1473,tender,0.015071
365,crispy,0.014738
881,manager,0.014241


In [25]:
results[6].sort_values(by='weight', ascending=False).head(40)

Unnamed: 0,term,weight
1309,great,0.017536
1811,mayo,0.015644
1688,lettuce,0.014991
3033,tomato,0.014323
1989,nugget,0.014278
3326,worst,0.013361
1964,night,0.013087
1319,grilled,0.012991
2532,sauce,0.01289
3340,wrong,0.012884


In [26]:
#chain orders and names
reviews.name.unique()

array(['Popeyes', 'KFC', "Wendy's", 'Chick-fil-A', "Carl's Jr.",
       "McDonald's", 'Burger King'], dtype=object)

In [27]:
#tfidf based on sentiment(positive/negative labels from the sentiment analysis)
pos_neg = pd.read_csv("pos_and_neg_reviews.csv")
pos_neg

Unnamed: 0.1,Unnamed: 0,name,stars,text,lemma,polarity,analysis
0,0,Popeyes,1.0,Awful. Waited in line for 15-20 for a chicken...,Awful Waited line chicken sandwich biscuit g...,-0.800000,Negative
1,1,Popeyes,1.0,This is my first time at the Hartsfield Airpor...,first time Hartsfield Airport idea service H...,-0.061905,Negative
2,2,Popeyes,4.0,They weren't ready for the return of the chick...,ready return chicken stop take order restaur...,0.079940,Positive
3,3,Popeyes,1.0,I have being several times there but they neve...,several time never chicken sandwich problem ...,0.079592,Positive
4,4,Popeyes,1.0,I have been trying to get the spicy chicken sa...,try get spicy chicken sandwich last three mo...,-0.166667,Negative
...,...,...,...,...,...,...,...
2419,2419,Burger King,1.0,We stopped in this evening about 7:50 and orde...,stop evening order dinner whopper onion ring...,-0.345238,Negative
2420,2420,Burger King,1.0,"I just (December 4th, 2010 at 1:50PM) went thr...",December th PM go Burger King drive store SE...,-0.122917,Negative
2421,2421,Burger King,1.0,Do not be fooled by the commercials for the ch...,fool commercial chicken parmesan sandwich fa...,-0.261429,Negative
2422,2422,Burger King,1.0,The strangest experience ever!\nWe pulled up t...,strange experience ever pull drive Inside cl...,-0.107639,Negative


In [28]:
[neg_reviews,pos_reviews,neutral_reviews] = [x for _,x in pos_neg.groupby(pos_neg.analysis)]

In [29]:
sentiment_results = []
for df in [neg_reviews,pos_reviews]:
    #stemming the words(lemmatizer works better)
    lemmatizer = WordNetLemmatizer()
    df['lemmatized'] = df.text.map(lambda x: ' '.join([lemmatizer.lemmatize(y) for y in x.split(' ')]))
    #create tfidf with stopwords
    tvec = TfidfVectorizer(min_df=.0025, max_df=.1, stop_words='english')
    tvec_weights = tvec.fit_transform(df.lemmatized.dropna())
    weights = np.asarray(tvec_weights.mean(axis=0)).ravel().tolist()
    weights_df = pd.DataFrame({'term': tvec.get_feature_names(), 'weight': weights})
    sentiment_results.append(weights_df)

In [30]:
sentiment_results[0].sort_values(by='weight', ascending=False).head(40)

Unnamed: 0,term,weight
1252,popeyes,0.018034
1449,sauce,0.015594
1445,sandwiches,0.015467
729,grilled,0.015202
1834,waited,0.014104
425,crispy,0.01372
544,employee,0.013655
287,cheese,0.013589
1835,waiting,0.013173
234,bun,0.0129


In [31]:
sentiment_results[1].sort_values(by='weight', ascending=False).head(40)

Unnamed: 0,term,weight
1221,hot,0.017642
641,crispy,0.016977
362,burger,0.015831
1890,pretty,0.015559
1516,mcdonald,0.014557
388,came,0.014295
2667,waffle,0.013936
2637,usually,0.013583
1107,grilled,0.013028
369,busy,0.012804


In [34]:
# results[0].to_csv('Popeyes.csv')
# results[1].to_csv('KFC.csv')
# results[2].to_csv('Wendys.csv')
# results[3].to_csv('Chick-fil-A.csv')
# results[4].to_csv('CarlsJr.csv')
# results[5].to_csv('McDonalds.csv')
# results[6].to_csv('BurgerKing.csv')

In [35]:
# sentiment_results[0].to_csv('negative.csv')
# sentiment_results[1].to_csv('positive.csv')