In [1]:
import os
import json
import sys
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from nltk.stem.snowball import SnowballStemmer
import nltk
stdout = sys.stdout
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [2]:
reviews = pd.read_csv("all_chains_cs_reviews.csv")
reviews.head(5)

Unnamed: 0.1,Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars_x,...,categories,hours,review_id,user_id,stars_y,useful,funny,cool,text,date
0,44,c32IpmTXxiDxKYKvhJiSuQ,Popeyes,6000 N Terminal Pkwy,Atlanta,GA,30320,33.640737,-84.429305,3.5,...,"Fast Food, Chicken Wings, Restaurants","{'Monday': '10:0-1:0', 'Tuesday': '10:0-1:0', ...",tr6X_kKyXgnnyOnax5o0Ng,cZA_G7kIkyIrR15EKXoVFw,1.0,0,0,0,Awful. Waited in line for 15-20 for a chicken...,2016-05-25 17:00:07
1,91,c32IpmTXxiDxKYKvhJiSuQ,Popeyes,6000 N Terminal Pkwy,Atlanta,GA,30320,33.640737,-84.429305,3.5,...,"Fast Food, Chicken Wings, Restaurants","{'Monday': '10:0-1:0', 'Tuesday': '10:0-1:0', ...",3EJq4ihQoTVmvTunSP2-RQ,zKMCLxQnAOXpHJIKMZCI_Q,1.0,5,1,0,This is my first time at the Hartsfield Airpor...,2015-06-08 16:13:57
2,131,IpNJfLJ6R3RjZJF_ucGaZA,Popeyes,4932 State Route 46,Sanford,FL,32771,28.811882,-81.344261,1.5,...,"Chicken Wings, Restaurants, Fast Food, America...","{'Monday': '10:0-23:0', 'Tuesday': '10:0-23:0'...",4ASda-KRC4gz5n194yHQ1A,CpRBM-El-mqvbv93lYX5QA,4.0,5,2,4,They weren't ready for the return of the chick...,2019-11-03 17:56:41
3,132,IpNJfLJ6R3RjZJF_ucGaZA,Popeyes,4932 State Route 46,Sanford,FL,32771,28.811882,-81.344261,1.5,...,"Chicken Wings, Restaurants, Fast Food, America...","{'Monday': '10:0-23:0', 'Tuesday': '10:0-23:0'...",4v8hM7d-Zu5x5bdsYfxU1A,ui1vL68Ty9_aeKGtzJNSHg,1.0,1,0,0,I have being several times there but they neve...,2020-02-04 13:37:25
4,134,IpNJfLJ6R3RjZJF_ucGaZA,Popeyes,4932 State Route 46,Sanford,FL,32771,28.811882,-81.344261,1.5,...,"Chicken Wings, Restaurants, Fast Food, America...","{'Monday': '10:0-23:0', 'Tuesday': '10:0-23:0'...",thb_gq39zatj4dEv_iyAbw,Lk_clm7vulcrkvcBAyO6fg,1.0,0,0,0,I have been trying to get the spicy chicken sa...,2019-12-17 02:20:50


In [3]:
reviews.shape

(2424, 23)

In [5]:
stemmer = SnowballStemmer("english")
reviews['stemmed'] = reviews.text.map(lambda x: ' '.join([stemmer.stem(y) for y in x.split(' ')]))
reviews.stemmed.head()

0    awful. wait line 15-20 chicken sandwich biscui...
1    this first time hartsfield airport i idea serv...
2    they readi return chicken. they stop take orde...
3    i sever time never chicken sandwich problem ki...
4    i tri get spici chicken sandwich last three mo...
Name: stemmed, dtype: object

In [6]:
cvec = CountVectorizer(stop_words='english', min_df=1, max_df=.5, ngram_range=(1,2))
cvec

CountVectorizer(max_df=0.5, ngram_range=(1, 2), stop_words='english')

In [7]:
from itertools import islice
cvec.fit(reviews.stemmed)
list(islice(cvec.vocabulary_.items(), 20))

[('awful', 6031),
 ('wait', 88921),
 ('line', 45923),
 ('15', 418),
 ('20', 577),
 ('biscuit', 8002),
 ('order', 57236),
 ('bone', 8756),
 ('staff', 78386),
 ('50', 1117),
 ('miser', 52658),
 ('compar', 16254),
 ('helpful', 37878),
 ('unacceptable', 87082),
 ('awful wait', 6058),
 ('wait line', 89056),
 ('line 15', 45930),
 ('15 20', 421),
 ('20 chicken', 583),
 ('sandwich biscuit', 70376)]

In [8]:
len(cvec.vocabulary_)

93818

In [9]:
cvec = CountVectorizer(stop_words='english', min_df=.0025, max_df=.1, ngram_range=(1,2))
cvec.fit(reviews.stemmed)
len(cvec.vocabulary_)

3206

In [10]:
cvec_counts = cvec.transform(reviews.stemmed)
print('sparse matrix shape:', cvec_counts.shape)
print('nonzero count:', cvec_counts.nnz)
print('sparsity: %.2f%%' % (100.0 * cvec_counts.nnz / (cvec_counts.shape[0] * cvec_counts.shape[1])))

sparse matrix shape: (2424, 3206)
nonzero count: 84466
sparsity: 1.09%


In [11]:
occ = np.asarray(cvec_counts.sum(axis=0)).ravel().tolist()
counts_df = pd.DataFrame({'term': cvec.get_feature_names(), 'occurrences': occ})
counts_df.sort_values(by='occurrences', ascending=False).head(20)

Unnamed: 0,term,occurrences
355,burger,353
3101,wendi,319
1294,grill,309
3129,window,308
2176,pretti,301
1748,mcdonald,297
2151,popey,283
1684,lunch,283
1658,lot,281
886,everi,281


In [12]:
transformer = TfidfTransformer()
transformed_weights = transformer.fit_transform(cvec_counts)
transformed_weights

<2424x3206 sparse matrix of type '<class 'numpy.float64'>'
	with 84466 stored elements in Compressed Sparse Row format>

In [13]:
weights = np.asarray(transformed_weights.mean(axis=0)).ravel().tolist()
weights_df = pd.DataFrame({'term': cvec.get_feature_names(), 'weight': weights})
weights_df.sort_values(by='weight', ascending=False).head(20)

Unnamed: 0,term,weight
3101,wendi,0.013571
355,burger,0.013127
1294,grill,0.012459
2151,popey,0.01202
2176,pretti,0.011981
1684,lunch,0.01149
2220,quick,0.011245
2589,service,0.011168
1748,mcdonald,0.011073
3180,wrong,0.010887


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
tvec = TfidfVectorizer(min_df=.0025, max_df=.1, stop_words='english')
tvec_weights = tvec.fit_transform(reviews.stemmed.dropna())
weights = np.asarray(tvec_weights.mean(axis=0)).ravel().tolist()
weights_df = pd.DataFrame({'term': tvec.get_feature_names(), 'weight': weights})
weights_df.sort_values(by='weight', ascending=False).head(20)

Unnamed: 0,term,weight
1798,wendi,0.015916
754,grill,0.015405
231,burger,0.015184
1283,pretti,0.014927
1260,popey,0.014503
1469,service,0.014063
990,lunch,0.01401
1315,quick,0.013986
1842,wrong,0.013511
340,cold,0.013136


In [15]:
#Popeyes
df1,df2,df3,df4,df5,df6,df7 = [x for _,x in reviews.groupby(reviews.name)]

In [16]:
dfs = [df1,df2,df3,df4,df5,df6,df7]

In [17]:
results = []

In [18]:
for df in dfs:
    #stemming the words
    stemmer = SnowballStemmer("english")
    df['stemmed'] = df.text.map(lambda x: ' '.join([stemmer.stem(y) for y in x.split(' ')]))
    tvec = TfidfVectorizer(min_df=.0025, max_df=.1, stop_words='english')
    tvec_weights = tvec.fit_transform(df.stemmed.dropna())
    weights = np.asarray(tvec_weights.mean(axis=0)).ravel().tolist()
    weights_df = pd.DataFrame({'term': tvec.get_feature_names(), 'weight': weights})
    results.append(weights_df)
    weights_df.sort_values(by='weight', ascending=False).head(20)

In [25]:
results[1].sort_values(by='weight', ascending=False).head(20)

Unnamed: 0,term,weight
478,salad,0.030864
386,new,0.027711
335,lie,0.022006
593,times,0.020715
633,wish,0.020084
396,offer,0.019439
647,zero,0.01917
91,charbroil,0.019073
389,nom,0.018908
558,super,0.018844


NameError: name 'test' is not defined