In [33]:
import pandas as pd

# retrieve data for businesses
businesses = pd.read_csv('csv_data/yelp_academic_dataset_business.csv')

# filter only subways
starbucks = businesses[businesses['name'].str[0:9] == "Starbucks"]
print(starbucks.shape)

(730, 14)


In [34]:
# check cities with most starbucks
print(starbucks.groupby('city').count().sort_values('business_id', ascending=False))

# get list for business id's for starbucks' in philadelphia
list_of_ids = starbucks[starbucks['city'] == 'Philadelphia']['business_id'].tolist()

                business_id  name  address  state  postal_code  latitude  \
city                                                                       
Philadelphia             57    57       57     57           57        57   
Tucson                   54    54       54     54           54        54   
Indianapolis             50    50       50     50           50        50   
Tampa                    47    47       47     47           47        47   
Edmonton                 41    41       41     41           41        41   
...                     ...   ...      ...    ...          ...       ...   
Isla Vista                1     1        1      1            1         1   
Jenkintown                1     1        1      1            1         1   
Kennett Square            1     1        1      1            1         1   
Kirkwood                  1     1        1      1            1         1   
Malvern                   1     1        1      1            1         1   

           

In [35]:
# retrieve data for reviews
reviews = pd.read_csv('csv_data/yelp_academic_dataset_review.csv')

# filter only reviews for the starbucks in philadelphia
starbucks_reviews = reviews[reviews['business_id'].isin(list_of_ids)]

In [36]:
# check individual starbucks with most reviews
print(starbucks_reviews.groupby('business_id').count().sort_values('review_id', ascending=False))

# filter only the five starbucks with most reviews
top_5_ids = starbucks_reviews.groupby('business_id').count().sort_values('review_id', ascending=False).reset_index()['business_id'][0:5]
starbucks_reviews = starbucks_reviews.loc[starbucks_reviews['business_id'].isin(top_5_ids)]

                        review_id  user_id  stars  useful  funny  cool  text  \
business_id                                                                    
EFci0tbSrb7wko6tpVDnbA        126      126    126     126    126   126   126   
lgSO-bzx0USaPe34zdLyCw         73       73     73      73     73    73    73   
9c1p8TTOG_F55i3yEiVVDQ         71       71     71      71     71    71    71   
mBcvp7NarEQEviJyRiX23Q         70       70     70      70     70    70    70   
90LsqLbaoQpz_xNbSQsCFA         63       63     63      63     63    63    63   
7Klk8UuK1sdJt5_44eUtDg         61       61     61      61     61    61    61   
f22prjTaNUWwqvVsI4HWxQ         60       60     60      60     60    60    60   
s6nOfGZJpkZ3JplCO9lGiA         58       58     58      58     58    58    58   
PTn0hZz7BiP5T2muyoxrjQ         58       58     58      58     58    58    58   
UrYj-HI2I61BK_nsPEa_QQ         58       58     58      58     58    58    58   
HxdaEwpoXKll0Ze4nhS0XA         57       

In [37]:
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
import unidecode

# strip accents and use lowercase for all the text
starbucks_reviews['text'] = [unidecode.unidecode(review_text).lower() for review_text in starbucks_reviews['text']]

# tokenize the reviews using spaCy
spacy_tokenizer = English()
starbucks_reviews['spacy_token'] = [[token.text for token in spacy_tokenizer(review_text)] for review_text in starbucks_reviews['text']]

# remove stop words
starbucks_reviews['spacy_token'] = [list(filter(lambda word: word not in STOP_WORDS, list_of_tokens)) for list_of_tokens in starbucks_reviews['spacy_token']]

# check the results from pre-processing
print(starbucks_reviews[['text', 'spacy_token']].head())

# create a list with inputs in the ideal format for BoW and TF-IDF
pre_processed_inputs = [' '.join(review) for review in starbucks_reviews['spacy_token'].tolist()]

                                                    text  \
37897  i got my coffee right away like after i placed...   
38525  have waited 25 minutes for my order on numerou...   
41073  always a pleasure to stop in while waiting for...   
55469  the lady forgot to write vanilla on my latte. ...   
61244  pretty fast service (even during a very busy d...   

                                             spacy_token  
37897  [got, coffee, right, away, like, placed, order...  
38525  [waited, 25, minutes, order, numerous, occasio...  
41073  [pleasure, stop, waiting, bus, simply, walking...  
55469  [lady, forgot, write, vanilla, latte, ., left,...  
61244  [pretty, fast, service, (, busy, day, ), decen...  


In [59]:
from sklearn.feature_extraction.text import CountVectorizer

# create the BoW representation for the set
count_vec = CountVectorizer(max_features=1024, ngram_range=(1,2))
bow_representation = count_vec.fit_transform(pre_processed_inputs)

# create the array with BoW and normalize it
bow_array = bow_representation.toarray()
bow_normalized = [vector/sum(vector) if sum(vector) != 0 else vector for vector in bow_array]

In [87]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

starbucks_ids = starbucks_reviews['business_id'].tolist()
X_train, X_test, y_train, y_test = train_test_split(np.array(bow_normalized), np.array(starbucks_ids), test_size = 0.25, random_state=0)

# using KNNs for classification
knn = KNeighborsClassifier(n_neighbors=20).fit(X_train, y_train)
print("KNN test score:", knn.score(X_test, y_test))
print("KNN train score:", knn.score(X_train, y_train))

# using SVMs for classification
svm = SVC().fit(X_train, y_train)
print("SVM test score:", svm.score(X_test, y_test))
print("SVM train score:", svm.score(X_train, y_train))

# using random forests for classification
rfc = RandomForestClassifier(max_depth=6, random_state=0).fit(X_train, y_train)
print("RF test score:", rfc.score(X_test, y_test))
print("RF train score:", rfc.score(X_train, y_train))

KNN test score: 0.36633663366336633
KNN train score: 0.46357615894039733
SVM test score: 0.36633663366336633
SVM train score: 0.8807947019867549
RF test score: 0.4158415841584158
RF train score: 0.5860927152317881


In [78]:
starbucks_reviews.groupby('business_id').mean()

Unnamed: 0_level_0,stars,useful,funny,cool
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
90LsqLbaoQpz_xNbSQsCFA,2.936508,0.825397,0.507937,0.412698
9c1p8TTOG_F55i3yEiVVDQ,2.070423,0.957746,0.295775,0.380282
EFci0tbSrb7wko6tpVDnbA,3.095238,1.31746,0.436508,0.277778
lgSO-bzx0USaPe34zdLyCw,2.671233,0.438356,0.232877,0.178082
mBcvp7NarEQEviJyRiX23Q,3.228571,0.757143,0.6,0.514286
