In [1]:
# Import all necessary packages

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
warnings.filterwarnings("ignore")


In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
# df = pd.read_csv('/content/drive/My Drive/merged_df.csv', index_col=0)
# df.head()

In [4]:
df = pd.read_csv('merged_df.csv', index_col=0)
df.head()

Unnamed: 0,business_id,name,city,state,avg_stars,user_stars,text,processed_reviews,review_words
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,Philadelphia,PA,4.0,4.0,This is nice little Chinese bakery in the hear...,nice little chinese bakery heart philadelphia ...,40.0
1,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,Philadelphia,PA,4.0,4.0,This is the bakery I usually go to in Chinatow...,bakery usually go chinatown decent variety bun...,58.0
2,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,Philadelphia,PA,4.0,5.0,"A delightful find in Chinatown! Very clean, an...",delightful find chinatown clean kind service e...,22.0
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,Philadelphia,PA,4.0,5.0,I ordered a graduation cake for my niece and i...,ordered graduation cake niece came absolutely ...,16.0
4,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,Philadelphia,PA,4.0,4.0,HK-STYLE MILK TEA: FOUR STARS\n\nNot quite su...,hkstyle milk tea four star quite sure two sain...,103.0


In [5]:
df.shape

(2812484, 9)

In [8]:
df.isnull().sum()

business_id           1
name                  1
city                  2
state                 2
avg_stars             2
user_stars            2
text                  2
processed_reviews    20
review_words          3
dtype: int64

In [6]:
df.dropna(inplace=True)
df.isnull().sum()

business_id          0
name                 0
city                 0
state                0
avg_stars            0
user_stars           0
text                 0
processed_reviews    0
review_words         0
dtype: int64

In [10]:
top_cities = ['Philadelphia', 'New Orleans', 'Nashville', 'Tampa', 'Tucson', 'Indianapolis', 'Saint Louis',
             'Reno', 'Santa Barbara', 'Saint Petersburg', 'Boise', 'Clearwater', 'Metairie',
             'Sparks', 'Franklin', 'Wilmington', 'Meridian', 'Saint Louis']

In [11]:
review_count_per_city = df.groupby('city')['business_id'].count().reset_index(name='review_count').sort_values(by='review_count', ascending=True)


print(review_count_per_city)


                city  review_count
4           Meridian         23683
16        Wilmington         26947
2           Franklin         34214
13            Sparks         37977
5           Metairie         38028
1         Clearwater         49913
0              Boise         63634
11  Saint Petersburg         76578
12     Santa Barbara        147611
9               Reno        176096
10       Saint Louis        187539
3       Indianapolis        213377
15            Tucson        219305
14             Tampa        264899
6          Nashville        284559
7        New Orleans        420313
8       Philadelphia        547791


In [7]:
## Creating 13 different datsets for each county

# Philadelphia
padf = df[df['city'] == 'Philadelphia']
print('Philadelphia: ',padf.shape)

# New Orleans
nodf = df[df['city'] == 'New Orleans']
print('New Orleans: ', nodf.shape)

# Nashville
nadf = df[df['city'] == 'Nashville']
print('Nashville: ', nadf.shape)

# Saint Petersburg, Tampa, and Clearwater are neighboring areas in Pineallas County, FL
florida_cities = ['Saint Petersburg', 'Tampa', 'Clearwater']
fldf = df[df['city'].isin(florida_cities)]
print('Florida: ', fldf.shape)

# Tucson
tudf = df[df['city'] == 'Tucson']
print('Tucson: ', tudf.shape)

# Indianapolis
indf = df[df['city'] == 'Indianapolis']
print('Indianapolis: ', indf.shape)

# Sparks and Reno are neighboring areas in Washoe County, NV
nv_cities = ['Reno', 'Sparks']
nvdf = df[df['city'].isin(nv_cities)]
print('Washoe County: ', nvdf.shape)

# Meridian and Boise are within Ada County, ID
ada = ['Meridian', 'Boise']
adadf = df[df['city'].isin(ada)]
print('Ada County: ', adadf.shape)

# Metairie
metdf = df[df['city'] == 'Metairie']
print('Metairie: ', metdf.shape)

# Franklin
fadf = df[df['city'] == 'Franklin']
print('Franklin: ', fadf.shape)

# Wilmington
wdf = df[df['city'] == 'Wilmington']
print('Wilmington: ', wdf.shape)

# Santa Barbara
sbdf = df[df['city'] == 'Santa Barbara']
print('Santa Barbara: ', sbdf.shape)

# Saint Louis
sldf = df[df['city'] == 'Saint Louis']
print('Saint Louis: ', sldf.shape)



Philadelphia:  (547791, 9)
New Orleans:  (420313, 9)
Nashville:  (284559, 9)
Florida:  (391390, 9)
Tucson:  (219305, 9)
Indianapolis:  (213377, 9)
Washoe County:  (214073, 9)
Ada County:  (87317, 9)
Metairie:  (38028, 9)
Franklin:  (34214, 9)
Wilmington:  (26947, 9)
Santa Barbara:  (147611, 9)
Saint Louis:  (187539, 9)


We will run TF-IDF in this order since kernel keeps dying with larger corpus (like for Philadelphia):
1. Wilmington - 26947
2. Franklin - 34214
3. Metairie - 38028
4. Ada County - 87317
5. Santa Barbara - 147611
6. Saint Louis - 187539
7. Indianapolis - 213377
8. Washoe County - 214073
9. Tucson - 219305
10. Nashville - 284559
11. Florida - 391390
12. New Orleans - 420313
13. Philadelphia - 547791


## TF-IDF

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from scipy.stats import pearsonr

## First 30 reviews for PA
For proof of concept.

In [14]:
# test TF-IDF on first 30 rows of reviews

corpus = padf['processed_reviews'].tolist()
corpus_reduced = corpus[:30]


In [15]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus_reduced)  # 'corpus' is a list of review texts

# Convert to DataFrame for easier manipulation
tfidf_padf_reduced = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# Add the ratings to the DataFrame
tfidf_padf_reduced['user_rating'] = df['user_stars']

In [16]:
# Compute Pearson correlation for each word with the rating
correlations = {}
for word in tfidf_padf_reduced.columns[:-1]:  # Exclude the 'rating' column
    correlations[word] = pearsonr(tfidf_padf_reduced[word], tfidf_padf_reduced['user_rating'])[0]

# Convert to DataFrame for nicer display and sort by correlation
correlations_df = pd.DataFrame.from_dict(correlations, orient='index', columns=['correlation'])
correlations_df = correlations_df.sort_values(by='correlation', ascending=False)

print(correlations_df.head(20))  # Show top 20 words most positively correlated with ratings

top_20_words = correlations_df.head(20).index.tolist()
top_words = tfidf_padf_reduced[top_20_words]

            correlation
also           0.435733
flavor         0.405554
strawberry     0.397731
world          0.344618
gem            0.344618
nice           0.343942
absolutely     0.343621
come           0.323810
go             0.320112
place          0.319157
ton            0.312283
ive            0.311602
made           0.306853
delicious      0.285178
ordered        0.275230
cake           0.259879
without        0.256012
perfect        0.244142
upon           0.240206
stumbled       0.240206


### Wilmington

In [17]:
# Start with Wilmington since it has the smallest review count

from sklearn.feature_extraction.text import TfidfVectorizer

corpus = wdf['processed_reviews'].tolist()

vectorizer = TfidfVectorizer(min_df=5, max_df=0.6)
X = vectorizer.fit_transform(corpus)  # 'corpus' is a list of review texts

# Convert to DataFrame for easier manipulation
tfidf_wdf = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# Add the ratings to the DataFrame
tfidf_wdf['user_rating'] = df['user_stars']

In [18]:
# Compute Pearson correlation for each word with the rating

correlations = {}
for word in tfidf_wdf.columns[:-1]:  # Exclude the 'rating' column
    correlations[word] = pearsonr(tfidf_wdf[word], tfidf_wdf['user_rating'])[0]

# Convert to DataFrame for nicer display and sort by correlation
correlations_df = pd.DataFrame.from_dict(correlations, orient='index', columns=['correlation'])
correlations_df = correlations_df.sort_values(by='correlation', ascending=False)

print(correlations_df.head(20))  # Show top 20 words most positively correlated with ratings

top_20_words = correlations_df.head(20).index.tolist()
top_words = tfidf_wdf[top_20_words]

             correlation
hill            0.025091
tea             0.020431
provided        0.020103
ulysses         0.019569
experienced     0.019032
burrito         0.018666
iron            0.018401
decision        0.017937
el              0.017451
suite           0.017389
lady            0.017267
community       0.016936
dave            0.016691
soggy           0.015975
gelato          0.015930
charged         0.015800
valet           0.015743
worked          0.015732
bellefonte      0.015685
sherry          0.015676


In [22]:
# Initializing Truncated SVD
svd = TruncatedSVD(n_components=1)  # Reducing to one feature

# Fitting and transforming the data
wm_words_svd = svd.fit_transform(top_words)

# Adding this feature back to your DataFrame (or use as needed)
wdf['Wilmington_SVD'] = wm_words_svd[:, 0]


### Franklin

In [23]:
corpus = fadf['processed_reviews'].tolist()

vectorizer = TfidfVectorizer(min_df=5, max_df=0.6)
X = vectorizer.fit_transform(corpus)

# Convert to DataFrame for easier manipulation
tfidf_fadf = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# Add the ratings to the DataFrame
tfidf_fadf['user_rating'] = df['user_stars']


In [24]:
# Compute Pearson correlation for each word with the rating

correlations = {}
for word in tfidf_fadf.columns[:-1]:
    correlations[word] = pearsonr(tfidf_fadf[word], tfidf_fadf['user_rating'])[0]

correlations_df = pd.DataFrame.from_dict(correlations, orient='index', columns=['correlation'])
correlations_df = correlations_df.sort_values(by='correlation', ascending=False)

print(correlations_df.head(20))

            correlation
chuys          0.030843
mellow         0.021062
cheesecake     0.021005
shake          0.020739
buca           0.019631
level          0.019162
special        0.018129
duck           0.018128
bagel          0.018118
cake           0.018072
pony           0.017773
salad          0.017622
upstairs       0.017277
breakfast      0.017185
long           0.017051
delish         0.016726
dressing       0.016525
fried          0.016220
southern       0.016112
waiter         0.016008


In [25]:
top_20_words = correlations_df.head(20).index.tolist()
top_words = tfidf_fadf[top_20_words]

# Initializing Truncated SVD
svd = TruncatedSVD(n_components=1)  # Reducing to one feature

# Fitting and transforming the data
fa_words_svd = svd.fit_transform(top_words)

# Adding this feature back to your DataFrame (or use as needed)
fadf['Franklin_SVD'] = fa_words_svd[:, 0]

### Metairie

In [26]:
corpus = metdf['processed_reviews'].tolist()

vectorizer = TfidfVectorizer(min_df=5, max_df=0.6)
X = vectorizer.fit_transform(corpus)  # 'corpus' is a list of review texts

# Convert to DataFrame for easier manipulation
tfidf_metdf = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# Add the ratings to the DataFrame
tfidf_metdf['user_rating'] = df['user_stars']

In [27]:
# Compute Pearson correlation for each word with the rating

correlations = {}
for word in tfidf_metdf.columns[:-1]:
    correlations[word] = pearsonr(tfidf_metdf[word], tfidf_metdf['user_rating'])[0]

correlations_df = pd.DataFrame.from_dict(correlations, orient='index', columns=['correlation'])
correlations_df = correlations_df.sort_values(by='correlation', ascending=False)

print(correlations_df.head(20))

           correlation
pizza         0.043201
coffee        0.023135
italian       0.022580
crust         0.022559
frozen        0.021577
beer          0.020788
old           0.020669
wine          0.020364
daiquiri      0.019048
beignet       0.018457
great         0.018432
martin        0.018375
lasagna       0.017182
age           0.016621
ravioli       0.016553
pepperoni     0.016410
york          0.015993
bar           0.015766
twain         0.015300
queso         0.015284


In [28]:
top_20_words = correlations_df.head(20).index.tolist()
top_words = tfidf_metdf[top_20_words]

# Initializing Truncated SVD
svd = TruncatedSVD(n_components=1)  # Reducing to one feature

# Fitting and transforming the data
met_words_svd = svd.fit_transform(top_words)

# Adding this feature back to your DataFrame (or use as needed)
metdf['Metairie_SVD'] = met_words_svd[:, 0]

### Ada County

In [29]:
corpus = adadf['processed_reviews'].tolist()

vectorizer = TfidfVectorizer(min_df=5, max_df=0.6)
X = vectorizer.fit_transform(corpus) 

# Convert to DataFrame for easier manipulation
tfidf_adadf = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# Add the ratings to the DataFrame
tfidf_adadf['user_rating'] = df['user_stars']

In [30]:
# Compute Pearson correlation for each word with the rating

correlations = {}
for word in tfidf_adadf.columns[:-1]:
    correlations[word] = pearsonr(tfidf_adadf[word], tfidf_adadf['user_rating'])[0]

correlations_df = pd.DataFrame.from_dict(correlations, orient='index', columns=['correlation'])
correlations_df = correlations_df.sort_values(by='correlation', ascending=False)

print(correlations_df.head(20))


           correlation
thai          0.023555
kabob         0.017276
pad           0.015682
chang         0.014999
goldys        0.014293
brazilian     0.014226
chandler      0.014131
gernika       0.014111
guru          0.013718
grinder       0.013500
beer          0.013473
pf            0.013458
janjou        0.012768
tucanos       0.012326
ew            0.012317
pineapple     0.012297
wrap          0.012066
croquetas     0.012047
chinese       0.012015
pho           0.011845


In [31]:
top_20_words = correlations_df.head(20).index.tolist()
top_words = tfidf_adadf[top_20_words]

# Initializing Truncated SVD
svd = TruncatedSVD(n_components=1)  # Reducing to one feature

# Fitting and transforming the data
ada_words_svd = svd.fit_transform(top_words)

# Adding this feature back to your DataFrame (or use as needed)
adadf['AdaCounty_SVD'] = ada_words_svd[:, 0]

### Santa Barbara

In [9]:
corpus = sbdf['processed_reviews'].tolist()

vectorizer = TfidfVectorizer(min_df=5, max_df=0.6)
X = vectorizer.fit_transform(corpus)

# Convert to DataFrame for easier manipulation
tfidf_sbdf = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# Add the ratings to the DataFrame
tfidf_sbdf['user_rating'] = df['user_stars']

In [10]:
# Compute Pearson correlation for each word with the rating

correlations = {}
for word in tfidf_sbdf.columns[:-1]:
    correlations[word] = pearsonr(tfidf_sbdf[word], tfidf_sbdf['user_rating'])[0]

correlations_df = pd.DataFrame.from_dict(correlations, orient='index', columns=['correlation'])
correlations_df = correlations_df.sort_values(by='correlation', ascending=False)

print(correlations_df.head(20))


         correlation
wine        0.020956
paella      0.017478
tasting     0.015685
tapa        0.014866
korean      0.014770
pulpo       0.014735
loquita     0.014169
mezcal      0.013140
bravas      0.012518
patatas     0.012475
pho         0.012171
sevtap      0.012104
brewery     0.011881
ramen       0.011574
cask        0.011198
beer        0.010945
room        0.010659
bottle      0.010572
eos         0.010570
winery      0.010415


In [11]:
top_20_words = correlations_df.head(20).index.tolist()
top_words = tfidf_sbdf[top_20_words]

# Initializing Truncated SVD
svd = TruncatedSVD(n_components=1)  # Reducing to one feature

# Fitting and transforming the data
sb_words_svd = svd.fit_transform(top_words)

# Adding this feature back to your DataFrame (or use as needed)
sbdf['SantaBarbara_SVD'] = sb_words_svd[:, 0]

In [12]:
sbdf.to_csv("SantaBarbara_SVD.csv")

## Saint Louis

In [9]:
corpus = sldf['processed_reviews'].tolist()

vectorizer = TfidfVectorizer(min_df=5, max_df=0.6)
X = vectorizer.fit_transform(corpus) 

# Convert to DataFrame for easier manipulation
tfidf_sldf = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# Add the ratings to the DataFrame
tfidf_sldf['user_rating'] = df['user_stars']

In [10]:
# Compute Pearson correlation for each word with the rating

correlations = {}
for word in tfidf_sldf.columns[:-1]:
    correlations[word] = pearsonr(tfidf_sldf[word], tfidf_sldf['user_rating'])[0]

correlations_df = pd.DataFrame.from_dict(correlations, orient='index', columns=['correlation'])
correlations_df = correlations_df.sort_values(by='correlation', ascending=False)

print(correlations_df.head(20))


           correlation
cupcake       0.018718
afghan        0.014209
kingside      0.013420
jillys        0.012088
scottish      0.011936
irish         0.011877
eastern       0.011574
retreat       0.011573
lorenzos      0.011069
pudding       0.010988
pw            0.010884
farmhouse     0.010841
sameem        0.010504
lumiere       0.010384
lorussos      0.010265
bar           0.010104
corned        0.010093
falafel       0.010014
sameems       0.009956
katies        0.009856


In [11]:
from sklearn.decomposition import TruncatedSVD

top_20_words = correlations_df.head(20).index.tolist()
top_words = tfidf_sldf[top_20_words]

# Initializing Truncated SVD
svd = TruncatedSVD(n_components=1)  # Reducing to one feature

# Fitting and transforming the data
sl_words_svd = svd.fit_transform(top_words)

# Adding this feature back to your DataFrame (or use as needed)
sldf['SaintLouis_SVD'] = sl_words_svd[:, 0]

In [None]:
sldf.to_csv("SaintLouis_SVD.csv")


## Indianapolis

Since we keep running out of RAM memory and running into issues with computing power, resulting in a dead kernel, we decided to randomly select a portion of the city's dataset to reduce its size.  

In [12]:
reduced_indf = indf.sample(frac=0.8, replace=False, random_state=1)

In [14]:
corpus = reduced_indf['processed_reviews'].tolist()

vectorizer = TfidfVectorizer(min_df=5, max_df=0.6)
X = vectorizer.fit_transform(corpus)  # 'corpus' is a list of review texts

# Convert to DataFrame for easier manipulation
tfidf_indf = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# Add the ratings to the DataFrame
tfidf_indf['user_rating'] = df['user_stars']

In [15]:
# Compute Pearson correlation for each word with the rating

correlations = {}
for word in tfidf_indf.columns[:-1]:
    correlations[word] = pearsonr(tfidf_indf[word], tfidf_indf['user_rating'])[0]

correlations_df = pd.DataFrame.from_dict(correlations, orient='index', columns=['correlation'])
correlations_df = correlations_df.sort_values(by='correlation', ascending=False)

print(correlations_df.head(20))


            correlation
clever         0.008970
sea            0.008118
amazed         0.007919
powered        0.007524
saigon         0.007424
flavor         0.007421
coma           0.007396
linguine       0.007331
dinner         0.007309
curse          0.007219
precaution     0.007182
citrus         0.007181
silverware     0.007115
genuine        0.007111
grub           0.007002
ripoff         0.006930
walking        0.006930
later          0.006881
finishing      0.006823
baba           0.006765


In [16]:
top_20_words = correlations_df.head(20).index.tolist()
top_words = tfidf_indf[top_20_words]

# Initializing Truncated SVD
svd = TruncatedSVD(n_components=1)  # Reducing to one feature

# Fitting and transforming the data
in_words_svd = svd.fit_transform(top_words)

# Adding this feature back to your DataFrame (or use as needed)
reduced_indf['Indianapolis_SVD'] = in_words_svd[:, 0]

In [17]:
reduced_indf.to_csv("Indianapolis_SVD.csv")

## Washoe County

In [7]:
reduced_nvdf = nvdf.sample(frac=0.8, replace=False, random_state=1)

In [8]:
corpus = reduced_nvdf['processed_reviews'].tolist()

vectorizer = TfidfVectorizer(min_df=5, max_df=0.6)
X = vectorizer.fit_transform(corpus)  # 'corpus' is a list of review texts

# Convert to DataFrame for easier manipulation
tfidf_nvdf = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# Add the ratings to the DataFrame
tfidf_nvdf['user_rating'] = df['user_stars']

In [9]:
# Compute Pearson correlation for each word with the rating

correlations = {}
for word in tfidf_nvdf.columns[:-1]:
    correlations[word] = pearsonr(tfidf_nvdf[word], tfidf_nvdf['user_rating'])[0]

correlations_df = pd.DataFrame.from_dict(correlations, orient='index', columns=['correlation'])
correlations_df = correlations_df.sort_values(by='correlation', ascending=False)

print(correlations_df.head(20))


               correlation
screen            0.009992
saloon            0.008214
amazed            0.008095
mia               0.007894
cleanliness       0.007694
inconvenience     0.007436
scrambled         0.007418
fountain          0.007415
discounted        0.007381
various           0.007340
raf               0.007272
servicing         0.007244
identify          0.007120
777               0.007108
available         0.007002
mels              0.006917
proceeded         0.006914
celebrate         0.006875
irked             0.006860
category          0.006848


In [10]:
top_20_words = correlations_df.head(20).index.tolist()
top_words = tfidf_nvdf[top_20_words]

# Initializing Truncated SVD
svd = TruncatedSVD(n_components=1)  # Reducing to one feature

# Fitting and transforming the data
nv_words_svd = svd.fit_transform(top_words)

# Adding this feature back to your DataFrame (or use as needed)
reduced_nvdf['WashoeCounty_SVD'] = nv_words_svd[:, 0]

In [11]:
reduced_nvdf.to_csv("WashoeCounty_SVD.csv")

## Tucson

In [12]:
reduced_tudf = tudf.sample(frac=0.75, replace=False, random_state=1)

In [13]:
corpus = reduced_tudf['processed_reviews'].tolist()

vectorizer = TfidfVectorizer(min_df=5, max_df=0.6)
X = vectorizer.fit_transform(corpus)  # 'corpus' is a list of review texts

# Convert to DataFrame for easier manipulation
tfidf_tudf = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# Add the ratings to the DataFrame
tfidf_tudf['user_rating'] = df['user_stars']

In [14]:
# Compute Pearson correlation for each word with the rating

correlations = {}
for word in tfidf_tudf.columns[:-1]:
    correlations[word] = pearsonr(tfidf_tudf[word], tfidf_tudf['user_rating'])[0]

correlations_df = pd.DataFrame.from_dict(correlations, orient='index', columns=['correlation'])
correlations_df = correlations_df.sort_values(by='correlation', ascending=False)

print(correlations_df.head(20))

            correlation
ruben          0.009323
liquor         0.008513
dressing       0.007609
lucky          0.007470
smaller        0.007440
occasion       0.007425
lately         0.007394
525            0.007389
meager         0.007321
automatic      0.007118
convenient     0.007072
ie             0.007054
rear           0.007043
energetic      0.006933
polenta        0.006770
nonprofit      0.006724
upscale        0.006688
ruin           0.006628
doable         0.006618
yummmm         0.006597


In [15]:
top_20_words = correlations_df.head(20).index.tolist()
top_words = tfidf_tudf[top_20_words]

# Initializing Truncated SVD
svd = TruncatedSVD(n_components=1)  # Reducing to one feature

# Fitting and transforming the data
tu_words_svd = svd.fit_transform(top_words)

# Adding this feature back to your DataFrame (or use as needed)
reduced_tudf['Tucson_SVD'] = tu_words_svd[:, 0]

In [16]:
reduced_tudf.to_csv("Tucson_SVD.csv")

## Nashville

In [12]:
reduced_nadf = nadf.sample(frac=0.6, replace=False, random_state=1)

In [13]:
corpus = reduced_nadf['processed_reviews'].tolist()

vectorizer = TfidfVectorizer(min_df=5, max_df=0.6)
X = vectorizer.fit_transform(corpus)  # 'corpus' is a list of review texts

# Convert to DataFrame for easier manipulation
tfidf_nadf = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# Add the ratings to the DataFrame
tfidf_nadf['user_rating'] = df['user_stars']

In [14]:
# Compute Pearson correlation for each word with the rating

correlations = {}
for word in tfidf_nadf.columns[:-1]:
    correlations[word] = pearsonr(tfidf_nadf[word], tfidf_nadf['user_rating'])[0]

correlations_df = pd.DataFrame.from_dict(correlations, orient='index', columns=['correlation'])
correlations_df = correlations_df.sort_values(by='correlation', ascending=False)

print(correlations_df.head(20))

               correlation
nipper            0.008528
missed            0.008199
chalkboard        0.007748
patio             0.007450
spacious          0.007438
delight           0.007296
money             0.007213
located           0.007197
square            0.007159
glad              0.007083
german            0.007012
tofu              0.006852
1130              0.006845
chill             0.006664
krogers           0.006649
hungover          0.006594
highest           0.006587
hawker            0.006565
knowledgeable     0.006521
darfons           0.006521


In [15]:
top_20_words = correlations_df.head(20).index.tolist()
top_words = tfidf_nadf[top_20_words]

# Initializing Truncated SVD
svd = TruncatedSVD(n_components=1)  # Reducing to one feature

# Fitting and transforming the data
na_words_svd = svd.fit_transform(top_words)

# Adding this feature back to your DataFrame (or use as needed)
reduced_nadf['Nashville_SVD'] = na_words_svd[:, 0]

In [16]:
reduced_nadf.to_csv("Nashville_SVD.csv")

## Florida

In [12]:
reduced_fldf = fldf.sample(frac=0.45, replace=False, random_state=1)

In [13]:
corpus = reduced_fldf['processed_reviews'].tolist()

vectorizer = TfidfVectorizer(min_df=5, max_df=0.6)
X = vectorizer.fit_transform(corpus)  # 'corpus' is a list of review texts

# Convert to DataFrame for easier manipulation
tfidf_fldf = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# Add the ratings to the DataFrame
tfidf_fldf['user_rating'] = df['user_stars']

In [14]:
# Compute Pearson correlation for each word with the rating

correlations = {}
for word in tfidf_fldf.columns[:-1]:
    correlations[word] = pearsonr(tfidf_fldf[word], tfidf_fldf['user_rating'])[0]

correlations_df = pd.DataFrame.from_dict(correlations, orient='index', columns=['correlation'])
correlations_df = correlations_df.sort_values(by='correlation', ascending=False)

print(correlations_df.head(20))

               correlation
underwhelming     0.008292
pouring           0.008038
admit             0.007908
onion             0.007778
questionable      0.007752
pulled            0.007735
horrid            0.007276
counter           0.007210
blanc             0.007189
candle            0.007175
gimmicky          0.007014
saturday          0.006876
furry             0.006873
opening           0.006735
grocery           0.006698
cheesesteaks      0.006537
kosher            0.006519
stomping          0.006454
pure              0.006450
structure         0.006423


In [15]:
top_20_words = correlations_df.head(20).index.tolist()
top_words = tfidf_fldf[top_20_words]

# Initializing Truncated SVD
svd = TruncatedSVD(n_components=1)  # Reducing to one feature

# Fitting and transforming the data
fl_words_svd = svd.fit_transform(top_words)

# Adding this feature back to your DataFrame (or use as needed)
reduced_fldf['Florida_SVD'] = fl_words_svd[:, 0]

In [16]:
reduced_fldf.to_csv("Florida_SVD.csv")

## New Orleans

In [12]:
reduced_nodf = nodf.sample(frac=0.4, replace=False, random_state=1)

In [13]:
corpus = reduced_nodf['processed_reviews'].tolist()

vectorizer = TfidfVectorizer(min_df=5, max_df=0.6)
X = vectorizer.fit_transform(corpus)  # 'corpus' is a list of review texts

# Convert to DataFrame for easier manipulation
tfidf_nodf = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# Add the ratings to the DataFrame
tfidf_nodf['user_rating'] = df['user_stars']


In [15]:
# Compute Pearson correlation for each word with the rating

correlations = {}
for word in tfidf_nodf.columns[:-1]:
    correlations[word] = pearsonr(tfidf_nodf[word], tfidf_nodf['user_rating'])[0]

correlations_df = pd.DataFrame.from_dict(correlations, orient='index', columns=['correlation'])
correlations_df = correlations_df.sort_values(by='correlation', ascending=False)

print(correlations_df.head(20))


              correlation
ginger           0.009219
dude             0.008545
sammich          0.007391
bachelorette     0.007383
fuck             0.007338
relation         0.007305
kale             0.007295
gris             0.007231
tighten          0.007189
35               0.007147
medallion        0.006896
giant            0.006804
werent           0.006801
frustration      0.006637
wedding          0.006635
tilapia          0.006614
appease          0.006593
deboned          0.006533
breakfast        0.006517
foreign          0.006504


In [16]:
top_20_words = correlations_df.head(20).index.tolist()
top_words = tfidf_nodf[top_20_words]

# Initializing Truncated SVD
svd = TruncatedSVD(n_components=1)  # Reducing to one feature

# Fitting and transforming the data
no_words_svd = svd.fit_transform(top_words)

# Adding this feature back to your DataFrame (or use as needed)
reduced_nodf['NewOrleans_SVD'] = no_words_svd[:, 0]

In [17]:
reduced_nodf.to_csv("NewOrleans_SVD.csv")

## Philadelphia

In [12]:
reduced_padf = padf.sample(frac=0.3, replace=False, random_state=1)

In [13]:
corpus = reduced_padf['processed_reviews'].tolist()


vectorizer = TfidfVectorizer(min_df=5, max_df=0.5)
X = vectorizer.fit_transform(corpus)  # 'corpus' is a list of review texts

# Convert to DataFrame for easier manipulation
tfidf_padf = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# Add the ratings to the DataFrame
tfidf_padf['user_rating'] = df['user_stars']


In [14]:
# Compute Pearson correlation for each word with the rating

correlations = {}
for word in tfidf_padf.columns[:-1]:
    correlations[word] = pearsonr(tfidf_padf[word], tfidf_padf['user_rating'])[0]

correlations_df = pd.DataFrame.from_dict(correlations, orient='index', columns=['correlation'])
correlations_df = correlations_df.sort_values(by='correlation', ascending=False)

print(correlations_df.head(20))


            correlation
picnic         0.008358
vegetarian     0.007793
touring        0.007754
flagged        0.007324
scheme         0.007201
chewiness      0.007152
converted      0.006892
pang           0.006807
diy            0.006690
guard          0.006663
bra            0.006501
headphone      0.006477
pop            0.006454
marcie         0.006437
pile           0.006428
mummer         0.006400
liking         0.006384
eyeing         0.006381
nog            0.006371
madness        0.006368


In [None]:
top_20_words = correlations_df.head(20).index.tolist()
top_words = tfidf_padf[top_20_words]

# Initializing Truncated SVD
svd = TruncatedSVD(n_components=1)  # Reducing to one feature

# Fitting and transforming the data
pa_words_svd = svd.fit_transform(top_words)

# Adding this feature back to your DataFrame (or use as needed)
reduced_padf['Philadelphia_SVD'] = pa_words_svd[:, 0]

In [None]:
reduced_padf.to_csv("Philadelphia_SVD.csv")