Import necessary libraries for data preprocessing, modeling, and evaluation

In [None]:
import os, glob, time, gc, psutil

import ast, re, string
from collections import Counter
from itertools import islice, tee

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud

import spacy
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize

from scipy import sparse
from scipy.spatial.distance import cosine

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

import tqdm
from google.colab import files

import ipywidgets as widgets
from ipywidgets import Text, Button, Output, HBox

In [None]:
from google.colab import drive
drive.mount('/content/drive')
!ln -s /content/drive/MyDrive/talkofthetown /content/talkofthetown

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
ln: failed to create symbolic link '/content/talkofthetown/talkofthetown': File exists


In [None]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


Import data & create respective Dataframes for each city.

In [None]:
# Insert data & create respective dataframes

reviews_path = '/content/talkofthetown/data/yelp_academic_dataset_processed_reviews.csv'

reviews_df = pd.read_csv(reviews_path).drop(columns=['name'])
reviews_df.info()
reviews_df.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 9 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   text                  250000 non-null  object 
 1   stars                 250000 non-null  float64
 2   date                  250000 non-null  object 
 3   business_id           250000 non-null  object 
 4   text_clean            249999 non-null  object 
 5   text_length           250000 non-null  int64  
 6   word_count            250000 non-null  int64  
 7   sentiment_binary      250000 non-null  int64  
 8   sentiment_multiclass  250000 non-null  object 
dtypes: float64(1), int64(3), object(5)
memory usage: 17.2+ MB


Unnamed: 0,text,stars,date,business_id,text_clean,text_length,word_count,sentiment_binary,sentiment_multiclass
0,I am a long term frequent customer of this est...,1.0,2015-09-23 23:10:31,04UD14gamNjLY0IDYVhHJg,i am a long term frequent customer of this est...,341,65,0,negative
1,"I got the gyro ""platter"" for lunch, which was ...",1.0,2012-09-10 17:07:50,-OIUunijjcq_ZzyyQhPPFQ,"i got the gyro ""platter"" for lunch, which was ...",279,48,0,negative
2,They served me a frozen pizza and skunked beer...,1.0,2008-04-30 14:54:01,VbItL6RDULtnw4YvB6EhVg,they served me a frozen pizza and skunked beer...,60,11,0,negative
3,I am positively LIVID.\n\nI went to check my a...,1.0,2009-10-15 00:08:18,R-HCwu9UbasUudG1yTM1Ow,i am positively livid. i went to check my acco...,998,193,0,negative
4,This place's posted hours include a 10PM closi...,1.0,2015-08-06 22:35:02,r2cjbHG_WGWUkaPCbLpo-A,this place's posted hours include a 10pm closi...,396,74,0,negative


In [None]:
biz_path = '/content/talkofthetown/data/clean_data_business/business_clean.csv'

biz_df = pd.read_csv(biz_path, usecols=['business_id', 'name', 'city'], dtype=str)
biz_df.info()
biz_df.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150346 entries, 0 to 150345
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   business_id  150346 non-null  object
 1   name         150346 non-null  object
 2   city         150346 non-null  object
dtypes: object(3)
memory usage: 3.4+ MB


Unnamed: 0,business_id,name,city
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ",Santa Barbara
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,Affton
2,tUFrWirKiKi_TAnsVWINQQ,Target,Tucson
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,Philadelphia
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,Green Lane


Data Preprocessing Tasks

In [None]:
enriched_reviews_df = reviews_df.merge(biz_df, how='left', on='business_id')
cols = [
    'business_id',
    'name',
    'city',
    'date',
    'stars',
    'text',
    'text_clean',
    'text_length',
    'word_count',
    'sentiment_binary',
    'sentiment_multiclass'
]

enriched_reviews_df = enriched_reviews_df[cols]
enriched_reviews_df.stars = enriched_reviews_df.stars.astype(int)
enriched_reviews_df.info()
enriched_reviews_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 11 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   business_id           250000 non-null  object
 1   name                  250000 non-null  object
 2   city                  250000 non-null  object
 3   date                  250000 non-null  object
 4   stars                 250000 non-null  int64 
 5   text                  250000 non-null  object
 6   text_clean            249999 non-null  object
 7   text_length           250000 non-null  int64 
 8   word_count            250000 non-null  int64 
 9   sentiment_binary      250000 non-null  int64 
 10  sentiment_multiclass  250000 non-null  object
dtypes: int64(4), object(7)
memory usage: 21.0+ MB


Unnamed: 0,business_id,name,city,date,stars,text,text_clean,text_length,word_count,sentiment_binary,sentiment_multiclass
0,04UD14gamNjLY0IDYVhHJg,Dmitri's,Philadelphia,2015-09-23 23:10:31,1,I am a long term frequent customer of this est...,i am a long term frequent customer of this est...,341,65,0,negative
1,-OIUunijjcq_ZzyyQhPPFQ,Zorba's Taverna,Philadelphia,2012-09-10 17:07:50,1,"I got the gyro ""platter"" for lunch, which was ...","i got the gyro ""platter"" for lunch, which was ...",279,48,0,negative
2,VbItL6RDULtnw4YvB6EhVg,Jon's Bar & Grille,Philadelphia,2008-04-30 14:54:01,1,They served me a frozen pizza and skunked beer...,they served me a frozen pizza and skunked beer...,60,11,0,negative
3,R-HCwu9UbasUudG1yTM1Ow,TD Banknorth,Philadelphia,2009-10-15 00:08:18,1,I am positively LIVID.\n\nI went to check my a...,i am positively livid. i went to check my acco...,998,193,0,negative
4,r2cjbHG_WGWUkaPCbLpo-A,Philly Fusion & Grill Chicken Bowl,Philadelphia,2015-08-06 22:35:02,1,This place's posted hours include a 10PM closi...,this place's posted hours include a 10pm closi...,396,74,0,negative


In [None]:
city_counts = enriched_reviews_df['city'].value_counts(dropna=True).sort_values(ascending=False)
city_counts

Unnamed: 0_level_0,count
city,Unnamed: 1_level_1
Philadelphia,25000
New Orleans,25000
Tampa,25000
Nashville,25000
Tucson,25000
Indianapolis,25000
Reno,25000
Santa Barbara,25000
Saint Louis,25000
Boise,25000


In [None]:
#enriched_reviews_df.to_csv('/content/talkofthetown/data/enriched_yelp_academic_dataset_processed_reviews.csv', index=False)

In [None]:
top_cities = city_counts.head(11).index.tolist()

city_dfs = {
    city: enriched_reviews_df[enriched_reviews_df['city'] == city] for city in top_cities
}

sorted_city_dfs = sorted(city_dfs.items(), key=lambda x: len(x[1]))

nltk.download('stopwords')
nltk.download('punkt_tab')

nlp = spacy.load('en_core_web_lg')

stop_words = set(stopwords.words('english'))
stop_words.update(string.punctuation)
stemmer = PorterStemmer()

for city, df in sorted_city_dfs:
    print(f"Processing {city}...")

    for i in range(len(df)):
        review = str(df.iloc[i, 5]).lower()  # column 5 = review text
        review = re.sub(r'\d+', '', review)  # Remove digits
        review = review.strip()
        review = re.sub(r"http\S+|www.\S+", '', review)  # Remove URLs
        review = review.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation

        word_tokens = word_tokenize(review)
        filtered_words = [stemmer.stem(w) for w in word_tokens if w not in stop_words and len(w) > 1]

        df.iat[i, 5] = filtered_words  # Modify DataFrame directly, not a copy

    print(f"{city} processing completed!")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Processing Philadelphia...
Philadelphia processing completed!
Processing New Orleans...
New Orleans processing completed!
Processing Tampa...
Tampa processing completed!
Processing Nashville...
Nashville processing completed!
Processing Tucson...
Tucson processing completed!
Processing Indianapolis...
Indianapolis processing completed!
Processing Reno...
Reno processing completed!
Processing Santa Barbara...
Santa Barbara processing completed!
Processing Saint Louis...
Saint Louis processing completed!
Processing Boise...
Boise processing completed!


In [None]:
for city in top_cities:
  if city in city_dfs:
    # Construct the filename for the CSV
    filename = f'/content/drive/My Drive/talkofthetown/data/city_data/{city.replace(" ", "_")}_reviews.csv'
    print(f"Saving {city} data to {filename}")
    # Save the city's dataframe to a CSV file in Google Drive
    city_dfs[city].to_csv(filename, index=False)
    print(f"{city} data saved.")
  else:
    print(f"No data found for {city}.")

print("All specified city data saved.")

Saving Philadelphia data to /content/drive/My Drive/talkofthetown/data/city_data/Philadelphia_reviews.csv
Philadelphia data saved.
Saving New Orleans data to /content/drive/My Drive/talkofthetown/data/city_data/New_Orleans_reviews.csv
New Orleans data saved.
Saving Tampa data to /content/drive/My Drive/talkofthetown/data/city_data/Tampa_reviews.csv
Tampa data saved.
Saving Nashville data to /content/drive/My Drive/talkofthetown/data/city_data/Nashville_reviews.csv
Nashville data saved.
Saving Tucson data to /content/drive/My Drive/talkofthetown/data/city_data/Tucson_reviews.csv
Tucson data saved.
Saving Indianapolis data to /content/drive/My Drive/talkofthetown/data/city_data/Indianapolis_reviews.csv
Indianapolis data saved.
Saving Reno data to /content/drive/My Drive/talkofthetown/data/city_data/Reno_reviews.csv
Reno data saved.
Saving Santa Barbara data to /content/drive/My Drive/talkofthetown/data/city_data/Santa_Barbara_reviews.csv
Santa Barbara data saved.
Saving Saint Louis data 

**WordCloud for Positive and Negative Comments for all Yelp reviews for each of the 11 cities.**

In [None]:
# Categories for sentiment grouping
categories = {
    'Bad': [1, 2],
    'Mixed': [3],
    'Good': [4, 5]
}

# Loop over each city
for city, df in sorted_city_dfs:
    print(f"Processing: {city} (Reviews: {len(df)})")

    array = df.to_numpy()
    l = len(array)

    # Loop over each category (Bad, Mixed, Good)
    for category, ratings in categories.items():
        all_tokens = []

        for i in range(l):
            rating = array[i][4]
            tokens = array[i][5]  # Tokenized review content

            if rating in ratings and isinstance(tokens, list):
                all_tokens.extend(tokens)

        if not all_tokens:
            print(f"→ No tokens found for {category} reviews in {city}. Skipping wordcloud.")
            continue

        text = " ".join(all_tokens)
        wordcloud = WordCloud(width=800, height=600, background_color='white').generate(text)

        plt.figure(figsize=(10, 8))
        plt.title(f"{city} - {category} Reviews", fontsize=14)
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis("off")
        plt.show()

Output hidden; open in https://colab.research.google.com to view.

**GloVe Application Workflow**

In [None]:
city_dir = '/content/talkofthetown/talkofthetown/data/city_data'
city_paths = glob.glob(os.path.join(city_dir, '*_reviews.csv'))

city_vars = []

for path in city_paths:

    city_name = os.path.basename(path).replace("_reviews.csv", "")
    var_name  = f"{city_name.replace(' ', '_')}_reviews"

    globals()[var_name] = pd.read_csv(path)   # Creating dataframe objects
    city_vars.append(var_name)

print("Created the following city dataframe objects:\n\n" + ",\n".join(city_vars))


Created the following city dataframe objects:

New_Orleans_reviews,
Philadelphia_reviews,
Tampa_reviews,
Nashville_reviews,
Tucson_reviews,
Indianapolis_reviews,
Reno_reviews,
Santa_Barbara_reviews,
Saint_Louis_reviews,
Boise_reviews


In [None]:
# MIN_TOKEN_FREQ: Only keeps tokens that appear in at least this* many documents
MIN_TOKEN_FREQ = 5 # Adjustable

for var in city_vars:
    base = var.replace("_reviews", "")
    print(f"Building {base}_keywords.")

    df = globals()[var]
    df.iloc[:, 5] = df.iloc[:, 5].apply(ast.literal_eval)
    token_lists = df.iloc[:, 5].tolist()

    tok_ctr = Counter(tok for doc in token_lists for tok in doc)

    keywords = sorted(tok for tok, cnt in tok_ctr.items() if cnt >= MIN_TOKEN_FREQ)

    globals()[f"{base}_keywords"] = keywords

    print(f"{len(keywords):,} unique tokens\n")

Building New_Orleans_keywords.
8,395 unique tokens

Building Philadelphia_keywords.
9,406 unique tokens

Building Tampa_keywords.
8,710 unique tokens

Building Nashville_keywords.
8,632 unique tokens

Building Tucson_keywords.
8,965 unique tokens

Building Indianapolis_keywords.
8,861 unique tokens

Building Reno_keywords.
8,866 unique tokens

Building Santa_Barbara_keywords.
8,857 unique tokens

Building Saint_Louis_keywords.
9,106 unique tokens

Building Boise_keywords.
8,800 unique tokens



In [None]:

def sliding_windows(tokens, k=2):
    for i, w in enumerate(tokens):
        for j in range(1, k+1):
            if i + j < len(tokens):
                yield w, tokens[i+j]

def build_cooc(token_lists, vocab,
                            window,
                            min_tok_freq,
                            min_pair_freq):

    # Token–frequency filter
    tok_ctr = Counter(t for doc in token_lists for t in doc)
    vocab   = [t for t in vocab if tok_ctr[t] >= min_tok_freq]
    if not vocab:
        n    = 0
        zero = sparse.csr_matrix((0, 0), dtype=int)
        return zero, pd.DataFrame.sparse.from_spmatrix(zero,
                                                       index=vocab,
                                                       columns=vocab)

    # Map tokens → row/col indices
    vmap = {w: i for i, w in enumerate(vocab)}

    # Build sliding-window pair counts
    pair_ctr = Counter()
    for tokens in token_lists:
        for w1, w2 in sliding_windows(tokens, window):
            if w1 in vmap and w2 in vmap:
                pair_ctr[(vmap[w1], vmap[w2])] += 1

    # Pair–frequency filter
    pair_ctr = {pair: c for pair, c in pair_ctr.items()
                      if c >= min_pair_freq}

    # Condition if no pairs survived
    if not pair_ctr:
        n    = len(vocab)
        zero = sparse.csr_matrix((n, n), dtype=int)
        return None, print('No pairs survived.')

    # Build sparse coo → csr
    rows, cols, data = zip(*((i, j, c) for (i, j), c in pair_ctr.items()))
    coo = sparse.coo_matrix((data, (rows, cols)),
                             shape=(len(vocab), len(vocab)),
                             dtype=int).tocsr()

    glove_df = pd.DataFrame.sparse.from_spmatrix(coo,
                                                 index=vocab,
                                                 columns=vocab)
    return coo, glove_df

cities = [
    "Indianapolis","Boise","Philadelphia",
    "New_Orleans","Nashville","Tampa",
    "Santa_Barbara","Saint_Louis","Reno","Tucson"
]

for city in cities:
    print(f"\nBuilding {city} co-occurrence…")
    df     = globals()[f"{city}_reviews"]
    tokens = df.iloc[:, 5].tolist()
    vocab  = globals()[f"{city}_keywords"]

    mat, glove = build_cooc(
        token_lists = tokens,
        vocab       = vocab,
        window      = 10,
        min_tok_freq= 5,
        min_pair_freq=3
    )

    globals()[f"{city}_matrix"] = mat
    globals()[f"{city}_glove"]  = glove

    print(f"Shapes:  _matrix {mat.shape}, _glove {glove.shape}\n")


Building Indianapolis co-occurrence…
Shapes:  _matrix (8861, 8861), _glove (8861, 8861)


Building Boise co-occurrence…
Shapes:  _matrix (8800, 8800), _glove (8800, 8800)


Building Philadelphia co-occurrence…
Shapes:  _matrix (9406, 9406), _glove (9406, 9406)


Building New_Orleans co-occurrence…
Shapes:  _matrix (8395, 8395), _glove (8395, 8395)


Building Nashville co-occurrence…
Shapes:  _matrix (8632, 8632), _glove (8632, 8632)


Building Tampa co-occurrence…
Shapes:  _matrix (8710, 8710), _glove (8710, 8710)


Building Santa_Barbara co-occurrence…
Shapes:  _matrix (8857, 8857), _glove (8857, 8857)


Building Saint_Louis co-occurrence…
Shapes:  _matrix (9106, 9106), _glove (9106, 9106)


Building Reno co-occurrence…
Shapes:  _matrix (8866, 8866), _glove (8866, 8866)


Building Tucson co-occurrence…
Shapes:  _matrix (8965, 8965), _glove (8965, 8965)



In [None]:
# Matrix Validation
Indianapolis_matrix

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 871157 stored elements and shape (8861, 8861)>

In [None]:
# GloVe df Validation
Indianapolis_glove

Unnamed: 0,aaa,ab,aback,abandon,abil,abl,abomin,abound,aboveaverag,abras,...,zing,zinger,zionsvil,zip,ziti,zombi,zone,zoo,zucchini,étouffé
aaa,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ab,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aback,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abandon,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abil,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zombi,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zone,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zoo,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zucchini,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,6,0


In [None]:
matrices = {
    name.replace("_matrix", ""): obj
    for name, obj in globals().items()
    if name.endswith("_matrix")
}

records = []
for city, mat in matrices.items():
    nnz      = mat.nnz                    # number of nonzero entries
    total    = mat.data.sum()             # sum of all co-occurrence counts
    pct_nz   = 100 * (nnz / total)        # percent nonzero

    records.append({
        "city":        city,
        "vocab_size":  mat.shape[0],
        "nonzero_count": nnz,
        "total_count": total,
        "pct_nonzero": pct_nz
    })

summary = pd.DataFrame.from_records(records).set_index("city")
display(summary.style.format({
    "vocab_size":"{:,}",
    "nonzero_count":"{:,}",
    "total_count":"{:,}",
    "pct_nonzero":"{:.2f}%"
}))

Unnamed: 0_level_0,vocab_size,nonzero_count,total_count,pct_nonzero
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Indianapolis,8861,871157,9597519,9.08%
Boise,8800,838831,9129393,9.19%
Philadelphia,9406,900944,9946065,9.06%
New_Orleans,8395,815996,9182692,8.89%
Nashville,8632,860291,9709422,8.86%
Tampa,8710,858486,9378870,9.15%
Santa_Barbara,8857,844576,9119670,9.26%
Saint_Louis,9106,913797,10347694,8.83%
Reno,8866,901764,10116007,8.91%
Tucson,8965,858595,9249874,9.28%


In [None]:
n = 20
reports = {}

for var in city_vars:
    base = var.replace("_reviews","")
    mat  = globals()[f"{base}_matrix"].tocoo()
    vocab = globals()[f"{base}_keywords"]

    triples = sorted(
        zip(mat.row, mat.col, mat.data),
        key=lambda x: -x[2])[:n]

    df = pd.DataFrame(
        [(vocab[r], vocab[c], cnt) for r,c,cnt in triples],
        columns=["1","2","count"])
    reports[base] = df

# ________________________________________________________________

wide = pd.concat(
    reports.values(),
    axis=1,
    keys=reports.keys())
wide.columns.names = ["city","token"]

display(wide)

#__________________________________________________________________

# For by-city reporting, use the below content in place of the above^ (between the lines):
# for city, df in reports.items():
#     print(f"\nTop {n} Co‐occurrences in {city}\n" + "-"*40)
#     display(df)

city,New_Orleans,New_Orleans,New_Orleans,Philadelphia,Philadelphia,Philadelphia,Tampa,Tampa,Tampa,Nashville,...,Reno,Santa_Barbara,Santa_Barbara,Santa_Barbara,Saint_Louis,Saint_Louis,Saint_Louis,Boise,Boise,Boise
token,1,2,count,1,2,count,1,2,count,1,...,count,1,2,count,1,2,count,1,2,count
0,new,orlean,4435,food,good,2534,food,good,2650,food,...,2630,santa,barbara,4701,st,loui,3555,food,good,2532
1,food,good,2527,order,order,2264,food,food,2269,order,...,2358,food,good,2135,order,order,2499,go,back,2098
2,order,order,2013,go,back,2183,order,order,2226,food,...,2224,order,order,1774,food,good,2467,order,order,2050
3,food,servic,2006,food,food,2101,go,back,2029,good,...,2089,go,back,1672,go,back,2427,food,food,2026
4,food,food,2004,place,food,1947,good,good,1992,go,...,1937,good,good,1642,good,good,2074,good,good,2012
5,good,good,1934,good,good,1848,good,food,1985,food,...,1934,food,servic,1634,ice,cream,2059,food,servic,1975
6,go,back,1813,good,food,1702,food,servic,1934,good,...,1890,food,food,1599,food,food,1892,good,food,1968
7,good,food,1811,food,servic,1653,place,food,1867,place,...,1842,place,food,1566,food,servic,1683,place,food,1653
8,servic,food,1742,food,order,1584,custom,servic,1650,great,...,1800,come,back,1559,good,food,1660,custom,servic,1634
9,place,food,1732,come,back,1546,great,food,1634,order,...,1676,good,food,1479,place,food,1648,great,food,1592


In [None]:
def cosine_distances_df(glove_map, w1, w2):
    col = f"{w1}_{w2}_cosine_dist"
    records = []
    for city, df in glove_map.items():
        if {w1, w2}.issubset(df.index):
            v1 = df.loc[w1].to_numpy()
            v2 = df.loc[w2].to_numpy()
            dist = cosine(v1, v2)
        else:
            dist = float("nan")
        records.append((city, dist))
    result = pd.DataFrame(records, columns=["city", col])
    return result.sort_values(col).reset_index(drop=True)

cities = [
    "Indianapolis","Boise","Philadelphia",
    "New_Orleans","Nashville","Tampa",
    "Santa_Barbara","Saint_Louis","Reno","Tucson"
]
glove_map = {city: globals()[f"{city}_glove"] for city in cities}

w1_dist = Text(value="menu", description="Word 1:")
w2_dist = Text(value="great", description="Word 2:")
btn_dist = Button(description="Compute Distances", button_style="info")
out_dist = Output()

def on_dist_click(_):
    with out_dist:
        out_dist.clear_output()
        df = cosine_distances_df(glove_map, w1_dist.value.strip(), w2_dist.value.strip())
        display(df)

btn_dist.on_click(on_dist_click)
display(HBox([w1_dist, w2_dist, btn_dist]), out_dist)

HBox(children=(Text(value='menu', description='Word 1:'), Text(value='great', description='Word 2:'), Button(b…

Output()

In [None]:
def cosine_sims_df(glove_map, w1, w2):

    col = f"{w1}_{w2}_cosine_sim"
    records = []
    for city, df in glove_map.items():
        if {w1, w2}.issubset(df.index):
            v1  = df.loc[w1].to_numpy()
            v2  = df.loc[w2].to_numpy()
            sim = 1.0 - cosine(v1, v2)
        else:
            sim = float("nan")
        records.append((city, sim))
    result = pd.DataFrame(records, columns=["city", col])
    return result.sort_values(col).reset_index(drop=True)

cities = [
    "Indianapolis","Boise","Philadelphia",
    "New_Orleans","Nashville","Tampa",
    "Santa_Barbara","Saint_Louis","Reno","Tucson"
]

glove_map = {city: globals()[f"{city}_glove"] for city in cities}

w1_sim = Text(value="menu", description="Word 1:")
w2_sim = Text(value="great", description="Word 2:")
btn_sim = Button(description="Compute Similarities", button_style="success")
out_sim = Output()

def on_sim_click(_):
    with out_sim:
        out_sim.clear_output()
        df = cosine_sims_df(glove_map, w1_sim.value.strip(), w2_sim.value.strip())
        display(df)

btn_sim.on_click(on_sim_click)
display(HBox([w1_sim, w2_sim, btn_sim]), out_sim)

HBox(children=(Text(value='menu', description='Word 1:'), Text(value='great', description='Word 2:'), Button(b…

Output()