In [None]:
import re
import string
import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords
import spacy

import tensorflow as tf
from tensorflow.keras.layers import Dense, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

import tqdm

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Import data & create respective Dataframes for each city.

In [None]:
# Insert data & create respective dataframes

reviews_path = '/content/drive/MyDrive/talkofthetown/data/yelp_academic_dataset_processed_reviews.csv'

reviews_df = pd.read_csv(reviews_path).drop(columns=['name'])
reviews_df.info()
reviews_df.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   text                  50000 non-null  object 
 1   stars                 50000 non-null  float64
 2   date                  50000 non-null  object 
 3   business_id           50000 non-null  object 
 4   text_clean            50000 non-null  object 
 5   text_length           50000 non-null  int64  
 6   word_count            50000 non-null  int64  
 7   sentiment_binary      50000 non-null  int64  
 8   sentiment_multiclass  50000 non-null  object 
dtypes: float64(1), int64(3), object(5)
memory usage: 3.4+ MB


Unnamed: 0,text,stars,date,business_id,text_clean,text_length,word_count,sentiment_binary,sentiment_multiclass
0,I am a long term frequent customer of this est...,1.0,2015-09-23 23:10:31,04UD14gamNjLY0IDYVhHJg,i am a long term frequent customer of this est...,341,65,0,negative
1,If you want to pay for everything a la carte t...,1.0,2014-08-24 20:14:12,jNL5KUPz2-tHUJM__ysSaw,if you want to pay for everything a la carte t...,1016,189,0,negative
2,The TV shows are $4.99 and they have commercia...,1.0,2012-12-04 02:40:49,pAgtmlIGqFYaWdBoCUeitw,the tv shows are $4.99 and they have commercia...,298,52,0,negative
3,"If I could give it a zero, I would. I order a ...",1.0,2011-08-24 23:07:08,Wy8Hswf2cLQGRZN6armkag,"if i could give it a zero, i would. i order a ...",712,138,0,negative
4,We visited once and were very disappointed in ...,1.0,2017-08-16 15:43:19,aY_n9RSaD2Yw09jSFFePew,we visited once and were very disappointed in ...,315,60,0,negative


In [None]:
biz_path = '/content/drive/MyDrive/talkofthetown/data/clean_data_business/business_clean.csv'

biz_df = pd.read_csv(biz_path, usecols=['business_id', 'name', 'city'], dtype=str)
biz_df.info()
biz_df.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150346 entries, 0 to 150345
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   business_id  150346 non-null  object
 1   name         150346 non-null  object
 2   city         150346 non-null  object
dtypes: object(3)
memory usage: 3.4+ MB


Unnamed: 0,business_id,name,city
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ",Santa Barbara
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,Affton
2,tUFrWirKiKi_TAnsVWINQQ,Target,Tucson
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,Philadelphia
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,Green Lane


Data Preprocessing Tasks

In [None]:
all_cities_reviews = reviews_df.merge(biz_df, how='left', on='business_id')
cols = [
    'business_id',
    'name',
    'city',
    'date',
    'stars',
    'text',
    'text_clean',
    'text_length',
    'word_count',
    'sentiment_binary',
    'sentiment_multiclass'
]

all_cities_reviews = all_cities_reviews[cols]
all_cities_reviews.stars = all_cities_reviews.stars.astype(int)
all_cities_reviews.info()
all_cities_reviews.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   business_id           50000 non-null  object
 1   name                  50000 non-null  object
 2   city                  50000 non-null  object
 3   date                  50000 non-null  object
 4   stars                 50000 non-null  int64 
 5   text                  50000 non-null  object
 6   text_clean            50000 non-null  object
 7   text_length           50000 non-null  int64 
 8   word_count            50000 non-null  int64 
 9   sentiment_binary      50000 non-null  int64 
 10  sentiment_multiclass  50000 non-null  object
dtypes: int64(4), object(7)
memory usage: 4.2+ MB


Unnamed: 0,business_id,name,city,date,stars,text,text_clean,text_length,word_count,sentiment_binary,sentiment_multiclass
0,04UD14gamNjLY0IDYVhHJg,Dmitri's,Philadelphia,2015-09-23 23:10:31,1,I am a long term frequent customer of this est...,i am a long term frequent customer of this est...,341,65,0,negative
1,jNL5KUPz2-tHUJM__ysSaw,El Chicanito Mexican Restaurant,Port Richey,2014-08-24 20:14:12,1,If you want to pay for everything a la carte t...,if you want to pay for everything a la carte t...,1016,189,0,negative
2,pAgtmlIGqFYaWdBoCUeitw,Holiday Inn Nashville-Vanderbilt,Nashville,2012-12-04 02:40:49,1,The TV shows are $4.99 and they have commercia...,the tv shows are $4.99 and they have commercia...,298,52,0,negative
3,Wy8Hswf2cLQGRZN6armkag,Jack in the Box,Goleta,2011-08-24 23:07:08,1,"If I could give it a zero, I would. I order a ...","if i could give it a zero, i would. i order a ...",712,138,0,negative
4,aY_n9RSaD2Yw09jSFFePew,PizzaMan Dan's,Carpinteria,2017-08-16 15:43:19,1,We visited once and were very disappointed in ...,we visited once and were very disappointed in ...,315,60,0,negative


In [None]:
#cities = enriched_reviews_df['city'].unique()
#sorted(cities)

In [None]:
city_counts = all_cities_reviews['city'].value_counts(dropna=True).sort_values(ascending=False)
city_counts.head(20)

Unnamed: 0_level_0,count
city,Unnamed: 1_level_1
Philadelphia,8579
New Orleans,5304
Nashville,3463
Tampa,2779
Tucson,2439
Indianapolis,2402
Saint Louis,2318
Reno,2282
Santa Barbara,1668
Edmonton,869


Verifying the differences in text vs. clean_text (seeing what other preprocessing needs to be done for text_clean)

In [None]:
pd.set_option('display.max_colwidth', None)
all_cities_reviews['text'].head(3)

Unnamed: 0,text
0,I am a long term frequent customer of this establishment. I just went in to order take out (3 apps) and was told they're too busy to do it. Really? The place is maybe half full at best. Does your dick reach your ass? Yes? Go fuck yourself! I'm a frequent customer AND great tipper. Glad that Kanella just opened. NEVER going back to dmitris!
1,"If you want to pay for everything a la carte this is the place for you. \nFood wasn't terrible not impressive.\nThey brought a basket of chips and some tomato sauce which I asked politely for something spicier and some pico de gallo. She brought them happily to me and the salsa was much better. When asked what we would like to drink I asked for a coke and she brought out a bottle which I stated I wanted the fountain drink. She said ""oh that's only Pepsi"". Never mentioned that they only had bottle drinks for coke. \nWe ordered our food which was reasonably priced, asked for sour cream and also to put cheese on the taco. She let us know cheese was extra. \nIt was $2.50 extra for another basket of chips. \nWhen I received the bill we paid more for the condiments then the actual food. Side of sour cream 2.00, pico de gallo 2.50, salsa 2.00, chips 2.50, cheese 1.00 and the bottled coke that we didn't want 5.00. \nJust a suggestion...when you order anything make sure to ask if there is an extra charge."
2,"The TV shows are $4.99 and they have commercials! What a cheesy way to make money and a sign of a less than classy hotel, particularly when you pay more than $150 a night. And there is NO complimentary breakfast, just an overpriced buffet, something even the cheapest hotels in California provide."


In [None]:
all_cities_reviews['text_clean'].head(3)

Unnamed: 0,text_clean
0,i am a long term frequent customer of this establishment. i just went in to order take out (3 apps) and was told they're too busy to do it. really? the place is maybe half full at best. does your dick reach your ass? yes? go fuck yourself! i'm a frequent customer and great tipper. glad that kanella just opened. never going back to dmitris!
1,"if you want to pay for everything a la carte this is the place for you. food wasn't terrible not impressive. they brought a basket of chips and some tomato sauce which i asked politely for something spicier and some pico de gallo. she brought them happily to me and the salsa was much better. when asked what we would like to drink i asked for a coke and she brought out a bottle which i stated i wanted the fountain drink. she said ""oh that's only pepsi"". never mentioned that they only had bottle drinks for coke. we ordered our food which was reasonably priced, asked for sour cream and also to put cheese on the taco. she let us know cheese was extra. it was $2.50 extra for another basket of chips. when i received the bill we paid more for the condiments then the actual food. side of sour cream 2.00, pico de gallo 2.50, salsa 2.00, chips 2.50, cheese 1.00 and the bottled coke that we didn't want 5.00. just a suggestion...when you order anything make sure to ask if there is an extra charge."
2,"the tv shows are $4.99 and they have commercials! what a cheesy way to make money and a sign of a less than classy hotel, particularly when you pay more than $150 a night. and there is no complimentary breakfast, just an overpriced buffet, something even the cheapest hotels in california provide."


In [None]:
all_cities_reviews[['sentiment_multiclass', 'stars']].value_counts()

Unnamed: 0_level_0,Unnamed: 1_level_0,count
sentiment_multiclass,stars,Unnamed: 2_level_1
negative,1,10000
negative,2,10000
neutral,3,10000
positive,4,10000
positive,5,10000


**Classical TF_IDF --> Logistic Regression Workflow**

In [None]:
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)

stop_words = set(nltk.corpus.stopwords.words('english'))
punct_tbl = str.maketrans('', '', string.punctuation) # table to map all corpus punctuation occurences to 'None'
url_re = re.compile(r'https?://\S+|@\w+', flags=re.IGNORECASE) # secondary url cleanse pass for quality control

def tfidf_prep(txt: str) -> str:

    txt = txt.lower() # secondary lowercase assurance
    txt = url_re.sub('', txt)
    txt = txt.translate(punct_tbl) # 'translates' punctuation to 'None's
    txt = re.sub(r'\d+', ' ', txt)
    tokens = [w for w in txt.split() if w not in stop_words]

    return ' '.join(tokens)

all_cities_reviews['text_lr'] = (all_cities_reviews['text_clean'].astype(str).map(tfidf_prep)) # function application as column 'text_clean_processed'


In [None]:
all_cities_reviews['text_lr'].head(3)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    all_cities_reviews['text_lr'],
    all_cities_reviews['sentiment_multiclass'],
    test_size=0.20, random_state=42, stratify=all_cities_reviews['sentiment_multiclass']
)

tfidf_lr = make_pipeline(TfidfVectorizer(ngram_range=(1,2), min_df=5, stop_words='english', sublinear_tf=True),
                         LogisticRegression(max_iter=400, class_weight='balanced'))
tfidf_lr.fit(X_train, y_train)

In [None]:
print(classification_report(y_test, tfidf_lr.predict(X_test), digits=3))
print("macro-F1:", f1_score(y_test, tfidf_lr.predict(X_test), average='macro'))

**Random Model Workflow**

In [None]:
punc_gl = str.maketrans('', '', string.punctuation)

def glove_clean(txt):
  txt = txt.lower()
  txt = re.sub(r'https?://\S+|@\w+', '', txt)

  return txt.translate(punc_gl).strip()

all_cities_reviews['text_gl'] = (all_cities_reviews['text_clean'].map(glove_clean))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    all_cities_reviews['text_gl'],
    all_cities_reviews['sentiment_multiclass'],
    test_size=0.20, random_state=42, stratify=all_cities_reviews['sentiment_multiclass']
)

In [None]:
all_cities_reviews['text_gl'].str.split().str.len().describe([.75, 0.85, .9, 0.925, 0.95, 0.955, 0.96, 0.965, 0.97, 0.975, .98, 0.985, .99, 0.995, 0.998])

In [None]:
#!python -m spacy download en_core_web_lg
nlp = spacy.load('en_core_web_lg')
max_length = 306

In [None]:
def vocab_build(texts):

  vocab=set()

  for doc in tqdm.tqdm(nlp.pipe(texts, n_process=4, batch_size=1000)):
        vocab.update(t.text for t in doc[:max_length])

  return {t:i+2 for i,t in enumerate(sorted(vocab))}

gl_token_index = vocab_build(X_train)

In [None]:
def encode(texts): # token-to-integer mapping - matrix predecessor

    seq=[]

    for doc in tqdm.tqdm(nlp.pipe(texts, n_process=1, batch_size=1000)):
        seq.append([gl_token_index.get(t.text, 1) for t in doc[:max_length]])

    return tf.keras.preprocessing.sequence.pad_sequences(seq, maxlen=max_length, padding='post', value=0)

X_train_gl, X_test_gl = map(encode, [X_train, X_test])

In [None]:
""" This matrix holds a row per token (absent of first two padding/unknown indices)
    The matrix is of 300 dimensional vectors - 300 seems to be a common dimension choice"""
glove_matrix = np.zeros((len(gl_token_index)+2, 300), dtype='float32')
for token, index in gl_token_index.items():
    glove_matrix[index] = nlp.vocab[token].vector

In [None]:
"""I believe we are learning about this model in Module 10"""

def build_lstm(vocab_size, emb_matrix):

    inp = tf.keras.Input(shape=(max_length,))

    x = tf.keras.layers.Embedding(
        vocab_size, 300, weights=[emb_matrix], trainable=True, mask_zero=True)(inp)
    x = tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(256, return_sequences=False))(x)
    x = tf.keras.layers.Dropout(0.3)(x)

    out = tf.keras.layers.Dense(3, activation='softmax')(x)

    model = tf.keras.Model(inp, out)

    model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])

    return model

In [None]:
label_map = {'negative':0, 'neutral':1, 'positive':2}
class_weights = {0:1.0, 1:1.5, 2:1.0} # trying to boost 'neutral' recall
lstm_model = build_lstm(len(gl_token_index)+2, glove_matrix)
lstm_model.fit(
    X_train_gl, y_train.map(label_map).values,
    epochs=10,
    batch_size=512,
    validation_split=0.1,
    class_weight=class_weights,
    verbose=2)

pred = lstm_model.predict(X_test_gl, batch_size=1024).argmax(axis=1)
print(classification_report(y_test.map(label_map).values, pred, digits=3))
print("macro-F1:", f1_score(y_test.map(label_map).values, pred, average='macro'))