# Multilabel Classification on Yelp Restaurant Reviews for Segmented Sentiment Analysis
***
#### Training and applying a neural network model to label restaurant reviews as: Service, Food, Miscellaneous, and/or Ambience. <br> After reviews are segmented by topic, a restaurant could analyze in which categories they are underperforming to know where resources should be devoted for improvement
***

### Environment Set-up

In [2]:
import os
import pandas as pd
import numpy as np

In [3]:
import gensim.parsing.preprocessing as preprocessing
from gensim.utils import simple_preprocess
import nltk
nltk.download('omw-1.4')
from nltk.tokenize import word_tokenize
nltk.download(["averaged_perceptron_tagger", "wordnet", "punkt"])

from tqdm.auto import tqdm
tqdm.pandas()

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Caitl\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Caitl\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Caitl\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Caitl\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import string

In [5]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional

In [6]:
from sklearn.model_selection import train_test_split

In [None]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

## Phase 1: Train Model on Labeled Dataset

### Import Labeled Data

In [8]:
labeled_data = pd.read_excel("semeval2014_added.xlsx")
labeled_data

# Downloaded a previously labeled dataset and added more labeled samples to it
#https://github.com/Lucasfrota/MultilabelClassificationExample/blob/master/semeval2014.csv

Unnamed: 0,text,service,food,anecdotes/miscellaneous,price,ambience
0,but the staff was so horrible to us,1,0,0,0,0
1,to be completely fair the only redeeming facto...,0,1,1,0,0
2,the food is uniformly exceptional with a very ...,0,1,0,0,0
3,where gabriela personaly greets you and recomm...,1,0,0,0,0
4,for those that go once and dont enjoy it all i...,0,0,1,0,0
...,...,...,...,...,...,...
3814,"Thank you again to the nice host, your company...",1,0,0,0,0
3815,The setting is dark and romantic.,0,0,0,0,1
3816,There is dreamy lighting and a bubbling water ...,0,0,0,0,1
3817,The dish layouts are beautiful.,0,1,0,0,0


### Preprocess text by simplifying and tokenizing

In [10]:
def lemmatize_text(token_list, wnl):
  # POS tag each word
  for word, tag in nltk.pos_tag(token_list):
    # Mapping the pos tags to the types supported by wnl
    if tag.startswith("NN"):
      yield wnl.lemmatize(word, pos='n')
    elif tag.startswith('VB'):
      yield wnl.lemmatize(word, pos='v')
    elif tag.startswith('JJ'):
      yield wnl.lemmatize(word, pos='a')
    elif tag.startswith('RB'):
      yield wnl.lemmatize(word, pos='r')
    else:
      yield wnl.lemmatize(word)

# lower letters, strip_multiple_whitespaces, and stopwords
CUSTOM_FILTERS = [lambda x: x.lower(), preprocessing.strip_multiple_whitespaces, preprocessing.remove_stopwords]
labeled_data["processed_text"] = labeled_data["text"].apply(lambda x: preprocessing.preprocess_string(x, CUSTOM_FILTERS))

# lemmatize the tokens
wnl = nltk.WordNetLemmatizer()
labeled_data["processed_text"] = labeled_data["processed_text"].progress_apply(lambda x: simple_preprocess(" ".join(lemmatize_text(x, wnl))))

  0%|          | 0/3819 [00:00<?, ?it/s]

In [11]:
# Checking tokenization
labeled_data

Unnamed: 0,text,service,food,anecdotes/miscellaneous,price,ambience,processed_text
0,but the staff was so horrible to us,1,0,0,0,0,"[staff, horrible]"
1,to be completely fair the only redeeming facto...,0,1,1,0,0,"[completely, fair, redeem, factor, food, avera..."
2,the food is uniformly exceptional with a very ...,0,1,0,0,0,"[food, uniformly, exceptional, capable, kitche..."
3,where gabriela personaly greets you and recomm...,1,0,0,0,0,"[gabriela, personaly, greets, recommend, eat]"
4,for those that go once and dont enjoy it all i...,0,0,1,0,0,"[dont, enjoy, dont]"
...,...,...,...,...,...,...,...
3814,"Thank you again to the nice host, your company...",1,0,0,0,0,"[thank, nice, host, company, conversation, enj..."
3815,The setting is dark and romantic.,0,0,0,0,1,"[set, dark, romantic]"
3816,There is dreamy lighting and a bubbling water ...,0,0,0,0,1,"[dreamy, light, bubbling, water, fountain, cor..."
3817,The dish layouts are beautiful.,0,1,0,0,0,"[dish, layout, beautiful]"


### Create Word Embeddings

In [13]:
# Using GloVe
embeddings_dict = {}
with open("glove.6B.50d.txt", 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

In [14]:
# Testing max length of review sentences in this labeled dataset
list_of_texts = labeled_data['processed_text']

review_lengths = [len(review) for review in list_of_texts]
maxlen = max(review_lengths)
print("Maximum length of sequences:", maxlen)

Maximum length of sequences: 37


In [15]:
# Based on above, maxlen of 75 should be plenty
maxlen = 75
tokenizer = Tokenizer()
tokenizer.fit_on_texts(labeled_data['processed_text'])  
sequences = tokenizer.texts_to_sequences(labeled_data['processed_text'])

word_index = tokenizer.word_index
vocab_size = len(word_index) + 1
embedding_dim = 50  

# Create embedding matrix
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_dict.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Pad sequences to ensure all sentences are ultimately stored as the same length
data = pad_sequences(sequences, maxlen=maxlen)


In [16]:
# 5 potential labels for classification
num_labels = 5 

### Recurrent Neural Network Model

In [18]:
# Standard LSTM (Long Short-Term Memory) model, inputs sequences processed from left to right
model = Sequential()
model.add(Embedding(input_dim=vocab_size,
                    output_dim=embedding_dim,
                    weights=[embedding_matrix],
                    #input_length=maxlen,
                    trainable=False))  
model.add(LSTM(128))
model.add(Dense(num_labels, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [19]:
# Extract the labels from the DataFrame
labels_columns = ['service', 'food', 'anecdotes/miscellaneous', 'price', 'ambience']
labels = labeled_data[labels_columns]

#Convert labels to a binary matrix
num_samples = len(labeled_data)
num_classes = len(labels_columns)
binary_labels = np.zeros((num_samples, num_classes))
for i, col in enumerate(labels_columns):
    binary_labels[:, i] = labeled_data[col]

In [20]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(data, binary_labels, test_size=0.3, random_state=42)

In [21]:
# Train model
model.fit(X_train, y_train, epochs=10, batch_size=100, validation_split=0.3)

Epoch 1/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 230ms/step - accuracy: 0.4016 - loss: 0.6235 - val_accuracy: 0.5910 - val_loss: 0.4687
Epoch 2/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 203ms/step - accuracy: 0.5698 - loss: 0.4395 - val_accuracy: 0.6309 - val_loss: 0.3844
Epoch 3/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 196ms/step - accuracy: 0.6158 - loss: 0.3844 - val_accuracy: 0.6521 - val_loss: 0.3546
Epoch 4/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 195ms/step - accuracy: 0.6535 - loss: 0.3455 - val_accuracy: 0.6796 - val_loss: 0.3331
Epoch 5/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 193ms/step - accuracy: 0.7158 - loss: 0.3097 - val_accuracy: 0.6908 - val_loss: 0.3278
Epoch 6/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 193ms/step - accuracy: 0.7299 - loss: 0.2947 - val_accuracy: 0.7282 - val_loss: 0.3069
Epoch 7/10
[1m19/19[0m [3

<keras.src.callbacks.history.History at 0x186cfe16050>

In [22]:
# Evaluate model

In [23]:
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 32ms/step - accuracy: 0.7359 - loss: 0.2908
Test Loss: 0.2860836982727051
Test Accuracy: 0.7338569164276123


### Second Recurrent Neural Network Model

In [25]:
# Bidirectional LSTM model, processes both forward and backward

model = Sequential()
model.add(Embedding(input_dim=vocab_size,
                    output_dim=embedding_dim,
                    weights=[embedding_matrix],
                    #input_length=maxlen,
                    trainable=False))  
model.add(Bidirectional(LSTM(128, return_sequences=True)))  
model.add(Bidirectional(LSTM(64)))  
model.add(Dense(num_labels, activation='sigmoid'))  

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=100, validation_split=0.3)

# Evaluate
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

Epoch 1/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 454ms/step - accuracy: 0.3476 - loss: 0.5945 - val_accuracy: 0.5973 - val_loss: 0.4341
Epoch 2/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 391ms/step - accuracy: 0.5829 - loss: 0.4062 - val_accuracy: 0.6297 - val_loss: 0.3790
Epoch 3/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 397ms/step - accuracy: 0.6583 - loss: 0.3605 - val_accuracy: 0.6933 - val_loss: 0.3310
Epoch 4/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 395ms/step - accuracy: 0.6998 - loss: 0.3120 - val_accuracy: 0.7244 - val_loss: 0.3191
Epoch 5/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 420ms/step - accuracy: 0.7402 - loss: 0.2858 - val_accuracy: 0.7382 - val_loss: 0.2983
Epoch 6/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 397ms/step - accuracy: 0.7428 - loss: 0.2805 - val_accuracy: 0.7244 - val_loss: 0.2944
Epoch 7/10
[1m19/19[0m [

#### There's not much of a difference between the two models. Let's evaluate the performance within the different topics.

In [26]:
from sklearn.metrics import classification_report

# Get model predictions
y_pred = model.predict(X_test)

# Round the predictions to get binary values
y_pred_binary = (y_pred > 0.5).astype(int)

# Generate classification report
report = classification_report(y_test, y_pred_binary, target_names=labels_columns)
print(report)

[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 73ms/step
                         precision    recall  f1-score   support

                service       0.87      0.60      0.71       216
                   food       0.89      0.80      0.84       489
anecdotes/miscellaneous       0.78      0.77      0.78       394
                  price       0.73      0.39      0.51       111
               ambience       0.66      0.54      0.59       152

              micro avg       0.82      0.70      0.75      1362
              macro avg       0.78      0.62      0.69      1362
           weighted avg       0.82      0.70      0.75      1362
            samples avg       0.77      0.73      0.74      1362



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


#### Let's see how imbalanced the labeled dataset is

In [63]:
# Count the number of rows where service = 1
service_count = (labeled_data['service'] == 1).sum()

# Count the number of rows where food = 1
food_count = (labeled_data['food'] == 1).sum()

# Count the number of rows where anecdotes/miscellaneous = 1
anecdotes_count = (labeled_data['anecdotes/miscellaneous'] == 1).sum()

# Count the number of rows where price = 1
price_count = (labeled_data['price'] == 1).sum()

# Count the number of rows where ambience = 1
ambience_count = (labeled_data['ambience'] == 1).sum()

# Print the counts
print("Number of rows where service = 1:", service_count)
print("Number of rows where food = 1:", food_count)
print("Number of rows where anecdotes/miscellaneous = 1:", anecdotes_count)
print("Number of rows where price = 1:", price_count)
print("Number of rows where ambience = 1:", ambience_count)

Number of rows where service = 1: 746
Number of rows where food = 1: 1600
Number of rows where anecdotes/miscellaneous = 1: 1335
Number of rows where price = 1: 359
Number of rows where ambience = 1: 487


#### Not surprisingly, the quality of the F1 score follows the quantity of labeled data for each topic
<br>
<br>
<br>

## Phase 2: Apply Model to Yelp Restaurant Reviews and Understand Restaurant Performance by Topic
***
#### Taking a subset of Restaurant Yelp Reviews that are in the 3 star range. This group is performing OK, but could use more information on how they can moderately improve to jump into the 4 star range.

### Import Yelp Datasets, Filter, and Combine

In [29]:
# Load Yelp Review dataset
reviews = pd.read_json('yelp_academic_dataset_review.json', lines=True)
reviews

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15
...,...,...,...,...,...,...,...,...,...
6990275,H0RIamZu0B0Ei0P4aeh3sQ,qskILQ3k0I_qcCMI-k6_QQ,jals67o91gcrD4DC81Vk6w,5,1,2,1,Latest addition to services from ICCU is Apple...,2014-12-17 21:45:20
6990276,shTPgbgdwTHSuU67mGCmZQ,Zo0th2m8Ez4gLSbHftiQvg,2vLksaMmSEcGbjI5gywpZA,5,2,1,2,"This spot offers a great, affordable east week...",2021-03-31 16:55:10
6990277,YNfNhgZlaaCO5Q_YJR4rEw,mm6E4FbCMwJmb7kPDZ5v2Q,R1khUUxidqfaJmcpmGd4aw,4,1,0,0,This Home Depot won me over when I needed to g...,2019-12-30 03:56:30
6990278,i-I4ZOhoX70Nw5H0FwrQUA,YwAMC-jvZ1fvEUum6QkEkw,Rr9kKArrMhSLVE9a53q-aA,5,1,0,0,For when I'm feeling like ignoring my calorie-...,2022-01-19 18:59:27


In [30]:
# Load Yelp Business dataset
business = pd.read_json('yelp_academic_dataset_business.json', lines=True)
business

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150341,IUQopTMmYQG-qRtBk-8QnA,Binh's Nails,3388 Gateway Blvd,Edmonton,AB,T6J 5H2,53.468419,-113.492054,3.0,13,1,"{'ByAppointmentOnly': 'False', 'RestaurantsPri...","Nail Salons, Beauty & Spas","{'Monday': '10:0-19:30', 'Tuesday': '10:0-19:3..."
150342,c8GjPIOTGVmIemT7j5_SyQ,Wild Birds Unlimited,2813 Bransford Ave,Nashville,TN,37204,36.115118,-86.766925,4.0,5,1,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","Pets, Nurseries & Gardening, Pet Stores, Hobby...","{'Monday': '9:30-17:30', 'Tuesday': '9:30-17:3..."
150343,_QAMST-NrQobXduilWEqSw,Claire's Boutique,"6020 E 82nd St, Ste 46",Indianapolis,IN,46250,39.908707,-86.065088,3.5,8,1,"{'RestaurantsPriceRange2': '1', 'BusinessAccep...","Shopping, Jewelry, Piercing, Toy Stores, Beaut...",
150344,mtGm22y5c2UHNXDFAjaPNw,Cyclery & Fitness Center,2472 Troy Rd,Edwardsville,IL,62025,38.782351,-89.950558,4.0,24,1,"{'BusinessParking': '{'garage': False, 'street...","Fitness/Exercise Equipment, Eyewear & Optician...","{'Monday': '9:0-20:0', 'Tuesday': '9:0-20:0', ..."


In [31]:
# Drop N/As in Cateogry column and filter for Restaurants only
business.dropna(subset=['categories'], inplace=True) 
restaurants = business[business['categories'].str.contains('Restaurants', case=False)]
restaurants

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
5,CF33F8-E6oudUQ46HnavjQ,Sonic Drive-In,615 S Main St,Ashland City,TN,37015,36.269593,-87.058943,2.0,6,1,"{'BusinessParking': 'None', 'BusinessAcceptsCr...","Burgers, Fast Food, Sandwiches, Food, Ice Crea...","{'Monday': '0:0-0:0', 'Tuesday': '6:0-22:0', '..."
8,k0hlBqXX-Bt0vf1op7Jr1w,Tsevi's Pub And Grill,8025 Mackenzie Rd,Affton,MO,63123,38.565165,-90.321087,3.0,19,0,"{'Caters': 'True', 'Alcohol': 'u'full_bar'', '...","Pubs, Restaurants, Italian, Bars, American (Tr...",
9,bBDDEgkFA1Otx9Lfe7BZUQ,Sonic Drive-In,2312 Dickerson Pike,Nashville,TN,37207,36.208102,-86.768170,1.5,10,1,"{'RestaurantsAttire': ''casual'', 'Restaurants...","Ice Cream & Frozen Yogurt, Fast Food, Burgers,...","{'Monday': '0:0-0:0', 'Tuesday': '6:0-21:0', '..."
11,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.955269,-82.456320,4.0,10,1,"{'Alcohol': ''none'', 'OutdoorSeating': 'None'...","Vietnamese, Food, Restaurants, Food Trucks","{'Monday': '11:0-14:0', 'Tuesday': '11:0-14:0'..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150325,l9eLGG9ZKpLJzboZq-9LRQ,Wawa,19 N Bishop Ave,Clifton Heights,PA,19018,39.925656,-75.310344,3.0,11,1,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Restaurants, Sandwiches, Convenience Stores, C...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W..."
150327,cM6V90ExQD6KMSU3rRB5ZA,Dutch Bros Coffee,1181 N Milwaukee St,Boise,ID,83704,43.615401,-116.284689,4.0,33,1,"{'WiFi': ''free'', 'RestaurantsGoodForGroups':...","Cafes, Juice Bars & Smoothies, Coffee & Tea, R...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-17:0', '..."
150336,WnT9NIzQgLlILjPT0kEcsQ,Adelita Taqueria & Restaurant,1108 S 9th St,Philadelphia,PA,19147,39.935982,-75.158665,4.5,35,1,"{'WheelchairAccessible': 'False', 'Restaurants...","Restaurants, Mexican","{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'..."
150339,2O2K6SXPWv56amqxCECd4w,The Plum Pit,4405 Pennell Rd,Aston,DE,19014,39.856185,-75.427725,4.5,14,1,"{'RestaurantsDelivery': 'False', 'BusinessAcce...","Restaurants, Comfort Food, Food, Food Trucks, ...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W..."


In [32]:
# Narrow down list to businesses where there's room for minor improvements
restaurants = restaurants[(restaurants['stars'] >= 3) & (restaurants['stars'] <= 4)]
restaurants

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
8,k0hlBqXX-Bt0vf1op7Jr1w,Tsevi's Pub And Grill,8025 Mackenzie Rd,Affton,MO,63123,38.565165,-90.321087,3.0,19,0,"{'Caters': 'True', 'Alcohol': 'u'full_bar'', '...","Pubs, Restaurants, Italian, Bars, American (Tr...",
11,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.955269,-82.456320,4.0,10,1,"{'Alcohol': ''none'', 'OutdoorSeating': 'None'...","Vietnamese, Food, Restaurants, Food Trucks","{'Monday': '11:0-14:0', 'Tuesday': '11:0-14:0'..."
15,MUTTqe8uqyMdBl186RmNeA,Tuna Bar,205 Race St,Philadelphia,PA,19106,39.953949,-75.143226,4.0,245,1,"{'RestaurantsReservations': 'True', 'Restauran...","Sushi Bars, Restaurants, Japanese","{'Tuesday': '13:30-22:0', 'Wednesday': '13:30-..."
20,WKMJwqnfZKsAae75RMP6jA,Roast Coffeehouse and Wine Bar,10359 104 Street NW,Edmonton,AB,T5J 1B9,53.546045,-113.499169,4.0,40,0,"{'OutdoorSeating': 'False', 'Caters': 'True', ...","Coffee & Tea, Food, Cafes, Bars, Wine Bars, Re...","{'Monday': '8:0-18:0', 'Tuesday': '8:0-18:0', ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150313,sf_oQ62L8UEnOOLf00nNGA,Pizza Hut,5028 Old Hickory,Hermitage,TN,37076,36.193201,-86.614748,3.0,6,1,"{'RestaurantsTakeOut': 'True', 'GoodForKids': ...","Restaurants, Pizza, Fast Food, Chicken Wings, ...","{'Monday': '11:0-23:0', 'Tuesday': '11:0-23:0'..."
150319,8n93L-ilMAsvwUatarykSg,Kitchen Gia,3716 Spruce St,Philadelphia,PA,19104,39.951018,-75.198240,3.0,22,0,"{'RestaurantsGoodForGroups': 'True', 'BikePark...","Coffee & Tea, Food, Sandwiches, American (Trad...","{'Monday': '9:0-19:30', 'Tuesday': '9:0-19:30'..."
150322,2MAQeAqmD8enCT2ZYqUgIQ,The Melting Pot - Nashville,"166 2nd Ave N, Ste A",Nashville,TN,37201,36.163875,-86.776311,4.0,204,0,"{'RestaurantsDelivery': 'False', 'RestaurantsR...","Fondue, Beer, Wine & Spirits, Food, Restaurants","{'Monday': '0:0-0:0', 'Tuesday': '16:0-21:0', ..."
150325,l9eLGG9ZKpLJzboZq-9LRQ,Wawa,19 N Bishop Ave,Clifton Heights,PA,19018,39.925656,-75.310344,3.0,11,1,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Restaurants, Sandwiches, Convenience Stores, C...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W..."


In [33]:
# Narrow down list to businesses with a minimum of 80 reviews so there's datapoints
restaurants = restaurants[restaurants['review_count'] >= 80]
restaurants

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
15,MUTTqe8uqyMdBl186RmNeA,Tuna Bar,205 Race St,Philadelphia,PA,19106,39.953949,-75.143226,4.0,245,1,"{'RestaurantsReservations': 'True', 'Restauran...","Sushi Bars, Restaurants, Japanese","{'Tuesday': '13:30-22:0', 'Wednesday': '13:30-..."
27,tMkwHmWFUEXrC9ZduonpTg,The Green Pheasant,215 1st Ave S,Nashville,TN,37201,36.159886,-86.773197,4.0,161,0,"{'RestaurantsGoodForGroups': 'True', 'HappyHou...","Restaurants, Japanese, Seafood","{'Wednesday': '16:0-22:0', 'Thursday': '16:0-2..."
33,kV_Q1oqis8Qli8dUoGpTyQ,Ardmore Pizza,10 Rittenhouse Pl,Ardmore,PA,19003,40.006707,-75.289671,3.5,109,1,"{'RestaurantsGoodForGroups': 'True', 'WiFi': '...","Pizza, Restaurants","{'Monday': '11:0-0:0', 'Tuesday': '11:0-0:0', ..."
41,ljxNT9p0y7YMPx0fcNBGig,Tony's Restaurant & 3rd Street Cafe,312 Piasa St,Alton,IL,62002,38.896563,-90.186203,3.0,94,1,"{'RestaurantsReservations': 'True', 'Restauran...","Restaurants, Specialty Food, Steakhouses, Food...","{'Monday': '0:0-0:0', 'Tuesday': '16:0-21:30',..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150254,Ld3qgGua7MqrFnZY8AIo0A,Café Ventana,3919 W Pine Blvd,Saint Louis,MO,63108,38.638030,-90.242133,3.5,290,0,"{'RestaurantsGoodForGroups': 'True', 'Alcohol'...","Food, Breakfast & Brunch, Cajun/Creole, Restau...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
150260,N8fK2E6YNyo04DbVNvgIQw,Sage Mediterranean,150 Bridge St,Phoenixville,PA,19460,40.134042,-75.514528,4.0,118,1,"{'WiFi': ''no'', 'RestaurantsAttire': ''casual...","Restaurants, Mediterranean","{'Tuesday': '11:30-22:30', 'Wednesday': '11:30..."
150275,IeSD0nMKRFYUTnR5nZH1CQ,HighWire Lounge,14 S Arizona Ave,Tucson,AZ,85701,32.221828,-110.967969,3.5,111,1,"{'BusinessParking': '{'garage': False, 'street...","Bars, Tapas Bars, Restaurants, Nightlife, Gast...","{'Tuesday': '17:0-2:0', 'Wednesday': '17:0-2:0..."
150290,uriD7RFuHhLJeDdKaf0nFA,Pizza Guru,3534 State St,Santa Barbara,CA,93105,34.440689,-119.739681,4.0,299,0,"{'NoiseLevel': 'u'average'', 'RestaurantsGoodF...","Restaurants, Pizza, Food","{'Monday': '15:0-21:0', 'Tuesday': '15:0-21:0'..."


In [34]:
# Merge restaraunt dataframe with reviews dataframe
mid_rest_reviews = pd.merge(reviews, restaurants, on='business_id', how='inner')
mid_rest_reviews

Unnamed: 0,review_id,user_id,business_id,stars_x,useful,funny,cool,text,date,name,...,state,postal_code,latitude,longitude,stars_y,review_count,is_open,attributes,categories,hours
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11,Turning Point of North Wales,...,PA,19454,40.210196,-75.223639,3.0,169,1,"{'NoiseLevel': 'u'average'', 'HasTV': 'False',...","Restaurants, Breakfast & Brunch, Food, Juice B...","{'Monday': '7:30-15:0', 'Tuesday': '7:30-15:0'..."
1,VJxlBnJmCDIy8DFG0kjSow,Iaee7y6zdSB3B-kRCo4z1w,XQfwVwDr-v0ZS3_CbbE5Xw,2,0,0,0,This is the second time we tried turning point...,2017-05-13 17:06:55,Turning Point of North Wales,...,PA,19454,40.210196,-75.223639,3.0,169,1,"{'NoiseLevel': 'u'average'', 'HasTV': 'False',...","Restaurants, Breakfast & Brunch, Food, Juice B...","{'Monday': '7:30-15:0', 'Tuesday': '7:30-15:0'..."
2,S6pQZQocMB1WHMjTRbt77A,ejFxLGqQcWNLdNByJlIhnQ,XQfwVwDr-v0ZS3_CbbE5Xw,4,2,0,1,The place is cute and the staff was very frien...,2017-08-08 00:58:18,Turning Point of North Wales,...,PA,19454,40.210196,-75.223639,3.0,169,1,"{'NoiseLevel': 'u'average'', 'HasTV': 'False',...","Restaurants, Breakfast & Brunch, Food, Juice B...","{'Monday': '7:30-15:0', 'Tuesday': '7:30-15:0'..."
3,WqgTKVqWVHDHjnjEsBvUgg,f7xa0p_1V9lx53iIGN5Sug,XQfwVwDr-v0ZS3_CbbE5Xw,3,0,0,0,We came on a Saturday morning after waiting a ...,2017-11-19 02:20:23,Turning Point of North Wales,...,PA,19454,40.210196,-75.223639,3.0,169,1,"{'NoiseLevel': 'u'average'', 'HasTV': 'False',...","Restaurants, Breakfast & Brunch, Food, Juice B...","{'Monday': '7:30-15:0', 'Tuesday': '7:30-15:0'..."
4,M0wzFFb7pefOPcxeRVbLag,dCooFVCk8M1nVaQqcfTL3Q,XQfwVwDr-v0ZS3_CbbE5Xw,2,0,0,0,"Mediocre at best. The decor is very nice, and ...",2017-09-09 17:49:47,Turning Point of North Wales,...,PA,19454,40.210196,-75.223639,3.0,169,1,"{'NoiseLevel': 'u'average'', 'HasTV': 'False',...","Restaurants, Breakfast & Brunch, Food, Juice B...","{'Monday': '7:30-15:0', 'Tuesday': '7:30-15:0'..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2673092,rbPKXUuCEohjKLfLP_4CVg,-6bsEIMFz6ndaWXaakISSw,c3QxX3toWdqJnKQmmIliRQ,5,4,1,3,"ITS BRINGING SEXY BACK. New bar, 5th and Broad...",2021-04-27 22:20:51,Honky Tonk - The Twelve Thirty Club,...,TN,37203,36.160630,-86.778829,4.0,110,1,"{'HasTV': 'True', 'OutdoorSeating': 'True', 'R...","American (Traditional), Restaurants, Cocktail ...","{'Monday': '0:0-0:0', 'Tuesday': '11:0-0:0', '..."
2673093,yvZTL0ffB7B5KIbn_CTURw,sGgJvAky5QSKI26bqUAqxQ,c3QxX3toWdqJnKQmmIliRQ,4,0,0,0,"Live music, great food, and excellent service(...",2021-08-06 14:25:56,Honky Tonk - The Twelve Thirty Club,...,TN,37203,36.160630,-86.778829,4.0,110,1,"{'HasTV': 'True', 'OutdoorSeating': 'True', 'R...","American (Traditional), Restaurants, Cocktail ...","{'Monday': '0:0-0:0', 'Tuesday': '11:0-0:0', '..."
2673094,-O5_SIS43vIta6ZcZe8nNA,swpVVjVkQU5FmpPuueOAzA,c3QxX3toWdqJnKQmmIliRQ,4,0,0,0,"Very nice, large and airy restaurant right in ...",2021-09-24 14:33:49,Honky Tonk - The Twelve Thirty Club,...,TN,37203,36.160630,-86.778829,4.0,110,1,"{'HasTV': 'True', 'OutdoorSeating': 'True', 'R...","American (Traditional), Restaurants, Cocktail ...","{'Monday': '0:0-0:0', 'Tuesday': '11:0-0:0', '..."
2673095,mJHxrN38MxEW910nhNupzg,uqhRApncBooRxJ0dVadS3w,c3QxX3toWdqJnKQmmIliRQ,4,5,1,1,The food and drinks were good! I ordered the p...,2021-04-22 13:49:47,Honky Tonk - The Twelve Thirty Club,...,TN,37203,36.160630,-86.778829,4.0,110,1,"{'HasTV': 'True', 'OutdoorSeating': 'True', 'R...","American (Traditional), Restaurants, Cocktail ...","{'Monday': '0:0-0:0', 'Tuesday': '11:0-0:0', '..."


In [35]:
# Simplify dataframe
mid_rest_reviews.rename(columns={'stars_x': 'review_stars'}, inplace=True)
mid_rest_reviews.rename(columns={'stars_y': 'overall_stars'}, inplace=True)

desired_order = ['business_id', 'overall_stars', 'text', 'review_stars']  
mid_rest_reviews = mid_rest_reviews[desired_order]  
mid_rest_reviews

Unnamed: 0,business_id,overall_stars,text,review_stars
0,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,"If you decide to eat here, just be aware it is...",3
1,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,This is the second time we tried turning point...,2
2,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,The place is cute and the staff was very frien...,4
3,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,We came on a Saturday morning after waiting a ...,3
4,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,"Mediocre at best. The decor is very nice, and ...",2
...,...,...,...,...
2673092,c3QxX3toWdqJnKQmmIliRQ,4.0,"ITS BRINGING SEXY BACK. New bar, 5th and Broad...",5
2673093,c3QxX3toWdqJnKQmmIliRQ,4.0,"Live music, great food, and excellent service(...",4
2673094,c3QxX3toWdqJnKQmmIliRQ,4.0,"Very nice, large and airy restaurant right in ...",4
2673095,c3QxX3toWdqJnKQmmIliRQ,4.0,The food and drinks were good! I ordered the p...,4


### Pick random restaurant to test on as proof of concept

In [36]:
# Get a unique list of business_ids
unique_business_ids = mid_rest_reviews['business_id'].unique()

# Set the random seed to a fixed value for repeatability
np.random.seed(42)

# Pick one business_id at random
random_business_id = np.random.choice(unique_business_ids)

# Filter the dataframe to get all reviews for the randomly chosen business_id
selected_business = mid_rest_reviews[mid_rest_reviews['business_id'] == random_business_id]

# Expand to see full text
pd.set_option('display.max_colwidth', None)
selected_business

Unnamed: 0,business_id,overall_stars,text,review_stars
1839941,grBPIq_eJCT_SGuhvMrUZQ,3.5,"Decent place to come grab pizza. Crust was a little underdone on mine, but then again, I came on the busiest day of the year.\n\nI did a build your own pizza with the following:\n\nSauce: Red Sauce\nCheese: Shredded mozzarella with feta sprinkled around\nMeat: Smoked Ham, Bacon bits, Pepperoni\nVeggies: Red pepper, roasted garlic, basil, mushrooms\n\nThere really wasn't much to complain about on the pizza, but at the same time, nothing that made you take a step back and go ""wow, that was amazing.""\n\nI stopped by last night with a friend to take advantage of their Pi Day deal. At $3.14 a pizza, it was a steal, but then again, not every day is pie day. Decent pizza, and it's nice that you can customize it completely without spending a fortune. Just nothing outstanding.\n\nI just got this vibe that they minimize their ingredients. Thinnest layer of sauce I've seen on a pizza, 5 leaves of basil used when I asked for it as a topping, etc. \n\nIMHO, better than pieology by a lot, but there are other places I'd rather go. An A-OK from me.",3
1839942,grBPIq_eJCT_SGuhvMrUZQ,3.5,"This place is amazing. I love that you can make your own pizza the way you want. The girl at the register, Tatiana, was so friendly. The restaurant was clean as well.",5
1839943,grBPIq_eJCT_SGuhvMrUZQ,3.5,"This location is great! Only opened for about a month but service was fast and friendly. I love me some personalized pizza and sometimes franchise establishments become jaded, but this one is wonderful. So close to my house which means it's so dangerous! Plenty of parking in the 5 points plaza.\n\nMake sure to get the pesto drizzle at the end ;)\n\n- J",5
1839944,grBPIq_eJCT_SGuhvMrUZQ,3.5,They get an instant review from me. This is a build your own pizza kind of place although they have some default menu. The service was quick and the staff was attentive! Easy 4 star. Keep it up!,4
1839945,grBPIq_eJCT_SGuhvMrUZQ,3.5,"This is a great place to stop in and get a quick, hardy pizza and salad. There isn't anything else on the menu and the air conditioning is too cold for our liking, but they have really tasty pizza and salads served up quick. We are definitely returning here because they also have gluten-free pizza crust available and daiya cheese alternative for my lactose and gluten free children. We are happy!",4
...,...,...,...,...
1840087,grBPIq_eJCT_SGuhvMrUZQ,3.5,Loooove coming here! I get the allergy GF crust build your own pizza. The staff is really nice & they are pretty fast at cooking your pizza. I checked their ingredients online for the sauce & crust- no additives!,5
1840088,grBPIq_eJCT_SGuhvMrUZQ,3.5,"Had a BOGO coupon, figured it couldn't hurt to try it out... most impressed at the new State Steet location! Staff was super friendly and we're very good at guiding you through the process of choosing a ton of different toppings/add ons. Got the Green Stripe and Art Lover... you can add any additional toppings with no extra charge, you can also add to their already listed signature pizzas. The line moved very quickly. It cooked so fast, it was hot and very flavorful. Very delish! To go boxes are ready on a shelf if neeed, a ton of Parm, red peppers and basil shakers available. The selection of lemonades looked great! Seating is a little cramped and has a weird layout and they should add one more trash receptacle towards the back, so people could have easier access. Customer service was def 2 up, we were checked in on twice. Would definitely go back. For a personal pizza for $8.25, you can't go wrong! Convenient, good parking, good price, filling and tasty, definitely recommend.",5
1840089,grBPIq_eJCT_SGuhvMrUZQ,3.5,"Customize your pizza for only $9.00\nQuick, simple, good. \nEverything is very fresh and there are a lot of topping options, though none of them are organic. \nIt is the best affordable option in this area of SB.",3
1840090,grBPIq_eJCT_SGuhvMrUZQ,3.5,"First off, there's like 13 kids working and doing absolutely nothing. Literally running into each other laughing and joking. I got to the counter and ordered my pizza I asked the kid if the cauliflower crust was any good, he said ""dude I don't know ...some people like it""\nIt Would be wise for blaze Pizza to have their employees at least try some of their menu options so they can give positive or negative feedback to the customer, It would help the customer with his or her choices.\nI ordered the veg out... Big mistake! There's literally three vegetables on it Gorgonzola and a ton of marinara!! The combination of broccoli, Gorgonzola and red onion taste like fart! I was very disappointed because I have had this pizza before at a Blaze in Los Angeles and it was choice! This kid piled it on like he was at Subway sandwich! Lol!!! Why do you have so many employees? Sorry for the bad review i'm only trying to help!!\nThe kid at the register was totally cool. He was polite and intelligent I don't know his name wish I did because you guys should make him manager!!",1


### Apply training model on each text segment of a review
***
#### A review can cover multiple topics in one review, or even multiple topics in one sentence, such as "Food was great, but service was terrible". Therefore, the model needs to be applied on each segment instead of the reviews as a whole or sentences as a whole.

In [37]:
# List of punctuation marks to divide text segments on
punctuation_marks = [",", ".", "?", "!", ";"]

In [38]:
def custom_tokenize(text):
    # Manually tokenize the text to separate punctuation
    tokens = []
    current_word = ""
    for char in text:
        if char in string.punctuation:
            if current_word:
                tokens.append(current_word)
                current_word = ""
            tokens.append(char)  # Add the punctuation as a separate token
        elif char.isspace():
            if current_word:
                tokens.append(current_word)
                current_word = ""
        else:
            current_word += char
    if current_word:
        tokens.append(current_word)  # Add the last word if exists
    return tokens

# Apply the custom tokenize function
tokenized_texts = [custom_tokenize(text) for text in selected_business['text']]

# Initialize Tokenizer without filtering out any punctuation
tokenizer = Tokenizer(filters='')  # No filters to retain punctuation

# Fit tokenizer on the tokenized texts
tokenizer.fit_on_texts(tokenized_texts)

# Initialize an empty list to store all predictions
all_predictions = []

# Iterate through each tokenized text
for tokenized_text in tokenized_texts:
    # Initialize lists to store segment texts and their corresponding predictions
    segment_texts = []
    segment_predictions = []
    
    # Initialize variables to store the current segment text and its punctuation mark
    current_segment = ""
    current_punctuation = None
    
    # Iterate through each token in the tokenized text
    for token in tokenized_text:
        # If the token is a punctuation mark
        if token in punctuation_marks:
            # Check if the current segment is not empty
            if current_segment:
                # Append the current segment text and make prediction for it
                segment_texts.append(current_segment.strip())
                segment_predictions.append(model.predict(pad_sequences(tokenizer.texts_to_sequences([current_segment]), maxlen=maxlen))[0])
            
            # Update the current punctuation mark and reset the current segment
            current_punctuation = token
            current_segment = ""
        else:
            # Append the token to the current segment text
            current_segment += token + " "
    
    # Append the last segment text and make prediction for it
    if current_segment:
        segment_texts.append(current_segment.strip())
        segment_predictions.append(model.predict(pad_sequences(tokenizer.texts_to_sequences([current_segment]), maxlen=maxlen))[0])
    
    # Append the segment predictions to the list of all predictions
    all_predictions.append((segment_texts, segment_predictions))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 125ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4

In [39]:
# Check
all_predictions

[(['Decent place to come grab pizza',
   'Crust was a little underdone on mine',
   'but then again',
   'I came on the busiest day of the year',
   'I did a build your own pizza with the following : Sauce : Red Sauce Cheese : Shredded mozzarella with feta sprinkled around Meat : Smoked Ham',
   'Bacon bits',
   'Pepperoni Veggies : Red pepper',
   'roasted garlic',
   'basil',
   "mushrooms There really wasn ' t much to complain about on the pizza",
   'but at the same time',
   'nothing that made you take a step back and go " wow',
   'that was amazing',
   '" I stopped by last night with a friend to take advantage of their Pi Day deal',
   'At $ 3',
   '14 a pizza',
   'it was a steal',
   'but then again',
   'not every day is pie day',
   'Decent pizza',
   "and it ' s nice that you can customize it completely without spending a fortune",
   'Just nothing outstanding',
   'I just got this vibe that they minimize their ingredients',
   "Thinnest layer of sauce I ' ve seen on a pizz

### Map text, according to model's predictions, into an array that groups by topic

In [58]:
# Mapping of index to topics
topics = ['service', 'food', 'anecdotes/miscellaneous', 'price', 'ambience']

# Dictionary to hold the tokenized texts classified under each label
tokens_for_labels = {topic: [] for topic in topics}

# Threshold for classification
threshold = 0.80

# Iterate through all_predictions
for segment_texts, segment_predictions in all_predictions:
    for text, prediction in zip(segment_texts, segment_predictions):
        # Iterate through each probability in prediction
        for index, prob in enumerate(prediction):
            # Check if probability exceeds the threshold
            if prob > threshold:
                # Append the text to the corresponding topic in the dictionary
                tokens_for_labels[topics[index]].append(text)

In [59]:
# Check
tokens_for_labels

{'service': ['I came on the busiest day of the year',
  'I did a build your own pizza with the following : Sauce : Red Sauce Cheese : Shredded mozzarella with feta sprinkled around Meat : Smoked Ham',
  '" I stopped by last night with a friend to take advantage of their Pi Day deal',
  'I just got this vibe that they minimize their ingredients',
  "Thinnest layer of sauce I ' ve seen on a pizza",
  "but there are other places I ' d rather go",
  'I love that you can make your own pizza the way you want',
  'I love me some personalized pizza and sometimes franchise establishments become jaded',
  "So close to my house which means it ' s so dangerous",
  'hardy pizza and salad',
  'We are definitely returning here because they also have gluten - free pizza crust available and daiya cheese alternative for my lactose and gluten free children',
  'I called the State Street location at 6 : 40pm last night and placed my order for 1 pizza',
  'so the order definitely came through',
  "I showed

#### Threshold will need to be further fine tuned. This was just a general proof of concept for one restaurant. 
<br>
<br>
<br>

### Retrieve phrases to understand how model is performing on classification. 

In [44]:
# Define a function to search for a text component and retrieve corresponding prediction
def retrieve_segment_and_prediction(all_predictions, search_text_component):
    for segment_texts, segment_predictions in all_predictions:
        for text, prediction in zip(segment_texts, segment_predictions):
            if search_text_component in text:
                return text, prediction
    # If the text component is not found, return None
    return None, None

# Example usage:
search_text_component = "the service was always fast and friendly but lately we"  # Replace "example" with the text component you want to search for
segment_text, segment_prediction = retrieve_segment_and_prediction(all_predictions, search_text_component)

if segment_text is not None:
    print("Segment Text:", segment_text)
    print("Corresponding Prediction:", segment_prediction)
else:
    print("Text component not found in any segment.")


Segment Text: as the service was always fast and friendly but lately we ' ve noticed a disappointing decline in the customer service
Corresponding Prediction: [0.21523188 0.81354433 0.10245086 0.04549364 0.6187326 ]


#### There is much room for improvement. In this example, the segment is being allocated to the Food topic instead of the Service topic. Training dataset likely needs to be larger, and more balanced. Additionally, an even more robust model with transformer architecture, such as BERT and ChatGPT would likely also increase performance.
<br>
<br>

### Continuing proof of concept by applying a Sentiment Analyzer to text in each topic

In [51]:
# Initialize SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

# Dictionary to store sentiment scores for each topic
sentiment_scores = {topic: [] for topic in topics}

# Iterate through topics and corresponding texts
for topic, texts in tokens_for_labels.items():
    for text in texts:
        # Calculate sentiment scores for each text
        sentiment_score = sia.polarity_scores(text)
        sentiment_scores[topic].append(sentiment_score)

# Check
for topic, scores in sentiment_scores.items():
    print(f"{topic} sentiment scores: {scores[:10]}")


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Caitl\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


service sentiment scores: [{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}, {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}, {'neg': 0.105, 'neu': 0.608, 'pos': 0.287, 'compound': 0.5106}, {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}, {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}, {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}, {'neg': 0.0, 'neu': 0.645, 'pos': 0.355, 'compound': 0.6705}, {'neg': 0.165, 'neu': 0.57, 'pos': 0.266, 'compound': 0.3818}, {'neg': 0.307, 'neu': 0.693, 'pos': 0.0, 'compound': -0.6113}, {'neg': 0.0, 'neu': 0.526, 'pos': 0.474, 'compound': 0.4019}]
food sentiment scores: [{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}, {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}, {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}, {'neg': 0.2, 'neu': 0.8, 'pos': 0.0, 'compound': -0.3612}, {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}, {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}, {'neg': 0.0, 'neu': 0.506,

### Averaging sentiment scores within each topic for a generalized sentiment score for each

In [60]:
# Dictionary to store overall sentiment scores for each topic
overall_sentiment_scores = {}

# Iterate through topics and corresponding sentiment scores
for topic, scores in sentiment_scores.items():
    # Calculate the sum of compound scores for each topic
    compound_sum = sum(score['compound'] for score in scores)
    # Calculate the average compound score for each topic
    num_scores = len(scores)
    average_compound_score = compound_sum / num_scores
    overall_sentiment_scores[topic] = average_compound_score

# Check
for topic, average_score in overall_sentiment_scores.items():
    print(f"{topic} overall sentiment score (average): {average_score}")

service overall sentiment score (average): 0.15729909365558886
food overall sentiment score (average): 0.16541710526315784
anecdotes/miscellaneous overall sentiment score (average): 0.09867857142857142
price overall sentiment score (average): 0.04827727272727273
ambience overall sentiment score (average): 0.19590322580645161


***
#### Once model is in a good place after many iterations with different restaurants, and improvements mentioned above, you could use the ouput above for analysis. In this example, you could maybe conclude there is a mismatch between price and what a customer is getting food-wise and service-wise. However in the model's current state, you likely could not feel confident in its analysis. 


Number of rows where service = 1: 746
Number of rows where food = 1: 1600
Number of rows where anecdotes/miscellaneous = 1: 1335
Number of rows where price = 1: 359
Number of rows where ambience = 1: 487
