#### Imports

In [1]:
import re
import numpy as np
import pandas as pd
from pprint import pprint
import nltk
from nltk.stem.wordnet import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel, LdaModel

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models  # don't skip this
import matplotlib.pyplot as plt
from gensim.models.phrases import Phrases, ENGLISH_CONNECTOR_WORDS
from gensim.test.utils import datapath


import pickle # to save models and results
import os
# hyperopt - used for Bayesian hyperparameter tuning
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from hyperopt.pyll import scope

# for timing
from time import time
from tqdm import tqdm

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nicjh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nicjh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  def _figure_formats_changed(self, name, old, new):


#### Read in Dataset

In [2]:
df_reviews = pd.read_csv('data/final_restaurant_review_data.csv', lineterminator='\n')

In [3]:
df_reviews.head(3)

Unnamed: 0,rating,date,title,description,date_of_visit,url,num_of_img_uploaded,full_review,num_of_tokens_title,num_of_tokens_description,num_of_tokens_full_review,title_sentiment,description_sentiment,full_review_sentiment,review_sentiment_category
0,3.0,2021-12-04,Actually ordered online with Deliveroo! disapp...,Ordered a chicken Cobb salad which is meant to...,2021-12-01,https://www.tripadvisor.com.sg/Restaurant_Revi...,0.0,Actually ordered online with Deliveroo! disapp...,6.0,61.0,67.0,-0.3,-0.3625,-0.3625,0.0
1,1.0,2021-01-26,Hate to write bad reviews but....,"Bad service, bad attitude of owner and average...",2021-01-01,https://www.tripadvisor.com.sg/Restaurant_Revi...,0.0,"Hate to write bad reviews but.... Bad service,...",6.0,16.0,22.0,-0.75,-0.13,-0.13,0.0
2,4.0,2020-08-21,Tempting pizzas,I went with a couple a family friend pair who ...,2020-03-01,https://www.tripadvisor.com.sg/Restaurant_Revi...,0.0,Tempting pizzas I went with a couple a family ...,2.0,34.0,36.0,0.0,0.566667,0.566667,1.0


#### Processing Dataset

In [4]:
df_reviews['title'] = df_reviews['title'].replace(np.nan, "")
df_reviews['description'] = df_reviews['description'].replace(np.nan, "")
df_reviews['content'] = df_reviews['title'] + ". " + df_reviews["description"]
# lowercase all
df_reviews['content'] = df_reviews['content'].str.lower()

In [5]:
# Convert to list
data = df_reviews['content'].values.tolist()

In [6]:
# Remove new line characters
data = [re.sub(r'\s+', ' ', i) for i in data]

#### Text Preprocessing - Remove Emojis

In [7]:
import re
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F" # emoticons
                           u"\U0001F300-\U0001F5FF" # symbols & pictographs
                           u"\U0001F680-\U0001F6FF" # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF" # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

In [8]:
no_emoji = [remove_emoji(i) for i in data]

In [9]:
print(data[:1])

['actually ordered online with deliveroo! disappointing. ordered a chicken cobb salad which is meant to come with poached egg which didn’t exist. not sure what cobb means i thought corn. i called restaurant to just give feedback before writing this review we’ll talk about disappointing- check your order if ordering online....my husband had a prawn marinara which had exactly 4 tiny prawns 🍤 in it why bothermore']


In [10]:
print(no_emoji[:1])

['actually ordered online with deliveroo! disappointing. ordered a chicken cobb salad which is meant to come with poached egg which didn’t exist. not sure what cobb means i thought corn. i called restaurant to just give feedback before writing this review we’ll talk about disappointing- check your order if ordering online....my husband had a prawn marinara which had exactly 4 tiny prawns  in it why bothermore']


#### Text Preprocessing - Tokenize words 

- tokenize each sentence into a list of words, removing punctuations

In [11]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(no_emoji))

print(data_words[:1])

[['actually', 'ordered', 'online', 'with', 'deliveroo', 'disappointing', 'ordered', 'chicken', 'cobb', 'salad', 'which', 'is', 'meant', 'to', 'come', 'with', 'poached', 'egg', 'which', 'didn', 'exist', 'not', 'sure', 'what', 'cobb', 'means', 'thought', 'corn', 'called', 'restaurant', 'to', 'just', 'give', 'feedback', 'before', 'writing', 'this', 'review', 'we', 'll', 'talk', 'about', 'disappointing', 'check', 'your', 'order', 'if', 'ordering', 'online', 'my', 'husband', 'had', 'prawn', 'marinara', 'which', 'had', 'exactly', 'tiny', 'prawns', 'in', 'it', 'why', 'bothermore']]


#### Text Preprocessing - Remove Stopwords, numbers, 1-character words

In [12]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = [sw for sw in stopwords.words('english') if sw not in ['not', 'no']]

In [13]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

data_words_nostops = remove_stopwords(data_words)

In [14]:
# Remove numbers, but not words that contain numbers
data_words_nostops = [[token for token in doc if not token.isnumeric()] for doc in data_words_nostops]

# Remove words that are only one character
data_words_nostops = [[token for token in doc if len(token) > 1] for doc in data_words_nostops]

In [15]:
print(data_words_nostops[:1])

[['actually', 'ordered', 'online', 'deliveroo', 'disappointing', 'ordered', 'chicken', 'cobb', 'salad', 'meant', 'come', 'poached', 'egg', 'exist', 'not', 'sure', 'cobb', 'means', 'thought', 'corn', 'called', 'restaurant', 'give', 'feedback', 'writing', 'review', 'talk', 'disappointing', 'check', 'order', 'ordering', 'online', 'husband', 'prawn', 'marinara', 'exactly', 'tiny', 'prawns', 'bothermore']]


#### Text Preprocessing - Lemmatization

A lemmatizer is preferred over a stemmer in this case because it produces more readable words. Output that is easy to read is very desirable in topic modelling.

In [16]:
# Lemmatize the documents
lemmatizer = WordNetLemmatizer()
data_lemmatized = [[lemmatizer.lemmatize(token) for token in doc] for doc in data_words_nostops]

In [17]:
print(data_lemmatized[:1])

[['actually', 'ordered', 'online', 'deliveroo', 'disappointing', 'ordered', 'chicken', 'cobb', 'salad', 'meant', 'come', 'poached', 'egg', 'exist', 'not', 'sure', 'cobb', 'mean', 'thought', 'corn', 'called', 'restaurant', 'give', 'feedback', 'writing', 'review', 'talk', 'disappointing', 'check', 'order', 'ordering', 'online', 'husband', 'prawn', 'marinara', 'exactly', 'tiny', 'prawn', 'bothermore']]


#### Create Bigrams and Trigrams

min_count and threshold. The higher the values of these param, the harder it is for words to be combined to bigrams.

In [18]:
# Build the bigram and trigram models
bigram = Phrases(data_words, min_count=3, threshold=100, connector_words=ENGLISH_CONNECTOR_WORDS) # higher threshold fewer phrases.
trigram = Phrases(bigram[data_words], threshold=100, connector_words=ENGLISH_CONNECTOR_WORDS)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [20]:
# Define functions for stopwords, bigrams, trigrams and lemmatization

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

In [21]:
# Form Bigrams
data_words_bigrams = make_bigrams(data_lemmatized)

# Form Trigrams
data_words_trigrams = make_trigrams(data_lemmatized)

#### Build Model for Unigrams 

1. Create the Dictionary and Corpus needed for Topic Modeling

A dictionary is a mapping of word ids to words. To create our dictionary, we can create a built in gensim.corpora.Dictionary object. From there, the filter_extremes() method is essential in order to ensure that we get a desirable frequency and representation of tokens in our dictionary.

In [22]:
# Create Dictionary
id2word_unigram = corpora.Dictionary(data_lemmatized)
id2word_unigram.filter_extremes(no_below=5, no_above=0.6)
# filter out tokens that appear in less than 5 documents
# filter out tokens that appear in more than 60% of documents

In [23]:
print(id2word_unigram)

Dictionary(28734 unique tokens: ['actually', 'called', 'check', 'chicken', 'cobb']...)


In [24]:
# Create Corpus
texts_unigram = data_lemmatized

# Term Document Frequency
corpus_unigram = [id2word_unigram.doc2bow(text) for text in texts_unigram]

In [25]:
# produced corpus shown above is a mapping of (word_id, word_frequency).
print(corpus_unigram[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 2), (5, 1), (6, 1), (7, 1), (8, 2), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 2), (20, 1), (21, 2), (22, 1), (23, 1), (24, 2), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1)]]


In [26]:
# see what word a given id corresponds to, pass the id as a key to the dictionary.
id2word_unigram[0]

'actually'

In [27]:
# Human readable format of corpus (term-frequency)
[[(id2word_unigram[id], freq) for id, freq in cp] for cp in corpus_unigram[:1]]

[[('actually', 1),
  ('called', 1),
  ('check', 1),
  ('chicken', 1),
  ('cobb', 2),
  ('come', 1),
  ('corn', 1),
  ('deliveroo', 1),
  ('disappointing', 2),
  ('egg', 1),
  ('exactly', 1),
  ('exist', 1),
  ('feedback', 1),
  ('give', 1),
  ('husband', 1),
  ('marinara', 1),
  ('mean', 1),
  ('meant', 1),
  ('not', 1),
  ('online', 2),
  ('order', 1),
  ('ordered', 2),
  ('ordering', 1),
  ('poached', 1),
  ('prawn', 2),
  ('restaurant', 1),
  ('review', 1),
  ('salad', 1),
  ('sure', 1),
  ('talk', 1),
  ('thought', 1),
  ('tiny', 1),
  ('writing', 1)]]

Building the Topic Model

- alpha and eta are hyperparameters that affect sparsity of the topics. According to the Gensim docs, both defaults to 1.0/num_topics prior.
- chunksize is the number of documents to be used in each training chunk
- update_every determines how often the model parameters should be updated 
- passes is the total number of training passes
- per_word_topics, when set to True, computes a list of topics, sorted in descending order of most likely topics for each word, along with their phi values multiplied by the feature length (i.e. word count)

In [30]:
# Build LDA model
# 68 minutes
lda_model_unigram = gensim.models.ldamodel.LdaModel(corpus=corpus_unigram,
                                           id2word=id2word_unigram,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           eta = 'auto',
                                           per_word_topics=True)

In [31]:
# Save model to disk.
temp_file = datapath("unigram_base")
lda_model_unigram.save(temp_file)

In [None]:
# temp_file = datapath("unigram_base")
# lda_model = gensim.models.ldamodel.LdaModel.load(temp_file)

In [32]:
# Visualize the topics
plt.style.use('default') 
pyLDAvis.enable_notebook()
vis_unigram = pyLDAvis.gensim_models.prepare(lda_model_unigram, corpus_unigram, id2word_unigram)
vis_unigram

  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [33]:
pyLDAvis.save_html(vis_unigram, 'LDAvis/unigram_base.html')

View the topics in LDA model

- each topic is a combination of keywords and each keyword contributes a certain weightage to the topic
- see the keywords for each topic and the weightage(importance) of each keyword using lda_model.print_topics() 

In [34]:
top_topics = lda_model_unigram.top_topics(corpus_unigram) #, num_words=20)

In [35]:
top_topics

[([(0.1015753, 'restaurant'),
   (0.06430924, 'time'),
   (0.062226698, 'one'),
   (0.060567353, 'u'),
   (0.05024025, 'like'),
   (0.040033415, 'would'),
   (0.039817426, 'go'),
   (0.036449384, 'meal'),
   (0.0334548, 'also'),
   (0.027914979, 'first'),
   (0.021266697, 'never'),
   (0.020535976, 'visit'),
   (0.018607236, 'long'),
   (0.015646145, 'day'),
   (0.015466291, 'made'),
   (0.014660337, 'want'),
   (0.014287299, 'make'),
   (0.012997946, 'need'),
   (0.011982154, 'serving'),
   (0.011748431, 'area')],
  -2.4029608345405298),
 ([(0.18797716, 'not'),
   (0.06037248, 'order'),
   (0.055246647, 'ordered'),
   (0.04282611, 'chicken'),
   (0.036879595, 'taste'),
   (0.034448013, 'came'),
   (0.028682925, 'served'),
   (0.028224416, 'soup'),
   (0.027660945, 'try'),
   (0.021156019, 'could'),
   (0.019874252, 'sauce'),
   (0.018777903, 'still'),
   (0.018033447, 'got'),
   (0.017469406, 'set'),
   (0.015162581, 'say'),
   (0.014891025, 'took'),
   (0.014567696, 'bit'),
   (0.014

In [36]:
# Print the Keyword in the 20 topics
pprint(lda_model_unigram.print_topics())
doc_lda_unigram = lda_model_unigram[corpus_unigram]

[(0,
  '0.135*"ever" + 0.113*"side" + 0.095*"going" + 0.084*"full" + '
  '0.060*"selection" + 0.059*"probably" + 0.058*"wrong" + 0.053*"couple" + '
  '0.049*"flavour" + 0.049*"late"'),
 (1,
  '0.102*"restaurant" + 0.064*"time" + 0.062*"one" + 0.061*"u" + 0.050*"like" '
  '+ 0.040*"would" + 0.040*"go" + 0.036*"meal" + 0.033*"also" + 0.028*"first"'),
 (2,
  '0.204*"people" + 0.101*"cheap" + 0.088*"evening" + 0.088*"group" + '
  '0.065*"far" + 0.059*"saw" + 0.053*"trip" + 0.042*"low" + 0.040*"real" + '
  '0.037*"everyone"'),
 (3,
  '0.095*"table" + 0.088*"bad" + 0.062*"went" + 0.059*"average" + 0.058*"much" '
  '+ 0.038*"around" + 0.038*"money" + 0.036*"little" + 0.036*"review" + '
  '0.033*"wait"'),
 (4,
  '0.188*"not" + 0.060*"order" + 0.055*"ordered" + 0.043*"chicken" + '
  '0.037*"taste" + 0.034*"came" + 0.029*"served" + 0.028*"soup" + 0.028*"try" '
  '+ 0.021*"could"'),
 (5,
  '0.368*"price" + 0.068*"left" + 0.065*"see" + 0.062*"reasonable" + '
  '0.058*"offer" + 0.052*"hard" + 0.049

- Perplexity captures how surprised a model is of new data it has not seen before, and is measured as the normalized log-likelihood of a held-out test set.
- Coherence measures the degree of semantic similarity between high scoring words in the topic. The c_v measure is based on a sliding window, one-set segmentation of the top words and an indirect confirmation measure that uses normalized pointwise mutual information (NPMI) and cosine similarity.

In [37]:
# Compute Perplexity, lower is better
print('\nPerplexity: ', lda_model_unigram.log_perplexity(corpus_unigram))  # a measure of how good the model is. lower the better.

# Compute Coherence Score, higher is better
coherence_model_lda = CoherenceModel(model=lda_model_unigram, texts=data_lemmatized, dictionary=id2word_unigram, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -17.08423231239158

Coherence Score:  0.34292170748616524


#### Build Model for Bigrams 

In [38]:
# Create Dictionary
id2word_bigram = corpora.Dictionary(data_words_bigrams)
id2word_bigram.filter_extremes(no_below=5, no_above=0.6)
# filter out tokens that appear in less than 5 documents
# filter out tokens that appear in more than 60% of documents

In [39]:
print(id2word_bigram)

Dictionary(34813 unique tokens: ['actually', 'called', 'check', 'chicken', 'cobb']...)


In [40]:
# Create Corpus
texts_bigram = data_words_bigrams

# Term Document Frequency
corpus_bigram = [id2word_bigram.doc2bow(text) for text in texts_bigram]

In [41]:
# produced corpus is a mapping of (word_id, word_frequency).
print(corpus_bigram[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 2), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 2), (21, 1), (22, 2), (23, 1), (24, 1), (25, 2), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1)]]


In [42]:
# see what word a given id corresponds to, pass the id as a key to the dictionary.
id2word_bigram[0]

'actually'

In [43]:
# Human readable format of corpus (term-frequency)
[[(id2word_bigram[id], freq) for id, freq in cp] for cp in corpus_bigram[:1]]

[[('actually', 1),
  ('called', 1),
  ('check', 1),
  ('chicken', 1),
  ('cobb', 1),
  ('cobb_salad', 1),
  ('come', 1),
  ('corn', 1),
  ('deliveroo', 1),
  ('disappointing', 2),
  ('egg', 1),
  ('exactly', 1),
  ('exist', 1),
  ('feedback', 1),
  ('give', 1),
  ('husband', 1),
  ('marinara', 1),
  ('mean', 1),
  ('meant', 1),
  ('not', 1),
  ('online', 2),
  ('order', 1),
  ('ordered', 2),
  ('ordering', 1),
  ('poached', 1),
  ('prawn', 2),
  ('restaurant', 1),
  ('review', 1),
  ('sure', 1),
  ('talk', 1),
  ('thought', 1),
  ('tiny', 1),
  ('writing', 1)]]

In [44]:
# Build LDA model
# 80 minutes
lda_model_bigram = gensim.models.ldamodel.LdaModel(corpus=corpus_bigram,
                                           id2word=id2word_bigram,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           eta = 'auto',
                                           per_word_topics=True)

In [45]:
# Save model to disk.
temp_file = datapath("bigram_base")
lda_model_bigram.save(temp_file)

In [None]:
# temp_file = datapath("bigram_base")
# lda_model = gensim.models.ldamodel.LdaModel.load(temp_file)

In [46]:
# Visualize the topics
plt.style.use('default') 
pyLDAvis.enable_notebook()
vis_bigram = pyLDAvis.gensim_models.prepare(lda_model_bigram, corpus_bigram, id2word_bigram)
vis_bigram

  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [47]:
pyLDAvis.save_html(vis_bigram, 'LDAvis/bigram_base.html')

In [48]:
top_topics_bigram = lda_model_bigram.top_topics(corpus_bigram) #, num_words=20)

In [49]:
top_topics_bigram

[([(0.11812233, 'restaurant'),
   (0.06459214, 'ordered'),
   (0.06201005, 'dish'),
   (0.04440356, 'even'),
   (0.042387176, 'meal'),
   (0.04027524, 'came'),
   (0.038904756, 'also'),
   (0.03353496, 'served'),
   (0.03055448, 'much'),
   (0.024734799, 'could'),
   (0.024290947, 'never'),
   (0.023540366, 'portion'),
   (0.021402193, 'many'),
   (0.02108403, 'got'),
   (0.019272551, 'take'),
   (0.019251583, 'though'),
   (0.019078316, 'tasty'),
   (0.017032001, 'bit'),
   (0.01661207, 'make'),
   (0.016566169, 'last')],
  -2.392951336924859),
 ([(0.12743169, 'service'),
   (0.12522742, 'good'),
   (0.0928024, 'place'),
   (0.06917484, 'time'),
   (0.068303026, 'staff'),
   (0.04616416, 'great'),
   (0.04514008, 'nice'),
   (0.036117908, 'lunch'),
   (0.030568203, 'well'),
   (0.029810542, 'quality'),
   (0.02208974, 'visit'),
   (0.021998575, 'friend'),
   (0.020112664, 'worth'),
   (0.019014653, 'friendly'),
   (0.017053466, 'family'),
   (0.01647806, 'outlet'),
   (0.016437547, 'd

In [50]:
# Print the Keyword in the 20 topics
pprint(lda_model_bigram.print_topics())
doc_lda_bigram = lda_model_bigram[corpus_bigram]

[(0,
  '0.333*"taste" + 0.108*"know" + 0.062*"felt" + 0.057*"authentic" + '
  '0.051*"simple" + 0.041*"deal" + 0.038*"disappointment" + 0.034*"live" + '
  '0.033*"door" + 0.031*"real"'),
 (1,
  '0.118*"restaurant" + 0.065*"ordered" + 0.062*"dish" + 0.044*"even" + '
  '0.042*"meal" + 0.040*"came" + 0.039*"also" + 0.034*"served" + 0.031*"much" '
  '+ 0.025*"could"'),
 (2,
  '0.168*"table" + 0.153*"drink" + 0.106*"better" + 0.095*"eat" + '
  '0.074*"still" + 0.044*"arrived" + 0.042*"next" + 0.033*"something" + '
  '0.033*"outside" + 0.031*"busy"'),
 (3,
  '0.169*"overall" + 0.167*"choice" + 0.130*"variety" + 0.105*"gave" + '
  '0.063*"twice" + 0.048*"everyone" + 0.039*"tomato" + 0.032*"filling" + '
  '0.031*"wide" + 0.031*"tasting"'),
 (4,
  '0.335*"fresh" + 0.211*"tasted" + 0.186*"without" + 0.094*"ingredient" + '
  '0.089*"use" + 0.067*"guess" + 0.000*"crab" + 0.000*"rice" + 0.000*"fish" + '
  '0.000*"seafood"'),
 (5,
  '0.058*"sauce" + 0.045*"tried" + 0.040*"side" + 0.038*"thing" + 0.0

In [51]:
# Compute Perplexity, lower is better
print('\nPerplexity: ', lda_model_bigram.log_perplexity(corpus_bigram))  # a measure of how good the model is. lower the better.


Perplexity:  -17.017742832399524

Coherence Score:  0.35892959629033727


In [52]:
# Compute Coherence Score, higher is better
coherence_model_lda_bigram = CoherenceModel(model=lda_model_bigram, texts=data_words_bigrams, dictionary=id2word_bigram, coherence='c_v')
coherence_lda_bigram = coherence_model_lda_bigram.get_coherence()
print('\nCoherence Score: ', coherence_lda_bigram)


Coherence Score:  0.35892959629033727


#### Build Model for Trigrams 

In [53]:
# Create Dictionary
id2word_trigram = corpora.Dictionary(data_words_trigrams)
id2word_trigram.filter_extremes(no_below=5, no_above=0.6)
# filter out tokens that appear in less than 5 documents
# filter out tokens that appear in more than 60% of documents

In [54]:
print(id2word_trigram)

Dictionary(36312 unique tokens: ['actually', 'called', 'check', 'chicken', 'cobb']...)


In [55]:
# Create Corpus
texts_trigram = data_words_trigrams

# Term Document Frequency
corpus_trigram = [id2word_trigram.doc2bow(text) for text in texts_trigram]

In [56]:
# produced corpus shown above is a mapping of (word_id, word_frequency).
print(corpus_trigram[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 2), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 2), (20, 1), (21, 2), (22, 1), (23, 1), (24, 2), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1)]]


In [57]:
# see what word a given id corresponds to, pass the id as a key to the dictionary.
id2word_trigram[0]

'actually'

In [58]:
# Human readable format of corpus (term-frequency)
[[(id2word_trigram[id], freq) for id, freq in cp] for cp in corpus_trigram[:1]]

[[('actually', 1),
  ('called', 1),
  ('check', 1),
  ('chicken', 1),
  ('cobb', 1),
  ('cobb_salad', 1),
  ('come', 1),
  ('corn', 1),
  ('deliveroo', 1),
  ('disappointing', 2),
  ('exactly', 1),
  ('exist', 1),
  ('feedback', 1),
  ('give', 1),
  ('husband', 1),
  ('marinara', 1),
  ('mean', 1),
  ('meant', 1),
  ('not', 1),
  ('online', 2),
  ('order', 1),
  ('ordered', 2),
  ('ordering', 1),
  ('poached_egg', 1),
  ('prawn', 2),
  ('restaurant', 1),
  ('review', 1),
  ('sure', 1),
  ('talk', 1),
  ('thought', 1),
  ('tiny', 1),
  ('writing', 1)]]

In [59]:
# Build LDA model
# 85 minutes
lda_model_trigram = gensim.models.ldamodel.LdaModel(corpus=corpus_trigram,
                                           id2word=id2word_trigram,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           eta = 'auto',
                                           per_word_topics=True)

In [60]:
# Save model to disk.
temp_file = datapath("trigram_base")
lda_model_trigram.save(temp_file)

In [None]:
# temp_file = datapath("trigram_base")
# lda_model = gensim.models.ldamodel.LdaModel.load(temp_file)

In [61]:
# Visualize the topics
plt.style.use('default') 
pyLDAvis.enable_notebook()
vis_trigram = pyLDAvis.gensim_models.prepare(lda_model_trigram, corpus_trigram, id2word_trigram)
vis_trigram

  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [62]:
pyLDAvis.save_html(vis_trigram, 'LDAvis/trigram_base.html')

In [63]:
top_topics_trigram = lda_model_trigram.top_topics(corpus_trigram) #, num_words=20)

In [64]:
top_topics_trigram

[([(0.09977928, 'order'),
   (0.09155373, 'ordered'),
   (0.0878939, 'dish'),
   (0.07094258, 'chicken'),
   (0.043655638, 'soup'),
   (0.042824958, 'small'),
   (0.035059407, 'could'),
   (0.028950036, 'set'),
   (0.026055796, 'ever'),
   (0.02510668, 'say'),
   (0.024000563, 'beef'),
   (0.023983572, 'took'),
   (0.02383643, 'tried'),
   (0.020878697, 'cold'),
   (0.019983828, 'main'),
   (0.019750407, 'serving'),
   (0.0190879, 'tasted'),
   (0.018282158, 'find'),
   (0.018056344, 'dessert'),
   (0.01599034, 'full')],
  -2.601163408589241),
 ([(0.10100777, 'time'),
   (0.09513055, 'u'),
   (0.067352675, 'table'),
   (0.061629668, 'drink'),
   (0.0599729, 'even'),
   (0.052546, 'also'),
   (0.051848304, 'get'),
   (0.045293394, 'served'),
   (0.043844867, 'first'),
   (0.028322458, 'long'),
   (0.026030174, 'take'),
   (0.0260018, 'though'),
   (0.023941189, 'outlet'),
   (0.022374833, 'last'),
   (0.020415338, 'need'),
   (0.017804591, 'ask'),
   (0.017644739, 'arrived'),
   (0.0174

In [65]:
# Print the Keyword in the 20 topics
pprint(lda_model_trigram.print_topics())
doc_lda_trigram = lda_model_trigram[corpus_trigram]

[(0,
  '0.348*"dinner" + 0.086*"new" + 0.086*"cake" + 0.062*"recommended" + '
  '0.055*"cuisine" + 0.049*"pleasant" + 0.039*"birthday" + 0.033*"environment" '
  '+ 0.033*"payment" + 0.031*"making"'),
 (1,
  '0.442*"not" + 0.094*"bad" + 0.062*"average" + 0.061*"much" + 0.050*"ok" + '
  '0.040*"around" + 0.038*"review" + 0.034*"want" + 0.027*"area" + '
  '0.026*"pizza"'),
 (2,
  '0.207*"great" + 0.149*"back" + 0.136*"went" + 0.120*"come" + 0.099*"friend" '
  '+ 0.077*"family" + 0.049*"view" + 0.029*"enjoy" + 0.026*"yummy" + '
  '0.024*"lovely"'),
 (3,
  '0.370*"lunch" + 0.230*"never" + 0.114*"excellent" + 0.106*"disappointed" + '
  '0.072*"delivery" + 0.034*"serf" + 0.029*"setting" + 0.017*"bacon" + '
  '0.015*"thanks" + 0.002*"personnel"'),
 (4,
  '0.238*"one" + 0.160*"singapore" + 0.152*"go" + 0.069*"best" + 0.064*"beer" '
  '+ 0.057*"way" + 0.054*"location" + 0.049*"pay" + 0.041*"eating" + '
  '0.041*"enough"'),
 (5,
  '0.192*"service" + 0.189*"good" + 0.165*"restaurant" + 0.059*"meal

In [66]:
# Compute Perplexity, lower is better
print('\nPerplexity: ', lda_model_trigram.log_perplexity(corpus_trigram))  # a measure of how good the model is. lower the better.

# Compute Coherence Score, higher is better
coherence_model_lda_trigram = CoherenceModel(model=lda_model_trigram, texts=data_words_trigrams, dictionary=id2word_trigram, coherence='c_v')
coherence_lda_trigram = coherence_model_lda_trigram.get_coherence()
print('\nCoherence Score: ', coherence_lda_trigram)


Perplexity:  -17.02182282593306

Coherence Score:  0.3570824672361784


In [69]:
# Compute Coherence Score, higher is better
coherence_model_lda_trigram = CoherenceModel(model=lda_model_trigram, texts=data_words_trigrams, dictionary=id2word_trigram, coherence='c_v')
coherence_lda_trigram = coherence_model_lda_trigram.get_coherence()
print('\nCoherence Score: ', coherence_lda_trigram)


Coherence Score:  0.3570824672361784


#### Hyperparameter Tuning 

In [3]:
if not os.path.exists('Tuning'): 
    os.mkdir('Tuning')
    
def save_file(path, data):
    output = open(path, 'wb')
    pickle.dump(data, output)
    output.close()

def load_file(path):
    pkl_file = open(path, 'rb')
    data = pickle.load(pkl_file)
    pkl_file.close()
    return data

In [72]:
def evaluate_coherence(lda_model, data_words_lda, id2word):
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words_lda, dictionary=id2word, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    return coherence_lda

Helper function to perform Bayesian hyperparameter optimisation

In [73]:
def hyperopt(param_space, num_eval, classifier, data_words_lda, id2word):  
    start = time()
    def objective_function(params):
        clf = classifier(**params, random_state=1)
        score = evaluate_coherence(clf, data_words_lda, id2word) 
        return {'loss': -score, 'status': STATUS_OK}
        
    trials = Trials()
    best_param = fmin(objective_function, param_space, algo=tpe.suggest, max_evals=num_eval, 
                      trials=trials, rstate= np.random.RandomState(1))
    loss = [x['result']['loss'] for x in trials.trials] 
    best_param_values = best_param
    print(loss)
    return trials, best_param_values

In [75]:
LDA_param_hyperopt = {
    'corpus': corpus_bigram,
    'id2word': id2word_bigram,
    'num_topics': scope.int(hp.quniform('num_topics', 10, 30, 5)), 
    'random_state': 100,
    'update_every': 1,
    'chunksize': scope.int(hp.quniform('chunksize', 80, 130, 10)), 
    'passes': scope.int(hp.quniform('passes', 5, 20, 1)),
    'alpha': hp.choice('alpha', ['auto','symmetric', 'asymmetric']),
    'eta': hp.choice('eta', ['auto','symmetric']),
    'decay': hp.uniform('learning_rate', 0.5, 1),
    'per_word_topics': True,
}

In [None]:
load = False 
num_eval = 3

if(load == False):
    LDA_hyperopt_bigram_v0 = hyperopt(LDA_param_hyperopt,  num_eval, LdaModel,  data_words_bigrams, id2word_bigram)
    save_file('LDA_hyperopt_bigram_v0.pkl', LDA_hyperopt_bigram_v0)
else:
    try:
        LDA_hyperopt_bigram_v1 = load_file('LDA_hyperopt_bigram_v0.pkl')
    except Exception as e:
        print(e)

In [None]:
chosen = LDA_hyperopt_bigram_v0[1]
chosen

Replacing emojis with words

In [None]:
from emot.emo_unicode import UNICODE_EMOJI, EMOTICONS_EMO

In [None]:
# Function for converting emojis into word
def convert_emojis(text):
    for emot in UNICODE_EMOJI:
        text = text.replace(emot, "_".join(UNICODE_EMOJI[emot].replace(",","").replace(":","").split()))
    return text

In [None]:
# takes 11 minutes LOL
replace_emoji_words = [convert_emojis(i) for i in data]

In [None]:
print(replace_emoji_words[:1])