In [19]:
#import the above packages and libraries for working with files and the file system
import little_mallet_wrapper
import seaborn
import glob
from pathlib import Path

In [20]:
#assign to the "directory" variable the path to the directory containing our documents
directory = "Corpus_selected_texts/all"

#use `glob.gob()` function to make a list of all the `.txt` files in that directory.
files = glob.glob(f"{directory}/*.txt")

In [21]:
files

['Corpus_selected_texts/all\\1857_browne-grannys-wonderful-chair.txt',
 'Corpus_selected_texts/all\\1857_hughes-tom-browns-school-days.txt',
 'Corpus_selected_texts/all\\1865_carroll-alices-adventures-in-wonderland.txt',
 'Corpus_selected_texts/all\\1869_dickens-david-copperfield.txt',
 'Corpus_selected_texts/all\\1869_ewing-mrs-overtheways-remembrances.txt',
 'Corpus_selected_texts/all\\1871_macdonald-at-the-back-of-the-north-wind.txt',
 'Corpus_selected_texts/all\\1872_de-la-ramee-a-dog-of-flanders.txt',
 'Corpus_selected_texts/all\\1876_twain-the-adventures-of-tom-sawyer.txt',
 'Corpus_selected_texts/all\\1877_molesworth-the-cuckoo-clock.txt',
 'Corpus_selected_texts/all\\1877_sewell-black-beauty.txt',
 'Corpus_selected_texts/all\\1883_stevenson-treasure-island.txt',
 'Corpus_selected_texts/all\\1886_hodgson-burnett-little-lord-fauntleroy.txt',
 'Corpus_selected_texts/all\\1888_wilde-the-happy-prince-and-other-tales.txt',
 'Corpus_selected_texts/all\\1894_kipling-the-jungle-book.txt

In [22]:
# Load the regular expression library
import re

#initialize training_data as empty
training_data = []
texts_as_string_lists = []

#process each file and add it to the training_data list
for file in files:
    text = open(file, encoding='utf-8').read()
    processed_text = little_mallet_wrapper.process_string(text, remove_stop_words=False, numbers='remove')
    training_data.append(processed_text)
    list_of_strings = processed_text.split() #create a list from all the words in a book
    texts_as_string_lists.append(list_of_strings) #append the list of strings for the book in texts_as_string_lists
    


In [23]:
#sample: the first 24 words of the first book in the list
texts_as_string_lists[0][:24]

['granny',
 'wonderful',
 'chair',
 'chapter',
 'introductory',
 'old',
 'time',
 'long',
 'ago',
 'when',
 'the',
 'fairies',
 'were',
 'the',
 'world',
 'there',
 'lived',
 'little',
 'girl',
 'very',
 'fair',
 'and',
 'pleasant',
 'look']

** **
#### Step 3: Phrase Modeling: Bigram and Trigram Models
** **

Bigrams are two words frequently occurring together in the document. Trigrams are 3 words frequently occurring. Some examples in our example are: 'back_bumper', 'oil_leakage', 'maryland_college_park' etc.

Gensim's Phrases model can build and implement the bigrams, trigrams, quadgrams and more. The two important arguments to Phrases are min_count and threshold.

In [24]:
import gensim

# Build the bigram and trigram models
bigram = gensim.models.Phrases(texts_as_string_lists, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[texts_as_string_lists], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

#### Remove Stopwords, Make Bigrams and Lemmatize

The phrase models are ready. Let’s define the functions to remove the stopwords, make trigrams and lemmatization and call them sequentially.

In [25]:
# NLTK Stop words
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = stopwords.words('english')

#>>>> IMPORTANTE: QUANDO HAI I RISULTATI DEFINITIVI AGGIUNGI I NOMI DEI PERSONAGGI ALLA LISTA DELLE STOPWORDS!

stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'one', 'said']) # i added 'one' and 'said', as they're expected to be equally likely on all the novels and yet non particularly significant

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\media\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [26]:
import nltk
import spacy

# Define functions for stopwords, bigrams, trigrams and lemmatization

def remove_stopwords(texts_as_string_lists):
    return [[word for word in texts_as_string_lists if word not in stop_words] for text in texts_as_string_lists]



def make_bigrams(texts_as_string_lists):
    return [bigram_mod[doc] for doc in texts_as_string_lists]

def make_trigrams(texts_as_string_lists):
    return [trigram_mod[bigram_mod[doc]] for doc in texts_as_string_lists]

def lemmatization(texts_as_string_lists, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts_as_string_lists:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

Let's call the functions in order.

In [27]:
!python -m spacy download en_core_web_sm

Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)
     -------------------------------------- 13.9/13.9 MB 248.1 kB/s eta 0:00:00
✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')



[notice] A new release of pip available: 22.2.1 -> 22.2.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [28]:
from unittest.util import _MAX_LENGTH
import spacy

# Remove Stop Words
data_words_nostops = remove_stopwords(texts_as_string_lists)

# Form Bigrams
data_words_bigrams = make_bigrams(texts_as_string_lists)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv

#First, set nlp.max_length to a value higher than the default one and the length of the text you have to process, otherwise the following error 
#will be returned:
"""
ValueError: [E088] Text of length 1571315 exceeds maximum of 1000000. The parser and NER models require roughly 1GB of temporary memory 
per 100,000 characters in the input. This means long texts may cause memory allocation errors. If you're not using the parser or NER,
it's probably safe to increase the `nlp.max_length` limit. The limit is in number of characters, so you can check whether your inputs 
are too long by checking `len(text)`.
"""

nlp.max_length = 1600000 #sets nlp.max_length, in AppData\Roaming\Python\Python39\site-packages\spacy\language.py , to a value high enough to process the data. Change this value accordingly to the size of your data

data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1][0][:30])

['granny', 'wonderful', 'chair', 'chapter', 'introductory', 'old', 'time', 'long', 'ago', 'fairy', 'world', 'there', 'live', 'little', 'girl', 'very', 'fair', 'pleasant', 'look', 'call', 'snowflower', 'girl', 'good', 'pretty', 'one', 'ever', 'see', 'frown', 'hear', 'say']


** **
#### Step 4: Data transformation: Corpus and Dictionary
** **

The two main inputs to the LDA topic model are the dictionary(id2word) and the corpus. Let’s create them.

In [29]:
import gensim.corpora as corpora

# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1][0][:30])

[(0, 1), (1, 2), (2, 3), (3, 6), (4, 1), (5, 1), (6, 4), (7, 4), (8, 1), (9, 24), (10, 4), (11, 6), (12, 1), (13, 2), (14, 2), (15, 5), (16, 2), (17, 2), (18, 5), (19, 3), (20, 1), (21, 10), (22, 22), (23, 3), (24, 1), (25, 1), (26, 10), (27, 14), (28, 1), (29, 1)]


** **
#### Step 5: Base Model 
** **

We have everything required to train the base LDA model. In addition to the corpus and dictionary, you need to provide the number of topics as well. Apart from that, alpha and eta are hyperparameters that affect sparsity of the topics. According to the Gensim docs, both defaults to 1.0/num_topics prior (we'll use default for the base model).

chunksize controls how many documents are processed at a time in the training algorithm. Increasing chunksize will speed up training, at least as long as the chunk of documents easily fit into memory.

passes controls how often we train the model on the entire corpus (set to 10). Another word for passes might be "epochs". iterations is somewhat technical, but essentially it controls how often we repeat a particular loop over each document. It is important to set the number of "passes" and "iterations" high enough.

In [30]:
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=10, 
                                       random_state=100,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)

** **
The above LDA model is built with 10 different topics where each topic is a combination of keywords and each keyword contributes a certain weightage to the topic.

You can see the keywords for each topic and the weightage(importance) of each keyword using `lda_model.print_topics()`

In [31]:
from pprint import pprint

# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.024*"say" + 0.008*"know" + 0.008*"come" + 0.008*"see" + 0.008*"go" + '
  '0.007*"look" + 0.007*"make" + 0.007*"little" + 0.007*"think" + '
  '0.006*"very"'),
 (1,
  '0.002*"say" + 0.001*"come" + 0.001*"see" + 0.001*"look" + 0.001*"go" + '
  '0.001*"little" + 0.001*"then" + 0.001*"very" + 0.001*"know" + 0.001*"make"'),
 (2,
  '0.015*"say" + 0.008*"old" + 0.008*"boy" + 0.008*"little" + 0.008*"come" + '
  '0.007*"see" + 0.007*"look" + 0.007*"very" + 0.006*"know" + 0.006*"go"'),
 (3,
  '0.002*"say" + 0.001*"see" + 0.001*"come" + 0.001*"know" + 0.001*"very" + '
  '0.001*"little" + 0.001*"look" + 0.001*"think" + 0.001*"time" + 0.001*"go"'),
 (4,
  '0.030*"say" + 0.012*"come" + 0.011*"very" + 0.010*"see" + 0.010*"then" + '
  '0.009*"know" + 0.009*"little" + 0.009*"get" + 0.009*"go" + 0.009*"think"'),
 (5,
  '0.020*"say" + 0.010*"come" + 0.009*"man" + 0.009*"little" + 0.008*"mowgli" '
  '+ 0.007*"then" + 0.007*"know" + 0.007*"see" + 0.006*"go" + 0.006*"head"'),
 (6,
  '0.002*"say" + 

#### Compute Model Perplexity and Coherence Score

Let's calculate the baseline coherence score

In [32]:
from gensim.models import CoherenceModel

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

Coherence Score:  0.2634267588518763


** **
#### Step 6: Hyperparameter tuning
** **
First, let's differentiate between model hyperparameters and model parameters :

- `Model hyperparameters` can be thought of as settings for a machine learning algorithm that are tuned by the data scientist before training. Examples would be the number of trees in the random forest, or in our case, number of topics K

- `Model parameters` can be thought of as what the model learns during training, such as the weights for each word in a given topic.

Now that we have the baseline coherence score for the default LDA model, let's perform a series of sensitivity tests to help determine the following model hyperparameters: 
- Number of Topics (K)
- Dirichlet hyperparameter alpha: Document-Topic Density
- Dirichlet hyperparameter beta: Word-Topic Density

We'll perform these tests in sequence, one parameter at a time by keeping others constant and run them over the two difference validation corpus sets. We'll use `C_v` as our choice of metric for performance comparison 

In [33]:
# supporting function
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

Let's call the function, and iterate it over the range of topics, alpha, and beta parameter values

In [None]:
import numpy as np
import tqdm
import pandas as pd

In [36]:
import tqdm 

grid = {}
grid['Validation_Set'] = {}

# Topics range
min_topics = 2
max_topics = 11
step_size = 1
topics_range = range(min_topics, max_topics, step_size)

# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')

# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')

# Validation sets
num_of_docs = len(corpus)
corpus_sets = [gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)), 
               corpus]

corpus_title = ['75% Corpus', '100% Corpus']

model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=(len(beta)*len(alpha)*len(topics_range)*len(corpus_title)))
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, 
                                                  k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    
                    pbar.update(1)
    pd.DataFrame(model_results).to_csv('./results/lda_tuning_results.csv', index=False)
    pbar.close()

 41%|████      | 219/540 [4:29:55<6:35:38, 73.95s/it]  
100%|██████████| 540/540 [5:46:14<00:00, 27.79s/it]    

FileNotFoundError: [Errno 2] No such file or directory: './results/lda_tuning_results.csv'

In [37]:
#specify where you want to store the output of the function above (best to choose a new destination at each run, so as not to overwrite the results!!)
#you should create the destination folder and file BEFOREHAND, otherwise an error will be prompted

pd.DataFrame(model_results).to_csv('./results/lda_tuning_results.csv', index=False)

** **
#### Step 7: Final Model
** **

Based on external evaluation (Code to be added from Excel based analysis), let's train the final model with parameters yielding highest coherence score

In [41]:
num_topics = 6

lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=0.01,
                                           eta=0.61)

In [42]:
from pprint import pprint

# Print the Keyword in the 8 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.025*"say" + 0.008*"come" + 0.008*"little" + 0.008*"very" + 0.008*"go" + '
  '0.008*"see" + 0.008*"then" + 0.007*"think" + 0.007*"know" + 0.007*"get"'),
 (1,
  '0.020*"say" + 0.015*"diamond" + 0.011*"come" + 0.010*"see" + 0.009*"know" + '
  '0.009*"get" + 0.009*"very" + 0.008*"horse" + 0.008*"good" + 0.008*"go"'),
 (2,
  '0.014*"boy" + 0.012*"school" + 0.005*"get" + 0.004*"east" + 0.003*"tom" + '
  '0.003*"old" + 0.003*"master" + 0.003*"fight" + 0.003*"then" + '
  '0.003*"other"'),
 (3,
  '0.014*"say" + 0.007*"come" + 0.007*"man" + 0.007*"little" + 0.006*"mowgli" '
  '+ 0.005*"then" + 0.005*"know" + 0.005*"see" + 0.005*"go" + 0.005*"head"'),
 (4,
  '0.021*"say" + 0.008*"come" + 0.008*"see" + 0.007*"know" + 0.007*"go" + '
  '0.007*"look" + 0.007*"little" + 0.007*"make" + 0.006*"very" + '
  '0.006*"think"'),
 (5,
  '0.001*"say" + 0.001*"come" + 0.001*"see" + 0.001*"little" + 0.000*"know" + '
  '0.000*"think" + 0.000*"look" + 0.000*"go" + 0.000*"very" + 0.000*"make"')]


** **
#### Step 8: Visualize Results
** **

In [66]:
import pyLDAvis.gensim_models
import pickle 
import pyLDAvis
# Importing modules
import pandas as pd
import os

os.chdir('..')

# Visualize the topics
pyLDAvis.enable_notebook()

LDAvis_data_filepath = os.path.join('./results/ldavis_tuned_'+str(num_topics))

# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
if 1 == 1:
    LDAvis_prepared = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)

# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)

pyLDAvis.save_html(LDAvis_prepared, './results/ldavis_tuned_'+ str(num_topics) +'.html')

LDAvis_prepared
"""

"\n# # this is a bit time consuming - make the if statement True\n# # if you want to execute visualization prep yourself\nif 1 == 1:\n    LDAvis_prepared = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)\n    with open(LDAvis_data_filepath, 'wb') as f:\n        pickle.dump(LDAvis_prepared, f)\n\n# load the pre-prepared pyLDAvis data from disk\nwith open(LDAvis_data_filepath, 'rb') as f:\n    LDAvis_prepared = pickle.load(f)\n\npyLDAvis.save_html(LDAvis_prepared, './results/ldavis_tuned_'+ str(num_topics) +'.html')\n\nLDAvis_prepared\n"

In [63]:
pyLDAvis.save_html(LDAvis_prepared, './results/ldavis_tuned_'+ str(num_topics) +'.html')

FileNotFoundError: [Errno 2] No such file or directory: './results/ldavis_tuned_6.html'

** **
#### Closing Notes

We started with understanding why evaluating the topic model is essential. Next, we reviewed existing methods and scratched the surface of topic coherence, along with the available coherence measures. Then we built a default LDA model using Gensim implementation to establish the baseline coherence score and reviewed practical ways to optimize the LDA hyperparameters.

Hopefully, this article has managed to shed light on the underlying topic evaluation strategies, and intuitions behind it.

** **
#### References:
1. http://qpleple.com/perplexity-to-evaluate-topic-models/
2. https://www.amazon.com/Machine-Learning-Probabilistic-Perspective-Computation/dp/0262018020
3. https://papers.nips.cc/paper/3700-reading-tea-leaves-how-humans-interpret-topic-models.pdf
4. https://github.com/mattilyra/pydataberlin-2017/blob/master/notebook/EvaluatingUnsupervisedModels.ipynb
5. https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
6. http://svn.aksw.org/papers/2015/WSDM_Topic_Evaluation/public.pdf
7. http://palmetto.aksw.org/palmetto-webapp/