In [1]:
import pandas as pd
from bokeh.plotting import figure, show
from bokeh.models import Label
from urllib.request import urlopen
from urllib.error import HTTPError
import pickle
import requests
import datetime as dt
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models.coherencemodel import CoherenceModel
import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from nltk.corpus import wordnet
import numpy as np
import gzip
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
from sklearn.decomposition import LatentDirichletAllocation
from collections import Counter
from gensim.models import word2vec
import tqdm

# Reading in necessary data

In [2]:
fp=gzip.open(urlopen('https://github.com/Galeforse/DST-Assessment-03/raw/master/Data/main/corpus_10.pkl.gz'),'rb')
print("Reading corpus from pickle")
corpus=pickle.load(fp)
fp.close()
dictionary=pickle.load(urlopen('https://github.com/Galeforse/DST-Assessment-03/raw/master/Data/main/dictionary_10.pkl'))
print("Reading dictionary from pickle")

Reading corpus from pickle
Reading dictionary from pickle


In [3]:
print(dictionary)

Dictionary(350 unique tokens: ['failed', 'password', 'port', 'authentication', 'disconnecting']...)


We have used the pickle files from our data processing earlier to cut down on run time as the preprocessing stage providing some lengthy run times for some.

# Training LDA model

Now we have our documents in the form of our corpus and all its tokenised entries in our dictionary we are simply ready to train our LDA model using the Gensim package which allows for easy visualisation later on.

In [4]:
#lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=10, random_state=100, update_every=1, chunksize=10, passes=10, alpha='symmetric', iterations=100, per_word_topics=True)
#print(lda_model.print_topics())

In [6]:
global lda_model
try:
    print("Attempting to read model from pickle")
    fp=gzip.open(urlopen('https://github.com/Galeforse/DST-Assessment-03/raw/master/Data/main/lda_model_10.pkl.gz'),'rb')
    lda_model=pickle.load(fp)
    fp.close()
    print("Model read from pickle.")
except HTTPError as err:
    if err.code == 404:
        print("Pickle not found, creating model and saving to pickle")
        start=dt.datetime.now()
        lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=8, random_state=100, update_every=1, chunksize=20, passes=10, alpha='symmetric', iterations=80, per_word_topics=True)
        fp=gzip.open('G:/Users/Gabriel/Documents/Education/UoB/GitHubDesktop/DST-Assessment-03/data/main/lda_model_10.pkl.gz','wb')
        pickle.dump(lda_model,fp)
        fp.close()
        print("Pickle saved. Time taken: " + str(dt.datetime.now()-start))
    else:
        raise
print(lda_model.print_topics())

Attempting to read model from pickle
Pickle not found, creating model and saving to pickle
Pickle saved. Time taken: 0:48:41.586213
[(0, '0.619*"invalid" + 0.308*"request" + 0.023*"userauth" + 0.023*"input" + 0.007*"identification" + 0.007*"receive" + 0.007*"string" + 0.001*"account" + 0.001*"subsystem" + 0.001*"sftp"'), (1, '0.518*"closed" + 0.442*"connection" + 0.008*"fatal" + 0.008*"peer" + 0.008*"reset" + 0.008*"socket" + 0.008*"read" + 0.000*"allowed" + 0.000*"allowusers" + 0.000*"listed"'), (2, '0.382*"error" + 0.382*"getting" + 0.059*"disconnecting" + 0.059*"authentication" + 0.059*"failure" + 0.059*"many" + 0.000*"allowusers" + 0.000*"listed" + 0.000*"allowed" + 0.000*"greendata"'), (3, '0.500*"auth" + 0.334*"winbind" + 0.167*"wbclogonuser" + 0.000*"listed" + 0.000*"allowed" + 0.000*"allowusers" + 0.000*"zeilstraj" + 0.000*"greendata" + 0.000*"error" + 0.000*"oregon"'), (4, '0.300*"allowed" + 0.300*"allowusers" + 0.300*"listed" + 0.014*"static" + 0.014*"comcastbusiness" + 0.012

# Visualisation

The gensim package contains a wonderful visualisation package which allows us to see what each indivdual topic is made up of and how they interact with other topics.

In [5]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

In [6]:

try:
    print("Attempting to read model display from pickle")
    fp=gzip.open(urlopen('https://github.com/Galeforse/DST-Assessment-03/raw/master/Data/main/lda_display.pkl.gz'),'rb')
    lda_display=pickle.load(fp)
    fp.close()
    print("Model display read from pickle.")
except HTTPError as err:
    if err.code == 404:
        print("Pickle not found, creating model display and saving to pickle")
        start=dt.datetime.now()
        lda_display=pyLDAvis.gensim.prepare(lda_model, corpus, dictionary, mds='mmds')global lda_model
        fp=gzip.open('G:/Users/Gabriel/Documents/Education/UoB/GitHubDesktop/DST-Assessment-03/data/main/lda_model_10.pkl.gz','wb')
        pickle.dump(lda_display,fp)
        fp.close()
        print("Pickle saved. Time taken: " + str(dt.datetime.now()-start))
    else:
        raise
print(lda_model.print_topics())

  and should_run_async(code)


In [7]:
pyLDAvis.display(lda_display, template_type='notebook')

  and should_run_async(code)


In [None]:
pyLDAvis.save_html(lda_display,"G:/Users/Gabriel/Documents/Education/UoB/GitHubDesktop/DST-Assessment-03/Visulisations/lda_display.html")

# Perplexity and Coherence

Perplexity is the measure of how well a given model predicts a sample and in terms of LDA, we estimate the LDA model. After this we compare the theoritcal word distribution to the distribution of the words in our set of documents and the lower the perplexity score the better. Worth noting however is that perplexity and human judgment have been shown to have little correlation and sometimes anti-correlation implying that even if we obtain a good perplexity score, often we might not understand fully why!

The Coherence of a topic is based off the hypothesis that states that words with similar meaning tend to co-occur within a similar context. Also worth noting here is our measure of Coherence. The Coherence score here is calculated using a measure called C_umass which uses logritmic conditional probability so we can expect some differnces based on this. (The way we have found to calculate Coherence later on is called C_v and is based on a sliding window which is a one-set segmentation of the top words and a confirmation measure which uses Normalised Pointwise Mutual Information (NPMI) and the cosine similarity.)

In [8]:
def getCoherence(m,c,d):
    coherence_lda_model = CoherenceModel(model=m, corpus=c, dictionary=d, coherence='u_mass')
    coherence_lda = coherence_lda_model.get_coherence()
    return(coherence_lda)

  and should_run_async(code)


In [12]:
coherence = {'Coherence': getCoherence(lda_model, corpus, dictionary)}
coherence

  and should_run_async(code)


{'Coherence': -15.280204491716}

In [13]:
log_perplexitites = {lda_model.log_perplexity(corpus)}
log_perplexitites

  and should_run_async(code)


{-7.282650114756305}

# Hyper Parameter Tuning

Here we take a look at some hyperparameter tuning. First we differentiate between model hyperparameters and model parameters. Model hyperparameters for machine learning are effectively the 'settings' for the algoritthm which we control before training the model, for instance, the number of trees in a random forest or in this case, the number of topics. Model parameters are thought of as what the model itself learns during the training process such as the weights of word in a given topic.

Here we have introduced two hyperparameters Alpha and Beta, which are the Document-Topic density and Word-Topic density respectively. LDA distributions can be either symmetric or asymmetric with the former being generally more common. In a symmetric distribution, a higher alpha score implies the documents are made up of more topics while a high beta score implies that topics are made up of most of the words in our corpus. In a asymmetric distribution, a high alpha results in a more specific topic distribution per document and likewise, higher beta shows a more speicific word distribution per topic.


In this code below we aim to create a chart outlining our Coherence score, C_v, which in turn would allow us the determine the optimal number of topics with some fixed alpha and beta and we iterate over the range of topics and different alpha and beta parameters.

In [11]:
# supporting function
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=8, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=corpus, dictionary=dictionary, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

  and should_run_async(code)


In [10]:
grid = {}
grid['Validation_Set'] = {}
# Topics range
min_topics = 2
max_topics = 10
step_size = 1
topics_range = range(min_topics, max_topics, step_size)
# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')
# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')
# Validation sets
num_of_docs = len(corpus)
corpus_sets = [# gensim.utils.ClippedCorpus(corpus, num_of_docs*0.25), 
               # gensim.utils.ClippedCorpus(corpus, num_of_docs*0.5), 
               gensim.utils.ClippedCorpus(corpus, num_of_docs*0.75), 
               corpus]
corpus_title = ['75% Corpus', '100% Corpus']
model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# get the coherence score for the given parameters
cv = compute_coherence_values(corpus=corpus, dictionary=dictionary, 
                                                  k=k, a=a, b=b)
cv       

  and should_run_async(code)


NameError: name 'compute_coherence_values' is not defined

Unfortunately, I have been unable to get this portion of code to run after many failed attempts of trouble-shooting. If you follow the URL in the refernces section, you can find what I hoped this chart would of appeared like and from this we could have determined the optimal number of topics and other parameters to ensure that we were getting the best results for our data. 

# References

https://towardsdatascience.com/evaluate-topic-model-in-python-latent-dirichlet-allocation-lda-7d57484bb5d0

