In [1]:
import pandas as pd
from bokeh.plotting import figure, show
from bokeh.models import Label
from urllib.request import urlopen
from urllib.error import HTTPError
import pickle
import requests
import datetime as dt
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models.coherencemodel import CoherenceModel
import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from nltk.corpus import wordnet
import numpy as np
import gzip
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
from sklearn.decomposition import LatentDirichletAllocation
from collections import Counter
from gensim.models import word2vec
import tqdm

In [2]:
corpus=pickle.load(urlopen('https://github.com/Galeforse/DST-Assessment-03/raw/master/Data/main/corpus_5.pkl'))
dictionary=pickle.load(urlopen('https://github.com/Galeforse/DST-Assessment-03/raw/master/Data/main/dictionary_5.pkl'))
print("Reading dictionary from pickle")

Reading dictionary from pickle


In [3]:
print(dictionary)

Dictionary(301 unique tokens: ['Failed', 'password', 'port', 'Disconnecting', 'authentication']...)


In [4]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=10, random_state=100, update_every=1, chunksize=10, passes=10, alpha='symmetric', iterations=100, per_word_topics=True)
print(lda_model.print_topics())

[(0, '0.781*"request" + 0.117*"invalid" + 0.051*"input" + 0.051*"userauth" + 0.000*"list" + 0.000*"User" + 0.000*"allow" + 0.000*"AllowUsers" + 0.000*"subsystem" + 0.000*"sftp"'), (1, '0.607*"session" + 0.279*"unix" + 0.110*"open" + 0.000*"list" + 0.000*"AllowUsers" + 0.000*"User" + 0.000*"allow" + 0.000*"ppsuc" + 0.000*"mail" + 0.000*"greendata"'), (2, '0.538*"preauth" + 0.182*"disconnect" + 0.174*"Received" + 0.106*"Connection" + 0.000*"Shutdown" + 0.000*"Normal" + 0.000*"Goodbye" + 0.000*"socket" + 0.000*"Read" + 0.000*"User"'), (3, '0.249*"winbind" + 0.249*"auth" + 0.126*"AUTH" + 0.126*"ERROR" + 0.126*"wbcLogonUser" + 0.123*"getting" + 0.000*"allow" + 0.000*"AllowUsers" + 0.000*"User" + 0.000*"list"'), (4, '0.188*"fail" + 0.101*"ATTEMPT" + 0.101*"BREAK" + 0.101*"HHHHH" + 0.101*"POSSIBLE" + 0.080*"check" + 0.080*"reverse" + 0.080*"getaddrinfo" + 0.080*"mapping" + 0.021*"map"'), (5, '0.951*"user" + 0.044*"close" + 0.002*"subsystem" + 0.002*"sftp" + 0.000*"User" + 0.000*"list" + 0.000

In [9]:
try:
    print("Attempting to read model from pickle")
    fp=gzip.open(urlopen('https://github.com/Galeforse/DST-Assessment-03/raw/master/Data/main/lda_model.pkl.gz'),'rb')
    lda_model=pickle.load(fp)
    fp.close()
    print("Model read from pickle.")
except HTTPError as err:
    if err.code == 404:
        print("Pickle not found, creating model and saving to pickle")
        start=dt.datetime.now()
        lda_model_ex = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=10, random_state=100, update_every=1, chunksize=10, passes=10, alpha='symmetric', iterations=100, per_word_topics=True)
        fp=gzip.open('G:/Users/Gabriel/Documents/Education/UoB/GitHubDesktop/DST-Assessment-03/data/main/lda_model.pkl.gz','wb')
        pickle.dump(lda_model_ex,fp)
        fp.close()
        print("Pickle saved. Time taken: " + str(dt.datetime.now()-start))
    else:
        raise

Attempting to read model from pickle
Pickle not found, creating model and saving to pickle
Pickle saved. Time taken: 0:00:00.001995


In [10]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

In [11]:
lda_display=pyLDAvis.gensim.prepare(lda_model, corpus, dictionary, mds='mmds')

In [19]:
pyLDAvis.enable_notebook()
pyLDAvis.display(lda_display, template_type='notebook')

# Perplexitiy and Coherence

In [56]:
log_perplexitites = {lda_model.log_perplexity(corpus)}
log_perplexitites

{-6.8586075031796465}

In [65]:
def getCoherence(m,c,d):
    coherence_lda_model = CoherenceModel(model=m, corpus=c, dictionary=d, coherence='u_mass')
    coherence_lda = coherence_lda_model.get_coherence()
    return(coherence_lda)

In [66]:
coherence = {'lda_model': getCoherence(lda_model, corpus, dictionary)}
coherence

{'lda_model': -16.233680822188415}

In [85]:
# supporting function
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=8, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=corpus, dictionary=dictionary, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [86]:
grid = {}
grid['Validation_Set'] = {}
# Topics range
min_topics = 2
max_topics = 4
step_size = 1
topics_range = range(min_topics, max_topics, step_size)
# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')
# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')
# Validation sets
num_of_docs = len(corpus)
corpus_sets = [# gensim.utils.ClippedCorpus(corpus, num_of_docs*0.25), 
               # gensim.utils.ClippedCorpus(corpus, num_of_docs*0.5), 
               gensim.utils.ClippedCorpus(corpus, num_of_docs*0.75), 
               corpus]
corpus_title = ['75% Corpus', '100% Corpus']
model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# get the coherence score for the given parameters
cv = compute_coherence_values(corpus=corpus, dictionary=dictionary, 
                                                  k=k, a=a, b=b)
cv       

  numerator = (co_occur_count / num_docs) + EPSILON
  denominator = (w_prime_count / num_docs) * (w_star_count / num_docs)
  co_doc_prob = co_occur_count / num_docs


nan