In [25]:
import numpy as np
import nltk
from nltk.corpus import wordnet as wn
import pandas as pd
import gensim
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models.ldamodel import LdaModel
from gensim.models.phrases import Phrases, Phraser
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from gensim.parsing.preprocessing import STOPWORDS
from gensim.utils import simple_preprocess
from spellchecker import SpellChecker
from gensim import corpora, models
#nltk.download('wordnet')

In [26]:
data = pd.read_excel('data/Chats_scrubbed.xlsx',sheet_name='chatdata', error_bad_lines=False)
data = data[(data['Activity Type']=='Network Optimisation') & (data['WO Status']=='Incomplete')]
data_text = data[['CLOSURE_SUMMARY']]
documents = data_text

print(len(documents))
print(documents[:5])

969
                                       CLOSURE_SUMMARY
67                                           Ongoing\r
70   Workforce Delay / Time Constraint / Return to ...
75                                       RTM ongoing\r
183               More time required being monitored\r
191  Realigned node fixed some noise ingress still ...


In [27]:
#Define Preprocessing - tokenise, bigrams, remove stop words and lemmatisation

# Convert to list
data = documents.CLOSURE_SUMMARY.values.tolist()
print(data[18])

# Tokenising each comment into a list of words
def comments_to_words(data):
    for comment in data:
        yield(gensim.utils.simple_preprocess(str(comment), deacc=False))  # deacc=True removes punctuations

# Testing
data_words = list(comments_to_words(data))
print(data_words[18])

Incompleted. More time required.
['incompleted', 'more', 'time', 'required']


In [28]:
# take out 3 or more consecutive letters
def reduce_lengthening(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

def letter_reduction(comment):
    return [reduce_lengthening(word) for word in comment]

#Testing
print (reduce_lengthening( "finallllly soooon" ))        
data_words = [letter_reduction(comment) for comment in data_words]
print(data_words[18])

finally soon
['incompleted', 'more', 'time', 'required']


In [29]:
# Spell checker
def spell_correct(comment):
    spell = SpellChecker()
    misspelled = spell.unknown(comment)
    for word in misspelled:
        comment[comment.index(word)] = spell.correction(word)
    return comment

In [30]:
test_comment = ['somethng', 'iss', 'hapenning', 'heare', 'asdf']
spell_correct(test_comment)

['something', 'iss', 'happening', 'here', 'asda']

In [31]:
# N-Grams - 2 or 3 words consistently go together

test = 11

# Build the bigram and trigram models
bigram = Phrases(data_words, min_count=1, threshold=10) # higher threshold fewer phrases.
trigram = Phrases(bigram[data_words], threshold=10)

print('bigram test - {}'.format(bigram[['no','access','help','up','me','test','pick','up']]))


# Get sentence formed into sets of trigram/bigram
bigram_mod = Phraser(bigram)
trigram_mod = Phraser(trigram)

# See an example
#print(bigram_mod[0])
print(f'original - {data_words[test]}')
print(f'bigrams - {bigram_mod[data_words[test]]}')
print(f'trigrams - {trigram_mod[data_words[test]]}')
print(f'trigrams on bigrams - {trigram_mod[bigram_mod[data_words[test]]]}')

bigram test - ['no_access', 'help', 'up', 'me', 'test', 'pick_up']
original - ['workforce', 'delay', 'time', 'constraint', 'return', 'to', 'me', 'next', 'day']
bigrams - ['workforce_delay', 'time_constraint', 'return', 'to', 'me_next', 'day']
trigrams - ['workforce', 'delay', 'time', 'constraint', 'return', 'to', 'me_next', 'day']
trigrams on bigrams - ['workforce_delay_time_constraint', 'return', 'to', 'me_next_day']


In [32]:
#Lemmatization
#text here is every comment

POS = ['a', 's', 'r', 'n', 'v']

def lemmatize_stemming(text):
    lemmatizer = WordNetLemmatizer()
    return lemmatizer.lemmatize(text, pos='v')

def preprocess(comment,spelling_correct = True, trigram=True):
    #spelling correction
    if spell_correct:
        comment = spell_correct(comment)
        
    result = []
    bigram = Phrases(data_words, min_count=1, threshold=10) # higher threshold fewer phrases
    trigram = Phrases(bigram[data_words], threshold=10)
    bigram_mod = Phraser(bigram)
    trigram_mod = Phraser(trigram)
    
    #tokenised
    if trigram:
        for token in trigram_mod[comment]:
            #removed stopwords and words with length 3
            if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) >= 3:
                #lemmatisation
                result.append(lemmatize_stemming(token))
    else:
        for token in bigram_mod[comment]:
            #removed stopwords and words with length 3
            if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) >= 3:
                #lemmatisation
                result.append(lemmatize_stemming(token))

    return result


In [33]:
# check a sample if preprocessed correctly
doc_sample = data_words[5]

print(doc_sample)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

['replaced', 'way', 'db', 'tap', 'and', 'monitoring', 'device', 'from', 'tap', 'at', 'due', 'to', 'noise', 'will', 'return', 'tomorrow', 'and', 'check', 'the', 'other', 'leg', 'never', 'had', 'much', 'time', 'due', 'to', 'being', 'in', 'part', 'day', 'due', 'to', 'call', 'out', 'last', 'night']


 tokenized and lemmatized document: 
['replace', 'way', 'db_tap', 'monitor', 'device', 'tap', 'noise', 'will_return', 'tomorrow', 'check', 'leg', 'time', 'day', 'night']


In [34]:
# Apply preprocessing to every document
preprocessed_docs = [preprocess(comment) for comment in data_words]

#check a sample
print(preprocessed_docs[:5])

[['ongoing'], ['workforce', 'delay', 'time', 'constraint', 'return', 'me_next', 'day'], ['ram', 'ongoing'], ['more_time', 'require', 'monitor'], ['realign', 'node', 'fix', 'noise', 'ingress', 'noise']]


In [35]:
# Bag of Words on the Data set
dictionary = gensim.corpora.Dictionary(preprocessed_docs)

# check dictionary
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 ongoing
1 constraint
2 day
3 delay
4 me_next
5 return
6 time
7 workforce
8 ram
9 monitor
10 more_time


In [36]:
'''
no_below - Filter out tokens that appear in less than n documents
no_above - Filter out tokens that appear in more than k documents (proportion of corpus size)
keep_n = keep only the first x most frequent tokens
'''

dictionary.filter_extremes(no_below=10, no_above=0.2, keep_n=100000)


In [37]:
'''
For each document we create a dictionary reporting how many
words and how many times those words appear. Save this to ‘bow_corpus’, then check our selected document earlier.
'''

#Bag-of-words representation of the documents
bow_corpus = [dictionary.doc2bow(doc) for doc in preprocessed_docs]
print(bow_corpus[43])


# Preview - just checking one document
bow_doc_43 = bow_corpus[43]

for i in range(len(bow_doc_43)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_43[i][0], 
                                               dictionary[bow_doc_43[i][0]], 
bow_doc_43[i][1]))

[(15, 1), (27, 1)]
Word 15 ("check") appears 1 time.
Word 27 ("please_return") appears 1 time.


In [38]:
# Create tf-idf model object using bow_corpus 
# then apply transformation to the entire corpus

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 1.0)]


In [39]:
#LDA using BOW
import gensim.models
from gensim.models.ldamulticore import LdaMulticore
from gensim.models import CoherenceModel

nof_list=list(np.arange(2,16))

lda_best_coherence = []


# Perplexity - a measure of how good the model is. lower the better.

for n in nof_list:
    lda_model = LdaMulticore(bow_corpus, num_topics=n, id2word=dictionary, passes=10, workers=4)
    # Compute Coherence Score
    coherence_model_lda_model = CoherenceModel(model=lda_model, texts=preprocessed_docs, dictionary=dictionary, coherence='c_v')
    coherence_lda_model = coherence_model_lda_model.get_coherence()
    lda_best_coherence.append((n,coherence_lda_model,lda_model))
    print('\n',n,'Topics Coherence Score: ', coherence_lda_model, ' Perplexity: ', lda_model.log_perplexity(bow_corpus))

optimal_num_topics = max(lda_best_coherence, key=lambda x: x[1])[0]
print('\n','Best Number of Topics: ',optimal_num_topics)


# Show Topics

optimal_lda_model = max(lda_best_coherence, key=lambda x: x[1])[2]

for idx, topic in optimal_lda_model.print_topics():
    print('Topic: {} \nWords: {}'.format(idx, topic))



 2 Topics Coherence Score:  0.3535166456547471  Perplexity:  -4.775661687416168

 3 Topics Coherence Score:  0.39434823839123884  Perplexity:  -4.793840876454288

 4 Topics Coherence Score:  0.39607529232568084  Perplexity:  -4.838030253160892

 5 Topics Coherence Score:  0.3777800858399092  Perplexity:  -4.8822289586231875

 6 Topics Coherence Score:  0.39721653968107096  Perplexity:  -4.891937706851258

 7 Topics Coherence Score:  0.40458049802422275  Perplexity:  -4.9423416938724865

 8 Topics Coherence Score:  0.39229450686356315  Perplexity:  -4.936126347851159

 9 Topics Coherence Score:  0.37215426740867585  Perplexity:  -4.956840628560867

 10 Topics Coherence Score:  0.38921141999367775  Perplexity:  -4.966068556276186

 11 Topics Coherence Score:  0.36873791737224726  Perplexity:  -4.9801478668196335

 12 Topics Coherence Score:  0.38706625558293534  Perplexity:  -4.977219377345843

 13 Topics Coherence Score:  0.37419796935132965  Perplexity:  -4.995867116601469

 14 Topics

In [40]:
#LDA using TF-IDF

lda_best_coherence_tfidf = []

for n in nof_list:
    lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=n, id2word=dictionary, passes=5, workers=4)
    # Compute Coherence Score
    coherence_model_lda_model_tfidf = CoherenceModel(model=lda_model_tfidf, texts=preprocessed_docs, dictionary=dictionary, coherence='c_v')
    coherence_lda_model_tfidf = coherence_model_lda_model_tfidf.get_coherence()
    lda_best_coherence_tfidf.append((n,coherence_lda_model_tfidf,lda_model_tfidf))
    print('\n',n,'Topics Coherence Score: ', coherence_lda_model_tfidf, ' Perplexity: ', lda_model.log_perplexity(corpus_tfidf))

optimal_num_topics_tfidf = max(lda_best_coherence_tfidf, key=lambda x: x[1])[0]
print('\n','Best Number of Topics: ',optimal_num_topics_tfidf)


# Show Topics

optimal_lda_model_tfidf = max(lda_best_coherence_tfidf, key=lambda x: x[1])[2]

for idx, topic in optimal_lda_model_tfidf.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))


 2 Topics Coherence Score:  0.3039527155473939  Perplexity:  -6.56802808927757

 3 Topics Coherence Score:  0.4705728271153065  Perplexity:  -6.570249461439319

 4 Topics Coherence Score:  0.4254164768775908  Perplexity:  -6.570133278702745

 5 Topics Coherence Score:  0.47146627760282256  Perplexity:  -6.568394511663691

 6 Topics Coherence Score:  0.44296073679678244  Perplexity:  -6.567784198894151

 7 Topics Coherence Score:  0.40943908149415803  Perplexity:  -6.567240561691847

 8 Topics Coherence Score:  0.4459154254575408  Perplexity:  -6.567819393960271

 9 Topics Coherence Score:  0.40846697492871326  Perplexity:  -6.569203599558804

 10 Topics Coherence Score:  0.39923237079585394  Perplexity:  -6.570924434726305

 11 Topics Coherence Score:  0.38871964605853965  Perplexity:  -6.569180931387067

 12 Topics Coherence Score:  0.41171521814679995  Perplexity:  -6.5678784632662675

 13 Topics Coherence Score:  0.3956599744847435  Perplexity:  -6.5697762985829655

 14 Topics Cohe

In [41]:
# Plotting tools
!pip install pyLDAvis
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)



In [42]:
# Visualize the topics
pyLDAvis.enable_notebook()
#vis = pyLDAvis.gensim.prepare(optimal_lda_model, bow_corpus, dictionary)
vis = pyLDAvis.gensim.prepare(lda_best_coherence[7][2], bow_corpus, dictionary)
#vis = pyLDAvis.gensim.prepare(optimal_lda_model_tfidf, corpus_tfidf, dictionary)


vis

#Bubbles are topics. The larger the bubble, the more prevalent is that topic.

#Good topic model = non-overlapping bubbles and not too many topics.

In [43]:
# Building LDA Mallet Model
# Download File: http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
# The difference between Mallet and Gensim’s standard LDA is that Gensim uses a Variational Bayes sampling method which is faster but less precise that Mallet’s Gibbs Sampling. 

import os
from gensim.models.wrappers import LdaMallet
os.environ['MALLET_HOME'] = 'C:\\Users\\franz.iskandar\\Downloads\\mallet-2.0.8\\mallet-2.0.8\\'

mallet_path = r'C:\Users\franz.iskandar\Downloads\mallet-2.0.8\mallet-2.0.8\bin\mallet' # update this path

nof_list=list(np.arange(2,30))

mallet_best_coherence = []

for n in nof_list:
    ldamallet = LdaMallet(mallet_path, corpus=bow_corpus, num_topics=n, id2word=dictionary,  workers=6)
    # Compute Coherence Score
    coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=preprocessed_docs, dictionary=dictionary, coherence='c_v')
    coherence_ldamallet = coherence_model_ldamallet.get_coherence()
    mallet_best_coherence.append((n,coherence_ldamallet,ldamallet))
    print('\n',n,'Topics Coherence Score: ', coherence_ldamallet)

optimal_num_topics_mallet = max(mallet_best_coherence, key=lambda x: x[1])[0]
print('\n','Best Number of Topics: ',optimal_num_topics_mallet)

# Show Topics

optimal_model_mallet = max(mallet_best_coherence, key=lambda x: x[1])[2]

#ldamallet = LdaMallet(mallet_path, corpus=bow_corpus, num_topics=optimal_num_topics, id2word=dictionary)

for idx, topic in optimal_model_mallet.print_topics():
    print('Topic: {} \nWords: {}'.format(idx, topic))

    
# print(ldamallet.show_topics(formatted=False))


 2 Topics Coherence Score:  0.37514120217170266

 3 Topics Coherence Score:  0.4167772946327393

 4 Topics Coherence Score:  0.43843140977999684

 5 Topics Coherence Score:  0.4115814099900767

 6 Topics Coherence Score:  0.4553878766562816

 7 Topics Coherence Score:  0.4643233189533638

 8 Topics Coherence Score:  0.4949415370122413

 9 Topics Coherence Score:  0.4654730957850657

 10 Topics Coherence Score:  0.5031320565014005

 11 Topics Coherence Score:  0.5011159570193958

 12 Topics Coherence Score:  0.514803789514675

 13 Topics Coherence Score:  0.5338076961884203

 14 Topics Coherence Score:  0.5482998910068672

 15 Topics Coherence Score:  0.5348353708657141

 16 Topics Coherence Score:  0.5440357732868326

 17 Topics Coherence Score:  0.5606957070996249

 18 Topics Coherence Score:  0.5443520979501468

 19 Topics Coherence Score:  0.5622883219327092

 20 Topics Coherence Score:  0.5417196363623102

 21 Topics Coherence Score:  0.5570170837643469

 22 Topics Coherence Score

In [44]:
# Building LDA Mallet Model TFIDF
# Download File: http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip

mallet_best_coherence_tfidf = []

for n in nof_list:
    ldamallet_tfidf = LdaMallet(mallet_path, corpus=corpus_tfidf, num_topics=n, id2word=dictionary,  workers=6)
    # Compute Coherence Score
    coherence_model_ldamallet_tfidf = CoherenceModel(model=ldamallet_tfidf, texts=preprocessed_docs, dictionary=dictionary, coherence='c_v')
    coherence_ldamallet_tfidf = coherence_model_ldamallet_tfidf.get_coherence()
    mallet_best_coherence_tfidf.append((n,coherence_ldamallet_tfidf,ldamallet_tfidf))
    print('\n',n,'Topics Coherence Score: ', coherence_ldamallet_tfidf)

optimal_num_topics_mallet_tfidf = max(mallet_best_coherence_tfidf, key=lambda x: x[1])[0]
print('\n','Best Number of Topics: ',optimal_num_topics_mallet_tfidf)

# Show Topics

optimal_model_mallet_tfidf = max(mallet_best_coherence_tfidf, key=lambda x: x[1])[2]

#ldamallet = LdaMallet(mallet_path, corpus=bow_corpus, num_topics=optimal_num_topics, id2word=dictionary)

for idx, topic in optimal_model_mallet_tfidf.print_topics():
    print('Topic: {} \nWords: {}'.format(idx, topic))

    
# print(ldamallet_tfidf.show_topics(formatted=False))


 2 Topics Coherence Score:  0.5773480945527969

 3 Topics Coherence Score:  0.5735517558263796

 4 Topics Coherence Score:  0.5777486367279865

 5 Topics Coherence Score:  0.5797691300400526

 6 Topics Coherence Score:  0.5789113309294295

 7 Topics Coherence Score:  0.5977169792283161

 8 Topics Coherence Score:  0.5869508508502614

 9 Topics Coherence Score:  0.586738625067593

 10 Topics Coherence Score:  0.581331943964537

 11 Topics Coherence Score:  0.5997048158014384

 12 Topics Coherence Score:  0.5791843588516387

 13 Topics Coherence Score:  0.5887204587128573

 14 Topics Coherence Score:  0.5868290795532319

 15 Topics Coherence Score:  0.5874385346824555

 16 Topics Coherence Score:  0.5764239435866438

 17 Topics Coherence Score:  0.5848240637373032

 18 Topics Coherence Score:  0.5974244054316575

 19 Topics Coherence Score:  0.5896057890316158

 20 Topics Coherence Score:  0.5852698032295225

 21 Topics Coherence Score:  0.5924996438419

 22 Topics Coherence Score:  0.5

In [45]:
# Visualize the topics from Mallet
pyLDAvis.enable_notebook()

#Convert the class of your mallet model into a LdaModel before pyLDAvis

model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(mallet_best_coherence[7][2])
#(optimal_model_mallet)
vis = pyLDAvis.gensim.prepare(model, bow_corpus, dictionary)

#model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(mallet_best_coherence_tfidf[12][2])
#(optimal_model_mallet_tfidf)
#vis = pyLDAvis.gensim.prepare(model, corpus_tfidf, dictionary)

vis


#Each bubble on the left-hand side plot represents a topic. The larger the bubble, the more prevalent is that topic.

#A good topic model will have fairly big, non-overlapping bubbles scattered throughout the chart instead of being clustered in one quadrant.

#A model with too many topics, will typically have many overlaps, small sized bubbles clustered in one region of the chart.

  kernel = (topic_given_term * np.log((topic_given_term.T / topic_proportion).T))
  log_lift = np.log(topic_term_dists / term_proportion)
  log_ttd = np.log(topic_term_dists)


In [46]:
#Dominant Topics

def dominant_topic(ldamodel, corpus):
     #Function to find the dominant topic in each review
     sent_topics_df = pd.DataFrame() 
     # Get main topic in each review
     for i, row in enumerate(ldamodel[corpus]):
         row = sorted(row, key=lambda x: (x[1]), reverse=True)
         # Get the Dominant topic, Perc Contribution and Keywords for each review
         for j, (topic_num, prop_topic) in enumerate(row):
             if j == 0:  # =&gt; dominant topic
                 wp = ldamodel.show_topic(topic_num,topn=4)
                 topic_keywords = ", ".join([word for word, prop in wp])
                 sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
             else:
                 break
     sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
     #contents = pd.Series(texts)
     #sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
     return(sent_topics_df)

In [54]:
data = pd.read_excel('data\Chats_scrubbed.xlsx',sheet_name='chatdata', error_bad_lines=False)
data = data[(data['Activity Type']=='Network Optimisation') & (data['WO Status']=='Incomplete')]
data = data.reset_index()
data_text = data['ID']

df_dominant_topic = dominant_topic(ldamodel=optimal_lda_model, corpus=bow_corpus) 
contents = pd.Series(data_text)
output_df = pd.concat([df_dominant_topic,contents], axis=1)

display(output_df.head(10))

#output_df.to_csv('Incompletes_CS_State.csv')

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,ID
0,5.0,0.5712,"complete, egress, work, check",42121
1,6.0,0.8928,"fault, day, time, return",42128
2,1.0,0.7138,"fix, noise, continue, return",42141
3,4.0,0.7855,"return, more_time, require, further_investigation",42261
4,1.0,0.5155,"fix, noise, continue, return",42269
5,6.0,0.5531,"fault, day, time, return",42294
6,2.0,0.4474,"node, replace, check, level",42318
7,5.0,0.7141,"complete, egress, work, check",42334
8,1.0,0.6315,"fix, noise, continue, return",42345
9,1.0,0.4166,"fix, noise, continue, return",42347
