Original tutorial: https://www.tutorialspoint.com/gensim/gensim_creating_lda_topic_model.htm<br>
Secondary tutorial: https://radimrehurek.com/gensim/models/ldamodel.html

In [11]:
import warnings
warnings.filterwarnings('ignore') #There is deprication warnings using VSCode I would rather avoid

import re
import numpy as np 
import pandas as pd 
from pprint import pprint
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy
import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt 

from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset='train')
nlp = spacy.load('en_core_web_md', disable=['parser', 'ner'])

In [12]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from','subject','re','edu','use'])

In [13]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

data = newsgroups_train.data
data = [re.sub('\S*@\Ss?', '', sent) for sent in data]
data = [re.sub('\s+', ' ', sent) for sent in data]
data = [re.sub("\'","",sent) for sent in data]
data_words = list(sent_to_words(newsgroups_train.data))

In [14]:
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [16]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN','ADJ','VERB','ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

data_words_nostops = remove_stopwords(data_words)
data_words_bigrams = make_bigrams(data_words_nostops)
data_lemmatized = lemmatization(data_words_bigrams)

In [17]:
# Build the LDA model
id2word = corpora.Dictionary(data_lemmatized)
texts = data_lemmatized
corpus = [id2word.doc2bow(text) for text in texts]

lda_model = gensim.models.ldamodel.LdaModel(
    corpus=corpus
    ,id2word=id2word
    ,num_topics=20
    ,random_state=100
    ,update_every=1
    ,chunksize=100
    ,passes=10
    ,alpha='auto'
    ,per_word_topics=True
)

In [28]:
# Save the model
lda_model.save('20newsgroups_lda_t20')

In [29]:
# Loading a model example
lda_model = gensim.models.LdaModel.load('20newsgroups_lda_t20')

In [26]:
#Print the topics
lda_model.print_topics(num_topics=5, num_words=5)

[(4,
  '0.067*"cable" + 0.049*"sgi" + 0.000*"trunk" + 0.000*"jon" + 0.000*"wiring"'),
 (7,
  '0.090*"m" + 0.040*"fi" + 0.039*"boy" + 0.025*"brave" + 0.022*"saturday"'),
 (11,
  '0.016*"evidence" + 0.011*"case" + 0.011*"group" + 0.010*"book" + 0.009*"issue"'),
 (8,
  '0.033*"say" + 0.027*"people" + 0.019*"god" + 0.019*"think" + 0.014*"believe"'),
 (14,
  '0.037*"line" + 0.036*"com" + 0.034*"organization" + 0.027*"write" + 0.022*"get"')]

In [22]:
# Print some model stats
coherence_model_lda = CoherenceModel(
    model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v'
)
coherence_lda = coherence_model_lda.get_coherence()

print('Perplexity: ', lda_model.log_perplexity(corpus), '\nCoherence Score: ', coherence_lda)

Perplexity:  -14.328053890995294 
Coherence Score:  0.5086267008315802


In [25]:
# Visualize the model
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
pyLDAvis.save_html(vis, '20newsgroups_lda_model_vis.html')

In [31]:
# Query the model using previously unseen texts
other_texts = [
    ['computer','time','graph']
    ,['survey','response','eps']
    ,['human','system','computer']
]

other_corpus = [id2word.doc2bow(text) for text in other_texts]
vector = lda_model[other_corpus[0]]
print(vector)

([(2, 0.06496077), (3, 0.038348276), (6, 0.017036414), (8, 0.13890307), (9, 0.013866927), (10, 0.025442459), (11, 0.10610145), (13, 0.055529993), (14, 0.4666749), (17, 0.018005379)], [(99, [14, 2, 13]), (179, [14, 8, 11]), (23807, [])], [(99, [(2, 0.35869116), (13, 0.06876848), (14, 0.57246506)]), (179, [(8, 0.12819229), (11, 0.04464748), (14, 0.8271367)]), (23807, [])])


In [32]:
# update the model by incrementally training on the new corpus
lda_model.update(other_corpus)
vector = lda_model[other_corpus[0]]
print(vector)

([(2, 0.06872155), (3, 0.03776076), (6, 0.016843356), (8, 0.1387931), (9, 0.013720744), (10, 0.02510831), (11, 0.10827711), (13, 0.053645287), (14, 0.4646846), (17, 0.017797304)], [(99, [14, 2, 13]), (179, [14, 8, 11]), (23807, [])], [(99, [(2, 0.3840592), (13, 0.04543065), (14, 0.5704694)]), (179, [(8, 0.1192261), (11, 0.04241071), (14, 0.8383426)]), (23807, [])])


### Determining the number of topics

In [35]:
def coherence_values_computation(dictionary, corpus, texts, limit, start=2, step=3):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=10)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values

In [37]:
start=1; limit=50; step=8

model_list, coherence_values = coherence_values_computation(
    corpus=corpus
    ,dictionary=id2word
    ,texts=data_lemmatized
    ,start=start, limit=limit, step=step
)

plt.plot(x=range(start,limit,step), y=coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence values"), loc="best")
plt.show()

SyntaxError: cannot assign to literal (<ipython-input-37-84f2839a56c9>, line 1)