Using the `gensim` library's Latent Dirichlet Allocation model to do topic modeling.
Notebook derived from tutorial here: https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

In [1]:
import re
import warnings

import gensim
from   gensim import corpora
from   gensim.models import CoherenceModel
from   gensim.utils import simple_preprocess
import matplotlib.pyplot as plt
from   nltk.corpus import stopwords
from   nltk.stem import WordNetLemmatizer
import numpy as np
import pandas as pd
from   pprint import pprint
import pyLDAvis
import pyLDAvis.gensim

warnings.filterwarnings('ignore', category=DeprecationWarning)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  EPS = np.finfo(np.float).eps


In [2]:
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [3]:
DATA = '../../data'

In [4]:
!ls ../../data

GoogleNews-vectors-negative300.bin fr_embeddings.pkl
WSJ_02-21.pos                      hmm_vocab.txt
WSJ_24.pos                         newsgroups.json
aclImdb_v1.tar.gz                  [34mnmt[m[m
[34mag_news[m[m                            sarcasm.json
bbc-text.csv                       shakespeare.txt
[34mbooks[m[m                              [34msurnames[m[m
capitals.txt                       test.words
en-fr.test.txt                     [34mtwitter_samples[m[m
en-fr.train.txt                    wiki.multi.fr.vec
en_US.twitter.txt                  [34myelp[m[m
en_embeddings.pkl


In [5]:
df = pd.read_json(f'{DATA}/newsgroups.json')
print(df.target_names.unique())
df.head()

['rec.autos' 'comp.sys.mac.hardware' 'comp.graphics' 'sci.space'
 'talk.politics.guns' 'sci.med' 'comp.sys.ibm.pc.hardware'
 'comp.os.ms-windows.misc' 'rec.motorcycles' 'talk.religion.misc'
 'misc.forsale' 'alt.atheism' 'sci.electronics' 'comp.windows.x'
 'rec.sport.hockey' 'rec.sport.baseball' 'soc.religion.christian'
 'talk.politics.mideast' 'talk.politics.misc' 'sci.crypt']


Unnamed: 0,content,target,target_names
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,rec.autos
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,comp.sys.mac.hardware
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4,comp.sys.mac.hardware
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1,comp.graphics
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14,sci.space


In [6]:
data = df.content.values.tolist()
# Scrub email addresses
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]
# Rm newlines
data = [re.sub('\s+', ' ', sent) for sent in data]
# Rm single quotes
data = [re.sub("\'", '', sent) for sent in data]
pprint(data[:1])

['From: (wheres my thing) Subject: WHAT car is this!? Nntp-Posting-Host: '
 'rac3.wam.umd.edu Organization: University of Maryland, College Park Lines: '
 '15 I was wondering if anyone out there could enlighten me on this car I saw '
 'the other day. It was a 2-door sports car, looked to be from the late 60s/ '
 'early 70s. It was called a Bricklin. The doors were really small. In '
 'addition, the front bumper was separate from the rest of the body. This is '
 'all I know. If anyone can tellme a model name, engine specs, years of '
 'production, where this car is made, history, or whatever info you have on '
 'this funky looking car, please e-mail. Thanks, - IL ---- brought to you by '
 'your neighborhood Lerxst ---- ']


In [7]:
def sent2words(sentences):
    for sent in sentences:
        # deacc=True: remove punctuation
        yield (gensim.utils.simple_preprocess(str(sent), deacc=True))

In [8]:
data_words = list(sent2words(data))
print(data_words[0])

['from', 'wheres', 'my', 'thing', 'subject', 'what', 'car', 'is', 'this', 'nntp', 'posting', 'host', 'rac', 'wam', 'umd', 'edu', 'organization', 'university', 'of', 'maryland', 'college', 'park', 'lines', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'saw', 'the', 'other', 'day', 'it', 'was', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', 'it', 'was', 'called', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front', 'bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'know', 'if', 'anyone', 'can', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', 'il', 'brought', 'to', 'you', 'by', 'your', 'neighborhood', 'lerxst']


In [9]:
# bi-/tri-gram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

print(trigram_mod[bigram_mod[data_words[0]]])

['from', 'wheres', 'my', 'thing', 'subject', 'what', 'car', 'is', 'this', 'nntp_posting_host', 'rac_wam_umd_edu', 'organization', 'university', 'of', 'maryland_college_park', 'lines', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'saw', 'the', 'other', 'day', 'it', 'was', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', 'it', 'was', 'called', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front_bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'know', 'if', 'anyone', 'can', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', 'il', 'brought', 'to', 'you', 'by', 'your', 'neighborhood', 'lerxst']


In [10]:
def remove_stops(texts, stop_words):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words]
            for doc in texts]

In [11]:
def make_bigrams(texts, bigram_mod):
    return [bigram_mod[doc] for doc in texts]

In [12]:
def make_trigrams(texts, bigram_mod, trigram_mod):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

In [13]:
wn = WordNetLemmatizer()

In [14]:
def lemmatize(texts, lemmatizer):
    texts_out = []
    for sent in texts:
        doc = [lemmatizer.lemmatize(word) for word in sent]
        texts_out.append(doc)
    return texts_out

In [15]:
data_words_nostops = remove_stops(data_words, stop_words)

In [16]:
#data_words_nostops[0]

In [17]:
data_words_bigrams = make_bigrams(data_words_nostops, bigram_mod)

In [18]:
#data_words_bigrams[0]

In [19]:
data_lemmatized = lemmatize(data_words_bigrams, wn)

In [20]:
#data_lemmatized[0]

In [21]:
id2word = corpora.Dictionary(data_lemmatized)

In [22]:
texts = data_lemmatized
corpus = [id2word.doc2bow(text) for text in texts]
corpus[0][:10] # (word_id, word_freq) per text in corpus

[(0, 1),
 (1, 2),
 (2, 1),
 (3, 1),
 (4, 1),
 (5, 1),
 (6, 5),
 (7, 1),
 (8, 1),
 (9, 2)]

In [23]:
id2word[6]

'car'

In [24]:
human_readable = [[(id2word[id], freq) for id, freq in doc] 
                  for doc in corpus]
human_readable[0][:10]

[('addition', 1),
 ('anyone', 2),
 ('body', 1),
 ('bricklin', 1),
 ('brought', 1),
 ('called', 1),
 ('car', 5),
 ('could', 1),
 ('day', 1),
 ('door', 2)]

In [25]:
lda_mod = gensim.models.LdaModel(corpus=corpus, 
                                 id2word=id2word, 
                                 num_topics=10,
                                 random_state=1103,
                                 update_every=1,
                                 chunksize=100,
                                 passes=10, # (epochs)
                                 alpha='auto',
                                 per_word_topics=True)

In [26]:
# Top 10 words per topic:
pprint(lda_mod.print_topics())

[(0,
  '0.280*"ax" + 0.022*"max" + 0.009*"ei" + 0.009*"dn" + 0.008*"slave" + '
  '0.007*"um" + 0.007*"md" + 0.007*"lebanese" + 0.006*"cx" + 0.005*"c_"'),
 (1,
  '0.011*"system" + 0.010*"line" + 0.010*"window" + 0.009*"file" + '
  '0.008*"drive" + 0.008*"thanks" + 0.008*"mail" + 0.007*"card" + '
  '0.007*"program" + 0.007*"computer"'),
 (2,
  '0.016*"people" + 0.012*"israel" + 0.010*"gun" + 0.009*"government" + '
  '0.008*"state" + 0.007*"right" + 0.007*"said" + 0.006*"child" + '
  '0.006*"israeli" + 0.006*"death"'),
 (3,
  '0.072*"key" + 0.023*"ripem" + 0.021*"encryption" + 0.016*"clipper" + '
  '0.016*"security" + 0.016*"chip" + 0.015*"algorithm" + 0.013*"government" + '
  '0.012*"serial_number" + 0.012*"public"'),
 (4,
  '0.010*"space" + 0.007*"information" + 0.007*"may" + 0.007*"year" + '
  '0.006*"physical" + 0.005*"research" + 0.005*"new" + 0.005*"system" + '
  '0.005*"national" + 0.004*"first"'),
 (5,
  '0.022*"team" + 0.021*"game" + 0.014*"year" + 0.009*"win" + 0.009*"play" + '


In [27]:
doc_lda = lda_mod[corpus]

In [28]:
lda_mod.log_perplexity(corpus)

-9.641363196864923

In [29]:
coherence_mod_lda = CoherenceModel(model=lda_mod, 
                                   texts=data_lemmatized, 
                                   dictionary=id2word, 
                                   coherence='c_v')
coherence_lda = coherence_mod_lda.get_coherence()
coherence_lda

0.5873626376283271

In [30]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_mod, corpus, id2word)
vis

ValidationError: 
 * Not all rows (distributions) in topic_term_dists sum to 1.