# Gensim LDA

Adapted from https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

In [58]:
import pandas as pd

csv_file = 'npr.csv'            # the input csv file for training
test_file = 'money.csv'    # contains the transcript
topic_file = 'gensim_topics_'+test_file

data_path = 'data/'+csv_file
test_path = 'data/'+test_file
topic_path = 'topics/'+topic_file

df = pd.read_csv(data_path);
# df = df[['headline_text']]
df = df[['Article']]
df['index'] = df.index
documents = df


In [59]:
print(len(documents))
print(documents[:5])

11992
                                             Article  index
0  In the Washington of 2016, even when the polic...      0
1    Donald Trump has used Twitter  —   his prefe...      1
2    Donald Trump is unabashedly praising Russian...      2
3  Updated at 2:50 p. m. ET, Russian President Vl...      3
4  From photography, illustration and video, to d...      4


In [60]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')
stemmer = SnowballStemmer("english")


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/bansharee/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [61]:
# words in third person are changed to first person and verbs in past and future tenses are changed into present
# words are reduced to their root form
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [62]:
# displaying how preprocessing works
doc_sample = documents[documents['index'] == 4310].values[0][0]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['Oklahoma', 'City', 'residents', 'woke', 'early', 'New', 'Year’s', 'Day', 'to', 'a', 'magnitude', '4.', '2', 'quake.', 'Earlier', 'this', 'week,', 'a', 'magnitude', '4.', '3', 'quake', 'struck', 'the', 'same', 'area.', 'The', 'state', 'isn’t', 'historically', 'known', 'for', 'earthquakes,', 'but', 'NPR’s', 'Nell', 'Greenfieldboyce', 'told', 'our', 'Newscast', 'unit', 'that', 'Oklahoma', '”has', 'recently', 'seen', 'a', 'dramatic', 'rise', 'in', 'seismic', 'activity.”', 'Here’s', 'more:', '”If', 'you', 'think', 'of', 'a', 'U.', 'S.', 'state', 'associated', 'with', 'earthquakes,', 'it’s', 'probably', 'California.', 'But', 'really,', 'you', 'should', 'think', 'Oklahoma.', 'In', '2015,', 'Oklahoma', 'hit', 'an', '', '', 'high,', 'with', 'more', 'than', '800', 'quakes', 'of', 'magnitude', '3', 'or', 'greater.', 'That', 'busts', 'the', 'record', 'set', 'in', '2014,', 'which', 'topped', 'the', 'previous', 'record', 'set', 'the', 'year', 'before.', 'State', 'officials', 'h

In [63]:
# preprocess 'headline_text' text from training set
# processed_docs = documents['headline_text'].map(preprocess)
processed_docs = documents['Article'].map(preprocess)
processed_docs[:10]

0    [washington, polici, bipartisan, polit, sens, ...
1    [donald, trump, twitter, prefer, mean, communi...
2    [donald, trump, unabash, prais, russian, presi...
3    [updat, russian, presid, vladimir, putin, say,...
4    [photographi, illustr, video, data, visual, im...
5    [want, join, yoga, class, hat, beatif, instruc...
6    [public, support, debunk, claim, vaccin, caus,...
7    [stand, airport, exit, debat, snack, young, ro...
8    [movi, tri, realist, summon, batman, shouldn, ...
9    [eighteen, year, year, david, fisher, visit, f...
Name: Article, dtype: object

In [64]:
# creating dictionary using words in the training set, mapped to how many times the word appears in the set
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 abil
1 accept
2 account
3 act
4 action
5 actual
6 add
7 administr
8 advis
9 affair
10 afloat


In [65]:
# filtering out:
#   * less than 15 documents (absolute number) or
#   * more than 0.5 documents (fraction of total corpus size, not absolute number)
#   * after the above two steps, keep only the first 100000 most frequent tokens

dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)


In [66]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]


In [67]:
bow_doc_4310 = bow_corpus[4310]

for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(
                                                    bow_doc_4310[i][0], 
                                                    dictionary[bow_doc_4310[i][0]], 
                                                    bow_doc_4310[i][1]))

Word 8 ("advis") appears 1 time.
Word 51 ("citi") appears 1 time.
Word 61 ("compromis") appears 1 time.
Word 63 ("concern") appears 1 time.
Word 105 ("econom") appears 1 time.
Word 140 ("greater") appears 1 time.
Word 158 ("includ") appears 1 time.
Word 166 ("issu") appears 1 time.
Word 179 ("look") appears 1 time.
Word 195 ("nation") appears 1 time.
Word 206 ("offici") appears 1 time.
Word 247 ("recent") appears 1 time.
Word 259 ("respons") appears 1 time.
Word 298 ("suggest") appears 1 time.
Word 314 ("tri") appears 1 time.
Word 323 ("unit") appears 1 time.
Word 334 ("week") appears 1 time.
Word 357 ("compani") appears 2 time.
Word 362 ("council") appears 2 time.
Word 374 ("earlier") appears 1 time.
Word 406 ("line") appears 1 time.
Word 451 ("state") appears 3 time.
Word 497 ("earli") appears 1 time.
Word 515 ("increas") appears 1 time.
Word 569 ("unlik") appears 1 time.
Word 572 ("wake") appears 1 time.
Word 615 ("specif") appears 1 time.
Word 636 ("area") appears 1 time.
Word 694 

In [68]:
# TF-IDF weights words based on how often they appear in a document 
# versus how often they appear in the entire corpus
# this helps LDA distinguish topics by weighting more important words higher

from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint

# preview of how this works
pprint(corpus_tfidf[0])
for i in range(len(bow_corpus[0])):
    print(dictionary[bow_corpus[0][i][0]], corpus_tfidf[0][i][1])

[(0, 0.023067990364550667),
 (1, 0.02138290697692375),
 (2, 0.019120276543562995),
 (3, 0.023312189215803015),
 (4, 0.07177692380940279),
 (5, 0.012376338229759715),
 (6, 0.012652765926049652),
 (7, 0.015681833239163178),
 (8, 0.022752553413297974),
 (9, 0.027595984325063022),
 (10, 0.05605640784415791),
 (11, 0.055234263426146644),
 (12, 0.029834588795734765),
 (13, 0.03757301028755038),
 (14, 0.04175927604924723),
 (15, 0.046873736806329634),
 (16, 0.028261189971447376),
 (17, 0.032253677145107414),
 (18, 0.015158469095917521),
 (19, 0.009910816499937069),
 (20, 0.06389342674765507),
 (21, 0.04649554154969994),
 (22, 0.044285416005865226),
 (23, 0.027833764343638497),
 (24, 0.023250503546463713),
 (25, 0.03779800986995305),
 (26, 0.04946093135473547),
 (27, 0.0786047603506468),
 (28, 0.02922961188090144),
 (29, 0.02138290697692375),
 (30, 0.03995684959520006),
 (31, 0.023573953955485928),
 (32, 0.0321408155994278),
 (33, 0.013108675965096848),
 (34, 0.04804888404574625),
 (35, 0.0266

In [69]:
# TODO: HDP goes here for num_topics
# see https://medium.com/analytics-vidhya/text-classification-using-lda-35d5b98d4f05 for HDP 

num_topics = 3

In [70]:
# training the model using the bow corpus

lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=num_topics, id2word=dictionary, passes=2, workers=2)
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} Words: {}'.format(idx, topic))

Topic: 0 Words: 0.004*"want" + 0.004*"thing" + 0.004*"school" + 0.003*"state" + 0.003*"report" + 0.003*"polic" + 0.003*"live" + 0.003*"right" + 0.003*"take" + 0.003*"write"
Topic: 1 Words: 0.005*"health" + 0.003*"care" + 0.003*"help" + 0.003*"need" + 0.003*"state" + 0.003*"want" + 0.003*"studi" + 0.003*"look" + 0.003*"live" + 0.003*"research"
Topic: 2 Words: 0.014*"trump" + 0.007*"state" + 0.006*"clinton" + 0.006*"presid" + 0.005*"report" + 0.004*"campaign" + 0.004*"countri" + 0.003*"vote" + 0.003*"nation" + 0.003*"elect"


In [71]:
# training the model using the bow corpus

lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=num_topics, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.006*"trump" + 0.002*"clinton" + 0.002*"presid" + 0.002*"republican" + 0.002*"elect" + 0.002*"health" + 0.002*"vote" + 0.002*"obama" + 0.002*"state" + 0.002*"campaign"
Topic: 1 Word: 0.002*"music" + 0.002*"trump" + 0.001*"song" + 0.001*"clinton" + 0.001*"vote" + 0.001*"report" + 0.001*"album" + 0.001*"stori" + 0.001*"citi" + 0.001*"patient"
Topic: 2 Word: 0.002*"clinton" + 0.002*"trump" + 0.001*"polic" + 0.001*"women" + 0.001*"student" + 0.001*"song" + 0.001*"studi" + 0.001*"state" + 0.001*"report" + 0.001*"school"


In [72]:
# check how part of the training set is clustered
# first appearing topic is the one assigned to it
print(processed_docs[4310])
for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 5)))

['oklahoma', 'citi', 'resid', 'wake', 'earli', 'year', 'magnitud', 'quak', 'earlier', 'week', 'magnitud', 'quak', 'strike', 'area', 'state', 'histor', 'know', 'earthquak', 'nell', 'greenfieldboyc', 'tell', 'newscast', 'unit', 'oklahoma', 'recent', 'see', 'dramat', 'rise', 'seismic', 'activ', 'think', 'state', 'associ', 'earthquak', 'probabl', 'california', 'think', 'oklahoma', 'oklahoma', 'high', 'quak', 'magnitud', 'greater', 'bust', 'record', 'top', 'previous', 'record', 'year', 'state', 'offici', 'say', 'rise', 'unlik', 'repres', 'natur', 'occur', 'process', 'concern', 'quak', 'link', 'drill', 'specif', 'wastewat', 'produc', 'drill', 'pump', 'deep', 'underground', 'dispos', 'well', 'oklahoma', 'tri', 'address', 'issu', 'coordin', 'council', 'seismic', 'activ', 'includ', 'regul', 'scientist', 'industri', 'repres', 'wertz', 'stateimpact', 'oklahoma', 'explain', 'connect', 'industri', 'increas', 'number', 'quak', 'weekend', 'edit', 'saturday', 'novemb', 'product', 'creat', 'toxic', 'wa

In [73]:
# evaluate the tfidf version
for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 5)))


Score: 0.945931613445282	 
Topic: 0.002*"music" + 0.002*"trump" + 0.001*"song" + 0.001*"clinton" + 0.001*"vote"

Score: 0.05070444196462631	 
Topic: 0.006*"trump" + 0.002*"clinton" + 0.002*"presid" + 0.002*"republican" + 0.002*"elect"


In [74]:
# unseen_document = 'if you love traveling, you\'ll love this'
# bow_vector = dictionary.doc2bow(preprocess(unseen_document))
# for index, score in sorted(lda_model_tfidf[bow_vector], key=lambda tup: -1*tup[1]):
#     topic = lda_model_tfidf.print_topic(index, 10)
#     print("Score: {}\t Topic words:{}".format(score, re.sub('[^A-Za-z]+', ',', topic)))

# adapted from https://towardsdatascience.com/unsupervised-nlp-topic-models-as-a-supervised-learning-input-cf8ee9e5cf28
unseen_df = pd.read_csv(test_path)

train_vecs = []
for text_entry in unseen_df['text']:
    bow_vector = dictionary.doc2bow(preprocess(text_entry))
    top_topics = (
        lda_model_tfidf.get_document_topics(bow_vector,
                                        minimum_probability=0.0)
    )

    # probabilities for topics 0-num_topics, unsorted
    train_vecs.append(top_topics)
    

pprint(train_vecs)
    

[[(0, 0.06368509), (1, 0.06775403), (2, 0.8685609)],
 [(0, 0.17948042), (1, 0.21530297), (2, 0.60521656)],
 [(0, 0.80160165), (1, 0.09750986), (2, 0.10088849)],
 [(0, 0.060702354), (1, 0.06941794), (2, 0.86987966)],
 [(0, 0.048654966), (1, 0.052578095), (2, 0.89876693)],
 [(0, 0.08876355), (1, 0.08056355), (2, 0.83067286)],
 [(0, 0.10297102), (1, 0.105059825), (2, 0.7919692)],
 [(0, 0.09844649), (1, 0.62668407), (2, 0.27486947)],
 [(0, 0.096074216), (1, 0.7944771), (2, 0.10944868)],
 [(0, 0.09452536), (1, 0.102656566), (2, 0.8028181)],
 [(0, 0.065517455), (1, 0.17258857), (2, 0.7618939)],
 [(0, 0.06674719), (1, 0.06720671), (2, 0.86604613)],
 [(0, 0.10952174), (1, 0.10294947), (2, 0.78752875)],
 [(0, 0.89448845), (1, 0.054179635), (2, 0.051331926)],
 [(0, 0.03954476), (1, 0.9202981), (2, 0.040157195)],
 [(0, 0.11109244), (1, 0.09428943), (2, 0.7946181)],
 [(0, 0.10793408), (1, 0.10577496), (2, 0.78629094)],
 [(0, 0.9134531), (1, 0.04282846), (2, 0.04371845)],
 [(0, 0.04151245), (1, 0.4

In [75]:
topic_results = []
for doc in train_vecs:
    topic_idx, score = max(doc, key=lambda x: x[1]) # numerical topic
    topic_results.append(topic_idx)

unseen_df['topics'] = topic_results
unseen_df.head()


Unnamed: 0,text,topics
0,"Let’s say I had an apple, and you had an orang...",2
1,an apple.,2
2,"We could trade, and both of us would end up ha...",0
3,"But now let’s say I had an apple tree, and you...",2
4,"the apple tree were ready, but the oranges wou...",2


In [76]:
unseen_df.to_csv(topic_path, sep=',', index=False, encoding='utf-8')

In [78]:
import pyLDAvis.gensim
import pickle 
import pyLDAvis
import os

# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_data_filepath = os.path.join('./results/ldavis_prepared_'+str(num_topics))

LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary)
with open(LDAvis_data_filepath, 'wb') as f:
    pickle.dump(LDAvis_prepared, f)

# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)
    
pyLDAvis.save_html(LDAvis_prepared, './results/ldavis_prepared_'+ str(num_topics) +'.html')
LDAvis_prepared

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])
