# Gensim LDA

Adapted from https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

In [95]:
import pandas as pd

csv_file = 'npr.csv'      # the input csv file
topic_file = 'topics_'+csv_file

data_path = 'data/'+csv_file
topic_path = 'topics/'+topic_file

df = pd.read_csv(data_path);
# df = df[['headline_text']]
df = df[['Article']]
df['index'] = df.index
documents = df


In [96]:
print(len(documents))
print(documents[:5])

11992
                                             Article  index
0  In the Washington of 2016, even when the polic...      0
1    Donald Trump has used Twitter  —   his prefe...      1
2    Donald Trump is unabashedly praising Russian...      2
3  Updated at 2:50 p. m. ET, Russian President Vl...      3
4  From photography, illustration and video, to d...      4


In [97]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')
stemmer = SnowballStemmer("english")


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/bansharee/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [98]:
# words in third person are changed to first person and verbs in past and future tenses are changed into present
# words are reduced to their root form
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [99]:
# displaying how preprocessing works
doc_sample = documents[documents['index'] == 4310].values[0][0]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['Oklahoma', 'City', 'residents', 'woke', 'early', 'New', 'Year’s', 'Day', 'to', 'a', 'magnitude', '4.', '2', 'quake.', 'Earlier', 'this', 'week,', 'a', 'magnitude', '4.', '3', 'quake', 'struck', 'the', 'same', 'area.', 'The', 'state', 'isn’t', 'historically', 'known', 'for', 'earthquakes,', 'but', 'NPR’s', 'Nell', 'Greenfieldboyce', 'told', 'our', 'Newscast', 'unit', 'that', 'Oklahoma', '”has', 'recently', 'seen', 'a', 'dramatic', 'rise', 'in', 'seismic', 'activity.”', 'Here’s', 'more:', '”If', 'you', 'think', 'of', 'a', 'U.', 'S.', 'state', 'associated', 'with', 'earthquakes,', 'it’s', 'probably', 'California.', 'But', 'really,', 'you', 'should', 'think', 'Oklahoma.', 'In', '2015,', 'Oklahoma', 'hit', 'an', '', '', 'high,', 'with', 'more', 'than', '800', 'quakes', 'of', 'magnitude', '3', 'or', 'greater.', 'That', 'busts', 'the', 'record', 'set', 'in', '2014,', 'which', 'topped', 'the', 'previous', 'record', 'set', 'the', 'year', 'before.', 'State', 'officials', 'h

In [102]:
# preprocess 'headline_text' text from training set
# processed_docs = documents['headline_text'].map(preprocess)
processed_docs = documents['Article'].map(preprocess)
processed_docs[:10]

0    [washington, polici, bipartisan, polit, sens, ...
1    [donald, trump, twitter, prefer, mean, communi...
2    [donald, trump, unabash, prais, russian, presi...
3    [updat, russian, presid, vladimir, putin, say,...
4    [photographi, illustr, video, data, visual, im...
5    [want, join, yoga, class, hat, beatif, instruc...
6    [public, support, debunk, claim, vaccin, caus,...
7    [stand, airport, exit, debat, snack, young, ro...
8    [movi, tri, realist, summon, batman, shouldn, ...
9    [eighteen, year, year, david, fisher, visit, f...
Name: Article, dtype: object

In [103]:
# creating dictionary using words in the training set, mapped to how many times the word appears in the set
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 abil
1 accept
2 account
3 act
4 action
5 actual
6 add
7 administr
8 advis
9 affair
10 afloat


In [104]:
# filtering out:
#   * less than 15 documents (absolute number) or
#   * more than 0.5 documents (fraction of total corpus size, not absolute number)
#   * after the above two steps, keep only the first 100000 most frequent tokens

dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)


In [108]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[0]

# bow_corpus = []
# for doc in processed_docs:
#     bow = dictionary.doc2bow(doc)
#     bow_corpus.append(bow)


[(0, 1),
 (1, 1),
 (2, 1),
 (3, 1),
 (4, 4),
 (5, 1),
 (6, 1),
 (7, 1),
 (8, 1),
 (9, 1),
 (10, 1),
 (11, 3),
 (12, 1),
 (13, 2),
 (14, 1),
 (15, 2),
 (16, 2),
 (17, 1),
 (18, 1),
 (19, 1),
 (20, 2),
 (21, 1),
 (22, 3),
 (23, 1),
 (24, 1),
 (25, 1),
 (26, 2),
 (27, 2),
 (28, 1),
 (29, 1),
 (30, 1),
 (31, 1),
 (32, 1),
 (33, 1),
 (34, 1),
 (35, 1),
 (36, 1),
 (37, 1),
 (38, 2),
 (39, 1),
 (40, 1),
 (41, 1),
 (42, 1),
 (43, 7),
 (44, 4),
 (45, 1),
 (46, 1),
 (47, 2),
 (48, 1),
 (49, 1),
 (50, 1),
 (51, 1),
 (52, 2),
 (53, 1),
 (54, 1),
 (55, 1),
 (56, 1),
 (57, 5),
 (58, 1),
 (59, 1),
 (60, 1),
 (61, 1),
 (62, 1),
 (63, 2),
 (64, 1),
 (65, 1),
 (66, 1),
 (67, 1),
 (68, 1),
 (69, 1),
 (70, 1),
 (71, 1),
 (72, 1),
 (73, 1),
 (74, 1),
 (75, 1),
 (76, 1),
 (77, 2),
 (78, 1),
 (79, 1),
 (80, 1),
 (81, 1),
 (82, 1),
 (83, 1),
 (84, 1),
 (85, 1),
 (86, 1),
 (87, 1),
 (88, 1),
 (89, 1),
 (90, 1),
 (91, 1),
 (92, 1),
 (93, 1),
 (94, 1),
 (95, 2),
 (96, 1),
 (97, 1),
 (98, 2),
 (99, 1),
 (100, 1),

In [109]:
bow_doc_4310 = bow_corpus[4310]

for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(
                                                    bow_doc_4310[i][0], 
                                                    dictionary[bow_doc_4310[i][0]], 
                                                    bow_doc_4310[i][1]))

Word 8 ("advis") appears 1 time.
Word 51 ("citi") appears 1 time.
Word 61 ("compromis") appears 1 time.
Word 63 ("concern") appears 1 time.
Word 105 ("econom") appears 1 time.
Word 140 ("greater") appears 1 time.
Word 158 ("includ") appears 1 time.
Word 166 ("issu") appears 1 time.
Word 179 ("look") appears 1 time.
Word 195 ("nation") appears 1 time.
Word 206 ("offici") appears 1 time.
Word 247 ("recent") appears 1 time.
Word 259 ("respons") appears 1 time.
Word 298 ("suggest") appears 1 time.
Word 314 ("tri") appears 1 time.
Word 323 ("unit") appears 1 time.
Word 334 ("week") appears 1 time.
Word 357 ("compani") appears 2 time.
Word 362 ("council") appears 2 time.
Word 374 ("earlier") appears 1 time.
Word 406 ("line") appears 1 time.
Word 451 ("state") appears 3 time.
Word 497 ("earli") appears 1 time.
Word 515 ("increas") appears 1 time.
Word 569 ("unlik") appears 1 time.
Word 572 ("wake") appears 1 time.
Word 615 ("specif") appears 1 time.
Word 636 ("area") appears 1 time.
Word 694 

In [110]:
# TF-IDF weights words based on how often they appear in a document 
# versus how often they appear in the entire corpus
# this helps LDA distinguish topics by weighting more important words higher

from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint

# preview of how this works
pprint(corpus_tfidf[0])
for i in range(len(bow_corpus[0])):
    print(dictionary[bow_corpus[0][i][0]], corpus_tfidf[0][i][1])

[(0, 0.023067990364550667),
 (1, 0.02138290697692375),
 (2, 0.019120276543562995),
 (3, 0.023312189215803015),
 (4, 0.07177692380940279),
 (5, 0.012376338229759715),
 (6, 0.012652765926049652),
 (7, 0.015681833239163178),
 (8, 0.022752553413297974),
 (9, 0.027595984325063022),
 (10, 0.05605640784415791),
 (11, 0.055234263426146644),
 (12, 0.029834588795734765),
 (13, 0.03757301028755038),
 (14, 0.04175927604924723),
 (15, 0.046873736806329634),
 (16, 0.028261189971447376),
 (17, 0.032253677145107414),
 (18, 0.015158469095917521),
 (19, 0.009910816499937069),
 (20, 0.06389342674765507),
 (21, 0.04649554154969994),
 (22, 0.044285416005865226),
 (23, 0.027833764343638497),
 (24, 0.023250503546463713),
 (25, 0.03779800986995305),
 (26, 0.04946093135473547),
 (27, 0.0786047603506468),
 (28, 0.02922961188090144),
 (29, 0.02138290697692375),
 (30, 0.03995684959520006),
 (31, 0.023573953955485928),
 (32, 0.0321408155994278),
 (33, 0.013108675965096848),
 (34, 0.04804888404574625),
 (35, 0.0266

In [111]:
# TODO: HDP goes here for num_topics
# see https://medium.com/analytics-vidhya/text-classification-using-lda-35d5b98d4f05 for HDP 

In [112]:
# training the model using the bow corpus

lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} Words: {}'.format(idx, topic))

Topic: 0 Words: 0.006*"thing" + 0.006*"want" + 0.004*"feel" + 0.004*"live" + 0.004*"write" + 0.004*"book" + 0.004*"stori" + 0.004*"women" + 0.004*"life" + 0.003*"look"
Topic: 1 Words: 0.007*"food" + 0.004*"famili" + 0.004*"want" + 0.004*"look" + 0.003*"live" + 0.003*"help" + 0.003*"need" + 0.003*"world" + 0.003*"start" + 0.003*"thing"
Topic: 2 Words: 0.020*"trump" + 0.008*"clinton" + 0.006*"campaign" + 0.005*"presid" + 0.005*"polit" + 0.005*"state" + 0.005*"news" + 0.004*"candid" + 0.004*"report" + 0.004*"countri"
Topic: 3 Words: 0.012*"health" + 0.007*"care" + 0.006*"patient" + 0.005*"state" + 0.005*"insur" + 0.005*"medic" + 0.005*"research" + 0.004*"studi" + 0.004*"hospit" + 0.004*"need"
Topic: 4 Words: 0.007*"report" + 0.006*"attack" + 0.004*"kill" + 0.003*"state" + 0.003*"forc" + 0.003*"countri" + 0.003*"call" + 0.003*"world" + 0.003*"live" + 0.003*"group"
Topic: 5 Words: 0.012*"school" + 0.010*"student" + 0.007*"state" + 0.005*"educ" + 0.004*"famili" + 0.004*"report" + 0.003*"help

In [113]:
# training the model using the bow corpus

lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.004*"trump" + 0.002*"clinton" + 0.002*"elect" + 0.001*"presid" + 0.001*"report" + 0.001*"campaign" + 0.001*"women" + 0.001*"polic" + 0.001*"state" + 0.001*"vote"
Topic: 1 Word: 0.004*"trump" + 0.003*"zika" + 0.002*"climat" + 0.001*"virus" + 0.001*"hous" + 0.001*"women" + 0.001*"presid" + 0.001*"republican" + 0.001*"clinton" + 0.001*"mosquito"
Topic: 2 Word: 0.003*"music" + 0.002*"song" + 0.002*"album" + 0.002*"school" + 0.002*"dylan" + 0.002*"parent" + 0.001*"health" + 0.001*"artist" + 0.001*"children" + 0.001*"care"
Topic: 3 Word: 0.003*"trump" + 0.002*"polic" + 0.001*"report" + 0.001*"israel" + 0.001*"clinton" + 0.001*"state" + 0.001*"isra" + 0.001*"presid" + 0.001*"palestinian" + 0.001*"vote"
Topic: 4 Word: 0.003*"refuge" + 0.002*"school" + 0.002*"food" + 0.002*"marijuana" + 0.002*"aleppo" + 0.002*"student" + 0.001*"song" + 0.001*"syrian" + 0.001*"trump" + 0.001*"music"
Topic: 5 Word: 0.003*"trump" + 0.002*"dutert" + 0.002*"philippin" + 0.002*"pope" + 0.001*"clinton

In [122]:
# check how part of the training set is classified
# first appearing topic is the one assigned to it
print(processed_docs[4310])
for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 5)))

['oklahoma', 'citi', 'resid', 'wake', 'earli', 'year', 'magnitud', 'quak', 'earlier', 'week', 'magnitud', 'quak', 'strike', 'area', 'state', 'histor', 'know', 'earthquak', 'nell', 'greenfieldboyc', 'tell', 'newscast', 'unit', 'oklahoma', 'recent', 'see', 'dramat', 'rise', 'seismic', 'activ', 'think', 'state', 'associ', 'earthquak', 'probabl', 'california', 'think', 'oklahoma', 'oklahoma', 'high', 'quak', 'magnitud', 'greater', 'bust', 'record', 'top', 'previous', 'record', 'year', 'state', 'offici', 'say', 'rise', 'unlik', 'repres', 'natur', 'occur', 'process', 'concern', 'quak', 'link', 'drill', 'specif', 'wastewat', 'produc', 'drill', 'pump', 'deep', 'underground', 'dispos', 'well', 'oklahoma', 'tri', 'address', 'issu', 'coordin', 'council', 'seismic', 'activ', 'includ', 'regul', 'scientist', 'industri', 'repres', 'wertz', 'stateimpact', 'oklahoma', 'explain', 'connect', 'industri', 'increas', 'number', 'quak', 'weekend', 'edit', 'saturday', 'novemb', 'product', 'creat', 'toxic', 'wa

In [121]:
# evaluate the tfidf version
for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 5)))


Score: 0.7830478549003601	 
Topic: 0.004*"trump" + 0.002*"clinton" + 0.002*"elect" + 0.001*"presid" + 0.001*"report"

Score: 0.15203642845153809	 
Topic: 0.006*"health" + 0.005*"insur" + 0.003*"trump" + 0.003*"care" + 0.003*"abort"

Score: 0.0598716102540493	 
Topic: 0.002*"health" + 0.002*"food" + 0.002*"patient" + 0.002*"studi" + 0.002*"diseas"


In [120]:
unseen_document = 'writing out the equation for Euler'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
for index, score in sorted(lda_model_tfidf[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.6995978355407715	 Topic: 0.006*"thing" + 0.006*"want" + 0.004*"feel" + 0.004*"live" + 0.004*"write"
Score: 0.03338087350130081	 Topic: 0.008*"music" + 0.005*"song" + 0.005*"trump" + 0.004*"album" + 0.004*"record"
Score: 0.03338008001446724	 Topic: 0.011*"trump" + 0.007*"percent" + 0.006*"state" + 0.006*"report" + 0.006*"countri"
Score: 0.03337838500738144	 Topic: 0.007*"report" + 0.006*"attack" + 0.004*"kill" + 0.003*"state" + 0.003*"forc"
Score: 0.03337789326906204	 Topic: 0.012*"health" + 0.007*"care" + 0.006*"patient" + 0.005*"state" + 0.005*"insur"
Score: 0.03337752819061279	 Topic: 0.020*"trump" + 0.008*"clinton" + 0.006*"campaign" + 0.005*"presid" + 0.005*"polit"
Score: 0.03337736800312996	 Topic: 0.014*"trump" + 0.008*"clinton" + 0.007*"state" + 0.007*"presid" + 0.006*"vote"
Score: 0.033377304673194885	 Topic: 0.009*"polic" + 0.008*"report" + 0.005*"state" + 0.005*"offic" + 0.004*"citi"
Score: 0.03337668627500534	 Topic: 0.012*"school" + 0.010*"student" + 0.007*"state" 