# Gensim LDA

Adapted from https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

In [4]:
import pandas as pd

csv_file = 'npr.csv'      # the input csv file
topic_file = 'topics_'+csv_file

data_path = 'data/'+csv_file
topic_path = 'topics/'+topic_file

df = pd.read_csv(data_path);
# df = df[['headline_text']]
df = df[['Article']]
df['index'] = df.index
documents = df


In [5]:
print(len(documents))
print(documents[:5])

11992
                                             Article  index
0  In the Washington of 2016, even when the polic...      0
1    Donald Trump has used Twitter  —   his prefe...      1
2    Donald Trump is unabashedly praising Russian...      2
3  Updated at 2:50 p. m. ET, Russian President Vl...      3
4  From photography, illustration and video, to d...      4


In [6]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')
stemmer = SnowballStemmer("english")


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/bansharee/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
# words in third person are changed to first person and verbs in past and future tenses are changed into present
# words are reduced to their root form
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [8]:
# displaying how preprocessing works
doc_sample = documents[documents['index'] == 4310].values[0][0]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['Oklahoma', 'City', 'residents', 'woke', 'early', 'New', 'Year’s', 'Day', 'to', 'a', 'magnitude', '4.', '2', 'quake.', 'Earlier', 'this', 'week,', 'a', 'magnitude', '4.', '3', 'quake', 'struck', 'the', 'same', 'area.', 'The', 'state', 'isn’t', 'historically', 'known', 'for', 'earthquakes,', 'but', 'NPR’s', 'Nell', 'Greenfieldboyce', 'told', 'our', 'Newscast', 'unit', 'that', 'Oklahoma', '”has', 'recently', 'seen', 'a', 'dramatic', 'rise', 'in', 'seismic', 'activity.”', 'Here’s', 'more:', '”If', 'you', 'think', 'of', 'a', 'U.', 'S.', 'state', 'associated', 'with', 'earthquakes,', 'it’s', 'probably', 'California.', 'But', 'really,', 'you', 'should', 'think', 'Oklahoma.', 'In', '2015,', 'Oklahoma', 'hit', 'an', '', '', 'high,', 'with', 'more', 'than', '800', 'quakes', 'of', 'magnitude', '3', 'or', 'greater.', 'That', 'busts', 'the', 'record', 'set', 'in', '2014,', 'which', 'topped', 'the', 'previous', 'record', 'set', 'the', 'year', 'before.', 'State', 'officials', 'h

In [9]:
# preprocess 'headline_text' text from training set
# processed_docs = documents['headline_text'].map(preprocess)
processed_docs = documents['Article'].map(preprocess)
processed_docs[:10]

0    [washington, polici, bipartisan, polit, sens, ...
1    [donald, trump, twitter, prefer, mean, communi...
2    [donald, trump, unabash, prais, russian, presi...
3    [updat, russian, presid, vladimir, putin, say,...
4    [photographi, illustr, video, data, visual, im...
5    [want, join, yoga, class, hat, beatif, instruc...
6    [public, support, debunk, claim, vaccin, caus,...
7    [stand, airport, exit, debat, snack, young, ro...
8    [movi, tri, realist, summon, batman, shouldn, ...
9    [eighteen, year, year, david, fisher, visit, f...
Name: Article, dtype: object

In [10]:
# creating dictionary using words in the training set, mapped to how many times the word appears in the set
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 abil
1 accept
2 account
3 act
4 action
5 actual
6 add
7 administr
8 advis
9 affair
10 afloat


In [11]:
# filtering out:
#   * less than 15 documents (absolute number) or
#   * more than 0.5 documents (fraction of total corpus size, not absolute number)
#   * after the above two steps, keep only the first 100000 most frequent tokens

dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)


In [12]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
# bow_corpus[0]

# bow_corpus = []
# for doc in processed_docs:
#     bow = dictionary.doc2bow(doc)
#     bow_corpus.append(bow)


In [None]:
bow_doc_4310 = bow_corpus[4310]

for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(
                                                    bow_doc_4310[i][0], 
                                                    dictionary[bow_doc_4310[i][0]], 
                                                    bow_doc_4310[i][1]))

In [None]:
# TF-IDF weights words based on how often they appear in a document 
# versus how often they appear in the entire corpus
# this helps LDA distinguish topics by weighting more important words higher

from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint

# preview of how this works
pprint(corpus_tfidf[0])
for i in range(len(bow_corpus[0])):
    print(dictionary[bow_corpus[0][i][0]], corpus_tfidf[0][i][1])

In [15]:
# TODO: HDP goes here for num_topics
# see https://medium.com/analytics-vidhya/text-classification-using-lda-35d5b98d4f05 for HDP 

In [16]:
# training the model using the bow corpus

lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} Words: {}'.format(idx, topic))

Topic: 0 Words: 0.005*"want" + 0.005*"thing" + 0.004*"feel" + 0.004*"live" + 0.004*"write" + 0.004*"book" + 0.004*"stori" + 0.003*"women" + 0.003*"world" + 0.003*"right"
Topic: 1 Words: 0.006*"food" + 0.004*"famili" + 0.004*"want" + 0.004*"look" + 0.003*"live" + 0.003*"need" + 0.003*"start" + 0.003*"help" + 0.003*"call" + 0.003*"thing"
Topic: 2 Words: 0.019*"trump" + 0.006*"clinton" + 0.006*"presid" + 0.005*"campaign" + 0.005*"polit" + 0.005*"state" + 0.004*"news" + 0.004*"report" + 0.004*"countri" + 0.004*"nation"
Topic: 3 Words: 0.012*"health" + 0.007*"care" + 0.006*"state" + 0.005*"patient" + 0.005*"insur" + 0.004*"medic" + 0.004*"research" + 0.004*"plan" + 0.004*"hospit" + 0.004*"need"
Topic: 4 Words: 0.007*"report" + 0.005*"attack" + 0.004*"state" + 0.003*"countri" + 0.003*"call" + 0.003*"forc" + 0.003*"polic" + 0.003*"kill" + 0.003*"live" + 0.003*"group"
Topic: 5 Words: 0.012*"school" + 0.009*"student" + 0.006*"state" + 0.004*"educ" + 0.004*"report" + 0.004*"famili" + 0.003*"help

In [17]:
# training the model using the bow corpus

lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.004*"trump" + 0.002*"clinton" + 0.001*"presid" + 0.001*"polic" + 0.001*"report" + 0.001*"elect" + 0.001*"women" + 0.001*"state" + 0.001*"campaign" + 0.001*"student"
Topic: 1 Word: 0.004*"zika" + 0.003*"trump" + 0.002*"virus" + 0.002*"mosquito" + 0.002*"climat" + 0.001*"hous" + 0.001*"women" + 0.001*"health" + 0.001*"presid" + 0.001*"clinton"
Topic: 2 Word: 0.002*"music" + 0.002*"song" + 0.002*"album" + 0.001*"health" + 0.001*"parent" + 0.001*"school" + 0.001*"trump" + 0.001*"artist" + 0.001*"dylan" + 0.001*"band"
Topic: 3 Word: 0.003*"trump" + 0.002*"polic" + 0.001*"isra" + 0.001*"clinton" + 0.001*"israel" + 0.001*"state" + 0.001*"palestinian" + 0.001*"report" + 0.001*"presid" + 0.001*"attack"
Topic: 4 Word: 0.003*"refuge" + 0.002*"olymp" + 0.002*"food" + 0.002*"school" + 0.001*"student" + 0.001*"song" + 0.001*"trump" + 0.001*"game" + 0.001*"water" + 0.001*"women"
Topic: 5 Word: 0.003*"trump" + 0.002*"dutert" + 0.002*"drug" + 0.002*"philippin" + 0.001*"clinton" + 0.001

In [18]:
# check how part of the training set is classified
# first appearing topic is the one assigned to it
print(processed_docs[4310])
for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 5)))

['oklahoma', 'citi', 'resid', 'wake', 'earli', 'year', 'magnitud', 'quak', 'earlier', 'week', 'magnitud', 'quak', 'strike', 'area', 'state', 'histor', 'know', 'earthquak', 'nell', 'greenfieldboyc', 'tell', 'newscast', 'unit', 'oklahoma', 'recent', 'see', 'dramat', 'rise', 'seismic', 'activ', 'think', 'state', 'associ', 'earthquak', 'probabl', 'california', 'think', 'oklahoma', 'oklahoma', 'high', 'quak', 'magnitud', 'greater', 'bust', 'record', 'top', 'previous', 'record', 'year', 'state', 'offici', 'say', 'rise', 'unlik', 'repres', 'natur', 'occur', 'process', 'concern', 'quak', 'link', 'drill', 'specif', 'wastewat', 'produc', 'drill', 'pump', 'deep', 'underground', 'dispos', 'well', 'oklahoma', 'tri', 'address', 'issu', 'coordin', 'council', 'seismic', 'activ', 'includ', 'regul', 'scientist', 'industri', 'repres', 'wertz', 'stateimpact', 'oklahoma', 'explain', 'connect', 'industri', 'increas', 'number', 'quak', 'weekend', 'edit', 'saturday', 'novemb', 'product', 'creat', 'toxic', 'wa

In [19]:
# evaluate the tfidf version
for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 5)))


Score: 0.993514358997345	 
Topic: 0.004*"trump" + 0.002*"clinton" + 0.001*"presid" + 0.001*"polic" + 0.001*"report"


In [20]:
unseen_document = 'writing out the equation for Euler'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
for index, score in sorted(lda_model_tfidf[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.699625551700592	 Topic: 0.005*"want" + 0.005*"thing" + 0.004*"feel" + 0.004*"live" + 0.004*"write"
Score: 0.03337695449590683	 Topic: 0.007*"trump" + 0.007*"music" + 0.004*"state" + 0.004*"clinton" + 0.004*"song"
Score: 0.03337628021836281	 Topic: 0.010*"trump" + 0.006*"percent" + 0.006*"report" + 0.005*"state" + 0.005*"countri"
Score: 0.03337552398443222	 Topic: 0.012*"health" + 0.007*"care" + 0.006*"state" + 0.005*"patient" + 0.005*"insur"
Score: 0.03337501734495163	 Topic: 0.019*"trump" + 0.006*"clinton" + 0.006*"presid" + 0.005*"campaign" + 0.005*"polit"
Score: 0.03337496146559715	 Topic: 0.008*"polic" + 0.008*"report" + 0.005*"state" + 0.005*"offic" + 0.004*"citi"
Score: 0.033374760299921036	 Topic: 0.007*"report" + 0.005*"attack" + 0.004*"state" + 0.003*"countri" + 0.003*"call"
Score: 0.03337417542934418	 Topic: 0.013*"trump" + 0.007*"clinton" + 0.006*"state" + 0.006*"presid" + 0.006*"vote"
Score: 0.033373814076185226	 Topic: 0.012*"school" + 0.009*"student" + 0.006*"sta