# Gensim LDA

Adapted from https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

In [1]:
import pandas as pd

csv_file = 'abcnews-date-text.csv'      # the input csv file
topic_file = 'topics_'+csv_file

data_path = 'data/'+csv_file
topic_path = 'topics/'+topic_file

df = pd.read_csv(data_path);
df = df[['headline_text']]
df['index'] = df.index
documents = df


In [2]:
print(len(documents))
print(documents[:5])

1244184
                                       headline_text  index
0  aba decides against community broadcasting lic...      0
1     act fire witnesses must be aware of defamation      1
2     a g calls for infrastructure protection summit      2
3           air nz staff in aust strike for pay rise      3
4      air nz strike to affect australian travellers      4


In [11]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')
stemmer = SnowballStemmer("english")


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/bansharee/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [14]:
# words in third person are changed to first person and verbs in past and future tenses are changed into present
# words are reduced to their root form
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [24]:
# displaying how preprocessing works
doc_sample = documents[documents['index'] == 4310].values[0][0]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['ratepayers', 'group', 'wants', 'compulsory', 'local', 'govt', 'voting']


 tokenized and lemmatized document: 
['ratepay', 'group', 'want', 'compulsori', 'local', 'govt', 'vote']


In [25]:
# preprocess 'headline_text' text from training set
processed_docs = documents['headline_text'].map(preprocess)
processed_docs[:10]

0            [decid, communiti, broadcast, licenc]
1                               [wit, awar, defam]
2           [call, infrastructur, protect, summit]
3                      [staff, aust, strike, rise]
4             [strike, affect, australian, travel]
5               [ambiti, olsson, win, tripl, jump]
6           [antic, delight, record, break, barca]
7    [aussi, qualifi, stosur, wast, memphi, match]
8            [aust, address, secur, council, iraq]
9                         [australia, lock, timet]
Name: headline_text, dtype: object

In [26]:
# creating dictionary using words in the training set, mapped to how many times the word appears in the set
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 broadcast
1 communiti
2 decid
3 licenc
4 awar
5 defam
6 wit
7 call
8 infrastructur
9 protect
10 summit


In [27]:
# filtering out:
#   * less than 15 documents (absolute number) or
#   * more than 0.5 documents (fraction of total corpus size, not absolute number)
#   * after the above two steps, keep only the first 100000 most frequent tokens

dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)


In [40]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[0]

# bow_corpus = []
# for doc in processed_docs:
#     bow = dictionary.doc2bow(doc)
#     bow_corpus.append(bow)


[(0, 1), (1, 1), (2, 1), (3, 1)]

In [54]:
bow_doc_4310 = bow_corpus[4310]

for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(
                                                    bow_doc_4310[i][0], 
                                                    dictionary[bow_doc_4310[i][0]], 
                                                    bow_doc_4310[i][1]))

Word 4 ("awar") appears 1 time.
Word 5 ("defam") appears 1 time.
Word 6 ("wit") appears 1 time.


In [73]:
# TF-IDF weights words based on how often they appear in a document 
# versus how often they appear in the entire corpus
# this helps LDA distinguish topics by weighting more important words higher

from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint

# preview of how this works
pprint(corpus_tfidf[0])
for i in range(len(bow_corpus[0])):
    print(dictionary[bow_corpus[0][i][0]], corpus_tfidf[0][i][1])

[(0, 0.5844216176085719),
 (1, 0.38716866963787633),
 (2, 0.5013820927104505),
 (3, 0.5071171375845095)]
broadcast 0.5844216176085719
communiti 0.38716866963787633
decid 0.5013820927104505
licenc 0.5071171375845095


In [76]:
# TODO: HDP goes here for num_topics

In [79]:
# training the model using the bow corpus

lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} Words: {}'.format(idx, topic))

Topic: 0 Words: 0.047*"polic" + 0.025*"death" + 0.023*"charg" + 0.019*"murder" + 0.018*"kill" + 0.016*"crash" + 0.016*"alleg" + 0.016*"attack" + 0.016*"woman" + 0.015*"border"
Topic: 1 Words: 0.054*"australian" + 0.034*"govern" + 0.023*"live" + 0.023*"health" + 0.023*"news" + 0.023*"victoria" + 0.022*"nation" + 0.018*"adelaid" + 0.010*"show" + 0.010*"servic"
Topic: 2 Words: 0.048*"trump" + 0.039*"year" + 0.034*"elect" + 0.012*"presid" + 0.012*"fall" + 0.012*"say" + 0.011*"labor" + 0.010*"liber" + 0.010*"vote" + 0.009*"parti"
Topic: 3 Words: 0.030*"melbourn" + 0.026*"test" + 0.019*"restrict" + 0.019*"tasmania" + 0.017*"lockdown" + 0.017*"perth" + 0.015*"coronavirus" + 0.013*"royal" + 0.013*"care" + 0.013*"open"
Topic: 4 Words: 0.019*"plan" + 0.017*"hous" + 0.015*"rise" + 0.014*"region" + 0.012*"council" + 0.012*"brisban" + 0.011*"interview" + 0.011*"farm" + 0.010*"resid" + 0.010*"budget"
Topic: 5 Words: 0.043*"queensland" + 0.028*"china" + 0.022*"report" + 0.016*"north" + 0.015*"worker"

In [78]:
# training the model using the bow corpus

lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.016*"countri" + 0.011*"hour" + 0.008*"thursday" + 0.007*"peter" + 0.006*"turnbul" + 0.006*"great" + 0.006*"univers" + 0.005*"right" + 0.005*"beach" + 0.005*"human"
Topic: 1 Word: 0.018*"covid" + 0.015*"coronavirus" + 0.011*"case" + 0.011*"news" + 0.009*"victoria" + 0.009*"market" + 0.009*"record" + 0.009*"street" + 0.009*"crash" + 0.007*"wall"
Topic: 2 Word: 0.017*"polic" + 0.016*"charg" + 0.015*"murder" + 0.012*"alleg" + 0.012*"court" + 0.010*"woman" + 0.009*"death" + 0.009*"restrict" + 0.009*"jail" + 0.008*"sentenc"
Topic: 3 Word: 0.018*"vaccin" + 0.013*"lockdown" + 0.009*"covid" + 0.009*"wednesday" + 0.008*"pandem" + 0.007*"energi" + 0.007*"coronavirus" + 0.007*"alan" + 0.006*"grandstand" + 0.006*"open"
Topic: 4 Word: 0.014*"elect" + 0.009*"andrew" + 0.009*"australia" + 0.008*"friday" + 0.008*"monday" + 0.008*"quarantin" + 0.007*"financ" + 0.007*"biden" + 0.006*"parti" + 0.006*"korea"
Topic: 5 Word: 0.019*"donald" + 0.012*"interview" + 0.009*"updat" + 0.008*"john" +

In [85]:
# check how part of the training set is classified
# first appearing topic is the one assigned to it
print(processed_docs[4310])
for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))

['ratepay', 'group', 'want', 'compulsori', 'local', 'govt', 'vote']
ratepayers group wants compulsory local govt voting

Score: 0.6123133897781372	 
Topic: 0.019*"plan" + 0.017*"hous" + 0.015*"rise" + 0.014*"region" + 0.012*"council" + 0.012*"brisban" + 0.011*"interview" + 0.011*"farm" + 0.010*"resid" + 0.010*"budget"

Score: 0.15085595846176147	 
Topic: 0.063*"covid" + 0.034*"coronavirus" + 0.025*"case" + 0.023*"vaccin" + 0.018*"chang" + 0.016*"peopl" + 0.014*"victorian" + 0.013*"australia" + 0.011*"life" + 0.011*"say"

Score: 0.14927779138088226	 
Topic: 0.048*"trump" + 0.039*"year" + 0.034*"elect" + 0.012*"presid" + 0.012*"fall" + 0.012*"say" + 0.011*"labor" + 0.010*"liber" + 0.010*"vote" + 0.009*"parti"

Score: 0.012509193271398544	 
Topic: 0.024*"world" + 0.021*"school" + 0.021*"market" + 0.020*"women" + 0.017*"busi" + 0.016*"australia" + 0.014*"speak" + 0.013*"win" + 0.012*"final" + 0.012*"countri"

Score: 0.012507690116763115	 
Topic: 0.043*"queensland" + 0.028*"china" + 0.022*"

In [86]:
# evaluate the tfidf version
for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.5040776133537292	 
Topic: 0.011*"govern" + 0.011*"rural" + 0.010*"health" + 0.009*"queensland" + 0.008*"drum" + 0.007*"news" + 0.007*"chang" + 0.006*"fund" + 0.006*"coronavirus" + 0.006*"budget"

Score: 0.39588508009910583	 
Topic: 0.016*"countri" + 0.011*"hour" + 0.008*"thursday" + 0.007*"peter" + 0.006*"turnbul" + 0.006*"great" + 0.006*"univers" + 0.005*"right" + 0.005*"beach" + 0.005*"human"

Score: 0.012506840750575066	 
Topic: 0.014*"elect" + 0.009*"andrew" + 0.009*"australia" + 0.008*"friday" + 0.008*"monday" + 0.008*"quarantin" + 0.007*"financ" + 0.007*"biden" + 0.006*"parti" + 0.006*"korea"

Score: 0.01250490639358759	 
Topic: 0.018*"vaccin" + 0.013*"lockdown" + 0.009*"covid" + 0.009*"wednesday" + 0.008*"pandem" + 0.007*"energi" + 0.007*"coronavirus" + 0.007*"alan" + 0.006*"grandstand" + 0.006*"open"

Score: 0.012504869140684605	 
Topic: 0.012*"royal" + 0.008*"commiss" + 0.007*"victorian" + 0.007*"histori" + 0.007*"august" + 0.006*"cancer" + 0.006*"septemb" + 0.006*"e

In [91]:
unseen_document = 'trade an apple for an orange'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
for index, score in sorted(lda_model_tfidf[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.425210565328598	 Topic: 0.048*"trump" + 0.039*"year" + 0.034*"elect" + 0.012*"presid" + 0.012*"fall"
Score: 0.3746684491634369	 Topic: 0.029*"record" + 0.026*"donald" + 0.020*"warn" + 0.017*"coronavirus" + 0.016*"island"
Score: 0.0250252615660429	 Topic: 0.019*"plan" + 0.017*"hous" + 0.015*"rise" + 0.014*"region" + 0.012*"council"
Score: 0.02501756325364113	 Topic: 0.024*"world" + 0.021*"school" + 0.021*"market" + 0.020*"women" + 0.017*"busi"
Score: 0.025016717612743378	 Topic: 0.054*"australian" + 0.034*"govern" + 0.023*"live" + 0.023*"health" + 0.023*"news"
Score: 0.025016402825713158	 Topic: 0.063*"covid" + 0.034*"coronavirus" + 0.025*"case" + 0.023*"vaccin" + 0.018*"chang"
Score: 0.025014767423272133	 Topic: 0.047*"polic" + 0.025*"death" + 0.023*"charg" + 0.019*"murder" + 0.018*"kill"
Score: 0.025012994185090065	 Topic: 0.030*"melbourn" + 0.026*"test" + 0.019*"restrict" + 0.019*"tasmania" + 0.017*"lockdown"
Score: 0.02501094713807106	 Topic: 0.043*"queensland" + 0.028*"chi