**Inspired by:**
- https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24
- https://github.com/susanli2016/NLP-with-Python/blob/master/LDA_news_headlines.ipynb
- https://www.kaggle.com/therohk/million-headlines/data?select=abcnews-date-text.csv

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('abcnews-date-text.csv', error_bad_lines=False);
documents = data[['headline_text']].reset_index(drop=True)



  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
print(len(documents))
documents.head()

1226258


Unnamed: 0,headline_text
0,aba decides against community broadcasting lic...
1,act fire witnesses must be aware of defamation
2,a g calls for infrastructure protection summit
3,air nz staff in aust strike for pay rise
4,air nz strike to affect australian travellers


## Data Pre-processing

In [5]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/vivianho/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

1. Tokenization:  Segmenting a document into atomic elements
2. Words that have fewer than 3 characters are removed.
3. Remove stop words: 250-300 most common words in English account for 50% or more of a given text.
4. Stemming: producing morphological variants of a root/base word
5. Lemmatization:Words are lemmatized — words in third person are changed to first person and verbs in past and future tenses are changed into present. Converting the word to its meaningful base form considering the context

In [6]:
def lemmatize_stemming(text):
    stemmer = SnowballStemmer("english")
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [14]:
doc_sample = documents[documents.index == 4310].values[0][0]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['ratepayers', 'group', 'wants', 'compulsory', 'local', 'govt', 'voting']


 tokenized and lemmatized document: 
['ratepay', 'group', 'want', 'compulsori', 'local', 'govt', 'vote']


In [15]:
processed_docs = documents['headline_text'].map(preprocess)
processed_docs[:10]

0            [decid, communiti, broadcast, licenc]
1                               [wit, awar, defam]
2           [call, infrastructur, protect, summit]
3                      [staff, aust, strike, rise]
4             [strike, affect, australian, travel]
5               [ambiti, olsson, win, tripl, jump]
6           [antic, delight, record, break, barca]
7    [aussi, qualifi, stosur, wast, memphi, match]
8            [aust, address, secur, council, iraq]
9                         [australia, lock, timet]
Name: headline_text, dtype: object

### Bag of Words on the Data set

In [16]:
# print the first 10 words of the dictionary
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 broadcast
1 communiti
2 decid
3 licenc
4 awar
5 defam
6 wit
7 call
8 infrastructur
9 protect
10 summit


In [17]:
# Filter out tokens that appear in
# 1. less than 15 documents (absolute number) or
# 2. more than 0.5 documents (fraction of total corpus size, not absolute number).
# 3. after the above two steps, keep only the first 100000 most frequent tokens.

dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [18]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[4310]

[(162, 1), (240, 1), (292, 1), (589, 1), (838, 1), (3570, 1), (3571, 1)]

In [19]:
bow_doc_4310 = bow_corpus[4310]
for i in range(len(bow_doc_4310)):
    print(bow_doc_4310[i], dictionary[bow_doc_4310[i][0]])

(162, 1) govt
(240, 1) group
(292, 1) vote
(589, 1) local
(838, 1) want
(3570, 1) compulsori
(3571, 1) ratepay


In [20]:
len(bow_corpus)

1226258

### TF-IDF

In [21]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.5842699484464488),
 (1, 0.38798859072167835),
 (2, 0.5008422243250992),
 (3, 0.5071987254965034)]


## Model

### Running LDA using Bag of Words

In [37]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, 
                                       passes=2, workers=2)

In [38]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.026*"death" + 0.025*"charg" + 0.025*"case" + 0.024*"court" + 0.021*"polic" + 0.020*"murder" + 0.015*"alleg" + 0.012*"trial" + 0.012*"arrest" + 0.012*"face"
Topic: 1 
Words: 0.022*"news" + 0.020*"market" + 0.018*"world" + 0.017*"women" + 0.015*"final" + 0.015*"australian" + 0.014*"island" + 0.012*"return" + 0.011*"street" + 0.011*"fall"
Topic: 2 
Words: 0.051*"coronavirus" + 0.028*"covid" + 0.024*"live" + 0.021*"nation" + 0.021*"coast" + 0.016*"restrict" + 0.015*"water" + 0.013*"gold" + 0.011*"plan" + 0.010*"park"
Topic: 3 
Words: 0.038*"sydney" + 0.025*"polic" + 0.021*"crash" + 0.020*"adelaid" + 0.019*"die" + 0.015*"miss" + 0.012*"break" + 0.011*"drug" + 0.011*"driver" + 0.010*"road"
Topic: 4 
Words: 0.037*"year" + 0.031*"melbourn" + 0.022*"open" + 0.021*"canberra" + 0.017*"jail" + 0.015*"work" + 0.014*"high" + 0.014*"life" + 0.013*"interview" + 0.013*"offic"
Topic: 5 
Words: 0.029*"govern" + 0.019*"health" + 0.019*"school" + 0.017*"help" + 0.016*"chang" + 0.015*"fed

### Running LDA using TF-IDF

In [39]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, 
                                             passes=2, workers=4)

for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.010*"christma" + 0.010*"stori" + 0.009*"sport" + 0.007*"violenc" + 0.007*"domest" + 0.007*"award" + 0.006*"septemb" + 0.006*"celebr" + 0.006*"australian" + 0.006*"dollar"
Topic: 1 Word: 0.014*"murder" + 0.013*"court" + 0.012*"charg" + 0.010*"alleg" + 0.008*"friday" + 0.008*"jail" + 0.008*"drug" + 0.008*"sentenc" + 0.008*"polic" + 0.008*"guilti"
Topic: 2 Word: 0.026*"trump" + 0.012*"australia" + 0.010*"world" + 0.006*"australian" + 0.006*"coronavirus" + 0.006*"south" + 0.006*"india" + 0.006*"test" + 0.006*"open" + 0.006*"korea"
Topic: 3 Word: 0.017*"coronavirus" + 0.014*"covid" + 0.010*"rural" + 0.007*"farmer" + 0.006*"restrict" + 0.006*"australia" + 0.006*"farm" + 0.005*"nation" + 0.005*"govern" + 0.005*"news"
Topic: 4 Word: 0.014*"interview" + 0.012*"market" + 0.010*"morrison" + 0.010*"scott" + 0.010*"monday" + 0.009*"share" + 0.008*"extend" + 0.006*"daniel" + 0.006*"novemb" + 0.006*"australian"
Topic: 5 Word: 0.016*"crash" + 0.015*"polic" + 0.012*"drum" + 0.012*"woma

## Performance evaluation:  Classification of the topics

### Performance evaluation by classifying sample document using LDA Bag of Words model

In [40]:
processed_docs[4310]

['ratepay', 'group', 'want', 'compulsori', 'local', 'govt', 'vote']

In [41]:
for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.4136901795864105	 
Topic: 0.029*"govern" + 0.019*"health" + 0.019*"school" + 0.017*"help" + 0.016*"chang" + 0.015*"feder" + 0.013*"indigen" + 0.012*"state" + 0.012*"coronavirus" + 0.012*"fund"

Score: 0.34723326563835144	 
Topic: 0.025*"call" + 0.025*"tasmania" + 0.020*"rise" + 0.019*"victorian" + 0.017*"morrison" + 0.017*"tasmanian" + 0.015*"million" + 0.015*"farm" + 0.011*"program" + 0.011*"claim"

Score: 0.15154524147510529	 
Topic: 0.070*"australia" + 0.045*"trump" + 0.025*"donald" + 0.017*"elect" + 0.016*"border" + 0.015*"busi" + 0.015*"peopl" + 0.014*"accus" + 0.013*"say" + 0.012*"scott"

Score: 0.012506252154707909	 
Topic: 0.051*"coronavirus" + 0.028*"covid" + 0.024*"live" + 0.021*"nation" + 0.021*"coast" + 0.016*"restrict" + 0.015*"water" + 0.013*"gold" + 0.011*"plan" + 0.010*"park"

Score: 0.01250444259494543	 
Topic: 0.037*"year" + 0.031*"melbourn" + 0.022*"open" + 0.021*"canberra" + 0.017*"jail" + 0.015*"work" + 0.014*"high" + 0.014*"life" + 0.013*"interview" + 0.

### Performance evaluation by classifying sample document using LDA TF-IDF model.

In [42]:
for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.715409517288208	 
Topic: 0.014*"elect" + 0.008*"govern" + 0.008*"andrew" + 0.007*"labor" + 0.007*"michael" + 0.007*"financ" + 0.007*"liber" + 0.006*"peter" + 0.006*"say" + 0.005*"parti"

Score: 0.18454216420650482	 
Topic: 0.019*"countri" + 0.014*"hour" + 0.013*"royal" + 0.010*"commiss" + 0.007*"social" + 0.007*"know" + 0.007*"explain" + 0.007*"coronavirus" + 0.006*"parent" + 0.006*"morn"

Score: 0.012509500607848167	 
Topic: 0.017*"coronavirus" + 0.014*"covid" + 0.010*"rural" + 0.007*"farmer" + 0.006*"restrict" + 0.006*"australia" + 0.006*"farm" + 0.005*"nation" + 0.005*"govern" + 0.005*"news"

Score: 0.012505966238677502	 
Topic: 0.010*"christma" + 0.010*"stori" + 0.009*"sport" + 0.007*"violenc" + 0.007*"domest" + 0.007*"award" + 0.006*"septemb" + 0.006*"celebr" + 0.006*"australian" + 0.006*"dollar"

Score: 0.01250587310642004	 
Topic: 0.014*"interview" + 0.012*"market" + 0.010*"morrison" + 0.010*"scott" + 0.010*"monday" + 0.009*"share" + 0.008*"extend" + 0.006*"daniel" + 0

### Testing model on unseen document

In [43]:
unseen_document = 'How a Pentagon deal became an identity crisis for Google'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.34983640909194946	 Topic: 0.026*"death" + 0.025*"charg" + 0.025*"case" + 0.024*"court" + 0.021*"polic"
Score: 0.3497518002986908	 Topic: 0.022*"news" + 0.020*"market" + 0.018*"world" + 0.017*"women" + 0.015*"final"
Score: 0.18368104100227356	 Topic: 0.042*"queensland" + 0.033*"victoria" + 0.022*"bushfir" + 0.022*"hous" + 0.014*"time"
Score: 0.016682041808962822	 Topic: 0.030*"china" + 0.025*"test" + 0.019*"south" + 0.016*"coronavirus" + 0.013*"north"
Score: 0.016676753759384155	 Topic: 0.025*"call" + 0.025*"tasmania" + 0.020*"rise" + 0.019*"victorian" + 0.017*"morrison"
Score: 0.016675986349582672	 Topic: 0.051*"coronavirus" + 0.028*"covid" + 0.024*"live" + 0.021*"nation" + 0.021*"coast"
Score: 0.016675952821969986	 Topic: 0.029*"govern" + 0.019*"health" + 0.019*"school" + 0.017*"help" + 0.016*"chang"
Score: 0.016673337668180466	 Topic: 0.038*"sydney" + 0.025*"polic" + 0.021*"crash" + 0.020*"adelaid" + 0.019*"die"
Score: 0.016673337668180466	 Topic: 0.037*"year" + 0.031*"melbo