In [8]:
import pandas as pd
import numpy as np
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
#nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\geris\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [9]:
data = pd.read_csv("./data/abcnews-date-text.csv", error_bad_lines=False)
data_text = data[['headline_text']] # get only the text from the original dataset (ignore publish date)
data_text['index'] = data_text.index # Create1
documents = data_text
documents.head() 

Unnamed: 0,headline_text,index
0,aba decides against community broadcasting lic...,0
1,act fire witnesses must be aware of defamation,1
2,a g calls for infrastructure protection summit,2
3,air nz staff in aust strike for pay rise,3
4,air nz strike to affect australian travellers,4


In [16]:
# Text data preprocessing:
# Tokenization
# Stopword and "small" words removal
# Lemmatization
# Stemming
def lemma_stem(text):
    stemmer = SnowballStemmer("english")
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos="v"))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemma_stem(token))
    return result

In [19]:
doc_sample = documents[documents['index'] == 4310].values[0][0]
print('original document text: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n Preprocessing result: ')
print(preprocess(doc_sample))

original document text: 
['ratepayers', 'group', 'wants', 'compulsory', 'local', 'govt', 'voting']


 Preprocessing result: 
['ratepay', 'group', 'want', 'compulsori', 'local', 'govt', 'vote']


In [20]:
# Preprocess all documents in df
preprocessed_docs = documents["headline_text"].map(preprocess)
preprocessed_docs[:5]

0     [decid, communiti, broadcast, licenc]
1                        [wit, awar, defam]
2    [call, infrastructur, protect, summit]
3               [staff, aust, strike, rise]
4      [strike, affect, australian, travel]
Name: headline_text, dtype: object

In [23]:
# Countstruct BoW representation of the document
dictionary = gensim.corpora.Dictionary(preprocessed_docs)
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
bow_corpus = [dictionary.doc2bow(doc) for doc in preprocessed_docs]
bow_corpus[4310]

[(162, 1), (240, 1), (292, 1), (589, 1), (838, 1), (3567, 1), (3568, 1)]

In [24]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.018*"north" + 0.016*"china" + 0.015*"water" + 0.014*"melbourn" + 0.014*"time" + 0.013*"tasmanian" + 0.013*"rise" + 0.013*"south" + 0.013*"price" + 0.012*"farmer"
Topic: 1 
Words: 0.034*"elect" + 0.022*"news" + 0.016*"royal" + 0.014*"bank" + 0.013*"busi" + 0.012*"commiss" + 0.011*"polit" + 0.010*"announc" + 0.010*"hobart" + 0.010*"parti"
Topic: 2 
Words: 0.034*"court" + 0.023*"canberra" + 0.023*"face" + 0.019*"final" + 0.017*"perth" + 0.017*"win" + 0.016*"high" + 0.016*"coast" + 0.015*"children" + 0.014*"gold"
Topic: 3 
Words: 0.022*"nation" + 0.020*"plan" + 0.019*"live" + 0.018*"chang" + 0.015*"govern" + 0.014*"indigen" + 0.013*"power" + 0.012*"farm" + 0.012*"communiti" + 0.011*"meet"
Topic: 4 
Words: 0.023*"school" + 0.020*"tasmania" + 0.019*"queensland" + 0.019*"state" + 0.016*"countri" + 0.016*"speak" + 0.014*"student" + 0.012*"die" + 0.012*"citi" + 0.011*"care"
Topic: 5 
Words: 0.055*"australia" + 0.039*"trump" + 0.026*"world" + 0.015*"market" + 0.014*"women" + 0

In [25]:
# Create a demo testing document and test the model
test_document = 'How a Pentagon deal became an identity crisis for Google'
bow_vector = dictionary.doc2bow(preprocess(test_document))
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.3501793444156647	 Topic: 0.055*"australia" + 0.039*"trump" + 0.026*"world" + 0.015*"market" + 0.014*"women"
Score: 0.3501611649990082	 Topic: 0.036*"year" + 0.025*"home" + 0.022*"open" + 0.020*"health" + 0.017*"adelaid"
Score: 0.1828790307044983	 Topic: 0.041*"polic" + 0.022*"charg" + 0.020*"death" + 0.019*"murder" + 0.016*"woman"
Score: 0.016692381352186203	 Topic: 0.018*"north" + 0.016*"china" + 0.015*"water" + 0.014*"melbourn" + 0.014*"time"
Score: 0.016684703528881073	 Topic: 0.019*"feder" + 0.016*"labor" + 0.016*"say" + 0.015*"fund" + 0.014*"claim"
Score: 0.016684390604496002	 Topic: 0.023*"school" + 0.020*"tasmania" + 0.019*"queensland" + 0.019*"state" + 0.016*"countri"
Score: 0.016679739579558372	 Topic: 0.034*"elect" + 0.022*"news" + 0.016*"royal" + 0.014*"bank" + 0.013*"busi"
Score: 0.016679739579558372	 Topic: 0.034*"court" + 0.023*"canberra" + 0.023*"face" + 0.019*"final" + 0.017*"perth"
Score: 0.016679739579558372	 Topic: 0.022*"nation" + 0.020*"plan" + 0.019*"live