<a href="https://colab.research.google.com/github/dudesparsh/Applied-ML/blob/master/NLP_Topic_Modelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Checking gpu server
from tensorflow.python.client import device_lib
device_lib.list_local_devices()


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')


Mounted at /content/gdrive


In [None]:
import pandas as pd


In [None]:
pwd

'/content'

The data used here is of news headlines published over a period of seventeen years.

Sourced from the reputable Australian news source ABC (Australian Broadcasting Corporation)



In [None]:
data = pd.read_csv('/content/gdrive/My Drive/NLP/abcnews-date-text.csv', error_bad_lines=False);


In [None]:
data_text = data[['headline_text']]
data_text['index'] = data_text.index
documents = data_text


In [None]:
# Taking a look at the data
print(len(documents))
print(documents[:5])


1186018
                                       headline_text  index
0  aba decides against community broadcasting lic...      0
1     act fire witnesses must be aware of defamation      1
2     a g calls for infrastructure protection summit      2
3           air nz staff in aust strike for pay rise      3
4      air nz strike to affect australian travellers      4


### Data Pre-processing
- Tokenization
- Stopwords removal
- Lemmatization
- Stemming

Loading genism and nltk

In [None]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)


In [None]:
import nltk
nltk.download('wordnet')


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# Checking the word lemmatizer imported above
print(WordNetLemmatizer().lemmatize('went', pos='v'))


go


In [None]:
# Checking few of the stemmed words
stemmer = SnowballStemmer('english')
original_words = ['caresses', 'flies', 'dies', 'mules', 'denied','died', 'agreed', 'owned', 
           'humbled', 'sized','meeting', 'stating', 'siezing', 'itemization','sensational', 
           'traditional', 'reference', 'colonizer','plotted']
singles = [stemmer.stem(plural) for plural in original_words]
pd.DataFrame(data = {'original word': original_words, 'stemmed': singles})


Unnamed: 0,original word,stemmed
0,caresses,caress
1,flies,fli
2,dies,die
3,mules,mule
4,denied,deni
5,died,die
6,agreed,agre
7,owned,own
8,humbled,humbl
9,sized,size


In [None]:
# Funcion for lemmatizing and stem preprocessing
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result


In [None]:
# Previewing a document after preprocessing
doc_sample = documents[documents['index'] == 4310].values[0][0]

print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))


original document: 
['ratepayers', 'group', 'wants', 'compulsory', 'local', 'govt', 'voting']


 tokenized and lemmatized document: 
['ratepay', 'group', 'want', 'compulsori', 'local', 'govt', 'vote']


In [None]:
# Preprocessing the headlines and saving the results as
# processed_docs
processed_docs = documents['headline_text'].map(preprocess)


In [None]:
processed_docs[:10]


0            [decid, communiti, broadcast, licenc]
1                               [wit, awar, defam]
2           [call, infrastructur, protect, summit]
3                      [staff, aust, strike, rise]
4             [strike, affect, australian, travel]
5               [ambiti, olsson, win, tripl, jump]
6           [antic, delight, record, break, barca]
7    [aussi, qualifi, stosur, wast, memphi, match]
8            [aust, address, secur, council, iraq]
9                         [australia, lock, timet]
Name: headline_text, dtype: object

## Bag of Words


In [None]:
# Creating a dictionary from processed docs containing
# Frequency count of words in training set
dictionary = gensim.corpora.Dictionary(processed_docs)


In [None]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break


0 broadcast
1 communiti
2 decid
3 licenc
4 awar
5 defam
6 wit
7 call
8 infrastructur
9 protect
10 summit


In [None]:
# Filtering tokens
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)


In [None]:
# Gensim do2bow
# Checking frequency count of each word and storing it in dictionary
# Then running this bow_corpus on earlier selected document
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[4310]


[(162, 1), (240, 1), (292, 1), (589, 1), (838, 1), (3567, 1), (3568, 1)]

In [None]:
# Previewing bag of words on our document
bow_doc_4310 = bow_corpus[4310]

for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                                     dictionary[bow_doc_4310[i][0]], 
                                                     bow_doc_4310[i][1]))


Word 162 ("govt") appears 1 time.
Word 240 ("group") appears 1 time.
Word 292 ("vote") appears 1 time.
Word 589 ("local") appears 1 time.
Word 838 ("want") appears 1 time.
Word 3567 ("compulsori") appears 1 time.
Word 3568 ("ratepay") appears 1 time.


## TF-IDF

Term frequency - inverse document frequency

In [None]:
#Creating tf-idf model object using models.
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)


In [None]:
# Applying tranformation fo the entire corpus
corpus_tfidf = tfidf[bow_corpus]


In [None]:
# Previewing TF-IDF scores of our first document
from pprint import pprint

for doc in corpus_tfidf:
    pprint(doc)
    break


[(0, 0.5850076620505259),
 (1, 0.38947256567331934),
 (2, 0.4997099083387053),
 (3, 0.5063271308533074)]


# Running LDA using BoW

In [None]:
# Training our lda model using genism.models and saving it
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)


In [None]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))


Topic: 0 
Words: 0.028*"death" + 0.020*"canberra" + 0.015*"hospit" + 0.013*"water" + 0.010*"flood" + 0.010*"reveal" + 0.010*"hobart" + 0.010*"take" + 0.010*"find" + 0.009*"risk"
Topic: 1 
Words: 0.023*"donald" + 0.021*"china" + 0.017*"island" + 0.015*"rise" + 0.013*"street" + 0.013*"fall" + 0.012*"show" + 0.012*"australian" + 0.010*"wall" + 0.010*"young"
Topic: 2 
Words: 0.031*"elect" + 0.021*"south" + 0.018*"live" + 0.016*"tasmania" + 0.015*"school" + 0.013*"australia" + 0.013*"interview" + 0.012*"perth" + 0.011*"student" + 0.011*"stori"
Topic: 3 
Words: 0.027*"market" + 0.018*"miss" + 0.016*"indigen" + 0.015*"price" + 0.014*"victoria" + 0.012*"sydney" + 0.012*"beat" + 0.012*"citi" + 0.011*"search" + 0.011*"share"
Topic: 4 
Words: 0.024*"charg" + 0.020*"murder" + 0.019*"australia" + 0.018*"melbourn" + 0.016*"world" + 0.014*"court" + 0.014*"face" + 0.014*"alleg" + 0.013*"test" + 0.012*"accus"
Topic: 5 
Words: 0.027*"govern" + 0.021*"chang" + 0.016*"jail" + 0.013*"rural" + 0.013*"busi" 

In [None]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)


In [None]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))


Topic: 0 Word: 0.007*"elect" + 0.007*"budget" + 0.006*"sport" + 0.006*"friday" + 0.006*"govern" + 0.006*"grandstand" + 0.005*"celebr" + 0.005*"fund" + 0.005*"histori" + 0.005*"labor"
Topic: 1 Word: 0.018*"news" + 0.013*"market" + 0.010*"rural" + 0.009*"coast" + 0.007*"price" + 0.007*"gold" + 0.007*"weather" + 0.007*"monday" + 0.006*"bushfir" + 0.006*"rise"
Topic: 2 Word: 0.019*"countri" + 0.014*"hour" + 0.009*"wednesday" + 0.009*"michael" + 0.008*"climat" + 0.007*"david" + 0.006*"explain" + 0.006*"age" + 0.006*"chang" + 0.005*"footag"
Topic: 3 Word: 0.011*"live" + 0.008*"turnbul" + 0.007*"morrison" + 0.007*"parliament" + 0.005*"asylum" + 0.005*"anim" + 0.004*"onlin" + 0.004*"kohler" + 0.004*"seeker" + 0.004*"australian"
Topic: 4 Word: 0.017*"crash" + 0.006*"mental" + 0.006*"truck" + 0.006*"pacif" + 0.005*"health" + 0.005*"road" + 0.005*"novemb" + 0.005*"die" + 0.005*"plane" + 0.005*"island"
Topic: 5 Word: 0.017*"charg" + 0.016*"murder" + 0.014*"polic" + 0.011*"court" + 0.010*"alleg" + 

# Performance evaluation

Evaluating our sample document using LDA BoW model

In [None]:
processed_docs[4310]


['ratepay', 'group', 'want', 'compulsori', 'local', 'govt', 'vote']

In [None]:
for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


#Our test document has the highest probability to be part
# of the topic that our model assigned
# which is the accurate classification.



Score: 0.7624931931495667	 
Topic: 0.027*"govern" + 0.021*"chang" + 0.016*"jail" + 0.013*"rural" + 0.013*"busi" + 0.012*"say" + 0.012*"break" + 0.011*"drum" + 0.010*"climat" + 0.010*"concern"

Score: 0.1374998539686203	 
Topic: 0.039*"trump" + 0.024*"queensland" + 0.020*"crash" + 0.019*"news" + 0.017*"die" + 0.016*"shoot" + 0.016*"coast" + 0.015*"dead" + 0.012*"polic" + 0.011*"north"

Score: 0.012504558078944683	 
Topic: 0.015*"feder" + 0.015*"health" + 0.014*"bushfir" + 0.014*"farmer" + 0.013*"royal" + 0.012*"plan" + 0.012*"speak" + 0.011*"help" + 0.011*"guilti" + 0.010*"commiss"

Score: 0.01250155083835125	 
Topic: 0.020*"warn" + 0.020*"nation" + 0.016*"peopl" + 0.013*"farm" + 0.011*"liber" + 0.011*"victorian" + 0.011*"leader" + 0.010*"parti" + 0.009*"australia" + 0.009*"weather"

Score: 0.012500865384936333	 
Topic: 0.023*"donald" + 0.021*"china" + 0.017*"island" + 0.015*"rise" + 0.013*"street" + 0.013*"fall" + 0.012*"show" + 0.012*"australian" + 0.010*"wall" + 0.010*"young"

Score

Evaluatoin by TF-IDF model

In [None]:

for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))



Score: 0.53730309009552	 
Topic: 0.007*"elect" + 0.007*"budget" + 0.006*"sport" + 0.006*"friday" + 0.006*"govern" + 0.006*"grandstand" + 0.005*"celebr" + 0.005*"fund" + 0.005*"histori" + 0.005*"labor"

Score: 0.21545980870723724	 
Topic: 0.019*"countri" + 0.014*"hour" + 0.009*"wednesday" + 0.009*"michael" + 0.008*"climat" + 0.007*"david" + 0.006*"explain" + 0.006*"age" + 0.006*"chang" + 0.005*"footag"

Score: 0.15972255170345306	 
Topic: 0.027*"trump" + 0.012*"interview" + 0.010*"australia" + 0.008*"hobart" + 0.008*"tuesday" + 0.007*"cricket" + 0.007*"christma" + 0.007*"world" + 0.006*"india" + 0.006*"peter"

Score: 0.012503769248723984	 
Topic: 0.014*"donald" + 0.012*"drum" + 0.006*"juli" + 0.006*"action" + 0.006*"coal" + 0.005*"jam" + 0.005*"tree" + 0.005*"marriag" + 0.005*"govern" + 0.005*"water"

Score: 0.012502809055149555	 
Topic: 0.011*"live" + 0.008*"turnbul" + 0.007*"morrison" + 0.007*"parliament" + 0.005*"asylum" + 0.005*"anim" + 0.004*"onlin" + 0.004*"kohler" + 0.004*"seeke

## Testing model on unseen document

In [None]:
unseen_document = 'How a Pentagon deal became an identity crisis for Google'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))


Score: 0.34999993443489075	 Topic: 0.039*"trump" + 0.024*"queensland" + 0.020*"crash" + 0.019*"news" + 0.017*"die"
Score: 0.1838952898979187	 Topic: 0.027*"market" + 0.018*"miss" + 0.016*"indigen" + 0.015*"price" + 0.014*"victoria"
Score: 0.18332457542419434	 Topic: 0.023*"donald" + 0.021*"china" + 0.017*"island" + 0.015*"rise" + 0.013*"street"
Score: 0.18276478350162506	 Topic: 0.028*"death" + 0.020*"canberra" + 0.015*"hospit" + 0.013*"water" + 0.010*"flood"
Score: 0.016674192622303963	 Topic: 0.015*"feder" + 0.015*"health" + 0.014*"bushfir" + 0.014*"farmer" + 0.013*"royal"
Score: 0.016669519245624542	 Topic: 0.020*"warn" + 0.020*"nation" + 0.016*"peopl" + 0.013*"farm" + 0.011*"liber"
Score: 0.016669070348143578	 Topic: 0.027*"govern" + 0.021*"chang" + 0.016*"jail" + 0.013*"rural" + 0.013*"busi"
Score: 0.016668107360601425	 Topic: 0.025*"attack" + 0.020*"polic" + 0.018*"kill" + 0.017*"australian" + 0.015*"arrest"
Score: 0.01666782796382904	 Topic: 0.031*"elect" + 0.021*"south" + 0.018