<a href="https://colab.research.google.com/github/perkykooky/NLP/blob/main/Topic_Modelling_Basic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Topic Modelling example for News Headlines 



In [None]:
import numpy as np
import pandas as pd

In [None]:
from google.colab import drive
import os
os.environ['KAGGLE_CONFIG_DIR'] = "/content"

In [None]:
!kaggle datasets download -d therohk/million-headlines

Downloading million-headlines.zip to /content
 43% 9.00M/21.1M [00:00<00:00, 44.0MB/s]
100% 21.1M/21.1M [00:00<00:00, 70.1MB/s]


In [None]:
!unzip \*.zip && rm *.zip

Archive:  million-headlines.zip
  inflating: abcnews-date-text.csv   


Data contains 18 years worth of headlines from Australian news company (Australian Broadcasting Corporation)

**Schema:** 
  <br> publish_date (YYYYMMDD)
  <br> headline_text (string)

**Date Range:** [2003-02-19, 2020-12-31]

**Size:** (1226258, 2)

In [None]:
data = pd.read_csv('abcnews-date-text.csv')
data.head(5)

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


In [None]:
documents = data['headline_text'].reset_index()
documents.head(5)

Unnamed: 0,index,headline_text
0,0,aba decides against community broadcasting lic...
1,1,act fire witnesses must be aware of defamation
2,2,a g calls for infrastructure protection summit
3,3,air nz staff in aust strike for pay rise
4,4,air nz strike to affect australian travellers


## Data Preparation

  * Splitting the text into sentences and then into words.
  * Cleaning any uunnecessary non-alphanumeric characters.
  * Lowercase all strings.
  * Removing articles, stopwords and other noise (less than 3 characters).

In [None]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import nltk

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
print(STOPWORDS)

frozenset({'which', 'with', 'you', 'does', 'see', 'whence', 'couldnt', 'really', 'etc', 'until', 'describe', 'latterly', 'hers', 'became', 'by', 'out', 'up', 'before', 'back', 'thin', 'thereby', 'least', 'fire', 'somehow', 'across', 'sometimes', 'is', 'nobody', 'just', 'such', 'give', 'indeed', 'due', 'noone', 'two', 'didn', 'without', 'whereafter', 'three', 'beforehand', 're', 'might', 'whither', 'themselves', 'although', 'if', 'first', 'against', 'bottom', 'while', 'bill', 'must', 'whereas', 'whole', 'call', 'she', 'used', 'our', 'nor', 'your', 'hence', 'both', 'each', 'whoever', 'per', 'put', 'now', 'therein', 'everyone', 'behind', 'anyhow', 'eight', 'off', 'kg', 'become', 'always', 'can', 'someone', 'onto', 'than', 'more', 'ourselves', 'below', 'he', 'un', 'either', 'few', 'beyond', 'nevertheless', 'would', 'fill', 'had', 'all', 'another', 'hereupon', 'their', 'de', 'third', 'they', 'itself', 'hasnt', 'the', 'that', 'himself', 'me', 'about', 'along', 'during', 'besides', 'will', 'n

In [None]:
# Stemming and Lemmatizing

def lemmatize_stemming(text):
    return WordNetLemmatizer().lemmatize(text, pos='v')

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in STOPWORDS and len(token) >3:
            result.append(lemmatize_stemming(token))
    return result

In [None]:
# Sample Output for preprocess()

sample = documents.values[0][1]
print("All Tokens: {}\n".format(sample.split(' ')))
print("Preprocessed and Relevant Tokens: {}".format(preprocess(sample)))

All Tokens: ['aba', 'decides', 'against', 'community', 'broadcasting', 'licence']

Preprocessed and Relevant Tokens: ['decide', 'community', 'broadcast', 'licence']


In [None]:
# Insert new column for preprocessed tokens

documents['preprocessed'] = documents['headline_text'].apply(lambda x: preprocess(x))
documents.head(5)

Unnamed: 0,index,headline_text,preprocessed
0,0,aba decides against community broadcasting lic...,"[decide, community, broadcast, licence]"
1,1,act fire witnesses must be aware of defamation,"[witness, aware, defamation]"
2,2,a g calls for infrastructure protection summit,"[call, infrastructure, protection, summit]"
3,3,air nz staff in aust strike for pay rise,"[staff, aust, strike, rise]"
4,4,air nz strike to affect australian travellers,"[strike, affect, australian, travellers]"


In [None]:
# Creating a dictionary of all unique words with a unique integer id

dictionary = gensim.corpora.Dictionary(documents['preprocessed'])

In [None]:
# Accessing values in the gensim dictionary
from pprint import pprint

sample = documents['preprocessed'].values[200] #change indices to check other values
id = dictionary.token2id[sample[0]]


print(sample)
print("\nWord \"{}\" has key {} in the dictionary.\n".format(sample[0], dictionary.token2id[sample[0]]))
print("Word \"{}\" appeared in {} documents.\n".format(sample[0], dictionary.dfs[dictionary.token2id[sample[0]]]))
print("Total of {} documents processed.\n".format(dictionary.num_docs))
print("Total of {} words processed.\n".format(dictionary.num_pos))

['academic', 'upbeat', 'higher', 'education', 'review']

Word "academic" has key 660 in the dictionary.

Word "academic" appeared in 513 documents.

Total of 1226258 documents processed.

Total of 5726030 words processed.



In [None]:
# Enumerating values in the dictionary

count = 0
for k,v in dictionary.iteritems():
    print(k,v)
    count +=1
    if count > 10:
        break

0 broadcast
1 community
2 decide
3 licence
4 aware
5 defamation
6 witness
7 call
8 infrastructure
9 protection
10 summit


In [None]:
# Filtering tokens based on their DF (document frequency)
# no_below: minimum number of appearance in x documents
# no_above: fraction of all documents (max)
# keep_n: keep first n most frequent words

dictionary.filter_extremes(no_below = 15, no_above = 0.5, keep_n = 100000)

# **Creating the Bag of Words Matrix**

The Bag of Words Matrix simply calculates the **Term Frequency** of a token in the document it belongs to. It is the TF part of the TF-IDF matrix with no weightage on the importance of the token in the collection of documents.

In [None]:
# Convert each list of tokens per row into BOW format 
# (int, int) -> (integer id of token in dict, count of token)
bow_corpus = [dictionary.doc2bow(doc) for doc in documents['preprocessed']]

In [None]:
bow_corpus[100]

[(78, 1), (362, 1), (363, 1)]

In [None]:
bow_sample = bow_corpus[100]

for i in range(len(bow_sample)):
    print("Word {} (\"{}\") appears {} time in new document.".format(bow_sample[i][0], dictionary[bow_sample[i][0]], bow_sample[i][1]))

Word 78 ("urge") appears 1 time in new document.
Word 362 ("councillors") appears 1 time in new document.
Word 363 ("women") appears 1 time in new document.


#**Creating the TF-IDF Matrix**

The TF-IDF Matrix calculates the relevance and importance of a token in a document within a collectin of documents.

It is based off of 2 measures:
*   **Term Frequency:** count of instances of a token in a document
*   **Inverse Document Frequency:** log inverse of fraction of documents token appears in 






In [None]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]


from pprint import pprint

# get tfidf vector representation of first entry

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.5918674193999763),
 (1, 0.3937180767686992),
 (2, 0.5009876624450964),
 (3, 0.49365007440105513)]


## **Latent Dirichlet Allocation on Bag of Words Matrix**


In [None]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics = 10, id2word = dictionary, passes=2, workers =2)

In [None]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic)) 

Topic: 0 
Words: 0.040*"queensland" + 0.023*"test" + 0.016*"australia" + 0.010*"game" + 0.009*"season" + 0.009*"northern" + 0.009*"john" + 0.008*"black" + 0.008*"coronavirus" + 0.008*"city"
Topic: 1 
Words: 0.047*"trump" + 0.038*"sydney" + 0.020*"open" + 0.016*"coronavirus" + 0.014*"hospital" + 0.013*"victorian" + 0.013*"speak" + 0.012*"care" + 0.012*"interview" + 0.010*"age"
Topic: 2 
Words: 0.046*"australian" + 0.033*"case" + 0.032*"court" + 0.021*"face" + 0.017*"people" + 0.013*"morrison" + 0.012*"tell" + 0.012*"release" + 0.011*"hear" + 0.011*"rule"
Topic: 3 
Words: 0.025*"government" + 0.018*"health" + 0.017*"school" + 0.016*"state" + 0.014*"say" + 0.012*"call" + 0.012*"federal" + 0.011*"indigenous" + 0.010*"election" + 0.010*"labor"
Topic: 4 
Words: 0.037*"australia" + 0.023*"news" + 0.013*"protest" + 0.012*"scott" + 0.010*"country" + 0.010*"darwin" + 0.010*"beat" + 0.009*"president" + 0.008*"south" + 0.007*"mark"
Topic: 5 
Words: 0.025*"live" + 0.020*"die" + 0.017*"north" + 0.01

# **Latent Dirichlet Allocation on TF-IDF Matrix**

In [None]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics = 10, id2word = dictionary, passes=4, workers =4)

In [None]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.015*"charge" + 0.014*"murder" + 0.010*"court" + 0.010*"child" + 0.009*"assault" + 0.009*"royal" + 0.008*"guilty" + 0.008*"abuse" + 0.008*"sentence" + 0.008*"police"
Topic: 1 Word: 0.015*"crash" + 0.014*"police" + 0.010*"kill" + 0.009*"die" + 0.009*"shoot" + 0.008*"dead" + 0.008*"search" + 0.008*"woman" + 0.007*"hour" + 0.007*"miss"
Topic: 2 Word: 0.010*"government" + 0.008*"border" + 0.007*"restrictions" + 0.006*"coronavirus" + 0.006*"korea" + 0.006*"cattle" + 0.005*"say" + 0.005*"action" + 0.004*"china" + 0.004*"minister"
Topic: 3 Word: 0.027*"trump" + 0.008*"bushfire" + 0.006*"age" + 0.006*"country" + 0.006*"flood" + 0.006*"south" + 0.006*"rain" + 0.006*"storm" + 0.005*"coast" + 0.005*"bushfires"
Topic: 4 Word: 0.020*"news" + 0.013*"market" + 0.012*"rural" + 0.008*"monday" + 0.007*"john" + 0.007*"wall" + 0.007*"national" + 0.006*"business" + 0.006*"street" + 0.006*"share"
Topic: 5 Word: 0.026*"covid" + 0.025*"coronavirus" + 0.008*"climate" + 0.007*"case" + 0.006*"cha

In [None]:
test = bow_corpus[1000]

In [None]:
for index, score in sorted(lda_model[test], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.6781476736068726	 
Topic: 0.060*"coronavirus" + 0.029*"covid" + 0.022*"victoria" + 0.015*"market" + 0.012*"tasmania" + 0.011*"restrictions" + 0.011*"rise" + 0.010*"record" + 0.008*"tasmanian" + 0.008*"break"

Score: 0.1885150521993637	 
Topic: 0.048*"police" + 0.021*"charge" + 0.020*"death" + 0.016*"murder" + 0.015*"attack" + 0.015*"kill" + 0.015*"woman" + 0.014*"years" + 0.013*"shoot" + 0.013*"jail"

Score: 0.016667615622282028	 
Topic: 0.028*"donald" + 0.022*"coast" + 0.016*"miss" + 0.016*"world" + 0.016*"national" + 0.014*"life" + 0.014*"change" + 0.014*"gold" + 0.012*"drum" + 0.011*"park"

Score: 0.016667520627379417	 
Topic: 0.040*"queensland" + 0.023*"test" + 0.016*"australia" + 0.010*"game" + 0.009*"season" + 0.009*"northern" + 0.009*"john" + 0.008*"black" + 0.008*"coronavirus" + 0.008*"city"

Score: 0.016667397692799568	 
Topic: 0.025*"government" + 0.018*"health" + 0.017*"school" + 0.016*"state" + 0.014*"say" + 0.012*"call" + 0.012*"federal" + 0.011*"indigenous" + 0.

In [None]:
for index, score in sorted(lda_model_tfidf[test], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.4290045499801636	 
Topic: 0.015*"crash" + 0.014*"police" + 0.010*"kill" + 0.009*"die" + 0.009*"shoot" + 0.008*"dead" + 0.008*"search" + 0.008*"woman" + 0.007*"hour" + 0.007*"miss"

Score: 0.23570944368839264	 
Topic: 0.015*"charge" + 0.014*"murder" + 0.010*"court" + 0.010*"child" + 0.009*"assault" + 0.009*"royal" + 0.008*"guilty" + 0.008*"abuse" + 0.008*"sentence" + 0.008*"police"

Score: 0.21859821677207947	 
Topic: 0.020*"news" + 0.013*"market" + 0.012*"rural" + 0.008*"monday" + 0.007*"john" + 0.007*"wall" + 0.007*"national" + 0.006*"business" + 0.006*"street" + 0.006*"share"

Score: 0.01667347364127636	 
Topic: 0.007*"health" + 0.007*"queensland" + 0.006*"fund" + 0.006*"election" + 0.006*"tuesday" + 0.006*"coronavirus" + 0.006*"federal" + 0.005*"budget" + 0.005*"update" + 0.005*"plan"

Score: 0.01667097955942154	 
Topic: 0.026*"covid" + 0.025*"coronavirus" + 0.008*"climate" + 0.007*"case" + 0.006*"change" + 0.005*"august" + 0.005*"victoria" + 0.005*"quarantine" + 0.005*"au

In [None]:
documents[['headline_text','preprocessed']].values[1000]

array(['death toll hits 41 during bangladeshs local',
       list(['death', 'toll', 'hit', 'bangladeshs', 'local'])],
      dtype=object)