In [203]:
import pandas as np
import seaborn as sns
import zipfile
import pandas as pd

import nltk
import gensim
from nltk.corpus import stopwords
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from gensim import corpora, models

In [239]:
df = pd.read_csv("abcnews-date-text.csv")

df.head()

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


In [215]:
# the publish_date column is of no use to us
df = df[['headline_text']];

### Data Pre-processing

In [216]:
# using stemming and lemmatization techniques
def LemStem(doc):
    
    # the stemmer requires a language parameter
    s = SnowballStemmer('english')
    
    lemmatize_words = WordNetLemmatizer().lemmatize(doc, pos='v')
    return s.stem(lemmatize_words)

In [217]:
def PreProcess(doc):
    
    processed_words = gensim.utils.simple_preprocess(doc,min_len=2, max_len=20)  
    
    stopWords = gensim.parsing.preprocessing.STOPWORDS
    pre_processed = [LemStem(token) for token in processed_words if token not in stopWords]
    
    return pre_processed

In [218]:
# preparing the processed text
processed_text = df['headline_text'].apply(PreProcess)

### Creating Word Vectors with Bag of Words Technique

In [219]:
# the dictionary containes the mapping of all the words, i.e. token with their respective integer ids.
dic = gensim.corpora.Dictionary(processed_text)  

In [212]:
dic.filter_extremes(no_below=15, no_above=0.6)

In [221]:
bag_of_words = [dic.doc2bow(doc) for doc in processed_text]

### Topic Modeling with LDA using BoW

In [223]:
lda_topic_modeling_bow = gensim.models.LdaMulticore(bag_of_words,
                    num_topics=10, id2word=dic, random_state=42)

In [224]:
# printing index and topics
def print_topics(model):
    for idx, topic in model.print_topics():
        print(idx)
        print(topic)
        print()

In [225]:
print_topics(lda_topic_modeling_bow)

0
0.055*"council" + 0.029*"execut" + 0.029*"secur" + 0.029*"heritag" + 0.029*"protect" + 0.029*"fail" + 0.029*"posit" + 0.029*"chief" + 0.029*"tas" + 0.029*"dargo"

1
0.025*"rise" + 0.025*"air" + 0.025*"australia" + 0.025*"aust" + 0.025*"iraq" + 0.025*"nz" + 0.025*"strike" + 0.025*"aid" + 0.025*"unit" + 0.025*"celebr"

2
0.036*"climb" + 0.036*"subway" + 0.036*"death" + 0.036*"korean" + 0.036*"toll" + 0.036*"toughen" + 0.036*"continu" + 0.036*"south" + 0.036*"organ" + 0.036*"conduct"

3
0.031*"carew" + 0.031*"ruin" + 0.031*"goal" + 0.031*"opp" + 0.031*"freak" + 0.031*"timet" + 0.031*"lock" + 0.031*"fate" + 0.031*"roma" + 0.031*"war"

4
0.046*"welcom" + 0.046*"council" + 0.024*"commonwealth" + 0.024*"calleri" + 0.024*"final" + 0.024*"dent" + 0.024*"cut" + 0.024*"decis" + 0.024*"fix" + 0.024*"tie"

5
0.037*"kuwait" + 0.037*"suppli" + 0.037*"big" + 0.037*"troop" + 0.037*"paroo" + 0.037*"combat" + 0.037*"plan" + 0.037*"daili" + 0.037*"british" + 0.037*"water"

6
0.019*"council" + 0.019*"aff

### Creating Word Vectors with TF-IDF Technique

In [226]:
tf_idf = models.TfidfModel(bow_corpus)

# Applying the above tranformation to the entire corpus
tf_idf_corpus = tf_idf[bag_of_words] 

### Topic Modeling with LDA using TF-IDF technique

In [227]:
lda_topic_modeling_tfidf = gensim.models.LdaMulticore(tf_idf_corpus, num_topics=10, id2word=dictionary,random_state=42)

In [228]:
print_topics(lda_topic_modeling_tfidf)

0
0.011*"season" + 0.010*"bomber" + 0.007*"access" + 0.007*"intern" + 0.007*"brown" + 0.007*"find" + 0.006*"industri" + 0.006*"ahead" + 0.006*"honour" + 0.006*"test"

1
0.008*"presid" + 0.008*"stay" + 0.008*"educ" + 0.007*"port" + 0.007*"tri" + 0.007*"lawyer" + 0.007*"plan" + 0.007*"kangaroo" + 0.006*"hop" + 0.006*"food"

2
0.008*"bomber" + 0.007*"beach" + 0.007*"price" + 0.007*"push" + 0.007*"adelaid" + 0.007*"club" + 0.006*"deal" + 0.006*"court" + 0.006*"strong" + 0.001*"access"

3
0.008*"prison" + 0.008*"august" + 0.008*"releas" + 0.008*"worker" + 0.007*"prepar" + 0.007*"million" + 0.007*"eas" + 0.007*"bomber" + 0.007*"melbourn" + 0.006*"alleg"

4
0.008*"drop" + 0.008*"access" + 0.008*"blaze" + 0.007*"shop" + 0.007*"get" + 0.007*"sector" + 0.007*"land" + 0.007*"green" + 0.007*"news" + 0.006*"final"

5
0.007*"bomber" + 0.007*"investig" + 0.007*"live" + 0.007*"leak" + 0.006*"return" + 0.006*"busi" + 0.006*"confirm" + 0.006*"centr" + 0.006*"boss" + 0.006*"strong"

6
0.008*"studi" + 0.0

### Assigning topics to each heading using the topic modeling technique.

In [240]:
# sample data is selected and grounded coding is done on this dataset
df=df[0:40]

def max_prob(topic_prob):
 
    topic_prob.sort(key = lambda x: x[1])
    return topic_prob[-1][0]
 
    
topics = []
for i in range(len(bag_of_words)):
    topic_prob = lda_topic_modeling_bow.get_document_topics(bag_of_words[i], 
                minimum_probability=None, minimum_phi_value=None, per_word_topics=False)
    
    # finding the topic with maximum probability
    topic = max_prob(topic_prob)
    
    topics.append(topic)

In [246]:
# adding the topic as a column
df['Topic']=topics
df.drop(['publish_date'],axis=1,inplace=True)
df.head()

Unnamed: 0,headline_text,Topic
0,aba decides against community broadcasting lic...,8
1,act fire witnesses must be aware of defamation,9
2,a g calls for infrastructure protection summit,8
3,air nz staff in aust strike for pay rise,1
4,air nz strike to affect australian travellers,6


In [254]:
df['Topic'].value_counts()

6    8
1    5
4    5
8    4
9    4
3    4
0    4
5    2
7    2
2    2
Name: Topic, dtype: int64

In [248]:
df.to_csv('Topic_modeling.csv')