# Topic Modelling

In [11]:
# Import the necessary packages

import pandas as pd
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
import gensim
import gensim.corpora as corpora
import pickle
import operator

[nltk_data] Downloading package punkt to C:\Users\LAPTOP
[nltk_data]     ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
df = pd.read_csv("Required Data.csv",sep='\t') # Input the data

df['tokenized_data'] = df.apply(lambda data: nltk.word_tokenize(str(data['processed_text'])), axis=1) # Tokenize the data

In [3]:
df

Unnamed: 0,id,processed_text,tokenized_data
0,1242602093501800448,yeah missing freedom life covid19,"[yeah, missing, freedom, life, covid19]"
1,1242602237571919872,contribute cm relief fund help delhi govt figh...,"[contribute, cm, relief, fund, help, delhi, go..."
2,1242602411962912769,bhai assalamualaikum possible please call bhai...,"[bhai, assalamualaikum, possible, please, call..."
3,1242602425023787008,bold adress nation activity banned except esse...,"[bold, adress, nation, activity, banned, excep..."
4,1242602501284585472,please understand important stay home responsi...,"[please, understand, important, stay, home, re..."
...,...,...,...
582685,1272317773230886917,URL,[URL]
582686,1272317779409076226,covid,[covid]
582687,1272317806269390849,italy face two new coronavirus outbreak,"[italy, face, two, new, coronavirus, outbreak]"
582688,1272317871318634497,india become top none modi reign india became ...,"[india, become, top, none, modi, reign, india,..."


In [4]:
# Remove covid related stopwords from processed data

stop_words = ['corona','coronavirus','covid-19','covid_19','covid19','covid','pandemic','lockdown','USER_MENTION','URL'] + stopwords.words('english')
df['data']=df['tokenized_data'].apply(lambda x: [item for item in x if item not in stop_words])

df

Unnamed: 0,id,processed_text,tokenized_data,data
0,1242602093501800448,yeah missing freedom life covid19,"[yeah, missing, freedom, life, covid19]","[yeah, missing, freedom, life]"
1,1242602237571919872,contribute cm relief fund help delhi govt figh...,"[contribute, cm, relief, fund, help, delhi, go...","[contribute, cm, relief, fund, help, delhi, go..."
2,1242602411962912769,bhai assalamualaikum possible please call bhai...,"[bhai, assalamualaikum, possible, please, call...","[bhai, assalamualaikum, possible, please, call..."
3,1242602425023787008,bold adress nation activity banned except esse...,"[bold, adress, nation, activity, banned, excep...","[bold, adress, nation, activity, banned, excep..."
4,1242602501284585472,please understand important stay home responsi...,"[please, understand, important, stay, home, re...","[please, understand, important, stay, home, re..."
...,...,...,...,...
582685,1272317773230886917,URL,[URL],[]
582686,1272317779409076226,covid,[covid],[]
582687,1272317806269390849,italy face two new coronavirus outbreak,"[italy, face, two, new, coronavirus, outbreak]","[italy, face, two, new, outbreak]"
582688,1272317871318634497,india become top none modi reign india became ...,"[india, become, top, none, modi, reign, india,...","[india, become, top, none, modi, reign, india,..."


In [5]:
# Create bigrams for the data

bigram = gensim.models.Phrases(df['data'])

bigram_model = gensim.models.phrases.Phraser(bigram)

df['data_bigrams'] = [bigram_model[data] for data in (df['data'])]

In [6]:
df

Unnamed: 0,id,processed_text,tokenized_data,data,data_bigrams
0,1242602093501800448,yeah missing freedom life covid19,"[yeah, missing, freedom, life, covid19]","[yeah, missing, freedom, life]","[yeah, missing, freedom, life]"
1,1242602237571919872,contribute cm relief fund help delhi govt figh...,"[contribute, cm, relief, fund, help, delhi, go...","[contribute, cm, relief, fund, help, delhi, go...","[contribute, cm_relief, fund, help, delhi, gov..."
2,1242602411962912769,bhai assalamualaikum possible please call bhai...,"[bhai, assalamualaikum, possible, please, call...","[bhai, assalamualaikum, possible, please, call...","[bhai, assalamualaikum, possible, please, call..."
3,1242602425023787008,bold adress nation activity banned except esse...,"[bold, adress, nation, activity, banned, excep...","[bold, adress, nation, activity, banned, excep...","[bold, adress, nation, activity, banned, excep..."
4,1242602501284585472,please understand important stay home responsi...,"[please, understand, important, stay, home, re...","[please, understand, important, stay, home, re...","[please, understand, important, stay_home, res..."
...,...,...,...,...,...
582685,1272317773230886917,URL,[URL],[],[]
582686,1272317779409076226,covid,[covid],[],[]
582687,1272317806269390849,italy face two new coronavirus outbreak,"[italy, face, two, new, coronavirus, outbreak]","[italy, face, two, new, outbreak]","[italy, face, two, new, outbreak]"
582688,1272317871318634497,india become top none modi reign india became ...,"[india, become, top, none, modi, reign, india,...","[india, become, top, none, modi, reign, india,...","[india, become, top, none, modi, reign, india,..."


In [7]:
data_dict = corpora.Dictionary(df['data_bigrams']) # Create Dictionary of the data

corpus = [data_dict.doc2bow(text) for text in df['data_bigrams']] # Create bag of words


In [8]:
[[(data_dict[id], frequency) for id, frequency in data] for data in corpus[:1]] #Sample of data in first row

[[('freedom', 1), ('life', 1), ('missing', 1), ('yeah', 1)]]

# Building the topic model

In [9]:
# Build LDA model

lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=data_dict,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=0,
                                           chunksize=5000,
                                           passes=5,
                                           alpha='auto',
                                           per_word_topics=True)


In [10]:
lda_model.print_topics()

[(0,
  '0.009*"one" + 0.009*"people" + 0.008*"time" + 0.007*"please" + 0.005*"go" + 0.005*"u" + 0.005*"need" + 0.004*"also" + 0.004*"due" + 0.004*"govt"'),
 (1,
  '0.015*"people" + 0.011*"india" + 0.006*"virus" + 0.006*"one" + 0.006*"govt" + 0.005*"please" + 0.005*"day" + 0.004*"state" + 0.004*"country" + 0.004*"time"'),
 (2,
  '0.006*"u" + 0.006*"due" + 0.006*"fight" + 0.005*"time" + 0.005*"india" + 0.005*"people" + 0.004*"today" + 0.004*"crisis" + 0.004*"need" + 0.004*"life"'),
 (3,
  '0.019*"case" + 0.011*"people" + 0.008*"india" + 0.006*"virus" + 0.006*"today" + 0.005*"government" + 0.004*"u" + 0.004*"death" + 0.004*"new_case" + 0.003*"delhi"'),
 (4,
  '0.011*"time" + 0.008*"day" + 0.008*"india" + 0.007*"virus" + 0.004*"case" + 0.004*"world" + 0.004*"u" + 0.004*"country" + 0.003*"good" + 0.003*"thank"'),
 (5,
  '0.009*"people" + 0.008*"u" + 0.008*"please" + 0.007*"sir" + 0.006*"time" + 0.005*"government" + 0.005*"fight" + 0.004*"app" + 0.004*"one" + 0.004*"due"'),
 (6,
  '0.016*"u"

In [13]:
# Save the model

pickle.dump(lda_model, open("LDA Model", 'wb'))

## Finding dominant topics

In [49]:
topic_data = []
topic_perc_contrib = []

for i, row in enumerate(lda_model[corpus]):
    topic_number = max(row[0], key=lambda x:x[1])
    topic_words = lda_model.show_topic(topic_number[0])
    topic_data.append(", ".join([word for word, prop in topic_words]))
    topic_perc_contrib.append(topic_number[1])


df['topic_data'] = topic_data
df['topic_perc_contrib'] = topic_perc_contrib

In [50]:
df

Unnamed: 0,id,processed_text,tokenized_data,data,data_bigrams,topic_data,topic_perc_contrib
0,1242602093501800448,yeah missing freedom life covid19,"[yeah, missing, freedom, life, covid19]","[yeah, missing, freedom, life]","[yeah, missing, freedom, life]","people, india, virus, one, govt, please, day, ...",0.983685
1,1242602237571919872,contribute cm relief fund help delhi govt figh...,"[contribute, cm, relief, fund, help, delhi, go...","[contribute, cm, relief, fund, help, delhi, go...","[contribute, cm_relief, fund, help, delhi, gov...","govt, home, people, one, fight, india, work, c...",0.995637
2,1242602411962912769,bhai assalamualaikum possible please call bhai...,"[bhai, assalamualaikum, possible, please, call...","[bhai, assalamualaikum, possible, please, call...","[bhai, assalamualaikum, possible, please, call...","u, day, may, india, due, get, people, n, sir, ...",0.561373
3,1242602425023787008,bold adress nation activity banned except esse...,"[bold, adress, nation, activity, banned, excep...","[bold, adress, nation, activity, banned, excep...","[bold, adress, nation, activity, banned, excep...","u, day, may, india, due, get, people, n, sir, ...",0.588830
4,1242602501284585472,please understand important stay home responsi...,"[please, understand, important, stay, home, re...","[please, understand, important, stay, home, re...","[please, understand, important, stay_home, res...","people, u, please, sir, time, government, figh...",0.991580
...,...,...,...,...,...,...,...
582685,1272317773230886917,URL,[URL],[],[],"u, day, may, india, due, get, people, n, sir, ...",0.122207
582686,1272317779409076226,covid,[covid],[],[],"u, day, may, india, due, get, people, n, sir, ...",0.122207
582687,1272317806269390849,italy face two new coronavirus outbreak,"[italy, face, two, new, coronavirus, outbreak]","[italy, face, two, new, outbreak]","[italy, face, two, new, outbreak]","case, people, india, virus, today, government,...",0.986889
582688,1272317871318634497,india become top none modi reign india became ...,"[india, become, top, none, modi, reign, india,...","[india, become, top, none, modi, reign, india,...","[india, become, top, none, modi, reign, india,...","india, people, patient, like, day, sir, delhi,...",0.992623


Here you can see that most of the topics are directly related to the data.

We can also see how much % of the data is related to topic

A value closer to 1 shows that the data is directly related to the topic.

A value closer to 0 shows that the data is vague and slighly resembles the topic.

Here, topic shows a set of words that can direcly describe about the data.

For example the topic: people, india, virus, one, govt, please, day, ... shows the problems of the people in India who are affected by lockdown. While the topic: people, u, please, sir, time, government, figh... shows the steps that people should follow to take care of themselves during the lockdown phase.	

In [84]:
# Finding the number of document of each topic
topic_numbers = df['topic_data'].value_counts()

# Percentage of Documents for Each Topic
topic_perc_docs= round(topic_numbers/topic_numbers.sum(), 4)


# Concatenate Column wise
df_dominant_topics = pd.concat([topic_numbers, topic_perc_docs], axis=1).reset_index()

# Change Column names
df_dominant_topics.columns = ['Topics','Num_of_Documents', 'Perc_of_Documents']

In [85]:
df_dominant_topics

Unnamed: 0,Topics,Num_of_Documents,Perc_of_Documents
0,"u, day, may, india, due, get, people, n, sir, ...",71683,0.123
1,"people, india, virus, one, govt, please, day, ...",70827,0.1216
2,"case, people, india, virus, today, government,...",65366,0.1122
3,"india, people, patient, like, day, sir, delhi,...",64399,0.1105
4,"one, people, time, please, go, u, need, also, ...",60083,0.1031
5,"people, u, please, sir, time, government, figh...",58618,0.1006
6,"u, due, fight, time, india, people, today, cri...",52680,0.0904
7,"govt, home, people, one, fight, india, work, c...",48213,0.0827
8,"time, day, india, virus, case, world, u, count...",46346,0.0795
9,"help, u, india, virus, people, fight, sir, man...",44475,0.0763


In [87]:
# Remove unnecessary data before saving

df.drop(['tokenized_data','data','data_bigrams'], axis = 1,inplace = True) 

df.sample(5)

Unnamed: 0,id,processed_text,topic_data,topic_perc_contrib
195060,1249751158328983557,here indian state kerala covid19 hot spot flat...,"help, u, india, virus, people, fight, sir, man...",0.986262
44201,1243942410796687361,take part latest discussion way help contain c...,"one, people, time, please, go, u, need, also, ...",0.994751
316810,1255763476213088257,day mobile discnnctd network really hard pay h...,"time, day, india, virus, case, world, u, count...",0.995904
544705,1271484435356155904,youre looking big picture poverty dwarf police...,"one, people, time, please, go, u, need, also, ...",0.997036
503255,1269174747838984192,weekendreading lot people lucky enough family ...,"one, people, time, please, go, u, need, also, ...",0.993759


In [88]:
#Save the data

df.to_csv("Topic Modelled Data.csv", sep='\t', encoding='utf-8',index=False)
df_dominant_topics.to_csv("Dominant Topics.csv", sep='\t', encoding='utf-8',index=False)