# Topic Modelling

## Dependencies

In [21]:
import pandas as pd
import gensim
from gensim.models.ldamulticore import LdaMulticore
from gensim import corpora, models
import pyLDAvis
import pyLDAvis.gensim_models

from nltk.corpus import stopwords
import string
from nltk.stem import WordNetLemmatizer

import warnings, re
warnings.simplefilter('ignore')
from itertools import chain



stop_words = set(stopwords.words('english'))


import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Patil\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Patil\AppData\Roaming\nltk_data...


In [3]:
data = pd.read_excel('metoo_tweets_dec2017.xlsx')
data.head()

Unnamed: 0,column_a,text,favorited,favoritecount,replytosn,created,truncated,replytosid,id,replytouid,statussource,screenname,retweetcount,isretweet,retweeted,longitude,latitude,location
0,1,American Harem.. #MeToo https://t.co/HjExLJdGuF,0.0,0,,2017-11-29T23:59:00,0.0,,9.36e+17,,"<a href=""http://instagram.com"" rel=""nofollow"">...",ahmediaTV,0.0,0.0,0.0,,,
1,2,@johnconyersjr @alfranken why have you guys ...,0.0,0,johnconyersjr,2017-11-29T23:59:00,0.0,,9.36e+17,266149840.0,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",JesusPrepper74,0.0,0.0,0.0,,,
2,3,Watched Megan Kelly ask Joe Keery this A.M. if...,0.0,0,,2017-11-29T23:59:00,1.0,,9.36e+17,,"<a href=""http://twitter.com/download/android"" ...",DemerisePotvin,0.0,0.0,0.0,,,
3,4,Women have been talking about this crap the en...,0.0,0,,2017-11-29T23:59:00,0.0,,9.36e+17,,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",TheDawnStott,0.0,0.0,0.0,,,
4,5,.@BetteMidler please speak to this sexual assa...,0.0,15,,2017-11-29T23:59:00,0.0,,9.36e+17,,"<a href=""http://twitter.com/#!/download/ipad"" ...",scottygirl2014,11.0,0.0,0.0,,,


In [49]:


# Removing the false and nanvalues
filtered_data = data[data["text"].apply(lambda x: type(x) == str)]

def clean_text(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'RT[\s]+', '', text)
    text = text.lower()
    text = re.sub("#[A-Za-z0-9_]+","", text)
    text = re.sub(r'@[A-Za-z0-9]+', '', text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    text =  ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word.lower() not in stop_words])
    # text = ' '.join([lemma.lemmatize(word) for word in text.split()])
    return text.split()

filtered_data['clean_text'] = filtered_data['text'].apply(clean_text)

# filtered_data.head()

In [51]:
df = filtered_data[0:1000]
# create dictionary'
dictionary = corpora.Dictionary(df['clean_text'])
#Total number of non-zeroes in the BOW matrix (sum of the number of unique words per document over the entire corpus).
print(dictionary.num_nnz)


# Create document term matrix
doc_term_matrix = [dictionary.doc2bow(doc) for doc in df['clean_text'] ]
print(len(doc_term_matrix))

# Instantiate LDA model
lda = gensim.models.ldamodel.LdaModel

num_topics=3
%time ldamodel = lda(doc_term_matrix,num_topics=num_topics,id2word=dictionary,passes=50,minimum_probability=0)

# Print the topics identified by LDA model

ldamodel.print_topics(num_topics=num_topics)


# Visualize the LDA model results


lda_display = pyLDAvis.gensim_models.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display)





7868
1000
CPU times: total: 8.02 s
Wall time: 16.9 s


In [52]:
# Find which articles were marked in which cluster

# Assigns the topics to the documents in corpus
lda_corpus = ldamodel[doc_term_matrix]

# [doc for doc in lda_corpus]



# cluster1 = [j for i,j in zip(lda_corpus,df.index) if i[0][1] > threshold]
# cluster2 = [j for i,j in zip(lda_corpus,df.index) if i[1][1] > threshold]
# cluster3 = [j for i,j in zip(lda_corpus,df.index) if i[2][1] > threshold]
# # cluster4 = [j for i,j in zip(lda_corpus,df.index) if i[3][1] > threshold]
# # cluster5 = [j for i,j in zip(lda_corpus,df.index) if i[4][1] > threshold]

# print(len(cluster1))
# print(len(cluster2))
# print(len(cluster3))
# # print(len(cluster4))
# # print(len(cluster5))

# df.iloc[cluster1]




In [53]:
scores = list(chain(*[[score for topic_id,score in topic] \
                      for topic in [doc for doc in lda_corpus]]))

threshold = sum(scores)/len(scores)
print(threshold)

0.3333333341293037


In [54]:
cluster1 = [j for i,j in zip(lda_corpus,df.index) if i[0][1] > threshold]
cluster2 = [j for i,j in zip(lda_corpus,df.index) if i[1][1] > threshold]
cluster3 = [j for i,j in zip(lda_corpus,df.index) if i[2][1] > threshold]
# cluster4 = [j for i,j in zip(lda_corpus,df.index) if i[3][1] > threshold]
# cluster5 = [j for i,j in zip(lda_corpus,df.index) if i[4][1] > threshold]

print(len(cluster1))
print(len(cluster2))
print(len(cluster3))

379
297
386


In [55]:
filtered_data.iloc[cluster1]

Unnamed: 0,column_a,text,favorited,favoritecount,replytosn,created,truncated,replytosid,id,replytouid,statussource,screenname,retweetcount,isretweet,retweeted,longitude,latitude,location,clean_text
4,5,.@BetteMidler please speak to this sexual assa...,0.0,15,,2017-11-29T23:59:00,0.0,,9.360000e+17,,"<a href=""http://twitter.com/#!/download/ipad"" ...",scottygirl2014,11.0,0.0,0.0,,,,"[please, speak, sexual, assault, interview]"
6,7,Jay-Z is saying what I've been saying. DJT's j...,0.0,3,,2017-11-29T23:59:00,1.0,,9.360000e+17,,"<a href=""http://twitter.com/download/android"" ...",silveriaalison,3.0,0.0,0.0,,,,"[jayz, saying, ive, saying, djts, jobis, done,..."
7,8,Where in the world is @MattLauer Celebrate #MeToo,0.0,0,,2017-11-29T23:59:00,0.0,,9.360000e+17,,"<a href=""http://twitter.com/download/iphone"" r...",calrican,0.0,0.0,0.0,,,,"[world, celebrate]"
12,13,@RepKathleenRice @RepJayapal Calling out @RepJ...,0.0,0,RepKathleenRice,2017-11-29T23:58:00,1.0,9.360000e+17,9.360000e+17,2970462034,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",_standeliver,0.0,0.0,0.0,,,,"[calling, right, arc, historyx000dx000dyou, kn..."
14,15,Why are not the men haters from #Metoo marchin...,0.0,0,,2017-11-29T23:58:00,0.0,,9.360000e+17,,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",european_4,0.0,0.0,0.0,,,,"[men, hater, marching, congress, ask, x000dare..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1035,1013,"@MargaretTrucks @nbc Yes, because they're all ...",0.0,1,MargaretTrucks,2017-11-29T22:03:00,1.0,9.360000e+17,9.360000e+17,518248535,"<a href=""https://mobile.twitter.com"" rel=""nofo...",torqueflite,2.0,0.0,0.0,,,,"[yes, theyre, powerful, controlling, money, al..."
1038,1016,Disappointed by today's news._x000D__x000D_#Re...,0.0,1,,2017-11-29T22:02:00,0.0,,9.360000e+17,,"<a href=""http://twitter.com/download/android"" ...",TracyWashington,0.0,0.0,0.0,,,,"[disappointed, today, newsx000dx000d]"
1041,1019,Another one. There will be more. #MattLauer #m...,0.0,1,,2017-11-29T22:02:00,0.0,,9.360000e+17,,"<a href=""http://twitter.com/download/android"" ...",FredForTrump,1.0,0.0,0.0,,,,"[another, one]"
1042,1020,@StlGal_36 @JenDeerinwater Matt Lauer. And ano...,0.0,0,NolanHack,2017-11-29T22:02:00,1.0,9.360000e+17,9.360000e+17,323020327,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",NolanHack,2.0,0.0,0.0,,,,"[36, matt, lauer, another, one, bite, dust, pl..."
