In [60]:
import pandas as pd

# Reading Data

In [64]:
data = pd.read_csv('Demo.csv', error_bad_lines=False, encoding = 'unicode_escape');
data_text = data[['tweet']]
data_text['index'] = data_text.index
documents = data_text

In [65]:
len(documents)

4006

In [66]:
documents[:5]

Unnamed: 0,tweet,index
0,RT @rssurjewala: Critical question: Was PayTM ...,0
1,RT @Hemant_80: Did you vote on #Demonetization...,1
2,"RT @roshankar: Former FinSec, RBI Dy Governor,...",2
3,RT @ANI_news: Gurugram (Haryana): Post office ...,3
4,RT @satishacharya: Reddy Wedding! @mail_today ...,4


In [67]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

In [68]:
import nltk

# Defining stemmer

In [69]:
stemmer = SnowballStemmer('english')
original_words = ['caresses', 'flies', 'dies', 'mules', 'denied','died', 'agreed', 'owned', 
           'humbled', 'sized','meeting', 'stating', 'siezing', 'itemization','sensational', 
           'traditional', 'reference', 'colonizer','plotted']
singles = [stemmer.stem(plural) for plural in original_words]
pd.DataFrame(data = {'original word': original_words, 'stemmed': singles})

Unnamed: 0,original word,stemmed
0,caresses,caress
1,flies,fli
2,dies,die
3,mules,mule
4,denied,deni
5,died,die
6,agreed,agre
7,owned,own
8,humbled,humbl
9,sized,size


# Preprocessing

In [70]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

# Previewing preprocessor funct

In [71]:
doc_sample = documents[documents['index'] == 4000].values[0][0]

print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))
#docc=(preprocess(doc_sample))

original document: 
['RT', '@ModiBharosa:', 'Putting', 'Nation', 'over', 'Party', 'Politics', '#nitishkumar', 'supports', 'PM', '@narendramodi', 'on', '#Demonetization', 'https://t.co/UodwXdPMmG']


 tokenized and lemmatized document: 
['modibharosa', 'put', 'nation', 'parti', 'polit', 'nitishkumar', 'support', 'narendramodi', 'demonet', 'https', 'uodwxdpmmg']


# Applying preprocessor on tweets

In [73]:
processed_docs = documents['tweet'].astype(str).map(preprocess)

In [74]:
import gensim
from gensim import corpora, models

# Creating the doc2bow dictionary for words in the dataset and applying the LDA model from gensim

In [77]:
wordDict = corpora.Dictionary(processed_docs)
docTermMatrix = [wordDict.doc2bow(doc) for doc in processed_docs]
Lda = gensim.models.ldamodel.LdaModel
model = Lda(docTermMatrix,num_topics=10,id2word=wordDict,passes=100)
print(model.print_topics(num_topics=10, num_words=10))

[(0, '0.112*"demonet" + 0.106*"anilkoh" + 0.105*"parti" + 0.102*"suffer" + 0.101*"loss" + 0.013*"https" + 0.010*"corrupt" + 0.009*"read" + 0.009*"hilari" + 0.009*"video"'), (1, '0.090*"demonet" + 0.082*"reaction" + 0.036*"https" + 0.025*"call" + 0.024*"benefit" + 0.023*"join" + 0.021*"spread" + 0.021*"walk" + 0.021*"nationalist" + 0.021*"joydeep_"'), (2, '0.118*"bank" + 0.117*"demonet" + 0.115*"lakh" + 0.110*"terrorist" + 0.110*"kishtwar" + 0.110*"loot" + 0.110*"incid" + 0.110*"gauravcsaw" + 0.003*"minist" + 0.002*"prime"'), (3, '0.083*"demonet" + 0.066*"https" + 0.020*"cash" + 0.017*"modi" + 0.017*"peopl" + 0.015*"take" + 0.013*"time" + 0.012*"poor" + 0.011*"watch" + 0.011*"support"'), (4, '0.085*"demonet" + 0.082*"https" + 0.081*"support" + 0.079*"parti" + 0.077*"narendramodi" + 0.077*"polit" + 0.076*"nation" + 0.075*"nitishkumar" + 0.074*"modibharosa" + 0.074*"uodwxdpmmg"'), (5, '0.083*"demonet" + 0.082*"clear" + 0.079*"critic" + 0.078*"paytm" + 0.075*"requir" + 0.075*"question" + 0

In [76]:
model.print_topic(3)

'0.080*"demonet" + 0.051*"say" + 0.034*"explain" + 0.031*"tweet" + 0.029*"drgpradhan" + 0.029*"prob" + 0.029*"have" + 0.028*"minimathur" + 0.028*"bibi" + 0.027*"money"'