## Load Data

In [None]:
import pandas as pd

data = pd.read_csv("data.csv")
data = data["content"].drop_duplicates().dropna()[:37000]
print(data['content'].info())

## Removing Stopwords

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords


stop_words = stopwords.words('english')

# function to remove stopwords
def remove_stopwords(text):
    textArr = text.split(' ')
    rem_text = " ".join([i for i in textArr if i not in stop_words])
    return rem_text

# remove stopwords from the text
data=data.apply(remove_stopwords)

## Lemmatization

In [29]:
import spacy
from gensim import corpora

# nlp = en_core_web_sm.load()
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

def lemmatization(texts,allowed_postags=['NOUN', 'ADJ']): 
       output = []
       for sent in texts:
             doc = nlp(sent) 
             output.append([token.lemma_ for token in doc if token.pos_ in allowed_postags ])
       return output
from humanize import intword
tokenized_reviews = lemmatization(data.tolist())
print(intword(sum(len(x) for x in tokenized_reviews)), " Tokenizations")

dictionary = corpora.Dictionary(tokenized_reviews)

doc_term_matrix = [dictionary.doc2bow(doc) for doc in tokenized_reviews]

<class 'pandas.core.series.Series'>
RangeIndex: 50000 entries, 0 to 49999
Series name: content
Non-Null Count  Dtype 
--------------  ----- 
50000 non-null  object
dtypes: object(1)
memory usage: 390.8+ KB
None


## Model Training

In [None]:
import gensim
# Creating the object for LDA model using gensim library
Lda = gensim.models.LdaMulticore

# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(corpus=doc_term_matrix, id2word=dictionary, num_topics=25, passes=10)
print("-"*10, "\n", "Topics\n", ldamodel.print_topics(num_words=10))

## Evaluation

In [None]:
import pyLDAvis
import pyLDAvis.gensim_models
%matplotlib inline

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(ldamodel, doc_term_matrix, dictionary)
vis

  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [None]:
print('\nPerplexity: ', ldamodel.log_perplexity(doc_term_matrix,total_docs=len(data.index)))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
from gensim.models.coherencemodel import CoherenceModel
coherence_model_lda = CoherenceModel(model=ldamodel, texts=tokenized_reviews, dictionary=dictionary , coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -11.239377666616827

Coherence Score:  0.5377841348657011
