## Load Data

In [4]:
import pandas as pd

data = pd.read_csv("data.csv")
data = data["content"].drop_duplicates().dropna()[:37000]
print(data.info())

<class 'pandas.core.series.Series'>
Int64Index: 37000 entries, 0 to 37044
Series name: content
Non-Null Count  Dtype 
--------------  ----- 
37000 non-null  object
dtypes: object(1)
memory usage: 578.1+ KB
None


## Removing Stopwords

In [5]:
import nltk
nltk.download('stopwords')
nltk.download('words')

stop_words = set(nltk.corpus.stopwords.words('english'))
corp = set(nltk.corpus.words.words())

# function to remove stopwords
def remove_stopwords(text):
    textArr = text.split(' ')
    rem_text = " ".join([word for word in textArr if word not in stop_words and word in corp])
    return rem_text

# remove stopwords from the text
data=data.apply(remove_stopwords)

[nltk_data] Downloading package stopwords to /home/tawfik/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to /home/tawfik/nltk_data...
[nltk_data]   Package words is already up-to-date!


## Lemmatization

In [6]:
import spacy
from gensim import corpora

# nlp = en_core_web_sm.load()
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

def lemmatization(texts,allowed_postags=['NOUN', 'ADJ']): 
       output = []
       for sent in texts:
             doc = nlp(sent) 
             output.append([token.lemma_ for token in doc if token.pos_ in allowed_postags ])
       return output
tokenized_reviews = lemmatization(data.tolist())

# print number of tokenization
from importlib.util import find_spec as isModule
if(isModule('humanize') != None):
      from humanize import intword
      print(intword(sum(len(x) for x in tokenized_reviews)), " Tokenizations")
else:
      print(sum(len(x) for x in tokenized_reviews), " Tokenizations")

dictionary = corpora.Dictionary(tokenized_reviews)

doc_term_matrix = [dictionary.doc2bow(doc) for doc in tokenized_reviews]

3.8 million  Tokenizations


## Model Training

In [7]:
import gensim
# Creating the object for LDA model using gensim library
Lda = gensim.models.LdaMulticore

# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(corpus=doc_term_matrix, id2word=dictionary, num_topics=25, passes=10)
print("-"*10, "\n", "Topics\n", ldamodel.print_topics(num_words=10))

---------- 
 Topics
 [(12, '0.029*"company" + 0.015*"business" + 0.012*"border" + 0.011*"new" + 0.009*"mexican" + 0.008*"executive" + 0.008*"financial" + 0.007*"chief" + 0.007*"food" + 0.007*"money"'), (19, '0.036*"immigration" + 0.024*"illegal" + 0.020*"order" + 0.018*"executive" + 0.015*"government" + 0.013*"country" + 0.013*"border" + 0.012*"administration" + 0.012*"ban" + 0.011*"new"'), (5, '0.024*"white" + 0.019*"people" + 0.017*"black" + 0.013*"man" + 0.010*"gay" + 0.008*"social" + 0.007*"actor" + 0.007*"time" + 0.006*"film" + 0.005*"woman"'), (24, '0.053*"police" + 0.034*"gun" + 0.026*"people" + 0.014*"law" + 0.012*"violence" + 0.010*"enforcement" + 0.010*"city" + 0.009*"fire" + 0.009*"man" + 0.008*"protest"'), (11, '0.042*"percent" + 0.015*"poll" + 0.013*"game" + 0.013*"last" + 0.013*"first" + 0.012*"team" + 0.010*"second" + 0.009*"good" + 0.009*"time" + 0.008*"final"'), (10, '0.017*"political" + 0.014*"man" + 0.011*"government" + 0.010*"many" + 0.009*"people" + 0.007*"power" +

## Evaluation

In [8]:
import pyLDAvis
import pyLDAvis.gensim_models
%matplotlib inline

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(ldamodel, doc_term_matrix, dictionary)
vis

  default_term_info = default_term_info.sort_values(


In [9]:
print('\nPerplexity: ', ldamodel.log_perplexity(doc_term_matrix,total_docs=len(data.index)))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
from gensim.models.coherencemodel import CoherenceModel
coherence_model_lda = CoherenceModel(model=ldamodel, texts=tokenized_reviews, dictionary=dictionary , coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

  and should_run_async(code)



Perplexity:  -7.699343065161069
