## Load Data

In [15]:
import pandas as pd

data = pd.read_csv("data.csv")
data = data["content"].drop_duplicates().dropna()[:37000]
print(data.info())
print(data.head())

<class 'pandas.core.series.Series'>
Int64Index: 37000 entries, 0 to 37044
Series name: content
Non-Null Count  Dtype 
--------------  ----- 
37000 non-null  object
dtypes: object(1)
memory usage: 578.1+ KB
None
0    WASHINGTON  —   Congressional Republicans have...
1    After the bullet shells get counted, the blood...
2    When Walt Disney’s “Bambi” opened in 1942, cri...
3    Death may be the great equalizer, but it isn’t...
4    SEOUL, South Korea  —   North Korea’s leader, ...
Name: content, dtype: object


## Removing Stopwords

In [16]:
import nltk
nltk.download('stopwords')

# function to remove stopwords
stop_words = set(nltk.corpus.stopwords.words('english'))
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
def remove_stopwords(text : str):
    textArr = tokenizer.tokenize(text)
    rem_text = " ".join([word for word in textArr if word.lower() not in stop_words ])
    return rem_text

# remove stopwords from the text
data=data.apply(remove_stopwords)

print(data.head())

[nltk_data] Downloading package stopwords to /home/tawfik/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0    WASHINGTON Congressional Republicans new fear ...
1    bullet shells get counted blood dries votive c...
2    Walt Disney Bambi opened 1942 critics praised ...
3    Death may great equalizer necessarily evenhand...
4    SEOUL South Korea North Korea leader Kim said ...
Name: content, dtype: object


## Lemmatization

In [17]:
import spacy
from gensim import corpora

# nlp = en_core_web_sm.load()
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

def lemmatization(texts,allowed_postags=['NOUN', 'ADJ']): 
       output = []
       for sent in texts:
             doc = nlp(sent) 
             output.append([token.lemma_ for token in doc if token.pos_ in allowed_postags ])
       return output
tokenized_reviews = lemmatization(data.tolist())

# print number of tokenization
from importlib.util import find_spec as isModule
if(isModule('humanize') != None):
      from humanize import intword
      print(intword(sum(len(x) for x in tokenized_reviews)), " Tokenizations")
else:
      print(sum(len(x) for x in tokenized_reviews), " Tokenizations")

print(tokenized_reviews[:2])

In [None]:
dictionary = corpora.Dictionary(tokenized_reviews)

doc_term_matrix = [dictionary.doc2bow(doc) for doc in tokenized_reviews]

print(doc_term_matrix[:2])

4.8 million  Tokenizations
[[(0, 1), (1, 9), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 2), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 4), (15, 6), (16, 3), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 2), (24, 1), (25, 1), (26, 1), (27, 1), (28, 2), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 5), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 11), (59, 1), (60, 1), (61, 1), (62, 1), (63, 5), (64, 1), (65, 1), (66, 1), (67, 1), (68, 1), (69, 1), (70, 1), (71, 2), (72, 1), (73, 3), (74, 1), (75, 1), (76, 2), (77, 1), (78, 1), (79, 1), (80, 2), (81, 1), (82, 1), (83, 1), (84, 1), (85, 1), (86, 2), (87, 2), (88, 1), (89, 1), (90, 1), (91, 1), (92, 1), (93, 1), (94, 1), (95, 2), (96, 1), (97, 1), (98, 1), (99, 2), (100, 2), (101, 1), (102, 1), (103, 1), (104, 2), (105, 1), (106, 1), (107, 2), 

## Model Training

In [None]:
import gensim
# Creating the object for LDA model using gensim library
Lda = gensim.models.LdaMulticore

# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(corpus=doc_term_matrix, id2word=dictionary, num_topics=30, passes=10)
print("-"*10, "\n", "Topics\n", ldamodel.print_topics(num_words=10))

KeyboardInterrupt: 

## Evaluation

In [None]:
print('\nPerplexity: ', ldamodel.log_perplexity(doc_term_matrix,total_docs=len(data.index)))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
from gensim.models.coherencemodel import CoherenceModel
coherence_model_lda = CoherenceModel(model=ldamodel, texts=tokenized_reviews, dictionary=dictionary , coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

## Visualization

In [None]:
import pyLDAvis
import pyLDAvis.gensim_models
%matplotlib inline

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(ldamodel, doc_term_matrix, dictionary)
vis