In [None]:
import pandas as pd
import re
from sklearn.feature_extraction import text
from sklearn.decomposition import LatentDirichletAllocation
import nltk
import gensim
import spacy
from sklearn.model_selection import GridSearchCV
import numpy as np

In [None]:
data = pd.read_csv('/home/maximus1/Downloads/npr.csv')

In [None]:
data

In [None]:
# cleaning the data
list_data = data.Article.values.tolist()
# Remove new line characters
list_data = [re.sub(r'\s+', ' ', sent) for sent in list_data]
print(data[:1])

In [None]:
# tokenising each sentence into words after ignoring puncts
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True flag is used to remove punctuations.

words_list = list(sent_to_words(list_data))

In [None]:
def lemmatise(texts,allowed_tags = ['NOUN','ADJ','VERB','ADV']):
    out = []
    for tokens in texts:
        doc = nlp(" ".join(tokens))
        out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_tags]))
    return out

In [None]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [None]:
lemma_words = lemmatise(words_list,allowed_tags=['NOUN','VERB'])

In [None]:
count_vectorizer = text.CountVectorizer(max_df=0.9, min_df=0.1, stop_words='english')
count_vector_matrix = count_vectorizer.fit_transform(lemma_words)

In [None]:
LDA = LatentDirichletAllocation(n_components=20, random_state=54,max_iter=20, learning_method='online',batch_size=128, evaluate_every=-1,n_jobs=-1)

lda_output= LDA.fit(count_vector_matrix)

In [None]:
# evaluating model's performance
print("Log Likelihood score :: {}".format(LDA.score(count_vector_matrix)))

#evaluate model perplexity
print("Perplexity :: {}".format(LDA.perplexity(count_vector_matrix)))

In [None]:
#using grid search_CV

params_dict = {
    'n_components':[10,20],
    'learning_decay':[0.1,0.4,0.9]
}
LDA = LatentDirichletAllocation(max_iter=10, learning_method='online',random_state=42)

#grid search
model = GridSearchCV(LDA, param_grid=params_dict)

model.fit(count_vector_matrix)

In [None]:
#printing out the best parameter
best_model = model.best_estimator_

# best params
print("Model's Params:", model.best_params_)

#best log likelihood score
print("Best Log Likelihood score :: {}".format(model.best_score_))

# perplexity
print("Perplexity {}".format(best_model.perplexity(count_vector_matrix)))

In [None]:
# method to predict top n keywords for each topic

def show_topics(vectorizer=count_vectorizer, lda_model=best_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

In [None]:
topic_keywords = show_topics(vectorizer=count_vectorizer, lda_model=best_model, n_words=20)

In [None]:
#creating a dataframe to visualise
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords