In [None]:
import numpy as np
import pandas as pd
import re, nltk, spacy, gensim
# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint
# Plotting tools
 
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from google.colab import drive 
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
df=pd.read_csv('gdrive/My Drive/final_fixed_scrapped.csv')

In [None]:
df

Unnamed: 0.1,Unnamed: 0,Title,Article,totalwords
0,0,Rs 400-cr ‘bungling’ by Amar Singh: HC seeks E...,The Allahabad High Court has directed the Enfo...,341
1,1,J&K HC orders FIR against teacher who fails to...,The J&K High Court on Friday asked a teacher t...,335
2,2,Crime in Cyberia: an incomplete list of offenders,An unknown Indian hacker has been charged with...,382
3,3,450 cannabis plants found on govt land in Mundhwa,The interrogation of two marijuana peddlers ab...,372
4,4,"Nagaland: 9 bullet-riddled bodies found,Karbi-...",Nine highly decomposed bodies were found stack...,449
...,...,...,...,...
7838,7838,"13 dead, one million evacuated in China after ...",At least 18 people were killed due to a landsl...,397
7839,7839,"Book Karnataka, Maharashtra CMs for culpable h...",With the flood situation in the state continui...,500
7840,7840,Romanian researcher develops disc-shaped flyin...,"Razan Sabie, an aerodynamicist who operates fr...",346
7841,7841,Is asteroid Apophis capable of triggering a ts...,It was around a few months back that Iain McDo...,412


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Data Cleaning

In [None]:
# Convert to list
data = df.Article.values.tolist()
# Remove Emails
data = [re.sub(r'\S*@\S*\s?', '', sent) for sent in data]
# Remove new line characters
data = [re.sub(r'\s+', ' ', sent) for sent in data]
# Remove distracting single quotes
data = [re.sub(r"\'", "", sent) for sent in data]
pprint(data[:1])

['The Allahabad High Court has directed the Enforcement Directorate (ED) to '
 'file a status report, along with an affidavit, in connection with the '
 'investigation carried out in connection with the allegations of financial '
 'misappropriation to the tune over Rs 400 crore by former Samajwadi Party '
 'leader Amar Singh. The court has fixed October 8 for next hearing.A Division '
 'Bench of Chief Justice D Y Chandrachud and Justice Yashwant Varma passed the '
 'order on Thursday, while hearing a PIL filed by Ashok Kumar Singh, a '
 'resident of Kanpur, through his counsel Ashok Pandey.The petitioner, who was '
 'at one point district chief of SP in Kanpur, had pointed out that a '
 'three-judge bench had, in its order on August 1, 2013, directed further '
 'investigation by the ED in connection with the allegations against Amar '
 'Singh. It was alleged that as the head of the UP development council, he had '
 'misappropriated funds to the amount of Rs 400 crore. An FIR in this re

# Tokenization

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
data_words = list(sent_to_words(data))
print(data_words[:1])

[['the', 'allahabad', 'high', 'court', 'has', 'directed', 'the', 'enforcement', 'directorate', 'ed', 'to', 'file', 'status', 'report', 'along', 'with', 'an', 'affidavit', 'in', 'connection', 'with', 'the', 'investigation', 'carried', 'out', 'in', 'connection', 'with', 'the', 'allegations', 'of', 'financial', 'to', 'the', 'tune', 'over', 'rs', 'crore', 'by', 'former', 'samajwadi', 'party', 'leader', 'amar', 'singh', 'the', 'court', 'has', 'fixed', 'october', 'for', 'next', 'hearing', 'division', 'bench', 'of', 'chief', 'justice', 'chandrachud', 'and', 'justice', 'yashwant', 'varma', 'passed', 'the', 'order', 'on', 'thursday', 'while', 'hearing', 'pil', 'filed', 'by', 'ashok', 'kumar', 'singh', 'resident', 'of', 'kanpur', 'through', 'his', 'counsel', 'ashok', 'pandey', 'the', 'petitioner', 'who', 'was', 'at', 'one', 'point', 'district', 'chief', 'of', 'sp', 'in', 'kanpur', 'had', 'pointed', 'out', 'that', 'three', 'judge', 'bench', 'had', 'in', 'its', 'order', 'on', 'august', 'directed',

# Stemming

In [None]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): #'NOUN', 'ADJ', 'VERB', 'ADV'
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out

In [None]:
# Initialize spacy ‘en’ model, keeping only tagger component (for efficiency)
# Run in terminal: python -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])
# Do lemmatization keeping only Noun, Adj, Verb, Adverb
data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'VERB']) #select noun and verb
print(data_lemmatized[:2])

['court direct enforcement directorate file status report affidavit connection investigation carry connection allegation tune party leader court fix hearing division bench pass order hear file resident counsel pandey petitioner point district chief point judge order direct investigation ed connection allegation allege head misappropriate fund amount fir regard register station relate psr prayer file chargesheet report case judge rule report chargesheet ed take investigation conclusion trial can complete request court take cognizance report matter direct cbi investigate case say pandey petitioner make prayer seek probe report donation make singh violation rbi rule pass order court say stage court should apprise ed step take judgement bench court shall file affidavit explain position shall produce status report perusal court', 'court ask teacher write essay cow solve class iv math problem check credential teacher teacher fail court order fir register direct government screen degree teach

In [None]:
data_lemmatized

['court direct enforcement directorate file status report affidavit connection investigation carry connection allegation tune party leader court fix hearing division bench pass order hear file resident counsel pandey petitioner point district chief point judge order direct investigation ed connection allegation allege head misappropriate fund amount fir regard register station relate psr prayer file chargesheet report case judge rule report chargesheet ed take investigation conclusion trial can complete request court take cognizance report matter direct cbi investigate case say pandey petitioner make prayer seek probe report donation make singh violation rbi rule pass order court say stage court should apprise ed step take judgement bench court shall file affidavit explain position shall produce status report perusal court',
 'court ask teacher write essay cow solve class iv math problem check credential teacher teacher fail court order fir register direct government screen degree teac

In [None]:
len(data_lemmatized)

7843

# Vectorization

In [None]:
vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10,                        # minimum reqd occurences of a word 
                             stop_words='english',             # remove stop words
                             lowercase=True,                   # convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                             # max_features=50000,             # max number of uniq words
                            )
data_vectorized = vectorizer.fit_transform(data_lemmatized)

In [None]:
data_vectorized

<7843x6966 sparse matrix of type '<class 'numpy.int64'>'
	with 793296 stored elements in Compressed Sparse Row format>

In [None]:
## NOTE TFIDF needs to be added

In [None]:
# Build LDA Model
lda_model = LatentDirichletAllocation(n_components=5,               # Number of topics
                                      max_iter=50,               # Max learning iterations
                                      learning_method='online',   
                                      random_state=100,          # Random state
                                      batch_size=128,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               # Use all available CPUs
                                     )
lda_output = lda_model.fit(data_vectorized)

print(lda_model)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='online', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=50,
                          mean_change_tol=0.001, n_components=5, n_jobs=-1,
                          perp_tol=0.1, random_state=100, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)


In [None]:
# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_model.score(data_vectorized))
# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(data_vectorized))
# See model parameters
pprint(lda_model.get_params())

Log Likelihood:  -9284239.649798643
Perplexity:  1455.7410546219894
{'batch_size': 128,
 'doc_topic_prior': None,
 'evaluate_every': -1,
 'learning_decay': 0.7,
 'learning_method': 'online',
 'learning_offset': 10.0,
 'max_doc_update_iter': 100,
 'max_iter': 50,
 'mean_change_tol': 0.001,
 'n_components': 5,
 'n_jobs': -1,
 'perp_tol': 0.1,
 'random_state': 100,
 'topic_word_prior': None,
 'total_samples': 1000000.0,
 'verbose': 0}


In [None]:
# Define Search Param
search_params = {'n_components': [10,11,12,13,14,15,16,17,18,19,20], 'learning_decay': [.5, .7, .9]}
# Init the Model
lda = LatentDirichletAllocation(max_iter=5, learning_method='online', learning_offset=50.,random_state=0)
# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)
# Do the Grid Search
model.fit(data_vectorized)


GridSearchCV(cv=None, error_score=nan,
             estimator=LatentDirichletAllocation(batch_size=128,
                                                 doc_topic_prior=None,
                                                 evaluate_every=-1,
                                                 learning_decay=0.7,
                                                 learning_method='online',
                                                 learning_offset=50.0,
                                                 max_doc_update_iter=100,
                                                 max_iter=5,
                                                 mean_change_tol=0.001,
                                                 n_components=10, n_jobs=None,
                                                 perp_tol=0.1, random_state=0,
                                                 topic_word_prior=None,
                                                 total_samples=1000000.0,
                               

In [None]:
GridSearchCV(cv=None, error_score='raise',
       estimator=LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=10, n_jobs=1,
             perp_tol=0.1, random_state=None,
             topic_word_prior=None, total_samples=1000000.0, verbose=0),
       iid=True, n_jobs=1,
       param_grid={'n_topics': [10,11,12,13,14,15,16,17,18,19,20], 'learning_decay': [0.5, 0.7, 0.9]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

GridSearchCV(cv=None, error_score='raise',
             estimator=LatentDirichletAllocation(batch_size=128,
                                                 doc_topic_prior=None,
                                                 evaluate_every=-1,
                                                 learning_decay=0.7,
                                                 learning_method=None,
                                                 learning_offset=10.0,
                                                 max_doc_update_iter=100,
                                                 max_iter=10,
                                                 mean_change_tol=0.001,
                                                 n_components=10, n_jobs=1,
                                                 perp_tol=0.1,
                                                 random_state=None,
                                                 topic_word_prior=None,
                                                 total_

In [None]:
# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_model.score(data_vectorized))
# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(data_vectorized))
# See model parameters
pprint(lda_model.get_params())

Log Likelihood:  -9219437.608841859
Perplexity:  1383.5869842094062
{'batch_size': 128,
 'doc_topic_prior': None,
 'evaluate_every': -1,
 'learning_decay': 0.7,
 'learning_method': 'online',
 'learning_offset': 10.0,
 'max_doc_update_iter': 100,
 'max_iter': 50,
 'mean_change_tol': 0.001,
 'n_components': 15,
 'n_jobs': -1,
 'perp_tol': 0.1,
 'random_state': 100,
 'topic_word_prior': None,
 'total_samples': 1000000.0,
 'verbose': 0}


In [None]:
# Best Model
best_lda_model = model.best_estimator_
# Model Parameters
print("Best Model's Params: ", model.best_params_)
# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)
# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

Best Model's Params:  {'learning_decay': 0.7, 'n_components': 10}
Best Log Likelihood Score:  -1981826.2301763124
Model Perplexity:  1398.5063156911185


In [None]:
mat=best_lda_model.transform(data_vectorized)
print(np.mean(mat,axis=0))
print(np.std(mat,axis=0))

[0.04385157 0.19063828 0.0943938  0.0421854  0.12315375 0.00240472
 0.10817332 0.12763425 0.07806598 0.18949893]
[0.10875464 0.28483943 0.2022578  0.13902581 0.25135118 0.00819101
 0.19721116 0.22771614 0.16444467 0.25733357]


In [None]:

# Get Log Likelyhoods from Grid Search Output

log_likelyhoods_5 = [round(model.cv_results_['mean_test_score'][index]) for index, gscore in enumerate(model.cv_results_['params']) if gscore['learning_decay']==0.5]
log_likelyhoods_7 = [round(model.cv_results_['mean_test_score'][index]) for index, gscore in enumerate(model.cv_results_['params']) if gscore['learning_decay']==0.7]
log_likelyhoods_9 = [round(model.cv_results_['mean_test_score'][index]) for index, gscore in enumerate(model.cv_results_['params']) if gscore['learning_decay']==0.9]

In [None]:
log_likelyhoods_5

[-1988193,
 -1994435,
 -1989054,
 -1990827,
 -1994794,
 -1995250,
 -1994599,
 -1992023,
 -1995512,
 -1996729,
 -1997643]

In [None]:
log_likelyhoods_7

[-1981826,
 -1987823,
 -1986862,
 -1989505,
 -1985606,
 -1982252,
 -1991189,
 -1988074,
 -1991187,
 -1994575,
 -1997434]

In [None]:
log_likelyhoods_9

[-1985752,
 -1993055,
 -1994216,
 -1995243,
 -1999930,
 -2006778,
 -2012340,
 -2017014,
 -2022822,
 -2020671,
 -2024928]