## CA-2

In [69]:
# importing libraries
import pandas as pd
import time

In [70]:
# Helper functions

def time_elapsed_start():
    return time.time()

def time_elapsed_stop(start):
    print(f"Execution took {time.time() - start} seconds")

In [71]:
# loading data
ROWS_TO_READ = 200000
df = pd.read_csv('quora_questions.csv', nrows=ROWS_TO_READ)

In [72]:
df.sample(5)

Unnamed: 0,question
162665,What are some major events that happened in 1994?
184379,I am a fourth year student of BA LLB from Indi...
22851,How do I recover from failure?
150260,"Objectively speaking, what have been some of P..."
77232,How did the Commercial Revolution impact Europ...


In [73]:
# dataframe size
print(f"Dataframe size: {df.size}")

Dataframe size: 200000


### Pre-processing

In [74]:
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer(max_df=0.90, min_df=2, stop_words='english')

In [75]:
start = time_elapsed_start()
term_matrix = count_vectorizer.fit_transform(df['question'])
time_elapsed_stop(start)

Execution took 3.0968377590179443 seconds


In [76]:
term_matrix

<200000x27884 sparse matrix of type '<class 'numpy.int64'>'
	with 981746 stored elements in Compressed Sparse Row format>

The vector has taken 27,884 words in the vocabulary from 2,00,000 questions(rows).

# Applying LDA

In [77]:
from sklearn.decomposition import LatentDirichletAllocation

In [78]:
lda = LatentDirichletAllocation(n_components=13, random_state=3)

In [28]:
start = time_elapsed_start()
lda.fit(term_matrix)
time_elapsed_stop(start)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=13, n_jobs=None,
                          perp_tol=0.1, random_state=1, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

Checking log likelihood, perplexity

In [31]:
start = time_elapsed_start()
log_likelihood = lda.score(term_matrix)
perplexity = lda.perplexity(term_matrix)
time_elapsed_stop(start)

In [37]:
print(f"Log likelihood: {log_likelihood}")
print(f"Perplexity: {perplexity}")

Log likelihood: -8516737.535031516
Perplexity: 4612.6439753800405


In [62]:
# Creating a metrics dataframe for comparison later
lda_metrics_df = pd.DataFrame([[log_likelihood, perplexity, 13, 0.7]], columns=['log_likelihood', 'perplexity', 'n_components', 'learning_decay'])

In [63]:
lda_metrics_df

Unnamed: 0,log_likelihood,perplexity
0,-8507502.0,4570.6395


## Applying GridSearch for finding out best parameters for LDA

In [43]:
# importing GridSearchCV from sklearn.model_selection
from sklearn.model_selection import GridSearchCV

In [44]:
# creating parameters for LDA
params = {'n_components': [5, 7, 9, 10, 12, 14], 'learning_decay': [.3, .5, .7, .9]}

In [46]:
# Initializing evaluate LDA and passing parameters to it
eval_lda = LatentDirichletAllocation()
eval_lda = GridSearchCV(eval_lda, param_grid=params)

In [None]:
# Fitting the model
# this will search for optimal parameters
# Note: this process consumes a significant amount of time and resources
start = time_elapsed_start()
eval_lda.fit(term_matrix)
time_elapsed_stop(start)

In [None]:
# Best Model which gave highest score 
best_lda_model = lda_comparison.best_estimator_

# Model Parameters is used to store a list of parameter settings dicts for all the parameter candidates
print("Best Model's Params: ", lda_comparison.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", lda_comparison.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(doc_term_matrix))

In [48]:
lda = LatentDirichletAllocation(n_components=10, random_state=3)

In [49]:
lda.fit(term_matrix)print

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=10, n_jobs=None,
                          perp_tol=0.1, random_state=3, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [51]:
start = time_elapsed_start()
log_likelihood = lda.score(term_matrix)
perplexity = lda.perplexity(term_matrix)
time_elapsed_stop(start)

In [52]:
print(f"Log likelihood: {log_likelihood}")
print(f"Perplexity: {perplexity}")

Log likelihood: -8507502.497878691
Perplexity: 4570.639499601198


In [67]:
# storing metrics in metrics dataframe
lda_metrics_df.loc[-1] = [log_likelihood, perplexity, 10, 0.7]
lda_metrics_df.index = lda_metrics_df.index + 1
lda_metrics_df = lda_metrics_df.sort_index()

In [68]:
lda_metrics_df

Unnamed: 0,log_likelihood,perplexity
0,-8507502.0,4570.6395
1,,
2,-8507502.0,4570.6395


## Exploring topics and their words

We have 27,885 words in the vocabulary. 

In [None]:
lda.components_.shape

10 topics with 27,885 words