In [1]:
import pandas as pd
import numpy as np

import re
import string

import spacy

import gensim
from gensim import corpora

# libraries for visualization
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [2]:
review_data= pd.read_csv("Dataset\Reviews.csv")
review_data.head(2)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...


In [3]:
def clean_text(text ): 
    delete_dict = {sp_character: '' for sp_character in string.punctuation} 
    delete_dict[' '] = ' ' 
    table = str.maketrans(delete_dict)
    text1 = text.translate(table)
    textArr= text1.split()
    text2 = ' '.join([w for w in textArr if ( not w.isdigit() and  ( not w.isdigit() and len(w)>3))]) 
    return text2.lower()

In [4]:
review_data.dropna(axis = 0, how ='any',inplace=True) 
review_data['Text'] = review_data['Text'].apply(clean_text)
review_data['Num_words_text'] = review_data['Text'].apply(lambda x:len(str(x).split())) 

print('-------Dataset --------')
print(review_data['Score'].value_counts())
print(len(review_data))
print('-------------------------')
max_review_data_sentence_length  = review_data['Num_words_text'].max()

mask = (review_data['Num_words_text'] < 100) & (review_data['Num_words_text'] >=20)
df_short_reviews = review_data[mask]
df_sampled = df_short_reviews.groupby('Score').apply(lambda x: x.sample(n=20000)).reset_index(drop = True)

print('No of Short reviews')
print(len(df_short_reviews))


-------Dataset --------
Score
5    339
4     70
3     37
1     36
2     18
Name: count, dtype: int64
500
-------------------------
No of Short reviews
325


***
Let us pre-process the data
***

In [5]:
from Method import preprocess_text
tokenized_reviews = []

for _ in range(review_data.shape[0]):
    tokenized_reviews.append(preprocess_text(review_data.Text[_]))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
dictionary = corpora.Dictionary(tokenized_reviews)
doc_term_matrix = [dictionary.doc2bow(rev) for rev in tokenized_reviews]

In [7]:
# Creating the object for LsA model using gensim library
LSA = gensim.models.LsiModel
# Build LsA model
lsa_model = LSA(corpus=doc_term_matrix, id2word=dictionary, num_topics=10,chunksize=1000)

In [8]:
lsa_model.print_topics()

[(0,
  '0.552*"chip" + 0.277*"flavor" + 0.227*"like" + 0.181*"brand" + 0.178*"food" + 0.174*"taste" + 0.164*"good" + 0.161*"kettle" + 0.151*"also" + 0.141*"great"'),
 (1,
  '-0.569*"chip" + 0.364*"food" + 0.204*"like" + -0.179*"kettle" + 0.150*"good" + 0.148*"sugar" + 0.125*"taste" + 0.125*"product" + 0.124*"coffee" + -0.120*"potato"'),
 (2,
  '0.648*"food" + 0.238*"cat" + -0.188*"taste" + 0.183*"change" + -0.182*"sugar" + -0.155*"coffee" + -0.127*"make" + 0.125*"chip" + -0.122*"drink" + -0.121*"good"')]

In [9]:
# print('\nPerplexity: ', lsa_model.log_perplexity(doc_term_matrix,total_docs=10000))  # a measure of how good the model is. the lower the better.

# Compute Coherence Score
from gensim.models.coherencemodel import CoherenceModel
coherence_model_lsa = CoherenceModel(model=lsa_model, texts=tokenized_reviews, dictionary=dictionary , coherence='c_v')
coherence_lsa = coherence_model_lsa.get_coherence()
print('\nCoherence Score: ', coherence_lsa)


Coherence Score:  0.3829488269098329


***
Method to find optimal number of topics
***

In [10]:
def compute_coherence_values(dictionary, corpus, texts, end, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, end, step):
        model = gensim.models.ldamodel.LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [11]:
model_list, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=doc_term_matrix, texts=tokenized_reviews, start=2, end=50, step=1)

KeyboardInterrupt: 

In [None]:
# Show graph
end=50; start=2; step=1;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()# Print the coherence scores

In [None]:
# Print the coherence scores
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

In [None]:
# Select the model and print the topics
optimal_model = model_list[7]
model_topics = optimal_model.show_topics(formatted=False)
optimal_model.print_topics(num_words=10)