# Imports

In [1]:
#Basic libraries
import numpy as np
import pandas as pd
import gzip
import json
import sklearn

#NLTK libraries
import nltk
import re
import string
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS
from nltk.stem.porter import PorterStemmer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.stem.porter import *
from gensim import corpora, models

#Visualization libraries
import matplotlib.pyplot as plt 
from matplotlib import rcParams
import seaborn as sns
from textblob import TextBlob
from plotly import tools
import plotly.graph_objs as go
from plotly.offline import iplot
%matplotlib inline

#Metrics libraries
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score

# Others
from collections import defaultdict
from collections import Counter

In [2]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /home/divyaj/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/divyaj/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
csv_path = '../Data/csv_files/'

In [5]:
reviews_df = pd.read_csv(csv_path + '/cleaned_reviews.csv',)
reviews_df = reviews_df.fillna('')
reviews_df.head()

Unnamed: 0,reviewerID,asin,reviewText,summary,helpful,overall,reviewText_cleaned
0,ADZPIG9QOCDG5,5019281,This is a charming version of the classic Dick...,good version of a classic,"[0, 0]",4.0,charming version classic dickens tale henry wi...
1,A35947ZP82G7JH,5019281,It was good but not as emotionally moving as t...,Good but not as moving,"[0, 0]",3.0,good but not emotionally moving christmas caro...
2,A3UORV8A9D5L2E,5019281,"Don't get me wrong, Winkler is a wonderful cha...",Winkler's Performance was ok at best!,"[0, 0]",3.0,dont get wrong winkler wonderful character act...
3,A1VKW06X1O2X7V,5019281,Henry Winkler is very good in this twist on th...,It's an enjoyable twist on the classic story,"[0, 0]",5.0,henry winkler good twist classic story not con...
4,A3R27T4HADWFFJ,5019281,This is one of the best Scrooge movies out. H...,Best Scrooge yet,"[0, 0]",4.0,one best scrooge movies henry winkler outdoes ...


In [None]:
# Topic modeling - run only once

sws_punc_stem = []
ps = PorterStemmer()

for text in reviews_df['reviewText_cleaned']:
    words = nltk.word_tokenize(text.lower())
    stem_words = [ps.stem(word) for word in words]
    sws_punc_stem.append(stem_words)

dictionary = corpora.Dictionary(sws_punc_stem)
dictionary.filter_extremes(no_below=5, no_above=0.3)
dictionary.compactify()

corpus = [dictionary.doc2bow(text) for text in sws_punc_stem]

ldamodel_final = models.ldamodel.LdaModel(corpus, num_topics=10, id2word=dictionary, passes=20)
ldamodel_final.save('../Models/lda.model')

In [7]:
ldamodel = models.LdaModel.load('../Models/lda.model')

In [12]:
ldamodel.print_topics(num_topics=10, num_words=10)

[(0,
  '0.010*"famili" + 0.010*"get" + 0.009*"life" + 0.007*"find" + 0.007*"young" + 0.007*"girl" + 0.007*"man" + 0.007*"take" + 0.006*"live" + 0.006*"friend"'),
 (1,
  '0.027*"match" + 0.019*"vs" + 0.010*"move" + 0.008*"back" + 0.008*"get" + 0.008*"work" + 0.008*"use" + 0.007*"team" + 0.006*"show" + 0.006*"time"'),
 (2,
  '0.019*"music" + 0.017*"comedi" + 0.012*"song" + 0.011*"love" + 0.011*"funni" + 0.011*"perform" + 0.009*"play" + 0.007*"show" + 0.007*"laugh" + 0.006*"sing"'),
 (3,
  '0.035*"watch" + 0.034*"love" + 0.033*"great" + 0.029*"seri" + 0.029*"season" + 0.028*"show" + 0.019*"episod" + 0.019*"enjoy" + 0.017*"stori" + 0.013*"see"'),
 (4,
  '0.016*"origin" + 0.016*"action" + 0.016*"horror" + 0.014*"effect" + 0.011*"anim" + 0.011*"special" + 0.009*"new" + 0.008*"fan" + 0.008*"stori" + 0.007*"monster"'),
 (5,
  '0.027*"dvd" + 0.018*"set" + 0.018*"releas" + 0.010*"bluray" + 0.009*"featur" + 0.009*"video" + 0.009*"version" + 0.009*"disc" + 0.009*"qualiti" + 0.008*"get"'),
 (6,
  '