In [16]:
# dataset Location
# https://snap.stanford.edu/data/web-Movies.html

In [17]:
import numpy as np
import pandas as pd
import seaborn as sb
from gensim import corpora, models
import nltk
import json
from glob import glob
import networkx as nx
from gensim import corpora, models
import plotly.express as px
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import PorterStemmer
import string

In [18]:
# File Location
path = 'movies-by-year/movies-1999.json'
files = []

In [19]:
df = pd.read_json(path)

In [20]:
for each in glob('movies-by-year/*.json'):
    files.append(each)

In [21]:
files[:4]

['movies-by-year/movies-1999.json',
 'movies-by-year/movies-2006.json',
 'movies-by-year/movies-2012.json',
 'movies-by-year/movies-2000.json']

## Concatenate Multiple Movie Files into DataFrame

In [22]:
# for each in files[:4]: # I am getting errors on my machine trying to concatenate many more files than this
#     if each != path:
#         temp_df = pd.read_json(each)
#         df = pd.concat([temp_df, df])

In [23]:
len(df['userId'])

78977

In [24]:
G = nx.from_pandas_edgelist(df, "userId", "productId")

In [25]:
# nx.draw(G) # ran for more than 8 hrs

## Prep for Sentiment Analysis

In [26]:
df.head()

Unnamed: 0,productId,userId,profileName,helpfulness,score,time,summary,text,year,month,day
539,0790747324,A1Y6LC4JFBCGZN,Michael Butkus,8/8,4,924480000000,Time-less classic sci-fi that bears up to repe...,Although quite different in tone and content f...,1999,4,19
1652,B00004CQTP,A2QE8OSHVY9EJ5,"George A. Hinkson ""spidude""",0/0,5,926553600000,Killer!,I would have to disagree that this is better t...,1999,5,13
1702,6305508569,AK2AQIULQDFS5,Bradley Tobin,6/7,5,945043200000,Every self-respecting action fan MUST own it.,"action, Action, ACTION, and... Yes, thats righ...",1999,12,13
2243,6304286961,A34KBX6VF28QYN,Joseph Jordan,5/9,3,940464000000,Stone almost ruins Kilmer's finest hour,Jim Morrison was a self-indulgent hack of a so...,1999,10,21
2287,6304286961,A1O40PJC4U0J4Q,J7173@Hotmail.com,0/2,2,917568000000,"It was OK, but...",I have read about a dozen books on The Doors. ...,1999,1,29


In [27]:
sws = set(stopwords.words('english'))
exclude = set(string.punctuation)
df['sent_token'] = df.apply(lambda row: [ch for ch in word_tokenize(row['text'].lower()) if ch not in exclude], axis=1)
df['sent_no_stop'] = df['sent_token'].apply(lambda x: ' '.join([word for word in x if word not in (sws) and len(word) > 2 and (not word.startswith("http"))]))

## Sentiment Analysis

In [28]:
analyzer = SentimentIntensityAnalyzer()
df['compound'] = [analyzer.polarity_scores(x)['compound'] for x in df['sent_no_stop']]
df['neg'] = [analyzer.polarity_scores(x)['neg'] for x in df['sent_no_stop']]
df['neu'] = [analyzer.polarity_scores(x)['neu'] for x in df['sent_no_stop']]
df['pos'] = [analyzer.polarity_scores(x)['pos'] for x in df['sent_no_stop']]

## Visuals for Sentiment Analysis Results

In [29]:
pos_hist = px.box(df, x='pos').show()
neg_hist = px.box(df, x='neg').show()
neu_hist = px.box(df, x='neu').show()
compound_hist = px.box(df, x='compound', title="Entire dataset compound score boxplot").show()

## Prep for LDA

In [30]:
df['lda_token'] = df.apply(lambda row: ''.join([ch for ch in ' '.join(word_tokenize(row['text'].lower())) if ch not in exclude]).split(), axis=1)
df['lda_no_stop'] = df['lda_token'].apply(lambda x: [word for word in x if word not in (sws) and len(word) > 2 and (not word.startswith("http"))])
full_sws_removed = df['lda_no_stop'].values
full_dictionary = corpora.Dictionary(full_sws_removed)
full_dictionary.filter_extremes(no_below=5, no_above=0.3)
full_dictionary.compactify()
full_corpus = [full_dictionary.doc2bow(text) for text in full_sws_removed]
df['full_corpus'] = df.apply(lambda row: full_dictionary.doc2bow(row['lda_no_stop']), axis=1)

## LDA

In [31]:
full_ldamodel = models.ldamodel.LdaModel(full_corpus, num_topics=3, id2word=full_dictionary, passes=20)

In [32]:
full_ldamodel.print_topics(num_topics=5, num_words=5)

[(0,
  '0.009*"great" + 0.009*"like" + 0.008*"see" + 0.007*"good" + 0.007*"time"'),
 (1,
  '0.016*"dvd" + 0.008*"version" + 0.008*"great" + 0.006*"like" + 0.006*"good"'),
 (2,
  '0.004*"story" + 0.003*"man" + 0.003*"films" + 0.003*"best" + 0.003*"like"')]

In [33]:
for i in range(0, 5, 1):  
    full_topics = full_ldamodel.get_document_topics(full_corpus[i])
    full_topics = sorted(full_topics, key=lambda x: -x[1])
    print("{}: {}\n".format(full_topics, " ".join(full_sws_removed[i])))

[(2, 0.65082526), (1, 0.3443094)]: although quite different tone content original novel wells george pal oscar winner special effects time machine convincingly conveys wonder mystery time travel rod taylor highly believable intense portrayal time traveler major strength film even though special effects quot dated quot reveal times quot effects quot low budget manage hold repeated viewings much like pal war worlds victorian atmosphere time machine vivid strangeness world 802701ad spite genre film manages effectively portray need preserving humanness throughout time obvious adherence hollywood comic book rescueadventure romance given poetic license film still unique entertaining worth one film library

[(1, 0.58984673), (0, 0.38298538), (2, 0.027167903)]: would disagree better original saying bad would blasphemy orginal still remains favorite

[(2, 0.37361822), (1, 0.3700569), (0, 0.2563249)]: action action action yes thats right hardhitting action arnie really knows audience wants revie

In [34]:
df['lda_topic'] = df.apply(lambda row: sorted(full_ldamodel.get_document_topics(row['full_corpus']), key=lambda x: -x[1])[0][0], axis=1)

## Visuals looking at LDA and Sentiment Analysis

In [35]:
comp_hist_0 = px.box(df[df['lda_topic']==0], x='compound', title="Topic 0 compound score boxplot").show()
comp_hist_1 = px.box(df[df['lda_topic']==1], x='compound', title="Topic 1 compound score boxplot").show()
comp_hist_2 = px.box(df[df['lda_topic']==2], x='compound', title="Topic 2 compound score boxplot").show()