# Import necessary dependencies

In [1]:
import pandas as pd
import numpy as np
import text_normalizer as tn
import warnings
import nltk

warnings.filterwarnings("ignore")

# Load and normalize data

In [3]:
dataset = pd.read_csv(r'movie_reviews.csv')

# take a peek at the data
print(dataset.head())
reviews = np.array(dataset['review'])
sentiments = np.array(dataset['sentiment'])

# build train and test datasets
train_reviews = reviews[:35000]
train_sentiments = sentiments[:35000]
test_reviews = reviews[35000:]
test_sentiments = sentiments[35000:]

# normalize datasets
stop_words = nltk.corpus.stopwords.words('english')
stop_words.remove('no')
stop_words.remove('but')
stop_words.remove('not')

norm_train_reviews = tn.normalize_corpus(train_reviews, stopwords=stop_words)
norm_test_reviews = tn.normalize_corpus(test_reviews, stopwords=stop_words)

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


# Extract features from positive and negative reviews

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

# consolidate all normalized reviews
norm_reviews = norm_train_reviews+norm_test_reviews
# get tf-idf features for only positive reviews
positive_reviews = [review for review, sentiment in zip(norm_reviews, sentiments) if sentiment == 'positive']
ptvf = TfidfVectorizer(use_idf=True, min_df=0.02, max_df=0.75, ngram_range=(1, 2), sublinear_tf=True)
ptvf_features = ptvf.fit_transform(positive_reviews)
# get tf-idf features for only negative reviews
negative_reviews = [review for review, sentiment in zip(norm_reviews, sentiments) if sentiment == 'negative']
ntvf = TfidfVectorizer(use_idf=True, min_df=0.02, max_df=0.75, ngram_range=(1, 2), sublinear_tf=True)
ntvf_features = ntvf.fit_transform(negative_reviews)
# view feature set dimensions
print(ptvf_features.shape, ntvf_features.shape)

(25000, 933) (25000, 925)


# Topic Modeling on Reviews

In [5]:
import pyLDAvis
import pyLDAvis.sklearn
from sklearn.decomposition import NMF
import topic_model_utils as tmu

pyLDAvis.enable_notebook()
total_topics = 10

## Display and visualize topics for positive reviews

In [6]:
# build topic model on positive sentiment review features
pos_nmf = NMF(n_components=total_topics, solver='cd', max_iter=500,
               random_state=42, alpha=.1, l1_ratio=.85)
pos_nmf.fit(ptvf_features)      
# extract features and component weights
pos_feature_names = np.array(ptvf.get_feature_names())
pos_weights = pos_nmf.components_
# extract and display topics and their components
pos_feature_names = np.array(ptvf.get_feature_names())
feature_idxs = np.argsort(-pos_weights)[:, :15]
topics = [pos_feature_names[idx] for idx in feature_idxs]
for idx, topic in enumerate(topics):
    print('Topic #'+str(idx+1)+':')
    print(', '.join(topic))
    print()

Topic #1:
but, one, make, no, take, way, even, get, seem, like, much, scene, may, character, go

Topic #2:
movie, watch, see, like, think, really, good, but, see movie, great, movie not, would, get, enjoy, say

Topic #3:
show, episode, series, tv, season, watch, dvd, television, first, good, see, would, air, great, remember

Topic #4:
family, old, young, year, life, child, father, mother, son, year old, man, friend, kid, boy, girl

Topic #5:
performance, role, actor, play, great, cast, good, well, excellent, character, story, star, also, give, acting

Topic #6:
film, see, see film, film not, watch, good film, watch film, dvd, great film, film but, film see, release, year, film make, great

Topic #7:
love, love movie, story, love story, fall love, fall, beautiful, song, wonderful, music, heart, romantic, romance, favorite, character

Topic #8:
funny, laugh, hilarious, joke, humor, moment, fun, guy, get, but, line, show, lot, time, scene

Topic #9:
ever, ever see, movie ever, one good, o

In [7]:
pyLDAvis.sklearn.prepare(pos_nmf, ptvf_features, ptvf, mds='mmds')

## Display and visualize topics for negative reviews

In [8]:
# build topic model on negative sentiment review features
neg_nmf = NMF(n_components=total_topics, solver='cd', max_iter=500,
              random_state=42, alpha=.1, l1_ratio=.85)
neg_nmf.fit(ntvf_features)      
# extract features and component weights
neg_feature_names = ntvf.get_feature_names()
neg_weights = neg_nmf.components_
# extract and display topics and their components
neg_feature_names = np.array(ntvf.get_feature_names())
feature_idxs = np.argsort(-neg_weights)[:, :15]
topics = [neg_feature_names[idx] for idx in feature_idxs]
for idx, topic in enumerate(topics):
    print('Topic #'+str(idx+1)+':')
    print(', '.join(topic))
    print()

Topic #1:
but, one, character, get, go, like, no, scene, seem, take, show, much, time, would, play

Topic #2:
movie, watch, good, bad, think, like, but, see, would, make, even, movie not, could, really, watch movie

Topic #3:
film, film not, good, bad, make, bad film, acting, film but, but, actor, watch film, see film, script, watch, see

Topic #4:
horror, budget, low, low budget, horror movie, horror film, gore, flick, zombie, blood, scary, killer, monster, kill, genre

Topic #5:
effect, special, special effect, fi, sci, sci fi, acting, bad, look, look like, cheesy, terrible, cheap, creature, space

Topic #6:
funny, comedy, joke, laugh, not funny, show, humor, stupid, try, hilarious, but, fun, suppose, episode, moment

Topic #7:
ever, ever see, bad, bad movie, movie ever, see, one bad, ever make, one, movie, film ever, bad film, make, horrible, movie bad

Topic #8:
waste, waste time, time, not waste, money, complete, hour, spend, life, talent, please, crap, total, plot, minute

Topic 

In [9]:
pyLDAvis.sklearn.prepare(neg_nmf, ntvf_features, ntvf, mds='mmds')