In [1]:
import pandas as pd

data = pd.read_csv('IMDB-Dataset.csv', error_bad_lines=False);

# split positive and negative sentiment reviews
pos_reviews = data[data.sentiment == "positive"]
neg_reviews = data[data.sentiment == "negative"]

pos_data = pos_reviews[['review']]
pos_data['index'] = pos_data.index
pos_documents = pos_data

neg_data = neg_reviews[['review']]
neg_data['index'] = neg_data.index
neg_documents = neg_data

# showing that the reviews were correctly split by sentiment
print(len(pos_documents))
print(pos_documents[:5])
print(len(neg_documents))
print(neg_documents[:5])

25000
                                              review  index
0  One of the other reviewers has mentioned that ...      0
1  A wonderful little production. <br /><br />The...      1
2  I thought this was a wonderful way to spend ti...      2
4  Petter Mattei's "Love in the Time of Money" is...      4
5  Probably my all-time favorite movie, a story o...      5
25000
                                               review  index
3   Basically there's a family where a little boy ...      3
7   This show was an amazing, fresh & innovative i...      7
8   Encouraged by the positive comments about this...      8
10  Phil the Alien is one of those quirky films wh...     10
11  I saw this movie when I was about 12 when it c...     11


In [2]:
from nltk.corpus import stopwords
import re

stop_words = list(set(stopwords.words('english')))

'''Positive Data'''
# Remove punctuation using regular expresssion
pos_documents['review_processed'] = pos_documents['review'].map(lambda x: re.sub('[,\.!?]', '', x))
# Lowercase the words using regulatr expresssion
pos_documents['review_processed'] = pos_documents['review'].map(lambda x: x.lower())
'''Negative Data'''
# Remove punctuation using regular expresssion
neg_documents['review_processed'] = neg_documents['review'].map(lambda x: re.sub('[,\.!?]', '', x))
# Lowercase the words using regulatr expresssion
neg_documents['review_processed'] = neg_documents['review'].map(lambda x: x.lower())

In [3]:
from wordcloud import WordCloud

print("Word Cloud generated from positive reviews with minimal preprocessing:")
long_string = " ".join(pos_documents.review_processed)

wordcloud = WordCloud().generate(long_string)
image = wordcloud.to_image()
image.show()

print("Word Cloud generated from negative reviews with minimal preprocessing:")
long_string = " ".join(neg_documents.review_processed)
wordcloud = WordCloud().generate(long_string)
image = wordcloud.to_image()
image.show()

Word Cloud generated from positive reviews with minimal preprocessing:
Word Cloud generated from negative reviews with minimal preprocessing:


In [4]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk

stemmer = SnowballStemmer("english")
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))


def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [5]:
pos_processed_documents = pos_documents['review_processed'].map(preprocess)
print("Positive reviews after lemmatizing and stemming:")
print(len(pos_processed_documents))
print(pos_processed_documents[:10])

neg_processed_documents = neg_documents['review_processed'].map(preprocess)
print("\nNegative reviews after lemmatizing and stemming:")
print(len(neg_processed_documents))
print(neg_processed_documents[:10])

Positive reviews after lemmatizing and stemming:
25000
0     [review, mention, watch, episod, hook, right, ...
1     [wonder, littl, product, film, techniqu, unass...
2     [think, wonder, spend, time, summer, weekend, ...
4     [petter, mattei, love, time, money, visual, st...
5     [probabl, time, favorit, movi, stori, selfless...
6     [sure, like, resurrect, date, seahunt, seri, t...
9     [like, origin, wrench, laughter, like, movi, y...
14    [fantast, movi, prison, famous, actor, georg, ...
16    [film, simpli, remak, film, fail, captur, flav...
18    [rememb, film, film, watch, cinema, pictur, da...
Name: review_processed, dtype: object

Negative reviews after lemmatizing and stemming:
25000
3     [basic, famili, littl, jake, think, zombi, clo...
7     [amaz, fresh, innov, idea, air, year, brillian...
8     [encourag, posit, comment, film, look, forward...
10    [phil, alien, quirki, film, humour, base, odd,...
11    [movi, come, recal, scariest, scene, bird, eat...
12    [boll

In [6]:
pos_dictionary = gensim.corpora.Dictionary(pos_processed_documents)
count = 0
for k, v in pos_dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break
        
neg_dictionary = gensim.corpora.Dictionary(neg_processed_documents)
count = 0
for k, v in neg_dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 accustom
1 agenda
2 agreement
3 appeal
4 aryan
5 audienc
6 away
7 bitch
8 brutal
9 call
10 cell
0 argu
1 basic
2 boogeyman
3 closet
4 decid
5 descent
6 dialog
7 divorc
8 drama
9 expect
10 famili


In [7]:
pos_dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
neg_dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
pos_bow_corpus = [pos_dictionary.doc2bow(doc) for doc in neg_processed_documents]
neg_bow_corpus = [neg_dictionary.doc2bow(doc) for doc in pos_processed_documents]
pos_lda_model = gensim.models.LdaMulticore(pos_bow_corpus, num_topics=10, id2word=pos_dictionary, passes=2, workers=2)
neg_lda_model = gensim.models.LdaMulticore(neg_bow_corpus, num_topics=10, id2word=neg_dictionary, passes=2, workers=2)

In [8]:
print("Ten Random Positive Review Topics:")
for idx, topic in pos_lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))
    
print("\nTen Random Negative Review Topics")
for idx, topic in neg_lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Ten Random Positive Review Topics:
Topic: 0 
Words: 0.015*"like" + 0.010*"charact" + 0.009*"think" + 0.008*"good" + 0.007*"stori" + 0.006*"actor" + 0.006*"watch" + 0.006*"look" + 0.005*"come" + 0.005*"time"
Topic: 1 
Words: 0.013*"like" + 0.012*"time" + 0.011*"good" + 0.011*"watch" + 0.010*"act" + 0.010*"scene" + 0.008*"charact" + 0.008*"actor" + 0.007*"look" + 0.006*"think"
Topic: 2 
Words: 0.011*"charact" + 0.009*"like" + 0.009*"stori" + 0.006*"time" + 0.005*"scene" + 0.005*"good" + 0.005*"play" + 0.004*"work" + 0.004*"director" + 0.004*"look"
Topic: 3 
Words: 0.013*"good" + 0.011*"see" + 0.010*"stori" + 0.009*"watch" + 0.009*"act" + 0.008*"like" + 0.006*"plot" + 0.006*"better" + 0.006*"time" + 0.006*"worst"
Topic: 4 
Words: 0.009*"look" + 0.008*"horror" + 0.008*"like" + 0.007*"scene" + 0.006*"kill" + 0.005*"get" + 0.005*"time" + 0.005*"thing" + 0.005*"good" + 0.005*"girl"
Topic: 5 
Words: 0.018*"like" + 0.012*"peopl" + 0.006*"think" + 0.006*"time" + 0.006*"look" + 0.005*"play" + 0.0

In [9]:
#extracting positive topic words
x = pos_lda_model.show_topics(num_topics=12, num_words=900,formatted=False)
topics = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]
topics = [x[1] for x in topics]
pos_topic_words = []
for words in topics:
    for word in words:
        pos_topic_words.append(word)
        
#extracting negative topic words
x = neg_lda_model.show_topics(num_topics=12, num_words=900,formatted=False)
topics = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]
topics = [x[1] for x in topics]
neg_topic_words = []
for words in topics:
    for word in words:
        neg_topic_words.append(word)  

In [10]:
# Removing punctuation using regular expresssion
pos_topic_words = [re.sub('[,\.!?]', '', word) for word in pos_topic_words]
neg_topic_words = [re.sub('[,\.!?]', '', word) for word in neg_topic_words]
# showing that there are now root words in our lists
print('Positive Topics: {}'.format(len(pos_topic_words)))
print(pos_topic_words[:5])
print('\nNegative Topics: {}'.format(len(neg_topic_words)))
print(neg_topic_words[:5])

Positive Topics: 9000
['like', 'charact', 'think', 'good', 'stori']

Negative Topics: 9000
['great', 'play', 'best', 'good', 'time']
