## Load pretrained models or train from scratch?

In [25]:
import os
import pickle
use_pickled = input("Load pickled files/models? (y/n, defaults to y): ")
if use_pickled == 'n':
    use_pickled = False
else:
    use_pickled = True

pickled_dir = "pickled"


## Load the IMDB movie review dataset
This dataset is already separated into 25,000 negative and 25,000 positive reviews for a total of 50,000 reviews.

In [2]:
import pandas as pd

data = pd.read_csv('IMDB-Dataset.csv', error_bad_lines=False);

# split positive and negative sentiment reviews
pos_reviews = data[data.sentiment == "positive"]
neg_reviews = data[data.sentiment == "negative"]

pos_data = pos_reviews[['review']]
pos_data['index'] = pos_data.index
pos_documents = pos_data

neg_data = neg_reviews[['review']]
neg_data['index'] = neg_data.index
neg_documents = neg_data

# showing that the reviews were correctly split by sentiment
print(len(pos_documents))
print(pos_documents[:5])
print(len(neg_documents))
print(neg_documents[:5])

25000
                                              review  index
0  One of the other reviewers has mentioned that ...      0
1  A wonderful little production. <br /><br />The...      1
2  I thought this was a wonderful way to spend ti...      2
4  Petter Mattei's "Love in the Time of Money" is...      4
5  Probably my all-time favorite movie, a story o...      5
25000
                                               review  index
3   Basically there's a family where a little boy ...      3
7   This show was an amazing, fresh & innovative i...      7
8   Encouraged by the positive comments about this...      8
10  Phil the Alien is one of those quirky films wh...     10
11  I saw this movie when I was about 12 when it c...     11


## Initial Preprocessing: remove punctuation and convert everything to lowercase

In [3]:
# from nltk.corpus import stopwords
import re

# stop_words = list(set(stopwords.words('english')))

'''Positive Data'''
# Remove punctuation using regular expresssion
pos_documents['review_processed'] = pos_documents['review'].map(lambda x: re.sub('[,\.!?]', '', x))
# Lowercase the words using regular expresssion
pos_documents['review_processed'] = pos_documents['review'].map(lambda x: x.lower())
'''Negative Data'''
# Remove punctuation using regular expresssion
neg_documents['review_processed'] = neg_documents['review'].map(lambda x: re.sub('[,\.!?]', '', x))
# Lowercase the words using regular expresssion
neg_documents['review_processed'] = neg_documents['review'].map(lambda x: x.lower())

## Generate World Cloud

Either loads the pickled objects from disk, or genereate a new word cloud objects and save it to disk.

In [4]:
from wordcloud import WordCloud

# load from pickled/wordcloud_pos and pickled/worldcloud_neg
if use_pickled:
    with open(os.path.join(pickled_dir, "wordcloud_pos"), 'rb') as f:
        wordcloud_pos = pickle.load(f)
    with open(os.path.join(pickled_dir, "wordcloud_neg"), 'rb') as f:
        wordcloud_neg = pickle.load(f)
else:
    # create a new wordcloud object for positive reviews, then write it to disk
    long_string_pos = " ".join(pos_documents.review_processed)
    wordcloud_pos = WordCloud().generate(long_string_pos)
    with open(os.path.join(pickled_dir, "wordcloud_pos"), 'wb+') as f:
        pickle.dump(wordcloud_pos, f)
    
    # create a new wordcloud object for negative reviews, then write it to disk
    long_string_neg = " ".join(neg_documents.review_processed)
    wordcloud_neg = WordCloud().generate(long_string_neg)
    with open(os.path.join(pickled_dir, "wordcloud_neg"), 'wb+') as f:
        pickle.dump(wordcloud_neg, f)

## Postitive Word Cloud

In [5]:
image = wordcloud_pos.to_image()
image.show()


## Negative Word Cloud

In [6]:
image = wordcloud_neg.to_image()
image.show()

## Stemming and Lematizing

## Lemmataizing and Stemming

In [24]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk

stemmer = SnowballStemmer("english")
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))


def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

pos_proc_docs_fname = os.path.join(pickled_dir, "pos_processed_docs")
neg_proc_docs_fname = os.path.join(pickled_dir, "neg_processed_docs")
if use_pickled:
    with open(pos_proc_docs_fname, 'rb') as f1:
        pos_processed_documents = pickle.load(f1)
    with open(neg_proc_docs_fname, 'rb') as f2:
        neg_processed_documents = pickle.load(f2)
else:
    pos_processed_documents = pos_documents['review_processed'].map(preprocess)
    neg_processed_documents = neg_documents['review_processed'].map(preprocess)
    with open(pos_proc_docs_fname, 'wb+') as f1:
        pickle.dump(pos_processed_documents, f1)
    with open(neg_proc_docs_fname, 'wb+') as f2:
        pickle.dump(neg_processed_documents, f2)

print("Positive reviews after lemmatizing and stemming:")
print(len(pos_processed_documents))
print(pos_processed_documents[:10])

print("\nNegative reviews after lemmatizing and stemming:")
print(len(neg_processed_documents))
print(neg_processed_documents[:10])

Positive reviews after lemmatizing and stemming:


TypeError: object of type 'WordCloud' has no len()

In [9]:
# Making Positive and Negative Dictionaries
pos_dictionary = gensim.corpora.Dictionary(pos_processed_documents)
count = 0
print("\nSome random positive words in our dictionary: ")
for k, v in pos_dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break    
neg_dictionary = gensim.corpora.Dictionary(neg_processed_documents)
count = 0
print("\nSome random negative words in our dictionary: ")
for k, v in neg_dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break


Some random positive words in our dictionary: 
0 accustom
1 agenda
2 agreement
3 appeal
4 aryan
5 audienc
6 away
7 bitch
8 brutal
9 call
10 cell

Some random negative words in our dictionary: 
0 argu
1 basic
2 boogeyman
3 closet
4 decid
5 descent
6 dialog
7 divorc
8 drama
9 expect
10 famili


In [10]:
# Making Postiive and Negative LDA Models
pos_dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
pos_bow_corpus = [pos_dictionary.doc2bow(doc) for doc in pos_processed_documents]
pos_lda_model = gensim.models.LdaMulticore(pos_bow_corpus, num_topics=150, id2word=pos_dictionary, passes=10, workers=2)

neg_dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
neg_bow_corpus = [neg_dictionary.doc2bow(doc) for doc in neg_processed_documents]
neg_lda_model = gensim.models.LdaMulticore(neg_bow_corpus, num_topics=150, id2word=neg_dictionary, passes=10, workers=2)

In [11]:
# Viewing the LDA Model Topic Results
print("\nFirst Ten Positive Review Topics:")
for idx, topic in pos_lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))
    if idx>=9:
        break
print("\nFirst Ten Negative Review Topics")
for idx, topic in neg_lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))
    if idx>=9:
        break


First Ten Positive Review Topics:
Topic: 0 
Words: 0.032*"team" + 0.019*"like" + 0.019*"play" + 0.018*"sport" + 0.017*"role" + 0.017*"footbal" + 0.015*"actor" + 0.013*"basebal" + 0.012*"great" + 0.012*"good"
Topic: 1 
Words: 0.018*"urban" + 0.013*"charact" + 0.012*"luci" + 0.011*"virgin" + 0.010*"earli" + 0.010*"right" + 0.009*"play" + 0.009*"offens" + 0.008*"style" + 0.008*"like"
Topic: 2 
Words: 0.022*"charact" + 0.013*"scene" + 0.010*"work" + 0.008*"director" + 0.007*"stori" + 0.007*"make" + 0.007*"point" + 0.006*"perform" + 0.005*"effect" + 0.005*"emot"
Topic: 3 
Words: 0.079*"releas" + 0.053*"comment" + 0.049*"christma" + 0.018*"review" + 0.017*"spanish" + 0.017*"print" + 0.015*"holiday" + 0.014*"see" + 0.014*"titl" + 0.014*"classic"
Topic: 4 
Words: 0.024*"play" + 0.023*"richard" + 0.020*"jean" + 0.018*"role" + 0.017*"joan" + 0.016*"cagney" + 0.013*"gangster" + 0.013*"warner" + 0.011*"star" + 0.010*"bull"
Topic: 5 
Words: 0.038*"king" + 0.028*"british" + 0.017*"media" + 0.015*"g

In [12]:
from gensim.models import CoherenceModel

# Compute Perplexity
print('\nPositive Perplexity: ', pos_lda_model.log_perplexity(pos_bow_corpus))  # a measure of how good the model is. lower the better.
# Compute Coherence Score
pos_coherence_model_lda = CoherenceModel(model=pos_lda_model, texts=pos_processed_documents, dictionary=pos_dictionary, coherence='c_v')
pos_coherence_lda = pos_coherence_model_lda.get_coherence()
print('Positive Coherence Score: ', pos_coherence_lda)
'''Negative Topics'''
# Compute Perplexity
print('\nNegative Perplexity: ', neg_lda_model.log_perplexity(neg_bow_corpus))  # a measure of how good the model is. lower the better.
# Compute Coherence Score
neg_coherence_model_lda = CoherenceModel(model=neg_lda_model, texts=neg_processed_documents, dictionary=neg_dictionary, coherence='c_v')
neg_coherence_lda = neg_coherence_model_lda.get_coherence()
print('Negative Coherence Score: ', neg_coherence_lda)


Positive Perplexity:  -8.092771066060543
Positive Coherence Score:  0.31947427660032335

Negative Perplexity:  -8.165978130504575
Negative Coherence Score:  0.3061946201591925


In [13]:
#import gensim

#'''Positive'''
# Download File: http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
#mallet_path = r'C:/MALLET/bin/mallet.bat' # update this path
#pos_ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=pos_bow_corpus, num_topics=30, id2word=pos_dictionary)

# Show Topics
#print(pos_ldamallet.show_topics(formatted=False))
# Compute Coherence Score
#pos_coherence_model_ldamallet = CoherenceModel(model=pos_ldamallet, texts=pos_processed, dictionary=pos_dictionary, coherence='c_v')
#pos_coherence_ldamallet = pos_coherence_model_ldamallet.get_coherence()
#print('\nCoherence Score: ', pos_coherence_ldamallet)

#'''Negative'''
#neg_ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=neg_bow_corpus, num_topics=30, id2word=neg_dictionary)

# Show Topics
#print(neg_ldamallet.show_topics(formatted=False))
# Compute Coherence Score
#neg_coherence_model_ldamallet = CoherenceModel(model=neg_ldamallet, texts=neg_processed, dictionary=neg_dictionary, coherence='c_v')
#neg_coherence_ldamallet = neg_coherence_model_ldamallet.get_coherence()
#print('\nCoherence Score: ', neg_coherence_ldamallet)

In [14]:
import pyLDAvis
import pyLDAvis.gensim

# Visualize positive topic words
pyLDAvis.enable_notebook()
pos_vis = pyLDAvis.gensim.prepare(pos_lda_model, pos_bow_corpus, pos_dictionary)
pos_vis

In [15]:
# Visualize negative topic words
pyLDAvis.enable_notebook()
neg_vis = pyLDAvis.gensim.prepare(neg_lda_model, neg_bow_corpus, neg_dictionary)
neg_vis

In [16]:
def get_sentiment(text):
    count = 0
    pos_score = 0
    neg_score = 0
    #get what positive topics might be related
    bow_vector = pos_dictionary.doc2bow(preprocess(text))
    
    for idx, score in sorted(pos_lda_model[bow_vector], key=lambda tup:-1*tup[1]):
        count+=1
        pos_score += score
        if count > 2:
            break
    pos_score = pos_score/3
    pos_score *= 100
    
    count = 0
    #get what negative topics might be related
    bow_vector = neg_dictionary.doc2bow(preprocess(text))
    for idx, score in sorted(neg_lda_model[bow_vector], key=lambda tup:-1*tup[1]):    
        neg_score += score
        count+=1
        if count > 2:
            break
    neg_score = neg_score/3
    neg_score *= 100
    
    result = 0
    if pos_score>neg_score:
        result = (pos_score - neg_score)/(pos_score + neg_score)
        result *= 100
        result = 50 + (result*2)
        return result
    else:
        result = (neg_score - pos_score)/(pos_score + neg_score)
        result *= 100
        result = 50+(result*2)
        return result

In [17]:
def get_general_sentiment(text):
    count = 0
    pos_score = 0
    neg_score = 0
    #get what positive topics might be related
    bow_vector = pos_dictionary.doc2bow(preprocess(text))
    
    for idx, score in sorted(pos_lda_model[bow_vector], key=lambda tup:-1*tup[1]):
        count+=1
        pos_score += score
        if count > 2:
            break
    pos_score = pos_score/3
    pos_score *= 100
    
    count = 0
    #get what negative topics might be related
    bow_vector = neg_dictionary.doc2bow(preprocess(text))
    for idx, score in sorted(neg_lda_model[bow_vector], key=lambda tup:-1*tup[1]):    
        neg_score += score
        count+=1
        if count > 2:
            break
    neg_score = neg_score/3
    neg_score *= 100

    if pos_score>neg_score:
        return "positive"
    else:
        return "negative"

In [20]:
unseen_movie_description = input("Please enter a movie description to analyze: ")
result = round(get_sentiment(unseen_movie_description), 3)
print("\nWe predict that opinions on this movie are generally {}.".format(get_general_sentiment(unseen_movie_description)))
print("\nWe predict that this movie has a rating of ~{}%.".format(result))


We predict that opinions on this movie are generally negative.


ZeroDivisionError: float division by zero