# NLP: LDA & Sentiment Analysis on Movie Reviews

In [None]:
import os
import pickle
use_pickled = input("Load pickled files/models? (y/n, defaults to y): ")
if use_pickled == 'n':
    use_pickled = False
else:
    use_pickled = True

pickled_dir = "pickled"


In [3]:
import pandas as pd

data = pd.read_csv('IMDB-Dataset.csv', error_bad_lines=False);

# split positive and negative sentiment reviews
pos_reviews = data[data.sentiment == "positive"]
neg_reviews = data[data.sentiment == "negative"]

pos_data = pos_reviews[['review']]
pos_data['index'] = pos_data.index
pos_documents = pos_data

neg_data = neg_reviews[['review']]
neg_data['index'] = neg_data.index
neg_documents = neg_data


# from nltk.corpus import stopwords
import re

# stop_words = list(set(stopwords.words('english')))

'''Positive Data'''
# Remove punctuation using regular expresssion
pos_documents['review_processed'] = pos_documents['review'].map(lambda x: re.sub('[,\.!?]', '', x))
# Lowercase the words using regular expresssion
pos_documents['review_processed'] = pos_documents['review'].map(lambda x: x.lower())
'''Negative Data'''
# Remove punctuation using regular expresssion
neg_documents['review_processed'] = neg_documents['review'].map(lambda x: re.sub('[,\.!?]', '', x))
# Lowercase the words using regular expresssion
neg_documents['review_processed'] = neg_documents['review'].map(lambda x: x.lower())

## Stemming and Lematizing
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk

stemmer = SnowballStemmer("english")
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))


def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

pos_proc_docs_fname = os.path.join(pickled_dir, "pos_processed_docs")
neg_proc_docs_fname = os.path.join(pickled_dir, "neg_processed_docs")
if use_pickled:
    with open(pos_proc_docs_fname, 'rb') as f1:
        pos_processed_documents = pickle.load(f1)
    with open(neg_proc_docs_fname, 'rb') as f2:
        neg_processed_documents = pickle.load(f2)
else:
    pos_processed_documents = pos_documents['review_processed'].map(preprocess)
    neg_processed_documents = neg_documents['review_processed'].map(preprocess)
    with open(pos_proc_docs_fname, 'wb+') as f1:
        pickle.dump(pos_processed_documents, f1)
    with open(neg_proc_docs_fname, 'wb+') as f2:
        pickle.dump(neg_processed_documents, f2)
        
import gensim

# Making Positive and Negative Dictionaries
pos_dictionary = gensim.corpora.Dictionary(pos_processed_documents)
neg_dictionary = gensim.corpora.Dictionary(neg_processed_documents)        
#Removing Extreme Cases of Words
pos_dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
pos_bow_corpus = [pos_dictionary.doc2bow(doc) for doc in pos_processed_documents] # corpus for topics that are seen as positive

neg_dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
neg_bow_corpus = [neg_dictionary.doc2bow(doc) for doc in neg_processed_documents] # corpus for topics that are seen as negative

## Create the LDA Models

In [5]:
num_topics = int(input("Number of topics to generate: "))

pos_lda_model_fname = os.path.join(pickled_dir, 'pos_lda_model_{}_topics'.format(num_topics))
neg_lda_model_fname = os.path.join(pickled_dir, 'neg_lda_model_{}_topics'.format(num_topics))
if use_pickled:
    pos_lda_model = gensim.models.LdaMulticore.load(pos_lda_model_fname)
    neg_lda_model = gensim.models.LdaMulticore.load(neg_lda_model_fname)

else:
    pos_lda_model = gensim.models.LdaMulticore(
        corpus=pos_bow_corpus, 
        num_topics=num_topics, 
        id2word=pos_dictionary, 
        passes=2, 
        workers=2)

    neg_lda_model = gensim.models.LdaMulticore(
        corpus=neg_bow_corpus, 
        num_topics=num_topics, 
        id2word=neg_dictionary, 
        passes=2, 
        workers=2)
    
    pos_lda_model.save(pos_lda_model_fname)
    neg_lda_model.save(neg_lda_model_fname)

Number of topics to generate: 300


In [None]:
import pyLDAvis
import pyLDAvis.gensim

# Visualize positive topic words
pyLDAvis.enable_notebook()
pos_vis = pyLDAvis.gensim.prepare(pos_lda_model, pos_bow_corpus, pos_dictionary, sort_topics=False)
pos_vis

In [None]:
# Visualize negative topic words
pyLDAvis.enable_notebook()
neg_vis = pyLDAvis.gensim.prepare(neg_lda_model, neg_bow_corpus, neg_dictionary, sort_topics=False)
neg_vis

In [28]:
def get_topic(text):
    bow_vector = pos_dictionary.doc2bow(preprocess(text))
    for idx, score in sorted(pos_lda_model[bow_vector], key=lambda tup:-1*tup[1]):
        pos_score = score
        pos_topic = "Topic: {}\nWords: {}".format(idx+1, pos_lda_model.print_topic(idx, 15))
        break           
    
    bow_vector = neg_dictionary.doc2bow(preprocess(text))
    for idx, score in sorted(neg_lda_model[bow_vector], key=lambda tup:-1*tup[1]):
        neg_score = score
        neg_topic = "Topic: {}\nWords: {}".format(idx+1, neg_lda_model.print_topic(idx, 15))
        break
        
    if pos_score>neg_score:
        return pos_topic
    else:
        return neg_topic
    
    
def get_mult_topics(text):
    bow_vector = pos_dictionary.doc2bow(preprocess(text))
    bow_vector = neg_dictionary.doc2bow(preprocess(text))
    
    print("\nPositive Topics:")
    for idx, score in sorted(pos_lda_model[bow_vector], key=lambda tup:-1*tup[1]):    
        print('Topic: {} Score: {}\nWords: {}'.format(idx+1, score, pos_lda_model.print_topic(idx, 15)))
        
    print("\nNegative Topics:")
    for idx, score in sorted(neg_lda_model[bow_vector], key=lambda tup:-1*tup[1]):    
        print('Topic: {} Score: {}\nWords: {}'.format(idx+1, score, neg_lda_model.print_topic(idx, 15)))
    

def get_sentiment(text):
    #get what positive topics might be related
    bow_vector = pos_dictionary.doc2bow(preprocess(text))
    for idx, score in sorted(pos_lda_model[bow_vector], key=lambda tup:-1*tup[1]):
        pos_score = score
        break
    
    #get what negative topics might be related
    bow_vector = neg_dictionary.doc2bow(preprocess(text))
    for idx, score in sorted(neg_lda_model[bow_vector], key=lambda tup:-1*tup[1]):    
        neg_score = score
        break
    
    if pos_score>neg_score:
        result = neg_score/pos_score
        result = 100 - (30*result)
        return result
    else:
        result = pos_score/neg_score
        result *= 60
        return result
    
    
def get_general_sentiment(text):
    #get what positive topics might be related
    bow_vector = pos_dictionary.doc2bow(preprocess(text))
    for idx, score in sorted(pos_lda_model[bow_vector], key=lambda tup:-1*tup[1]):
        pos_score = score
        break

    #get what negative topics might be related
    bow_vector = neg_dictionary.doc2bow(preprocess(text))
    for idx, score in sorted(neg_lda_model[bow_vector], key=lambda tup:-1*tup[1]):    
        neg_score = score
        break
        
    if pos_score>neg_score:
        return "positive"
    else:
        return "negative"   

In [40]:
unseen_movie_description = input("Please enter a movie description to analyze: ")
result = round(get_sentiment(unseen_movie_description), 3)
print("\nWe predict that opinions on this movie are generally {}.".format(get_general_sentiment(unseen_movie_description)))
print("\nWe predict that this movie relates to the following topic:\n{}".format(get_topic(unseen_movie_description)))
#print("\nWe predict that this movie relates to the following topics:")
#get_mult_topics(unseen_movie_description)
print("\nWe predict that this movie has a rating of ~{}%.".format(result))

Please enter a movie description to analyze: Johnny is a successful banker who lives happily in a San Francisco townhouse with his fiancée, Lisa. One day, inexplicably, she gets bored of him and decides to seduce Johnny's best friend, Mark. From there, nothing will be the same again.

We predict that opinions on this movie are generally negative.

We predict that this movie relates to the following topic:
Topic: 184
Words: 0.021*"peter" + 0.016*"time" + 0.014*"falk" + 0.014*"bergman" + 0.014*"father" + 0.012*"work" + 0.011*"great" + 0.011*"be" + 0.008*"main" + 0.008*"stori" + 0.008*"friend" + 0.008*"stefan" + 0.008*"tell" + 0.008*"german" + 0.007*"afterward"

We predict that this movie has a rating of ~42.49%.
