# IMDB Sentiment Analysis

Sentiment Aanlysis with gensim and TFlearn

In [1]:
# Import packages
import os
import re
import tflearn
import numpy as np
import tensorflow as tf
from random import shuffle
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from gensim.models.doc2vec import TaggedDocument, Doc2Vec

# Define directories
train_pos_dir = 'aclImdb/train/pos/'
train_neg_dir = 'aclImdb/train/neg/'
train_unsup_dir = 'aclImdb/train/unsup/'
test_pos_dir = 'aclImdb/test/pos/'
test_neg_dir = 'aclImdb/test/neg/'

# Define dataset sizes
labeled_set_size = 12500
unlabeled_set_size = 50000

In [27]:
# Read and explore data
train_pos = []
for file_name in os.listdir(train_pos_dir):
    with open(train_pos_dir + file_name, 'r') as myfile:
        train_pos.append(myfile.read())
        
train_neg = []
for file_name in os.listdir(train_neg_dir):
    with open(train_neg_dir + file_name, 'r') as myfile:
        train_neg.append(myfile.read())
        
train_unsup = []
for file_name in os.listdir(train_unsup_dir):
    with open(train_unsup_dir + file_name, 'r') as myfile:
        train_unsup.append(myfile.read())
        
test_pos = []
for file_name in os.listdir(test_pos_dir):
    with open(test_pos_dir + file_name, 'r') as myfile:
        test_pos.append(myfile.read())
        
test_neg = []
for file_name in os.listdir(test_neg_dir):
    with open(test_neg_dir + file_name, 'r') as myfile:
        test_neg.append(myfile.read())     
        
print '%i positive train reviews:' % len(train_pos)
print train_pos[0][:100]
print '\n%i negative train reviews:' % len(train_neg)
print train_neg[0][:100]
print '\n%i unlabeled train reviews:' % len(train_unsup)
print train_unsup[0][:100]
print '\n%i positive test reviews:' % len(test_pos)
print test_pos[0][:100]
print '\n%i negative test reviews:' % len(test_neg)
print test_neg[0][:100]   

12500 positive train reviews:
This anime was underrated and still is. Hardly the dorky kids movie as noted, i still come back to t

12500 negative train reviews:
Whoever wrote the script for this movie does not deserve to work in Hollywood at all (not even live 

50000 unlabeled train reviews:
The movie Contagion was a well thought out story that had below average acting and corny elements. T

12500 positive test reviews:
I'm not sure what version of the film I saw, but it was very entertaining.<br /><br />I did not know

12500 negative test reviews:
I think there's a reason this film never came close to hitting theaters. It was probably my neighbor


In [28]:
# Preprocess data so reviews can be embedded
stop_words = set(stopwords.words("english"))

def parse_html(data):
    data = BeautifulSoup(data, 'lxml').get_text() # Remove markup
    data = re.sub("[^a-zA-Z]"," ", data) # Remove all non-alphanumeric characters
    data = [x for x in data.lower().split() if not x in stop_words] # Remove stopwords
    return data
    
for i in xrange(labeled_set_size):
    train_pos[i] = TaggedDocument(parse_html(train_pos[i]), ['train_pos_' + str(i)])
    train_neg[i] = TaggedDocument(parse_html(train_neg[i]), ['train_neg_' + str(i)])
    test_pos[i] = TaggedDocument(parse_html(test_pos[i]), ['test_pos_' + str(i)])
    test_neg[i] = TaggedDocument(parse_html(test_neg[i]), ['test_neg_' + str(i)])
    
for i in xrange(unlabeled_set_size):
    train_unsup[i] = TaggedDocument(parse_html(train_unsup[i]), ['train_unsup_' + str(i)])
    
print '%i positive train reviews:' % len(train_pos)
print train_pos[0][0][:8]
print '\n%i negative train reviews:' % len(train_neg)
print train_neg[0][0][:8]
print '\n%i unlabeled train reviews:' % len(train_unsup)
print train_unsup[0][0][:8]
print '\n%i positive test reviews:' % len(test_pos)
print test_pos[0][0][:8]
print '\n%i negative test reviews:' % len(test_neg)
print test_neg[0][0][:8]

12500 positive train reviews:
[u'anime', u'underrated', u'still', u'hardly', u'dorky', u'kids', u'movie', u'noted']

12500 negative train reviews:
[u'whoever', u'wrote', u'script', u'movie', u'deserve', u'work', u'hollywood', u'even']

50000 unlabeled train reviews:
[u'movie', u'contagion', u'well', u'thought', u'story', u'average', u'acting', u'corny']

12500 positive test reviews:
[u'sure', u'version', u'film', u'saw', u'entertaining', u'know', u'twins', u'gillian']

12500 negative test reviews:
[u'think', u'reason', u'film', u'never', u'came', u'close', u'hitting', u'theaters']


In [4]:
# Embed documents using doc2vec
if os.path.isfile('d2v'):
    d2v = Doc2Vec.load('d2v')
else:
    workers = 8 # Number of virtual CPU cores on machine
    window = 16 # Skip-gram window
    min_count = 30 # 30 is the max number of reviews per movie in the dataset

    all_reviews = train_pos + train_neg + train_unsup + test_pos + test_neg
    d2v = Doc2Vec(window=window, workers=workers, min_count=min_count)
    d2v.build_vocab(all_reviews)

    for i in range(10):
        shuffle(all_reviews) # Make sure to shuffle each epoch
        d2v.train(all_reviews)
        print 'epoch %i complete' % (i + 1)
        
    d2v.save('d2v')

epoch 1 complete
epoch 2 complete
epoch 3 complete
epoch 4 complete
epoch 5 complete
epoch 6 complete
epoch 7 complete
epoch 8 complete
epoch 9 complete
epoch 10 complete


In [49]:
# Examine embedding
print 'Most similar to man: '
print d2v.most_similar('man')[:3]
print '\nMost similar to movie: '
print d2v.most_similar('movie')[:3]

Most similar to man: 
[(u'woman', 0.8909039497375488), (u'guy', 0.8078601360321045), (u'girl', 0.8069558143615723)]

Most similar to movie: 
[(u'film', 0.9861048460006714), (u'show', 0.8519033193588257), (u'flick', 0.8308141827583313)]


In [5]:
# Get train and test embedded vectors for classification
x_vector_size = 300
y_vector_size = 2

train_x = np.ndarray([2 * labeled_set_size, x_vector_size])
train_y = np.ndarray([2 * labeled_set_size, y_vector_size])
test_x = np.ndarray([2 * labeled_set_size, x_vector_size])
test_y = np.ndarray([2 * labeled_set_size, y_vector_size])

for i in xrange(labeled_set_size):
    train_x[i] = d2v.docvecs['train_pos_' + str(i)]
    train_y[i] = [1, 0]
    test_x[i] = d2v.docvecs['test_pos_' + str(i)]    
    test_y[i] = [1, 0]
    
for i in xrange(labeled_set_size):
    train_x[i + labeled_set_size] = d2v.docvecs['train_neg_' + str(i)]
    train_y[i + labeled_set_size] = [0, 1]
    test_x[i + labeled_set_size] = d2v.docvecs['test_neg_' + str(i)]    
    test_y[i + labeled_set_size] = [0, 1]

In [38]:
# DNN model
epochs = 10

with tf.Graph().as_default():
    net = tflearn.input_data(shape=[None, x_vector_size])
    net = tflearn.fully_connected(net, 128, 'relu')
    net = tflearn.dropout(net, 0.5)
    net = tflearn.fully_connected(net, 2, activation='softmax')
    net = tflearn.regression(net, optimizer='adam')

    model = tflearn.DNN(net, tensorboard_dir='tensorboard')
    model.fit(train_x, train_y, epochs, (test_x, test_y), True)
    dnn_score = model.evaluate(test_x, test_y)

Training Step: 3910  | total loss: [1m[32m0.27173[0m[0m
| Adam | epoch: 010 | loss: 0.27173 - acc: 0.9006 | val_loss: 0.33722 - val_acc: 0.8545 -- iter: 25000/25000
Training Step: 3910  | total loss: [1m[32m0.27173[0m[0m
| Adam | epoch: 010 | loss: 0.27173 - acc: 0.9006 | val_loss: 0.33722 - val_acc: 0.8545 -- iter: 25000/25000
--


In [31]:
# LSTM model
epochs = 10
timesteps = 10
train_x_lstm = np.stack([train_x] * timesteps, 1)
test_x_lstm = np.stack([test_x] * timesteps, 1)

with tf.Graph().as_default():
    net = tflearn.input_data([None, timesteps, x_vector_size])
    net = tflearn.lstm(net, x_vector_size)
    net = tflearn.fully_connected(net, 2, activation='softmax')
    net = tflearn.regression(net, optimizer='adam')

    model = tflearn.DNN(net, tensorboard_dir='tensorboard')
    model.fit(train_x_lstm, train_y, epochs, (test_x_lstm, test_y), True)
    lstm_score = model.evaluate(test_x_lstm, test_y)

Training Step: 3910  | total loss: [1m[32m0.12703[0m[0m
| Adam | epoch: 010 | loss: 0.12703 - acc: 0.9610 | val_loss: 0.49553 - val_acc: 0.8387 -- iter: 25000/25000
Training Step: 3910  | total loss: [1m[32m0.12703[0m[0m
| Adam | epoch: 010 | loss: 0.12703 - acc: 0.9610 | val_loss: 0.49553 - val_acc: 0.8387 -- iter: 25000/25000
--


In [39]:
# Performance
print 'DNN model test set accuracy ' + '{:.2%}'.format(dnn_score[0])
print 'LSTM model test set accuracy ' + '{:.2%}'.format(lstm_score[0])

DNN model test set accuracy 85.45%
LSTM model test set accuracy 83.87%


# TODO

* Implement more regularization to combat overfitting
* Compare tf-idf performance to doc2vec
* Compare performance with stopwords
* Create TSNE visualization for document embeddings
* Develop CNN classification model
