# IMDB Sentiment Analysis

Andrew L. Maas, Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew Y. Ng, and Christopher Potts. (2011). Learning Word Vectors for Sentiment Analysis. The 49th Annual Meeting of the Association for Computational Linguistics (ACL 2011).

In [1]:
# TODO: min_count, stop_words, classifier articture

# Import packages
import os
import re
import tflearn
import numpy as np
import pandas as pd
import tensorflow as tf
from random import shuffle
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from gensim.models.doc2vec import TaggedDocument, Doc2Vec

train_pos_dir = 'aclImdb/train/pos/'
train_neg_dir = 'aclImdb/train/neg/'
train_unsup_dir = 'aclImdb/train/unsup/'
test_pos_dir = 'aclImdb/test/pos/'
test_neg_dir = 'aclImdb/test/neg/'

labeled_set_size = 12500
unlabeled_set_size = 50000

In [2]:
train_pos = []
for file_name in os.listdir(train_pos_dir):
    with open(train_pos_dir + file_name, 'r') as myfile:
        train_pos.append(myfile.read())
        
train_neg = []
for file_name in os.listdir(train_neg_dir):
    with open(train_neg_dir + file_name, 'r') as myfile:
        train_neg.append(myfile.read())
        
train_unsup = []
for file_name in os.listdir(train_unsup_dir):
    with open(train_unsup_dir + file_name, 'r') as myfile:
        train_unsup.append(myfile.read())
        
test_pos = []
for file_name in os.listdir(test_pos_dir):
    with open(test_pos_dir + file_name, 'r') as myfile:
        test_pos.append(myfile.read())
        
test_neg = []
for file_name in os.listdir(test_neg_dir):
    with open(test_neg_dir + file_name, 'r') as myfile:
        test_neg.append(myfile.read())     
        
print '%i positive train reviews:' % len(train_pos)
print train_pos[0]
print '\n%i negative train reviews:' % len(train_neg)
print train_neg[0]
print '\n%i unlabeled train reviews:' % len(train_unsup)
print train_unsup[0]
print '\n%i positive test reviews:' % len(test_pos)
print test_pos[0]
print '\n%i negative test reviews:' % len(test_neg)
print test_neg[0]        

12500 positive train reviews:
This anime was underrated and still is. Hardly the dorky kids movie as noted, i still come back to this 10 years after i first saw it. One of the better movies released.<br /><br />The animation while not perfect is good, camera tricks give it a 3D feel and the story is still as good today even after i grew up and saw ground-breakers like Neon Genesis Evangelion and RahXephon. It has nowhere near the depth obviously but try to see it from a lighthearted view. It's a story to entertain, not to question.<br /><br />Still one of my favourites I come back too when i feel like a giggle on over more lighthearted animes. Not to say its a childish movies, there are surprisingly sad moments in this and you need a sense of humour to see it all.

12500 negative train reviews:
Whoever wrote the script for this movie does not deserve to work in Hollywood at all (not even live there), and those actors need to find another job. The most dreadful hour and some minutes of 

In [3]:
# Preprocess reviews
stop_words = set(stopwords.words("english"))

def parse_html(data):
    data = BeautifulSoup(data, 'lxml').get_text()
    data = re.sub("[^a-zA-Z]"," ", data)
    data = [x for x in data.lower().split() if not x in stop_words]
    return data
    
for i in xrange(labeled_set_size):
    train_pos[i] = TaggedDocument(parse_html(train_pos[i]), ['train_pos_' + str(i)])
    train_neg[i] = TaggedDocument(parse_html(train_neg[i]), ['train_neg_' + str(i)])
    test_pos[i] = TaggedDocument(parse_html(test_pos[i]), ['test_pos_' + str(i)])
    test_neg[i] = TaggedDocument(parse_html(test_neg[i]), ['test_neg_' + str(i)])
    
for i in xrange(unlabeled_set_size):
    train_unsup[i] = TaggedDocument(parse_html(train_unsup[i]), ['train_unsup_' + str(i)])
    
print '%i positive train reviews:' % len(train_pos)
print train_pos[0]
print '\n%i negative train reviews:' % len(train_neg)
print train_neg[0]
print '\n%i unlabeled train reviews:' % len(train_unsup)
print train_unsup[0]
print '\n%i positive test reviews:' % len(test_pos)
print test_pos[0]
print '\n%i negative test reviews:' % len(test_neg)
print test_neg[0]      

12500 positive train reviews:
TaggedDocument([u'anime', u'underrated', u'still', u'hardly', u'dorky', u'kids', u'movie', u'noted', u'still', u'come', u'back', u'years', u'first', u'saw', u'one', u'better', u'movies', u'released', u'animation', u'perfect', u'good', u'camera', u'tricks', u'give', u'feel', u'story', u'still', u'good', u'today', u'even', u'grew', u'saw', u'ground', u'breakers', u'like', u'neon', u'genesis', u'evangelion', u'rahxephon', u'nowhere', u'near', u'depth', u'obviously', u'try', u'see', u'lighthearted', u'view', u'story', u'entertain', u'question', u'still', u'one', u'favourites', u'come', u'back', u'feel', u'like', u'giggle', u'lighthearted', u'animes', u'say', u'childish', u'movies', u'surprisingly', u'sad', u'moments', u'need', u'sense', u'humour', u'see'], ['train_pos_0'])

12500 negative train reviews:
TaggedDocument([u'whoever', u'wrote', u'script', u'movie', u'deserve', u'work', u'hollywood', u'even', u'live', u'actors', u'need', u'find', u'another', u'job'

In [4]:
workers = 8
min_count = 30 # 30 is the max number of reviews per movie in the dataset

all_reviews = train_pos + train_neg + train_unsup + test_pos + test_neg
d2v = Doc2Vec(workers=workers, min_count=min_count)
d2v.build_vocab(all_reviews)

for i in range(10):
    shuffle(all_reviews)
    d2v.train(all_reviews)
    print 'epoch %i complete' % (i + 1)

epoch 1 complete
epoch 2 complete
epoch 3 complete
epoch 4 complete
epoch 5 complete
epoch 6 complete
epoch 7 complete
epoch 8 complete
epoch 9 complete
epoch 10 complete


In [5]:
# Get train and test vectors
x_vector_size = 300
y_vector_size = 2

train_x = np.ndarray([2 * labeled_set_size, x_vector_size])
train_y = np.ndarray([2 * labeled_set_size, y_vector_size])
test_x = np.ndarray([2 * labeled_set_size, x_vector_size])
test_y = np.ndarray([2 * labeled_set_size, y_vector_size])

for i in xrange(labeled_set_size):
    train_x[i] = d2v.docvecs['train_pos_' + str(i)]
    train_y[i] = [1, 0]
    test_x[i] = d2v.docvecs['test_pos_' + str(i)]    
    test_y[i] = [1, 0]
    
for i in xrange(labeled_set_size):
    train_x[i + labeled_set_size] = d2v.docvecs['train_neg_' + str(i)]
    train_y[i + labeled_set_size] = [0, 1]
    test_x[i + labeled_set_size] = d2v.docvecs['test_neg_' + str(i)]    
    test_y[i + labeled_set_size] = [0, 1]

In [None]:
# Graph definition
with tf.Graph().as_default():
    net = tflearn.input_data(shape=[None, 300])
    net = tflearn.fully_connected(net, 1024, 'relu')
    net = tflearn.fully_connected(net, 128, 'relu')
    net = tflearn.dropout(net, 0.5)
    net = tflearn.fully_connected(net, 2, activation='softmax')
    net = tflearn.regression(net, optimizer='adam')

    # Model training
    model = tflearn.DNN(net, tensorboard_dir='tensorboard')
    model.fit(train_x, train_y, n_epoch=10)
    predictions = model.predict(test_x)
    
    print 'Train set accuracy ' + '{:.2%}'.format(model.evaluate(train_x, train_y)[0])
    print 'Test set accuracy ' + '{:.2%}'.format(model.evaluate(test_x, test_y)[0])

Training Step: 10  | total loss: [1m[32m0.68755[0m[0m
[2K| Adam | epoch: 000 | loss: 0.68755 -- iter: 00640/25000


In [7]:
print 'Preditions for positive test reviews (shold be all 0s):'
print np.argmax(predictions, 1)[:999]
print '\nPreditions for negative test reviews (shold be all 1s):'
print np.argmax(predictions, 1)[24001:]

Preditions for positive test reviews (shold be all 0s):
[0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 1 0
 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0
 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
 0 0 0 1 0 0 0 0 1 1 0 0 0 1 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 1 1 1 0 0 0
 0 0 1 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 1 0 0 0 0 1 1 0 0 0 1 1 0 0 1 0 0 1 0 0 0
 0 0 1 0 1 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0
 1 0 1 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 1
 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0