In [1]:
# TODO: min_count, stop_words, classifier articture

# Import packages
import re
import tflearn
import numpy as np
import pandas as pd
import tensorflow as tf
from random import shuffle
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from gensim.models.doc2vec import TaggedDocument, Doc2Vec

In [2]:
# Explore Data
labeled_train = pd.read_csv('labeledTrainData.tsv', delimiter='\t', quoting=3)
unlabeled_train = pd.read_csv('unlabeledTrainData.tsv', delimiter='\t', quoting=3)
test = pd.read_csv('testData.tsv', delimiter='\t', quoting=3)

print 'Labeled train data shape: ' + str(labeled_train.shape)
print labeled_train.head()
print '\nUnlabeled train data shape: ' + str(unlabeled_train.shape)
print unlabeled_train.head()
print '\nTest data shape: ' + str(test.shape)
print test.head()

Labeled train data shape: (25000, 3)
         id  sentiment                                             review
0  "5814_8"          1  "With all this stuff going down at the moment ...
1  "2381_9"          1  "\"The Classic War of the Worlds\" by Timothy ...
2  "7759_3"          0  "The film starts with a manager (Nicholas Bell...
3  "3630_4"          0  "It must be assumed that those who praised thi...
4  "9495_8"          1  "Superbly trashy and wondrously unpretentious ...

Unlabeled train data shape: (50000, 2)
          id                                             review
0   "9999_0"  "Watching Time Chasers, it obvious that it was...
1  "45057_0"  "I saw this film about 20 years ago and rememb...
2  "15561_0"  "Minor Spoilers<br /><br />In New York, Joan B...
3   "7161_0"  "I went to see this film with a great deal of ...
4  "43971_0"  "Yes, I agree with everyone on this site this ...

Test data shape: (25000, 2)
           id                                             review
0

In [3]:
# Preprocess reviews
stop_words = set(stopwords.words("english"))

def parse_html(data):
    data = BeautifulSoup(data, 'lxml').get_text()
    data = re.sub("[^a-zA-Z]"," ", data)
    data = [x for x in data.lower().split() if not x in stop_words]
    return data
    
labeled_train_reviews = []
for i in xrange(labeled_train.shape[0]):
    labeled_train_reviews.append(TaggedDocument(parse_html(labeled_train['review'][i]), 
                                                ['labeled_train_' + str(i)]))

unlabeled_train_reviews = []
for i in xrange(unlabeled_train.shape[0]):
    unlabeled_train_reviews.append(TaggedDocument(parse_html(unlabeled_train['review'][i]), 
                                                  ['unlabeled_train_' + str(i)]))

test_reviews = []
for i in xrange(test.shape[0]):
    test_reviews.append(TaggedDocument(parse_html(test['review'][i]), 
                                       ['test_' + str(i)]))
    
print 'Labeled train reviews shape: ' + str(len(labeled_train_reviews))
print labeled_train_reviews[0]
print '\nUnlabeled train reviews shape: ' + str(len(unlabeled_train_reviews))
print unlabeled_train_reviews[0]
print '\nTest reviews shape: ' + str(len(test_reviews))
print test_reviews[0]

Labeled train reviews shape: 25000
TaggedDocument([u'stuff', u'going', u'moment', u'mj', u'started', u'listening', u'music', u'watching', u'odd', u'documentary', u'watched', u'wiz', u'watched', u'moonwalker', u'maybe', u'want', u'get', u'certain', u'insight', u'guy', u'thought', u'really', u'cool', u'eighties', u'maybe', u'make', u'mind', u'whether', u'guilty', u'innocent', u'moonwalker', u'part', u'biography', u'part', u'feature', u'film', u'remember', u'going', u'see', u'cinema', u'originally', u'released', u'subtle', u'messages', u'mj', u'feeling', u'towards', u'press', u'also', u'obvious', u'message', u'drugs', u'bad', u'kay', u'visually', u'impressive', u'course', u'michael', u'jackson', u'unless', u'remotely', u'like', u'mj', u'anyway', u'going', u'hate', u'find', u'boring', u'may', u'call', u'mj', u'egotist', u'consenting', u'making', u'movie', u'mj', u'fans', u'would', u'say', u'made', u'fans', u'true', u'really', u'nice', u'actual', u'feature', u'film', u'bit', u'finally', u's

In [4]:
workers = 8

all_reviews = labeled_train_reviews + unlabeled_train_reviews + test_reviews
d2v = Doc2Vec(workers=workers)
d2v.build_vocab(all_reviews)

for i in range(10):
    shuffle(all_reviews)
    d2v.train(all_reviews)
    print 'epoch %i complete' % (i + 1)

epoch 1 complete
epoch 2 complete
epoch 3 complete
epoch 4 complete
epoch 5 complete
epoch 6 complete
epoch 7 complete
epoch 8 complete
epoch 9 complete
epoch 10 complete


In [5]:
# Get train and test vectors
train_x = np.ndarray([len(labeled_train), 300])
for i in xrange(len(labeled_train)):
    train_x[i] = d2v.docvecs['labeled_train_' + str(i)]
    
train_y = np.ndarray([len(labeled_train), 2])
for i in xrange(len(labeled_train)):    
    train_y[i] = [1, 0] if labeled_train['sentiment'][i] == 0 else [0, 1]

test_x = np.ndarray([len(test), 300])
for i in xrange(len(test)):
    test_x[i] = d2v.docvecs['test_' + str(i)]

In [10]:
# Graph definition
with tf.Graph().as_default():
    net = tflearn.input_data(shape=[None, 300])
    net = tflearn.fully_connected(net, 1024, 'relu')
    net = tflearn.fully_connected(net, 128, 'relu')
    net = tflearn.dropout(net, 0.5)
    net = tflearn.fully_connected(net, 2, activation='softmax')
    net = tflearn.regression(net, optimizer='adam')

    # Model training
    model = tflearn.DNN(net, tensorboard_dir='tensorboard')
    model.fit(train_x, train_y, n_epoch=10, show_metric=True)
    predictions = model.predict(test_x)    
    
output = pd.DataFrame(data={"id":test["id"], "sentiment":np.argmax(predictions, 1).tolist()})
output.to_csv( "imdb_sentiment_analysis.csv", index=False, quoting=3 )    

Training Step: 3910  | total loss: [1m[32m0.01524[0m[0m
| Adam | epoch: 010 | loss: 0.01524 - acc: 0.9956 -- iter: 25000/25000
Training Step: 3910  | total loss: [1m[32m0.01524[0m[0m
| Adam | epoch: 010 | loss: 0.01524 - acc: 0.9956 -- iter: 25000/25000
--
