In [43]:
import numpy as np
import pandas as pd
import re
import os
from bs4 import BeautifulSoup

from sklearn.model_selection import train_test_split

import keras
import tensorflow
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding

os.environ['KERAS_BACKEND']='tensorflow'

## Clean data

In [2]:
# Load data
data_train = pd.read_csv('./data/raw/labeledTrainData.tsv', sep='\t')
print data_train.shape

(25000, 3)


In [3]:
data_train.head(10)

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...
5,8196_8,1,I dont know why people think this is such a ba...
6,7166_2,0,"This movie could have been very good, but come..."
7,10633_1,0,I watched this video at a friend's house. I'm ...
8,319_1,0,"A friend of mine bought this film for £1, and ..."
9,8713_10,1,<br /><br />This movie is full of references. ...


In [5]:
# define helpers

def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\\", "", string)    
    string = re.sub(r"\'", "", string)    
    string = re.sub(r"\"", "", string)    
    return string.strip().lower()

In [7]:
texts = []
labels = []

for idx in range(data_train.review.shape[0]):
    text = BeautifulSoup(data_train.review[idx], 'lxml')
    texts.append(clean_str(text.get_text().encode('ascii','ignore')))
    labels.append(data_train.sentiment[idx])

In [17]:
for i in range(3):
    print('-------------')
    print('Label:', labels[i])
    print('Text: ', texts[i])

-------------
('Label:', 1)
('Text: ', 'with all this stuff going down at the moment with mj ive started listening to his music, watching the odd documentary here and there, watched the wiz and watched moonwalker again. maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. some of it has subtle messages about mjs feeling towards the press and also the obvious message of drugs are bad mkay.visually impressive but of course this is all about michael jackson so unless you remotely like mj in anyway then you are going to hate this and find it boring. some may call mj an egotist for consenting to the making of this movie but mj and most of his fans would say that he made it for the fans which if true is really nice of him.the actual feature film bit when it

## Vectorize words in movie reviews

In [8]:
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000

In [20]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 80562 unique tokens.


In [21]:
print(sequences[0])
# Note: this is just a vectorized form of the first movie review in the dataset

[15, 29, 10, 536, 165, 177, 30, 1, 559, 15, 10067, 198, 643, 2620, 5, 23, 223, 144, 1, 1022, 656, 128, 2, 46, 291, 1, 19356, 2, 291, 11563, 169, 275, 9, 40, 180, 5, 75, 3, 806, 2621, 80, 10, 226, 34, 9, 193, 12, 62, 636, 7, 1, 4263, 40, 5, 275, 93, 52, 57, 325, 723, 26, 6, 2514, 39, 1346, 11563, 6, 168, 5053, 168, 778, 18, 59, 9, 373, 165, 5, 63, 30, 1, 432, 50, 8, 12, 1820, 622, 45, 4, 8, 44, 1296, 3464, 41, 544, 943, 1, 3527, 2, 78, 1, 577, 744, 4, 1653, 22, 73, 2012, 1154, 17, 4, 259, 10, 6, 29, 41, 485, 1874, 35, 894, 21, 2593, 37, 10067, 7, 553, 90, 21, 22, 165, 5, 780, 10, 2, 164, 8, 355, 45, 199, 678, 10067, 32, 14, 5, 1, 227, 4, 10, 16, 17, 10067, 2, 87, 4, 23, 444, 58, 130, 11, 26, 89, 8, 14, 1, 444, 59, 43, 277, 6, 62, 323, 4, 86, 1, 776, 778, 18, 222, 50, 8, 412, 513, 6, 61, 19, 14, 886, 229, 39, 35, 16070, 1, 3507, 1670, 715, 2, 904, 11301, 6, 1076, 13, 3, 11900, 29, 974, 1392, 1624, 133, 26, 487, 10067, 341, 35, 73, 6, 720, 68, 84, 10067, 23, 2454, 13854, 904, 104, 11, 26,

## Preprocess data before feeding into neural network

In [36]:
# Zeropad sequences, because Keras expects vectors of a fixed shape
print('Before padding:')
print('Length of sequences list:', len(sequences))
print('Shape of labels list:', len(labels))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = to_categorical(np.asarray(labels))

print('After padding:')
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Before padding:
('Length of sequences list:', 25000)
('Shape of labels list:', 25000)
After padding:
('Shape of data tensor:', (25000, 1000))
('Shape of label tensor:', (25000, 2))


In [39]:
# Split data into train and tes sets
x_train, x_val, y_train, y_val = train_test_split(data, labels, random_state=0)

In [40]:
print('Number of positive and negative reviews in traing and validation set ')
print y_train.sum(axis=0)
print y_val.sum(axis=0)

Number of positive and negative reviews in traing and validation set 
[ 9345.  9405.]
[ 3155.  3095.]


# Build and train the model

In [None]:
# build the model
model = Sequential()
model.add(Embedding(input_dim=20000, output_dim=32, input_length=MAX_SEQUENCE_LENGTH))
model.add(LSTM(100))
model.add(Dense(2, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

In [42]:
# Train the model
model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=3, batch_size=64)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1000, 32)          640000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 202       
Total params: 693,402
Trainable params: 693,402
Non-trainable params: 0
_________________________________________________________________
None
Train on 18750 samples, validate on 6250 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1a25855a90>

3 epoches

training accuracy   : 09163

validation accuracy : 0.8523  


### Benchmark models

In [54]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

rfc = RandomForestClassifier(random_state=0)
rfc.fit(x_train, y_train)


y_pred = rfc.predict(data)

print("Accuracy:")
print(accuracy_score(labels, y_pred))

print("Precision and recall:")
print(classification_report(labels, y_pred))


Accuracy:
0.82916
Precision and recall:
             precision    recall  f1-score   support

          0       0.89      0.83      0.86     12500
          1       0.89      0.83      0.86     12500

avg / total       0.89      0.83      0.86     25000



With a vanilla random forest model, we can get up to 82% accuracy, 89% precision and 83% recall rate

In [None]:
# TODO: Vectorize a string ('this movie is the worst') and use the model to predict sentiment