In [1]:
import numpy as np
import pandas as pd
import re
import os
from bs4 import BeautifulSoup

from sklearn.model_selection import train_test_split

from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding

os.environ['KERAS_BACKEND']='tensorflow'

Using TensorFlow backend.


## Load data

In [3]:
# Load data
data_train = pd.read_csv('./data/raw/labeledTrainData.tsv', sep='\t')
print(data_train.shape)

(25000, 3)


In [4]:
data_train.head(10)

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...
5,8196_8,1,I dont know why people think this is such a ba...
6,7166_2,0,"This movie could have been very good, but come..."
7,10633_1,0,I watched this video at a friend's house. I'm ...
8,319_1,0,"A friend of mine bought this film for £1, and ..."
9,8713_10,1,<br /><br />This movie is full of references. ...


In [5]:
texts = []
labels = []

for idx in range(data_train.review.shape[0]):
    texts.append(data_train.review[idx])
    labels.append(data_train.sentiment[idx])

In [6]:
for i in range(3):
    print('-------------')
    print('Label:', labels[i])
    print('Text: ', texts[i])

-------------
('Label:', 1)
('Text: ', "With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actu

## Vectorize words in movie reviews

In [7]:
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000

In [8]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='\"\'\\')
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 216667 unique tokens.


In [9]:
print(sequences[0])
# Note: this is just a vectorized form of the first movie review in the dataset

[17, 37, 10, 716, 166, 211, 31, 1, 678, 17, 11172, 9, 125, 613, 2853, 5, 24, 1311, 155, 1, 1096, 782, 188, 3, 1007, 279, 1, 3, 279, 16594, 665, 291, 9, 40, 177, 5, 72, 2, 729, 2998, 73, 10, 244, 35, 9, 201, 14, 62, 794, 7, 1, 7057, 40, 5, 291, 91, 65, 58, 427, 657, 23, 6, 2701, 41, 17295, 16594, 6, 187, 187, 900, 25, 64, 9, 375, 166, 5, 67, 31, 1, 596, 52, 8, 14, 1975, 4723, 46, 4, 8, 43, 1451, 4015, 44, 11172, 12, 517, 861, 1, 4273, 3, 84, 1, 578, 857, 4, 2482, 26, 96, 135, 13, 1429, 18, 4, 431, 10, 6, 37, 44, 469, 1811, 39, 950, 20, 2515, 38, 11172, 7, 3055, 97, 20, 26, 166, 5, 783, 10, 3, 160, 8, 1601, 46, 190, 615, 11172, 34, 16, 5, 1, 241, 4, 10, 22, 18, 11172, 3, 79, 4, 24, 518, 55, 149, 11, 23, 95, 8, 16, 1, 518, 64, 47, 303, 6, 62, 312, 4, 2349, 13, 93, 681, 900, 25, 218, 52, 8, 449, 460, 6, 59, 21, 16, 972, 281, 41, 39, 1, 4246, 1907, 816, 3, 879, 16595, 6, 1274, 15, 2, 14364, 37, 1068, 1481, 15991, 145, 23, 441, 11172, 412, 39, 96, 6, 668, 467, 77, 11172, 24, 879, 16595, 12, 

## Preprocess data before feeding into neural network

In [10]:
# Zeropad sequences, because Keras expects vectors of a fixed shape
print('Before padding:')
print('Length of sequences list:', len(sequences))
print('Shape of labels list:', len(labels))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = to_categorical(np.asarray(labels))

print('After padding:')
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Before padding:
('Length of sequences list:', 25000)
('Shape of labels list:', 25000)
After padding:
('Shape of data tensor:', (25000, 1000))
('Shape of label tensor:', (25000, 2))


In [11]:
# Split data into train and tes sets
x_train, x_val, y_train, y_val = train_test_split(data, labels, random_state=0)

In [12]:
print('Number of positive and negative reviews in traing and validation set ')
print(y_train.sum(axis=0))
print(y_val.sum(axis=0))

Number of positive and negative reviews in traing and validation set 
[ 9345.  9405.]
[ 3155.  3095.]


# Build and train the model

In [None]:
# build the model
model = Sequential()
model.add(Embedding(input_dim=20000, output_dim=32, input_length=MAX_SEQUENCE_LENGTH))
model.add(LSTM(100))
model.add(Dense(2, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1000, 32)          640000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 202       
Total params: 693,402
Trainable params: 693,402
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
# Train the model
model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=3, batch_size=64)

Train on 18750 samples, validate on 6250 samples
Epoch 1/3
  896/18750 [>.............................] - ETA: 666s - loss: 0.6934 - acc: 0.5017

3 epoches

training accuracy   : 09163

validation accuracy : 0.8523  


### Benchmark models

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

rfc = RandomForestClassifier(random_state=0)
rfc.fit(x_train, y_train)


y_pred = rfc.predict(data)

print("Accuracy:")
print(accuracy_score(labels, y_pred))

print("Precision and recall:")
print(classification_report(labels, y_pred))


With a vanilla random forest model, we can get up to 82% accuracy, 89% precision and 83% recall rate

In [None]:
# TODO: Vectorize a string ('this movie is the worst') and use the model to predict sentiment