In [None]:
import numpy as np
import pandas as pd
import re
import os
from bs4 import BeautifulSoup

from sklearn.model_selection import train_test_split

from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding

os.environ['KERAS_BACKEND']='tensorflow'

## Load data

In [None]:
# Load data
data_train = pd.read_csv('./data/raw/labeledTrainData.tsv', sep='\t')
print(data_train.shape)

In [None]:
data_train.head(10)

In [None]:
texts = []
labels = []

for idx in range(data_train.review.shape[0]):
    texts.append(data_train.review[idx])
    labels.append(data_train.sentiment[idx])

In [None]:
for i in range(3):
    print('-------------')
    print('Label:', labels[i])
    print('Text: ', texts[i])

## Vectorize words in movie reviews

In [None]:
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000

In [None]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='\"\'\\')
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

In [None]:
print(sequences[0])
# Note: this is just a vectorized form of the first movie review in the dataset

## Preprocess data before feeding into neural network

In [None]:
# Zeropad sequences, because Keras expects vectors of a fixed shape
print('Before padding:')
print('Length of sequences list:', len(sequences))
print('Shape of labels list:', len(labels))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = to_categorical(np.asarray(labels))

print('After padding:')
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

In [None]:
# Split data into train and tes sets
x_train, x_val, y_train, y_val = train_test_split(data, labels, random_state=0)

In [None]:
print('Number of positive and negative reviews in traing and validation set ')
print(y_train.sum(axis=0))
print(y_val.sum(axis=0))

# Build and train the model

In [None]:
# build the model
model = Sequential()
model.add(Embedding(input_dim=20000, output_dim=32, input_length=MAX_SEQUENCE_LENGTH))
model.add(LSTM(100))
model.add(Dense(2, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

In [None]:
# Train the model
model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=3, batch_size=64)

3 epoches

training accuracy   : 09163

validation accuracy : 0.8523  


### Benchmark models

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

rfc = RandomForestClassifier(random_state=0)
rfc.fit(x_train, y_train)


y_pred = rfc.predict(data)

print("Accuracy:")
print(accuracy_score(labels, y_pred))

print("Precision and recall:")
print(classification_report(labels, y_pred))


With a vanilla random forest model, we can get up to 82% accuracy, 89% precision and 83% recall rate

In [None]:
# TODO: Vectorize a string ('this movie is the worst') and use the model to predict sentiment