# Sentiment analysis on IMDB large movie review dataset

Get the dataset from [here](http://ai.stanford.edu/~amaas/data/sentiment/)

In [106]:
from glob import glob
import pandas as pd
import io
import numpy as np

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import models, layers

## Prepare dataset

In [1]:
def parse_text(text):
    return text.lower().strip()

def read_and_append_files(folder, label):
    texts = []
    for file in glob('./aclImdb/{0}/*.txt'.format(folder)):
        with open(file, 'r') as f:
            text = f.read()
            text = parse_text(text)
            texts.append((text, label))
            
    return texts

def make_df(pos, neg):
    df = pd.concat([pd.DataFrame(pos),
                    pd.DataFrame(neg)])\
           .sample(frac=1)\
           .reset_index(drop=True)
    df.columns = ['review', 'label']
    
    return df

In [3]:
train_pos = read_and_append_files('train/pos', 1)
train_neg = read_and_append_files('train/neg', 0)

test_pos = read_and_append_files('test/pos', 1)
test_neg = read_and_append_files('test/neg', 0)

In [4]:
train_df = make_df(train_pos, train_neg)
test_df = make_df(test_pos, test_neg)

In [5]:
train_df.head()

Unnamed: 0,review,label
0,i bought this at tower records after seeing th...,0
1,a novel by remarque. a cast that looks great o...,0
2,i enjoyed watching this well acted movie very ...,1
3,this is an excellent film and one should not b...,1
4,i really like 101 dalmations when it came out ...,1


In [6]:
test_df.head()

Unnamed: 0,review,label
0,this is by far the worst movie i have ever see...,0
1,why do i hate this? let me list the ways:<br /...,0
2,nothing will ever top komodo with the lovely j...,0
3,"i recently rented this video after seeing ""fin...",1
4,"i was attracted to this film by its offbeat, l...",0


## Embeddings

I use [Fast Text pre trained embeddings](https://fasttext.cc/docs/en/english-vectors.html).

In [89]:
t = Tokenizer()
t.fit_on_texts(train_df['review'])

In [159]:
def load_vectors(fname, word_index):
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())

    for line in fin:
        tokens = line.rstrip().split(' ')
        if tokens[0] in word_index:
            w = word_index[tokens[0]]
            embedding_matrix[w] = np.fromiter(map(float, tokens[1:]), 'float')
    
    return embedding_matrix

In [160]:
%time vectors = load_vectors('/home/eric/Downloads/wiki-news-300d-1M.vec', t.word_index)

CPU times: user 14 s, sys: 404 ms, total: 14.4 s
Wall time: 14.3 s


In [166]:
embedding_layer = layers.Embedding(input_dim=x_train.shape[1],
                                   output_dim=vectors.shape[1],
                                   weights=[vectors],
                                   input_length=x_train.shape[1],
                                   trainable=True)

## Machine Learning with Keras

In [135]:
x_train = t.texts_to_matrix(train_df['review'])
y_train = train_df['label']

x_test = t.texts_to_matrix(test_df['review'])
y_test = test_df['label']

In [149]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((25000, 88566), (25000,), (25000, 88566), (25000,))

### Logistic regression

In [181]:
i = layers.Input(shape=(x_train.shape[1],))
h = layers.Dense(units=1, activation='sigmoid')(i)
model = models.Model(inputs=[i], outputs=[h])

model.compile(loss='binary_crossentropy',
              optimizer='sgd',
              metrics=['binary_accuracy'])

In [182]:
model.fit(x=x_train,
          y=y_train,
          validation_data=(x_test, y_test),
          epochs=5)

Train on 25000 samples, validate on 25000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fefce95d898>

### Using Embeddings and multi layer perceptron

In [183]:
i = layers.Input(shape=(x_train.shape[1],))
h = embedding_layer(i)
h = layers.Dense(128, activation='relu')(h)
h = layers.Dropout(0.3)(h)
h = layers.Dense(64, activation='relu')(h)
h = layers.Dropout(0.3)(h)
h = layers.Dense(32, activation='relu')(h)
h = layers.Dropout(0.3)(h)
h = layers.Dense(1, activation='sigmoid')(h)

model.compile(loss='binary_crossentropy',
              optimizer='sgd',
              metrics=['binary_accuracy'])

In [184]:
model.fit(x=x_train,
          y=y_train,
          validation_data=(x_test, y_test),
          epochs=5)

Train on 25000 samples, validate on 25000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fefcf5b0160>