# Sentiment analysis on IMDB large movie review dataset

Get the dataset from [here](http://ai.stanford.edu/~amaas/data/sentiment/)

In [1]:
from glob import glob
import pandas as pd
import io
import numpy as np
from IPython.display import SVG

from keras import models, layers
from keras import backend as K
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import plot_model
from keras.utils.vis_utils import model_to_dot

import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = ""

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Prepare dataset

In [2]:
def parse_text(text):
    return text.lower().strip()

def read_and_append_files(folder, label):
    texts = []
    for file in glob('./aclImdb/{0}/*.txt'.format(folder)):
        with open(file, 'r') as f:
            text = f.read()
            text = parse_text(text)
            texts.append((text, label))
            
    return texts

def make_df(pos, neg):
    df = pd.concat([pd.DataFrame(pos),
                    pd.DataFrame(neg)])\
           .sample(frac=1)\
           .reset_index(drop=True)
    df.columns = ['review', 'label']
    
    return df

In [3]:
train_pos = read_and_append_files('train/pos', 1)
train_neg = read_and_append_files('train/neg', 0)

test_pos = read_and_append_files('test/pos', 1)
test_neg = read_and_append_files('test/neg', 0)

In [4]:
train_df = make_df(train_pos, train_neg)
test_df = make_df(test_pos, test_neg)

In [5]:
train_df.head()

Unnamed: 0,review,label
0,i've been trying to find out about this series...,1
1,secret sunshine marks the return of director l...,1
2,the clouded yellow is a compact psychological ...,1
3,"maybe i'm biased to foxes, fox stories and all...",1
4,every kid has that movie that he pops into vhs...,1


In [6]:
test_df.head()

Unnamed: 0,review,label
0,"ah, here it is! a movie, which is said by peop...",0
1,after watch this movie i was surprised that so...,0
2,william shakespeare probably didn't envision s...,1
3,"""four daughters,"" a sentimental story of a sol...",1
4,great movie. i was laughing all time through. ...,1


## Embeddings

I use [Fast Text pre trained embeddings](https://fasttext.cc/docs/en/english-vectors.html).

In [7]:
t = Tokenizer()
t.fit_on_texts(train_df['review'])

In [8]:
def load_vectors(fname, word_index):
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())

    for line in fin:
        tokens = line.rstrip().split(' ')
        if tokens[0] in word_index:
            w = word_index[tokens[0]]
            embedding_matrix[w] = np.fromiter(map(float, tokens[1:]), 'float')
    
    return embedding_matrix

In [9]:
%time vectors = load_vectors('./wiki-news-300d-1M.vec', t.word_index)

CPU times: user 13.5 s, sys: 422 ms, total: 13.9 s
Wall time: 13.9 s


## Machine Learning with Keras

In [10]:
x_train = t.texts_to_matrix(train_df['review'])
y_train = train_df['label']

x_test = t.texts_to_matrix(test_df['review'])
y_test = test_df['label']

In [11]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((25000, 88566), (25000,), (25000, 88566), (25000,))

### Logistic regression

In [12]:
i = layers.Input(shape=(x_train.shape[1],))
h = layers.Dense(units=1, activation='sigmoid')(i)
model = models.Model(inputs=[i], outputs=[h])

model.compile(loss='binary_crossentropy',
              optimizer='sgd',
              metrics=['binary_accuracy'])

# SVG(model_to_dot(model, show_shapes=True).create(prog='dot', format='svg'))

In [13]:
model.fit(x=x_train,
          y=y_train,
          validation_data=(x_test, y_test),
          epochs=5)

Train on 25000 samples, validate on 25000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f951967d908>

### Using Embeddings and multi layer perceptron

In [16]:
x_train = pad_sequences(t.texts_to_sequences(train_df['review']), maxlen=500)
y_train = train_df['label']

x_test = pad_sequences(t.texts_to_sequences(test_df['review']), maxlen=500)
y_test = test_df['label']

In [17]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((25000, 500), (25000,), (25000, 500), (25000,))

In [26]:
i = layers.Input(shape=(x_train.shape[1],))

embedding_layer = layers.Embedding(input_dim=vectors.shape[0],
                                   output_dim=vectors.shape[1],
                                   weights=[vectors],
                                   trainable=False)

h = embedding_layer(i)
h = layers.Lambda(lambda r: K.mean(r, axis=1))(h)
h = layers.Dense(128, activation='relu')(h)
h = layers.Dropout(0.3)(h)
h = layers.Dense(32, activation='relu')(h)
h = layers.Dropout(0.3)(h)
h = layers.Dense(1, activation='sigmoid')(h)

model = models.Model(inputs=[i], outputs=[h])

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['binary_accuracy'])

In [27]:
model.fit(x=x_train,
          y=y_train,
          validation_data=(x_test, y_test),
          epochs=5)

Train on 25000 samples, validate on 25000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f950242f240>

### Embeddings and GRU

In [28]:
i = layers.Input(shape=(x_train.shape[1],))

embedding_layer = layers.Embedding(input_dim=vectors.shape[0],
                                   output_dim=vectors.shape[1],
                                   weights=[vectors],
                                   trainable=False)

h = embedding_layer(i)
h = layers.GRU(128)(h)
h = layers.Dropout(0.3)(h)
h = layers.Dense(32, activation='relu')(h)
h = layers.Dropout(0.3)(h)
h = layers.Dense(1, activation='sigmoid')(h)

model = models.Model(inputs=[i], outputs=[h])

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['binary_accuracy'])

In [None]:
model.fit(x=x_train,
          y=y_train,
          validation_data=(x_test, y_test),
          epochs=5)