# Recurrent neural network: IMDB example

In [192]:
import numpy as np
import pandas as pd
import matplotlib as plt

from keras.utils import plot_model
from keras.datasets import imdb
from keras.preprocessing import sequence
from keras.models import Sequential, Model
from keras.layers.core import Dense, Activation, Dropout
from keras.layers import SimpleRNN, Embedding, Input
from keras.layers.recurrent import LSTM, GRU
from typing import List

In [193]:
# For reproducibility:
np.random.seed(1234)

In [194]:
import os

img_path = './img'

if not os.path.isdir(img_path):
    os.mkdir(img_path)

## Global parameters and helper functions

In [213]:
size_vocabulary = 10000
num_words = 500
max_sentence_len = 200
embedding_dim = 128
batch_size=128

In [214]:
# TODO: Change epochs to 4
def train_and_eval(model, input_train, y_train, input_test, y_test):
    model.fit(input_train,  
              y_train,
              epochs=1,
              batch_size=batch_size,
              validation_split=0.2)
    
    score, acc = model.evaluate(input_test, y_test)
    
    print('Test score:', score)
    print('Test accuracy:', acc)

In [215]:
def display_model(model):
    from IPython.display import Image
    from IPython.core.display import HTML 
    
    path = './img/{}.png'.format(model.name)
    plot_model(model, to_file=path)
    
    return Image(url=path)

## The data

The data set consist of reviews for movies from IMDB, the Internet Movie Database. It contains reviews for 25,000 movies, each labeled by sentiment (positive/negative). 

Fortunately the data has already been preprocessed: each review is encoded as a sequence of word indexes (integers). The word index itself is the position of a word in the list of all unique words contained in all 25,000 movies ordered by their frequency (from most frequent words down to the most uncommon ones). So the word index "3" encodes the 3rd most frequent word in the data, which happens to be "a" like in "a movie".

By convention, "0" does not stand for a specific word. Instead it is used to encode any unknown word and can be used for padding (see below).

Here are some example reviews from IMDB:

We load the data set and split it intro training and test sets. Notice than we only pick the `max_sentence_len` most commons words by setting `num_words=max_sentence_len`:

In [216]:
(input_train, y_train), (input_test, y_test) = imdb.load_data(num_words=num_words, 
                                                              index_from=0)

Let's get an overview on how big the sets are. As it turns out the data is split in 50% for training and 50% for test. (Note: We cannot specify `test_split` for this data set in the `load_data` function.) As promised there are two labels.

In [217]:
print(len(input_train), 'train sequences')
print(len(input_test), 'test sequences')

print('Shape train sequences:', input_train.shape)
print('Shape test sequences:', input_test.shape)
print('Shape train labels:', y_train.shape)
print('Shape test labels:', y_test.shape)
print('Number of classes:', len(np.unique(y_test)))

25000 train sequences
25000 test sequences
Shape train sequences: (25000,)
Shape test sequences: (25000,)
Shape train labels: (25000,)
Shape test labels: (25000,)
Number of classes: 2


We can use Pandas to have a look at the first training samples. As said each of them is a list of word indexes.

In [218]:
pd.DataFrame(input_train, columns=["word_index_sequence"]).head()

Unnamed: 0,word_index_sequence
0,"[1, 11, 19, 13, 40, 2, 2, 2, 2, 62, 455, 2, 63..."
1,"[1, 191, 2, 191, 2, 75, 225, 2, 3, 2, 2, 2, 13..."
2,"[1, 11, 44, 5, 27, 28, 4, 1, 246, 105, 4, 1, 2..."
3,"[1, 1, 2, 2, 30, 2, 1, 2, 429, 108, 150, 100, ..."
4,"[1, 246, 2, 4, 58, 110, 7, 7, 10, 2, 11, 17, 5..."


The following functions loads the word index of the IMDB data set and allows us to translate a word index vector to a human readable format.

In [227]:
def decode(word_indizes: List[int]):
    word_index = imdb.get_word_index()
    inverse_word_index = {index:word for word, index in word_index.items()}
    words = [inverse_word_index.get(wrd_idx, '<missing-word>') for wrd_idx in word_indizes]
    
    return ' '.join(words)

The obvious counter part for encoding a review as a word index vector:

In [228]:
def encode(review: str):
    word_index = imdb.get_word_index()
    
    return [word_index.get(word, 0) for word in review.lower().split(" ")]

Let's check everythings works fine by encoding and decoding an example:

In [237]:
review = "Even those from the era should be turned off."
decode(encode(review)) 

'even those from the era should be turned <missing-word>'

Here is a real training example. (Note: Some words are clearly missing. Remember we limited ourself to the `num_words = 500` when loading the data via `imdb.load_data` above.) The review was favorable and thus gets a sentiment score of 1.

In [239]:
print('Review: "{}"\n'.format(decode(input_train[10])))
print('Sentiment: ', y_train[10])

Review: "and and and and to and well throughout director and and and a great and for the film the fact that most of it takes place and the and and and and that the film and very and and this and and the and idea of the and and to use and to and out of the and it's very and to get behind them it's often said that the and is the thing that really and people and this film and that as the director and that we can never really be sure of and what is and the and and this and to and that and actually does and to be quite and the film is and for a lot of and and the and plot the characters are all very interesting in their own way and the fact that the book itself almost takes on its own character is very well done anyone and that the film and and by the end and be and either as the ending both makes sense and and to be quite and overall and is a truly great horror film and one of the best of the and and and and"

Sentiment:  1


## Preprocessing: Padding

In [240]:
input_train = sequence.pad_sequences(input_train, maxlen=max_sentence_len)
input_test = sequence.pad_sequences(input_test, maxlen=max_sentence_len)

In [241]:
print('input_train shape:', input_train.shape)
print('input_test shape:', input_test.shape)

input_train shape: (25000, 200)
input_test shape: (25000, 200)


In [242]:
input_train[0]

array([  2,  22,  97,  40,   2, 109,  47,   2,   2,   6,  32, 477, 281,
         2, 147,   1, 169, 109, 164,   2, 333, 382,  36,   1, 169,   2,
         2,  14,   2,  35,  10, 444,   1, 189,  47,  13,   3, 144,   2,
        16,  11,  19,   1,   2,   2, 466,   1,  19,  68,  84,   9,  13,
        40,   2,  35,  73,  12,  10,   2,   1,  19,  14,   2,  14,   9,
        13,   2,  15,   2,   2,  59, 383,   9,   5, 313,   5, 103,   2,
         1,   2,   2,  13, 477,  63,   2,  30,   1, 127,   9,  13,  35,
         2,   2,  22, 121,  48,  33, 132,  45,  22,   2,  30,   3,  19,
         9, 212,  25,  74,  49,   2,  11, 404,  13,  79,   2,   5,   1,
       104, 114,   2,  12, 253,   1,   2,   4,   2,   2,   2,  33,  68,
        40,   2, 473,  23, 397, 314,  43,   4,   1,   2,   2,  10, 101,
        85,   1, 378,  12, 294,  95,  29,   2,  53,  23, 138,   3, 191,
         2,  15,   1, 223,  19,  18, 131, 473,  23, 477,   2, 141,  27,
         2,  15,  48,  33,  25, 221,  89,  22, 101,   1, 223,  6

## Simple RNN using the Sequential API

In [243]:
model = Sequential(name="simple_rnn_sequential")
model.add(Embedding(size_vocabulary, embedding_dim, name="embedding_layer")) 
model.add(SimpleRNN(units=embedding_dim, name="simple_rnn_layer"))
model.add(Dense(1, activation='sigmoid', name="dense_layer"))

model.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=['acc'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_layer (Embedding)  (None, None, 128)         1280000   
_________________________________________________________________
simple_rnn_layer (SimpleRNN) (None, 128)               32896     
_________________________________________________________________
dense_layer (Dense)          (None, 1)                 129       
Total params: 1,313,025
Trainable params: 1,313,025
Non-trainable params: 0
_________________________________________________________________


In [244]:
display_model(model)

In [14]:
train_and_eval(model, input_train, y_train, input_test, y_test)

Train on 20000 samples, validate on 5000 samples
Epoch 1/1
Test score: 0.6672063104057312
Test accuracy: 0.58304


## Simple RNN using the Functional API

In [27]:
inputs = Input(shape=(max_sentence_len,), name="input_layer")
embedding_layer = Embedding(size_vocabulary, embedding_dim, name="embedding_layer")(inputs)
simple_rnn_layer = SimpleRNN(units=embedding_dim, name="simple_rnn_layer")(embedding_layer)
predictions = Dense(1, activation='sigmoid', name="dense_layer")(simple_rnn_layer)

model = Model(inputs=inputs, 
              outputs=predictions,
              name="simple_rnn_functional")

model.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=['acc'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_layer (InputLayer)     (None, 500)               0         
_________________________________________________________________
embedding_layer (Embedding)  (None, 500, 128)          1280000   
_________________________________________________________________
simple_rnn_layer (SimpleRNN) (None, 128)               32896     
_________________________________________________________________
dense_layer (Dense)          (None, 1)                 129       
Total params: 1,313,025
Trainable params: 1,313,025
Non-trainable params: 0
_________________________________________________________________


In [28]:
display_model(model)

In [16]:
train_and_eval(model, input_train, y_train, input_test, y_test)

Train on 20000 samples, validate on 5000 samples
Epoch 1/1
Test score: 0.6479414934921265
Test accuracy: 0.62892


## LSTM using the Sequential API

In [29]:
model = Sequential(name="lstm_sequential")
model.add(Embedding(size_vocabulary, embedding_dim, name="embedding_layer")) 
model.add(LSTM(units=embedding_dim, name="lstm_layer"))
model.add(Dense(1, activation='sigmoid', name="dense_layer"))

model.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=['acc'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_layer (Embedding)  (None, None, 128)         1280000   
_________________________________________________________________
lstm_layer (LSTM)            (None, 128)               131584    
_________________________________________________________________
dense_layer (Dense)          (None, 1)                 129       
Total params: 1,411,713
Trainable params: 1,411,713
Non-trainable params: 0
_________________________________________________________________


In [30]:
display_model(model)

In [None]:
train_and_eval(model, input_train, y_train, input_test, y_test)

# GRU using the Sequential API

In [31]:
model = Sequential(name="gru_sequential")
model.add(Embedding(size_vocabulary, embedding_dim, name="embedding_layer")) 
model.add(GRU(units=embedding_dim, name="lstm_layer"))
model.add(Dense(1, activation='sigmoid', name="dense_layer"))

model.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=['acc'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_layer (Embedding)  (None, None, 128)         1280000   
_________________________________________________________________
lstm_layer (GRU)             (None, 128)               98688     
_________________________________________________________________
dense_layer (Dense)          (None, 1)                 129       
Total params: 1,378,817
Trainable params: 1,378,817
Non-trainable params: 0
_________________________________________________________________


In [32]:
display_model(model)