In [1]:
# import packages
import tensorflow as tf
from tensorflow import keras
import numpy as np

import matplotlib.pyplot as plt

tf.__version__

  return f(*args, **kwds)


'1.8.0'

In [2]:
movie_review_data = keras.datasets.imdb

In [3]:
# load data into buckets
(train_review, train_labels), (test_review, test_labels) = movie_review_data.load_data(num_words=100000)

The argument 100000 sets the upper limit of the number of words to keep data small, else we can also download the whole dataset.

In [4]:
# lets view our data
print(len(train_review), len(train_labels))
print(len(test_review), len(test_labels))

25000 25000
25000 25000


In [5]:
print(train_review[0], train_labels[0])

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 22665, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 21631, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 19193, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 10311, 8, 4, 107, 117, 5952, 15, 256, 4, 31050, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 12118, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32] 1


Don't worry these are integrs denoting different words.

The approach we are taking here will use word embeddings. In word embeddings, the reviews are converted into a series of codes which are mapped against n-number of words, in our case which is 100,000 as seen above. Words not present in the vocab are discarded but this makes reviews uneven more than they were before, so we use extra padding to make reviews of same length.

Then these codes are fed to the network with the word mapping. 1D vector is created of fixed length.
We use ReLU and sigmoid for activation

In [6]:
print(len(train_review[0]), len(train_review[1]))

218 189


We can notice that not all reviews is of same length. We need to make every review of the same length.

In [7]:
# now lets get word index mapping so that we can get the exact review and not the codes
word_index_map = movie_review_data.get_word_index()

In [8]:
print(len(word_index_map)) # this returns a dictionary of words along with their mappings

88584


Now we decode the codes and get the reviews back. Only for example. Our model will use a built in embedding system.

In [9]:
# push codes by 3 places to add some custom key-value mappings for our ease
word_index_map = {k:(v+3) for k,v in word_index_map.items()}

# add custom indices
word_index_map["<PAD>"] = 0 # extra symbols added to making sure length of all reviews are same
word_index_map["<START>"] = 1 # start of the review
word_index_map["<UNK>"] = 2  # unknown
word_index_map["<UNUSED>"] = 3 # un-used

# now lets reverse the dict so that its easy transform: index->word
reverse_word_index_map = dict([(value, key) for (key, value) in word_index_map.items()])

In [10]:
# now we need to decode the reviews
# take all the codes, find the word for that code and join using a space character to get the review
def decode_review(text):
    return ' '.join([reverse_word_index_map.get(num, '?') for num in text])

# test a sample
print(train_review[0])
print(decode_review(train_review[0]))

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 22665, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 21631, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 19193, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 10311, 8, 4, 107, 117, 5952, 15, 256, 4, 31050, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 12118, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]
<START> this film was just brilliant

Now we can see that we have our reviews. Next step would be to preprae the data.

The reviews—the arrays of integers—must be converted to tensors before fed into the neural network. Tensors need to be of same length. First thing we do is now to make the lengths of each of the reviews same.

In [11]:
# use padding to make lengths of each review same
# we also limit the maximum size of the reviews to 256 characters
# post padding adds extra characters at the end of the review
train_review = keras.preprocessing.sequence.pad_sequences(train_review,
                                                          value = word_index_map["<PAD>"],
                                                          maxlen = 256,
                                                          padding = "post"
                                                         )
# we do the same for test data
test_review = keras.preprocessing.sequence.pad_sequences(test_review,
                                                         value = word_index_map["<PAD>"],
                                                         maxlen = 256,
                                                         padding = "post"
                                                        )

In [12]:
print(len(train_review[0]), len(train_review[1]))

256 256


As we can see now the length of the reveiws are same.

In [13]:
train_review[0]

array([    1,    14,    22,    16,    43,   530,   973,  1622,  1385,
          65,   458,  4468,    66,  3941,     4,   173,    36,   256,
           5,    25,   100,    43,   838,   112,    50,   670, 22665,
           9,    35,   480,   284,     5,   150,     4,   172,   112,
         167, 21631,   336,   385,    39,     4,   172,  4536,  1111,
          17,   546,    38,    13,   447,     4,   192,    50,    16,
           6,   147,  2025,    19,    14,    22,     4,  1920,  4613,
         469,     4,    22,    71,    87,    12,    16,    43,   530,
          38,    76,    15,    13,  1247,     4,    22,    17,   515,
          17,    12,    16,   626,    18, 19193,     5,    62,   386,
          12,     8,   316,     8,   106,     5,     4,  2223,  5244,
          16,   480,    66,  3785,    33,     4,   130,    12,    16,
          38,   619,     5,    25,   124,    51,    36,   135,    48,
          25,  1415,    33,     6,    22,    12,   215,    28,    77,
          52,     5,

As you can see extra padding is added at the end of the review.

Now we can build are model.

In [14]:
# creating validation set and updating the train set too!
# first 10'000 are validation reviews and rest are for training
val_review = train_review[:10000]
val_labels = train_labels[:10000]

train_review = train_review[10000:]
train_labels = train_labels[10000:]

In [15]:
vocab_size = 100000 # input shape is the vocabulary count for the movies

# we develope a sequential model: stack based model
model = keras.Sequential([
    keras.layers.Embedding(vocab_size, 16), # look up for the embedding
    keras.layers.GlobalAvgPool1D(), # create a fixed length 1D vector
    keras.layers.Dense(16, activation=tf.nn.relu), # make value ranges between 0 - infinite
    keras.layers.Dense(1, activation=tf.nn.sigmoid) # make value range from 0-1
])

# get the model architecure
model.summary()

# compile the model: basically glue the model together
model.compile(optimizer=tf.train.AdamOptimizer(),
              loss = keras.losses.binary_crossentropy,
              metrics = ["accuracy"])

# fit our model to the training data
model.fit(train_review,
          train_labels,
          epochs = 20, # 20 times the whole data is trained
          batch_size = 64, # 256 reviews at a time
          validation_data = (val_review, val_labels)) # check performance on validation dataset

# evaluate the model on the test data
model.evaluate(test_review, test_labels)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 16)          1600000   
_________________________________________________________________
global_average_pooling1d_1 ( (None, 16)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                272       
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 17        
Total params: 1,600,289
Trainable params: 1,600,289
Non-trainable params: 0
_________________________________________________________________
Train on 15000 samples, validate on 10000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 1

[0.3299652925682068, 0.87596]

The resultant evaluation results in two values: loss and accruacy respectively. As we can see that out training accuracy is way more than the validation accuracyand testing accuracy. The gap shows that the model is overfitting - model is learning non important patterns and not generalizing to work on new data.