In [2]:
# load packages
import tensorflow as tf
from tensorflow import keras
import numpy as np

tf.__version__

'1.10.0'

In [3]:
# create an object for imdb dataset
movie_review_data = keras.datasets.imdb

In [4]:
# load dataset as train and test
# consider only top 100,000 words for reviews
(train_review, train_labels), (test_review, test_labels) = movie_review_data.load_data(num_words=100000)

In [5]:
print(len(train_review), len(train_labels))
print(len(test_review), len(test_labels))

25000 25000
25000 25000


In [6]:
train_review[:3]

array([list([1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 22665, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 21631, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 19193, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 10311, 8, 4, 107, 117, 5952, 15, 256, 4, 31050, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 12118, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]),
       list([1, 194, 1

In [7]:
train_labels[:3]

array([1, 0, 0])

In [8]:
print(train_review[0], train_labels[0])

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 22665, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 21631, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 19193, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 10311, 8, 4, 107, 117, 5952, 15, 256, 4, 31050, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 12118, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32] 1


The integers are encoded from top 100,000 frequent words, other are neglected.

In [9]:
# we expect the reviews to be of different length
print(len(train_review[0]), len(train_review[1]))

218 189


We use post padding method to make all reviews of same length and if review is already greater we neglect words after the set length.

In [10]:
# get the word mapping for word to code
word_index_map = movie_review_data.get_word_index()

The reviews—the arrays of integers—must be converted to tensors before fed into the neural network. Tensors need to be of same length. First thing we do is now to make the lengths of each of the reviews same.

In [11]:
# make some space for our custom word
for key in word_index_map.keys():
    word_index_map[key] += 1
# add padding as value 0
word_index_map["<PAD>"] = 0

In [12]:
# set max review length as 256
# set the length of all the reviews
train_review = keras.preprocessing.sequence.pad_sequences(train_review,
                                                          value = word_index_map["<PAD>"],
                                                          maxlen = 256,
                                                          padding = "post"
                                                         )
test_review = keras.preprocessing.sequence.pad_sequences(test_review,
                                                         value = word_index_map["<PAD>"],
                                                         maxlen = 256,
                                                         padding = "post"
                                                        )

In [13]:
# now check for review length
print(len(train_review[0]), len(train_review[1]))

256 256


WALLAH!!

In [14]:
# review representation as seen by our network
train_review[0]

array([    1,    14,    22,    16,    43,   530,   973,  1622,  1385,
          65,   458,  4468,    66,  3941,     4,   173,    36,   256,
           5,    25,   100,    43,   838,   112,    50,   670, 22665,
           9,    35,   480,   284,     5,   150,     4,   172,   112,
         167, 21631,   336,   385,    39,     4,   172,  4536,  1111,
          17,   546,    38,    13,   447,     4,   192,    50,    16,
           6,   147,  2025,    19,    14,    22,     4,  1920,  4613,
         469,     4,    22,    71,    87,    12,    16,    43,   530,
          38,    76,    15,    13,  1247,     4,    22,    17,   515,
          17,    12,    16,   626,    18, 19193,     5,    62,   386,
          12,     8,   316,     8,   106,     5,     4,  2223,  5244,
          16,   480,    66,  3785,    33,     4,   130,    12,    16,
          38,   619,     5,    25,   124,    51,    36,   135,    48,
          25,  1415,    33,     6,    22,    12,   215,    28,    77,
          52,     5,

In [15]:
# create validation set
val_review = test_review[:10000]
val_labels = test_labels[:10000]

# update test set
test_review = test_review[10000:]
test_labels = test_labels[10000:]

# check the size of validations set
print(len(val_review), len(val_labels))

10000 10000


In [16]:
# vocabulary size - most frequent words
vocab_size = 100000

In [17]:
# custom callback to monitor model performance
# stop the model if validation loss increases for 3 consecutive epochs
cbk = [
    keras.callbacks.EarlyStopping(monitor="val_loss", patience=3)
]

In [18]:
model = keras.Sequential([
    keras.layers.Embedding(vocab_size, 32), # get embedding
    keras.layers.GlobalAvgPool1D(), # create 1d vector
    keras.layers.Dense(16, activation=tf.nn.relu), # 16 nodes with relu
    keras.layers.Dense(1, activation=tf.nn.sigmoid) # 1 node with sigmoid activation
])

# get model configuration
model.summary()

# compile the model
model.compile(optimizer=tf.train.AdamOptimizer(), # using adaptive learning rate
              loss = keras.losses.binary_crossentropy, # loss function
              metrics = ["accuracy"]) # evaluation metrics

# fit the compiled model to the training data
model.fit(train_review,
          train_labels,
          epochs = 100, # max number of epochs
          batch_size = 512, # number of reveiws taken at a time
          validation_data = (val_review, val_labels),
          callbacks = cbk)

# evaluate the model on our test data
model.evaluate(test_review, test_labels)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 32)          3200000   
_________________________________________________________________
global_average_pooling1d (Gl (None, 32)                0         
_________________________________________________________________
dense (Dense)                (None, 16)                528       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 3,200,545
Trainable params: 3,200,545
Non-trainable params: 0
_________________________________________________________________
Train on 25000 samples, validate on 10000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Ep

[0.2938540789842606, 0.8829333333333333]

Model is overfitted. Let us create a bigger model.

In [29]:
model = keras.Sequential([
    keras.layers.Embedding(vocab_size, 32), # get embedding
    keras.layers.GlobalAvgPool1D(), # create 1d vector
    keras.layers.Dense(64, activation=tf.nn.relu), # 16 nodes with relu
    keras.layers.Dense(1, activation=tf.nn.sigmoid) # 1 node with sigmoid activation
])

# get model configuration
model.summary()

# compile the model
model.compile(optimizer=tf.train.AdamOptimizer(), # using adaptive learning rate
              loss = keras.losses.binary_crossentropy, # loss function
              metrics = ["accuracy"]) # evaluation metrics

# fit the compiled model to the training data
model.fit(train_review,
          train_labels,
          epochs = 100, # max number of epochs
          batch_size = 512, # number of reveiws taken at a time
          validation_data = (val_review, val_labels),
          callbacks = cbk)

# evaluate the model on our test data
model.evaluate(test_review, test_labels)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, None, 32)          3200000   
_________________________________________________________________
global_average_pooling1d_11  (None, 32)                0         
_________________________________________________________________
dense_27 (Dense)             (None, 64)                2112      
_________________________________________________________________
dense_28 (Dense)             (None, 1)                 65        
Total params: 3,202,177
Trainable params: 3,202,177
Non-trainable params: 0
_________________________________________________________________
Train on 25000 samples, validate on 10000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Ep

[0.2936652056773504, 0.8821333333651225]

No improvement. Lets add more layers.

In [30]:
model = keras.Sequential([
    keras.layers.Embedding(vocab_size, 32), # get embedding
    keras.layers.GlobalAvgPool1D(), # create 1d vector
    keras.layers.Dense(64, activation=tf.nn.relu), # 16 nodes with relu
    keras.layers.Dense(64, activation=tf.nn.relu), # 16 nodes with relu
    keras.layers.Dense(1, activation=tf.nn.sigmoid) # 1 node with sigmoid activation
])

# get model configuration
model.summary()

# compile the model
model.compile(optimizer=tf.train.AdamOptimizer(), # using adaptive learning rate
              loss = keras.losses.binary_crossentropy, # loss function
              metrics = ["accuracy"]) # evaluation metrics

# fit the compiled model to the training data
model.fit(train_review,
          train_labels,
          epochs = 100, # max number of epochs
          batch_size = 512, # number of reveiws taken at a time
          validation_data = (val_review, val_labels),
          callbacks = cbk)

# evaluate the model on our test data
model.evaluate(test_review, test_labels)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, None, 32)          3200000   
_________________________________________________________________
global_average_pooling1d_12  (None, 32)                0         
_________________________________________________________________
dense_29 (Dense)             (None, 64)                2112      
_________________________________________________________________
dense_30 (Dense)             (None, 64)                4160      
_________________________________________________________________
dense_31 (Dense)             (None, 1)                 65        
Total params: 3,206,337
Trainable params: 3,206,337
Non-trainable params: 0
_________________________________________________________________
Train on 25000 samples, validate on 10000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/10

[0.3218144570748011, 0.8778666666666667]

Nope nothing.  We go back to the simpler model and try to overcome overfitting using dropout.

In [33]:
model = keras.Sequential([
    keras.layers.Embedding(vocab_size, 32), # get embedding
    keras.layers.GlobalAvgPool1D(), # create 1d vector
    keras.layers.Dense(16, activation=tf.nn.relu), # 16 nodes with relu
    keras.layers.Dropout(0.4),
    keras.layers.Dense(1, activation=tf.nn.sigmoid) # 1 node with sigmoid activation
])

# get model configuration
model.summary()

# compile the model
model.compile(optimizer=tf.train.AdamOptimizer(), # using adaptive learning rate
              loss = keras.losses.binary_crossentropy, # loss function
              metrics = ["accuracy"]) # evaluation metrics

# fit the compiled model to the training data
model.fit(train_review,
          train_labels,
          epochs = 100, # max number of epochs
          batch_size = 512, # number of reveiws taken at a time
          validation_data = (val_review, val_labels),
          callbacks = cbk)

# evaluate the model on our test data
model.evaluate(test_review, test_labels)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_14 (Embedding)     (None, None, 32)          3200000   
_________________________________________________________________
global_average_pooling1d_14  (None, 32)                0         
_________________________________________________________________
dense_34 (Dense)             (None, 16)                528       
_________________________________________________________________
dropout_10 (Dropout)         (None, 16)                0         
_________________________________________________________________
dense_35 (Dense)             (None, 1)                 17        
Total params: 3,200,545
Trainable params: 3,200,545
Non-trainable params: 0
_________________________________________________________________
Train on 25000 samples, validate on 10000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/10

[0.28850850370724995, 0.8842000000317891]

In [48]:
model = keras.Sequential([
    keras.layers.Embedding(vocab_size, 128), # get embedding
    keras.layers.GlobalAvgPool1D(), # create 1d vector
    keras.layers.Dense(16, activation=tf.nn.relu), # 16 nodes with relu
    keras.layers.Dropout(0.4),
    keras.layers.Dense(1, activation=tf.nn.sigmoid) # 1 node with sigmoid activation
])

# get model configuration
model.summary()

# compile the model
model.compile(optimizer=tf.train.AdamOptimizer(), # using adaptive learning rate
              loss = keras.losses.binary_crossentropy, # loss function
              metrics = ["accuracy"]) # evaluation metrics

# fit the compiled model to the training data
model.fit(train_review,
          train_labels,
          epochs = 100, # max number of epochs
          batch_size = 128, # number of reveiws taken at a time
          validation_data = (val_review, val_labels),
          callbacks = cbk)

# evaluate the model on our test data
model.evaluate(test_review, test_labels)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_29 (Embedding)     (None, None, 128)         12800000  
_________________________________________________________________
global_average_pooling1d_29  (None, 128)               0         
_________________________________________________________________
dense_72 (Dense)             (None, 16)                2064      
_________________________________________________________________
dropout_32 (Dropout)         (None, 16)                0         
_________________________________________________________________
dense_73 (Dense)             (None, 1)                 17        
Total params: 12,802,081
Trainable params: 12,802,081
Non-trainable params: 0
_________________________________________________________________
Train on 25000 samples, validate on 10000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/

KeyboardInterrupt: 