In [None]:
# IMPORT PACKAGES
import tensorflow as tf
from tensorflow import keras
import numpy as np

import matplotlib.pyplot as plt

tf.__version__

In [2]:
# LOAD DATA FROM KERAS DATASETS
movie_review_data = keras.datasets.imdb

In [3]:
# CREATE BUCKETS FOR TRAIN AND TEST
# CONSIDER ONLY TOP 100,000 WORDS FOR REVIEWS
(train_review, train_labels), (test_review, test_labels) = movie_review_data.load_data(num_words=100000)

In [4]:
print(len(train_review), len(train_labels))
print(len(test_review), len(test_labels))

25000 25000
25000 25000


## PREPARE DATA

In [5]:
# VIEW OUR DATA IN HAND
print(train_review[0], train_labels[0])

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 22665, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 21631, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 19193, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 10311, 8, 4, 107, 117, 5952, 15, 256, 4, 31050, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 12118, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32] 1


THE INTEGERS HERE ARE ENCODED TOP 100,000 WORDS.

In [6]:
# USING DOMAIN KNOWLEDGE WE KNOW THAT REVIEW LENGTHS MAY OR MAY NOT BE SAME
print(len(train_review[0]), len(train_review[1]))

218 189


AS EXPECTED! WE NEED TO MAKE ALL AREVIEWS OF SAME LENGTH SO THAT OUR NETWORK CAN HANDLE THEM. WE USE A METHOD CALLED POST PADDING. WE ADD EXTRA PADDING AT THE END OF EACH REVIEW IF LESS THAN A SET LENGTH OR DISCARD EVERYTHING AFTER THAT.

In [7]:
# GET WORD MAP WHICH IS INBUILT IN KERAS DATASETS
word_index_map = movie_review_data.get_word_index()

In [8]:
# DICTIONARY OF WORD AND CODE MAPPING
print(len(word_index_map))

88584


The reviews—the arrays of integers—must be converted to tensors before fed into the neural network. Tensors need to be of same length. First thing we do is now to make the lengths of each of the reviews same.

In [9]:
# SHIFT THE DICTIONARY ELEMENTS BY ONE PLACE
for key in word_index_map.keys():
    word_index_map[key] += 1
# ADD NEW PADDING SYMBOL WITH VALUE 0
word_index_map["<PAD>"] = 0

# CHECK FOR LENGHT OF THE MAPPING
len(word_index_map)

88585

In [10]:
# LET US MAKE ALL REVIEWS OF SAME LENGHT USING PADDING
# SET MAX REVIEW LENGTH AS 256 WORDS
# IF REVIEW GREATER THAN THAT TRUNCATE IT ELSE ADD PADDING
train_review = keras.preprocessing.sequence.pad_sequences(train_review,
                                                          value = word_index_map["<PAD>"],
                                                          maxlen = 256,
                                                          padding = "post"
                                                         )
# DO THE SAME FOR TEST
test_review = keras.preprocessing.sequence.pad_sequences(test_review,
                                                         value = word_index_map["<PAD>"],
                                                         maxlen = 256,
                                                         padding = "post"
                                                        )

In [11]:
# CHECK THE REVIEW LENGTH AGAIN
print(len(train_review[0]), len(train_review[1]))

256 256


WALLAH!!

In [12]:
# REVIEW REPRESENTATION AS SEEN BY OUR NETWORK
train_review[0]

array([    1,    14,    22,    16,    43,   530,   973,  1622,  1385,
          65,   458,  4468,    66,  3941,     4,   173,    36,   256,
           5,    25,   100,    43,   838,   112,    50,   670, 22665,
           9,    35,   480,   284,     5,   150,     4,   172,   112,
         167, 21631,   336,   385,    39,     4,   172,  4536,  1111,
          17,   546,    38,    13,   447,     4,   192,    50,    16,
           6,   147,  2025,    19,    14,    22,     4,  1920,  4613,
         469,     4,    22,    71,    87,    12,    16,    43,   530,
          38,    76,    15,    13,  1247,     4,    22,    17,   515,
          17,    12,    16,   626,    18, 19193,     5,    62,   386,
          12,     8,   316,     8,   106,     5,     4,  2223,  5244,
          16,   480,    66,  3785,    33,     4,   130,    12,    16,
          38,   619,     5,    25,   124,    51,    36,   135,    48,
          25,  1415,    33,     6,    22,    12,   215,    28,    77,
          52,     5,

EXTRA PADDING IS ADDED AT THE END OF REVIEW. NOW WE ARE READY TO TRAIN OUR MODEL.

In [13]:
# CREATE VALIDATION SET
val_review = train_review[:5000]
val_labels = train_labels[:5000]
# UDPATE TRAIN DATASET
train_review = train_review[5000:]
train_labels = train_labels[5000:]

print(len(val_review), len(val_labels))

5000 5000


## MODEL

In [52]:
# CONSIDER ONLY TOP 100,000 WORDS AS SPECIFIED BEFORE
vocab_size = 100000

In [56]:
# CUSTOM CALL BACK TO STOP IF VALIDATION LOSS IS INCREASING FOR 3 CONTINUOUS EPOCH
cbk = [
    keras.callbacks.EarlyStopping(monitor="val_loss", patience=3)
]

In [57]:
# DESIGN SEQUENTIAL MULTI-LAYER PERCEPTRON NETWORK
# START WITH A SIMPLE NETWORK
model = keras.Sequential([
    keras.layers.Embedding(vocab_size, 32), # LOOK FOR EMBEDDINGS - 16 NODES
    keras.layers.GlobalAvgPool1D(), # CREATE 1D VECTOR USIGN EMBEDDING
    
    keras.layers.Dense(32, activation=tf.nn.relu), # FIRST HIDDEN LAYER WITH 32 NODES
    
    keras.layers.Dense(1, activation=tf.nn.sigmoid) # USING SIGMOID FOR ACTIVATION
])

# GET THE MODEL CONFIGURATION
model.summary()

# COMPILE THE MODEL
model.compile(optimizer=tf.train.AdamOptimizer(), # USING ADAPTIVE LEARNING RATE
              loss = keras.losses.binary_crossentropy, # CATAGORICAL LOSS FUNCTION
              metrics = ["accuracy"]) # ACCURACY AS OUR METRIC FOR EVALUATION

# FIT MODEL TO TRAIN DATA
model.fit(train_review,
          train_labels,
          epochs = 100, # SET MAX EPOCH
          batch_size = 32, # NUMBER OF REVIEWS AT ONCE TO TRAIN
          validation_data = (val_review, val_labels),
          callbacks = cbk)

# EVALUATE MODEL ON TEST DATA
model.evaluate(test_review, test_labels)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_41 (Embedding)     (None, None, 32)          3200000   
_________________________________________________________________
global_average_pooling1d_41  (None, 32)                0         
_________________________________________________________________
dense_111 (Dense)            (None, 32)                1056      
_________________________________________________________________
dense_112 (Dense)            (None, 1)                 33        
Total params: 3,201,089
Trainable params: 3,201,089
Non-trainable params: 0
_________________________________________________________________
Train on 20000 samples, validate on 5000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100


[0.3218516930294037, 0.87828]

DESCENT PERFORMANCE BUT MODEL IS OVERFITTED. 

In [68]:
# DESIGN SEQUENTIAL MULTI-LAYER PERCEPTRON NETWORK
# START WITH A SIMPLE NETWORK
model = keras.Sequential([
    keras.layers.Embedding(vocab_size, 64), # LOOK FOR EMBEDDINGS - 16 NODES
    keras.layers.GlobalAvgPool1D(), # CREATE 1D VECTOR USIGN EMBEDDING
    
    keras.layers.Dense(32, activation=tf.nn.relu), # FIRST HIDDEN LAYER WITH 32 NODES
    keras.layers.Dropout(0.7),
    
    keras.layers.Dense(32, activation=tf.nn.relu), # FIRST HIDDEN LAYER WITH 32 NODES
    keras.layers.Dropout(0.7),
    
    keras.layers.Dense(1, activation=tf.nn.sigmoid) # USING SIGMOID FOR ACTIVATION
])

# GET THE MODEL CONFIGURATION
model.summary()

# COMPILE THE MODEL
model.compile(optimizer=tf.train.AdamOptimizer(), # USING ADAPTIVE LEARNING RATE
              loss = keras.losses.binary_crossentropy, # CATAGORICAL LOSS FUNCTION
              metrics = ["accuracy"]) # ACCURACY AS OUR METRIC FOR EVALUATION

# FIT MODEL TO TRAIN DATA
model.fit(train_review,
          train_labels,
          epochs = 100, # SET MAX EPOCH
          batch_size = 128, # NUMBER OF REVIEWS AT ONCE TO TRAIN
          validation_data = (val_review, val_labels),
          callbacks = cbk)

# EVALUATE MODEL ON TEST DATA
model.evaluate(test_review, test_labels)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_51 (Embedding)     (None, None, 64)          6400000   
_________________________________________________________________
global_average_pooling1d_51  (None, 64)                0         
_________________________________________________________________
dense_131 (Dense)            (None, 32)                2080      
_________________________________________________________________
dropout_67 (Dropout)         (None, 32)                0         
_________________________________________________________________
dense_132 (Dense)            (None, 32)                1056      
_________________________________________________________________
dropout_68 (Dropout)         (None, 32)                0         
_________________________________________________________________
dense_133 (Dense)            (None, 1)                 33        
Total para

[0.42884352968215944, 0.8726]