# Analyzing IMDB Data in Keras

In [1]:
# Imports
import keras
import numpy as np
from keras.datasets import imdb
from keras.layers import Dense, Dropout, Activation
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
import matplotlib.pyplot as plt
%matplotlib inline

np.random.seed(42)

Using TensorFlow backend.


## 1. Loading the data
This dataset comes preloaded with Keras, so one simple command will get us training and testing data. There is a parameter for how many words we want to look at. We've set it at 1000, but feel free to experiment.

In [2]:
# Loading the data (it's preloaded in Keras)
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=1000)

print(x_train.shape)
print(x_test.shape)

Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz
(25000,)
(25000,)


## 2. Examining the data
Notice that the data have already been pre-processed, where all the words have numbers, and the reviews come in as a vector with the words that the review contains. For example, if the word 'the' is the first one in our dictionary, and a review contains the word 'the', then there is a 1 in the corresponding vector.

The output comes as a vector of 1's and 0's, where 1 is a positive sentiment for the review, and 0 is negative.

In [3]:
print(x_train[0])
print(y_train[0])

[1, 14, 22, 16, 43, 530, 973, 2, 2, 65, 458, 2, 66, 2, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 2, 2, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2, 19, 14, 22, 4, 2, 2, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 2, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2, 2, 16, 480, 66, 2, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 2, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 2, 15, 256, 4, 2, 7, 2, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 2, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2, 56, 26, 141, 6, 194, 2, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 2, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 2, 88, 12, 16, 283, 5, 16, 2, 113, 103, 32, 15, 16, 2, 19, 178, 32]
1


## 3. One-hot encoding the output
Here, we'll turn the input vectors into (0,1)-vectors. For example, if the pre-processed vector contains the number 14, then in the processed vector, the 14th entry will be 1.

In [4]:
# One-hot encoding the output into vector mode, each of length 1000
tokenizer = Tokenizer(num_words=1000)
x_train = tokenizer.sequences_to_matrix(x_train, mode='binary')
x_test = tokenizer.sequences_to_matrix(x_test, mode='binary')
print(x_train[0])

[ 0.  1.  1.  0.  1.  1.  1.  1.  1.  1.  0.  0.  1.  1.  1.  1.  1.  1.
  1.  1.  0.  1.  1.  0.  0.  1.  1.  0.  1.  0.  1.  0.  1.  1.  0.  1.
  1.  0.  1.  1.  0.  0.  0.  1.  0.  0.  1.  0.  1.  0.  1.  1.  1.  0.
  0.  0.  1.  0.  0.  0.  0.  0.  1.  0.  0.  1.  1.  0.  0.  0.  0.  1.
  0.  0.  0.  0.  1.  1.  0.  0.  0.  0.  1.  0.  0.  0.  0.  1.  1.  0.
  0.  0.  1.  0.  0.  0.  0.  0.  1.  0.  1.  0.  0.  1.  1.  0.  1.  1.
  0.  0.  0.  0.  1.  1.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  1.  0.
  0.  0.  0.  0.  1.  0.  0.  0.  1.  1.  0.  0.  0.  0.  0.  1.  0.  0.
  1.  0.  0.  1.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  1.  1.  0.  0.  0.  0.  1.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  1.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.
  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  1.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0

And we'll also one-hot encode the output.

In [5]:
# One-hot encoding the output
num_classes = 2
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
print(y_train.shape)
print(y_test.shape)

(25000, 2)
(25000, 2)


## 4. Building the  model architecture
Build a model here using sequential. Feel free to experiment with different layers and sizes! Also, experiment adding dropout to reduce overfitting.

In [38]:
# TODO: Build the model architecture
def build_model(
    input_shape, nodes, dropouts, activations, optimizer='adam'):
    
    model = Sequential()
    model.add(Dense(
        nodes[0], activation=activations[0], input_shape=(input_shape,)))
    model.add(Dropout(dropouts[0]))
    model.add(Dense(nodes[1], activation=activations[1]))
    model.add(Dropout(dropouts[1]))
    model.add(Dense(nodes[2], activation=activations[2]))
    model.add(Dropout(dropouts[2]))
    model.add(Dense(nodes[3], activation=activations[3]))

    model.compile(loss='categorical_crossentropy', 
                  optimizer=optimizer, 
                  metrics=['accuracy'])
    model.summary()
    return model

# TODO: Compile the model using a loss function and an optimizer.
def train_model(model, 
                X_train, 
                Y_train, 
                X_test, 
                Y_test, 
                epochs, 
                batch_size=256, 
                verbose=1):
    model.fit(X_train, 
              Y_train, 
              epochs=epochs, 
              batch_size=batch_size, 
              verbose=verbose)
    score = model.evaluate(X_train, Y_train)
    print('\nTraining Accuracy:', score[1])
    test_score = model.evaluate(X_test, Y_test)
    print('\nTest Accuracy:', test_score[1])
    return score, test_score

## 5. Training the model
Run the model here. Experiment with different batch_size, and number of epochs!

In [40]:
# TODO: Run the model. Feel free to experiment with different batch sizes 
# and number of epochs.
nodes = [1000, 1000, 1000, 2]
dropouts = [0, 0, 0]
activations = ['relu', 'relu', 'relu', 'softmax']
optimizer = 'adam'
epochs = 10
batch_size = 256

mod = build_model(input_shape=x_train.shape[1], 
                  nodes=nodes, 
                  dropouts=dropouts, 
                  activations=activations, 
                  optimizer=optimizer)

res = train_model(
    mod, x_train, y_train, x_test, y_test, epochs, batch_size)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_30 (Dense)             (None, 1000)              1001000   
_________________________________________________________________
dropout_17 (Dropout)         (None, 1000)              0         
_________________________________________________________________
dense_31 (Dense)             (None, 1000)              1001000   
_________________________________________________________________
dropout_18 (Dropout)         (None, 1000)              0         
_________________________________________________________________
dense_32 (Dense)             (None, 1000)              1001000   
_________________________________________________________________
dropout_19 (Dropout)         (None, 1000)              0         
_________________________________________________________________
dense_33 (Dense)             (None, 2)                 2002      
Total para

In [44]:
baseline = res[1][1]
best_so_far = baseline
best_params = {'nodes': nodes,
               'dropouts': dropouts,
               'activations': activations,
               'optimizer': optimizer,
               'epochs': epochs,
               'batch_size': batch_size}

nodes = [1000, 1000, 1000, 2]
dropouts = [0.5, 0.3, 0.1]
activations = ['relu', 'relu', 'relu', 'softmax']
optimizer = 'adam'
epochs = 10
batch_size = 256

mod = build_model(input_shape=x_train.shape[1], 
                  nodes=nodes, 
                  dropouts=dropouts, 
                  activations=activations, 
                  optimizer=optimizer)

res = train_model(
    mod, x_train, y_train, x_test, y_test, epochs, batch_size)

test_acc = res[1][1]
if test_acc > best_so_far:
    print('New Best Model!')
    best_so_far = train_acc
    best_params = {'nodes': nodes,
                   'dropouts': dropouts,
                   'activations': activations,
                   'optimizer': optimizer,
                   'epochs': epochs,
                   'batch_size': batch_size}

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_41 (Dense)             (None, 1000)              1001000   
_________________________________________________________________
dropout_25 (Dropout)         (None, 1000)              0         
_________________________________________________________________
dense_42 (Dense)             (None, 1000)              1001000   
_________________________________________________________________
dropout_26 (Dropout)         (None, 1000)              0         
_________________________________________________________________
dense_43 (Dense)             (None, 1000)              1001000   
_________________________________________________________________
dropout_27 (Dropout)         (None, 1000)              0         
_________________________________________________________________
dense_44 (Dense)             (None, 2)                 2002      
Total para

In [45]:
print(best_so_far)
for k, v in best_params.items():
    print('%s: %s' % (k, v))

0.8528
nodes: [1000, 1000, 1000, 2]
dropouts: [0.5, 0.3, 0.1]
activations: ['relu', 'relu', 'relu', 'softmax']
optimizer: adam
epochs: 10
batch_size: 256


## 6. Evaluating the model
This will give you the accuracy of the model, as evaluated on the testing set. Can you get something over 85%?

In [None]:
# DONE ABOVE
#score = model.evaluate(x_test, y_test, verbose=0)
#print("Accuracy: ", score[1])