# Analyzing IMDB Data in Keras

In [1]:
# Imports
import numpy as np
import keras
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.preprocessing.text import Tokenizer
import matplotlib.pyplot as plt
%matplotlib inline

np.random.seed(42)

Using TensorFlow backend.


## 1. Loading the data
This dataset comes preloaded with Keras, so one simple command will get us training and testing data. There is a parameter for how many words we want to look at. We've set it at 1000, but feel free to experiment.

In [2]:
# Loading the data (it's preloaded in Keras)
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=1000)

print(x_train.shape)
print(x_test.shape)

(25000,)
(25000,)


## 2. Examining the data
Notice that the data has been already pre-processed, where all the words have numbers, and the reviews come in as a vector with the words that the review contains. For example, if the word 'the' is the first one in our dictionary, and a review contains the word 'the', then there is a 1 in the corresponding vector.

The output comes as a vector of 1's and 0's, where 1 is a positive sentiment for the review, and 0 is negative.

In [3]:
print(x_train[0])
print(y_train[0])

[1, 14, 22, 16, 43, 530, 973, 2, 2, 65, 458, 2, 66, 2, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 2, 2, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2, 19, 14, 22, 4, 2, 2, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 2, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2, 2, 16, 480, 66, 2, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 2, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 2, 15, 256, 4, 2, 7, 2, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 2, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2, 56, 26, 141, 6, 194, 2, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 2, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 2, 88, 12, 16, 283, 5, 16, 2, 113, 103, 32, 15, 16, 2, 19, 178, 32]
1


## 3. One-hot encoding the output
Here, we'll turn the input vectors into (0,1)-vectors. For example, if the pre-processed vector contains the number 14, then in the processed vector, the 14th entry will be 1.

In [4]:
# One-hot encoding the output into vector mode, each of length 1000
tokenizer = Tokenizer(num_words=1000)
x_train = tokenizer.sequences_to_matrix(x_train, mode='binary')
x_test = tokenizer.sequences_to_matrix(x_test, mode='binary')
print(x_train[0])

[ 0.  1.  1.  0.  1.  1.  1.  1.  1.  1.  0.  0.  1.  1.  1.  1.  1.  1.
  1.  1.  0.  1.  1.  0.  0.  1.  1.  0.  1.  0.  1.  0.  1.  1.  0.  1.
  1.  0.  1.  1.  0.  0.  0.  1.  0.  0.  1.  0.  1.  0.  1.  1.  1.  0.
  0.  0.  1.  0.  0.  0.  0.  0.  1.  0.  0.  1.  1.  0.  0.  0.  0.  1.
  0.  0.  0.  0.  1.  1.  0.  0.  0.  0.  1.  0.  0.  0.  0.  1.  1.  0.
  0.  0.  1.  0.  0.  0.  0.  0.  1.  0.  1.  0.  0.  1.  1.  0.  1.  1.
  0.  0.  0.  0.  1.  1.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  1.  0.
  0.  0.  0.  0.  1.  0.  0.  0.  1.  1.  0.  0.  0.  0.  0.  1.  0.  0.
  1.  0.  0.  1.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  1.  1.  0.  0.  0.  0.  1.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  1.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.
  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  1.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0

And we'll also one-hot encode the output.

In [5]:
# One-hot encoding the output
num_classes = 2
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
print(y_train.shape)
print(y_test.shape)

(25000, 2)
(25000, 2)


## 4. Building the  model architecture
Build a model here using sequential. Feel free to experiment with different layers and sizes! Also, experiment adding dropout to reduce overfitting.

In [None]:
# TODO: Build the model architecture
sa_model = Sequential()

sa_model.add(Dense(64, input_dim=x_train.shape[1]))
sa_model.add(Activation('relu'))
sa_model.add(Dropout(0.2))

sa_model.add(Dense(64, input_dim=x_train.shape[1]))
sa_model.add(Activation('relu'))
sa_model.add(Dropout(0.2))

sa_model.add(Dense(num_classes))
sa_model.add(Activation('softmax'))

# TODO: Compile the model using a loss function and an optimizer.
sa_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

sa_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 64)                64064     
_________________________________________________________________
activation_1 (Activation)    (None, 64)                0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 64)                4160      
_________________________________________________________________
activation_2 (Activation)    (None, 64)                0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 130       
__________

## 5. Training the model
Run the model here. Experiment with different batch_size, and number of epochs!

In [None]:
# TODO: Run the model. Feel free to experiment with different batch sizes and number of epochs.
sa_model.fit(x_train, y_train, epochs=500, batch_size=50, validation_data=(x_test,y_test), verbose=2)

Train on 25000 samples, validate on 25000 samples
Epoch 1/500
 - 3s - loss: 0.4095 - acc: 0.8120 - val_loss: 0.3266 - val_acc: 0.8583
Epoch 2/500
 - 3s - loss: 0.3131 - acc: 0.8687 - val_loss: 0.3234 - val_acc: 0.8577
Epoch 3/500
 - 3s - loss: 0.2820 - acc: 0.8813 - val_loss: 0.3322 - val_acc: 0.8543
Epoch 4/500
 - 3s - loss: 0.2483 - acc: 0.9003 - val_loss: 0.3434 - val_acc: 0.8513
Epoch 5/500
 - 3s - loss: 0.1997 - acc: 0.9230 - val_loss: 0.4117 - val_acc: 0.8436
Epoch 6/500
 - 3s - loss: 0.1529 - acc: 0.9438 - val_loss: 0.4264 - val_acc: 0.8468
Epoch 7/500
 - 3s - loss: 0.1129 - acc: 0.9590 - val_loss: 0.4845 - val_acc: 0.8402
Epoch 8/500
 - 4s - loss: 0.0942 - acc: 0.9658 - val_loss: 0.5339 - val_acc: 0.8377
Epoch 9/500
 - 4s - loss: 0.0770 - acc: 0.9724 - val_loss: 0.6048 - val_acc: 0.8343
Epoch 10/500
 - 4s - loss: 0.0692 - acc: 0.9751 - val_loss: 0.6016 - val_acc: 0.8358
Epoch 11/500
 - 4s - loss: 0.0648 - acc: 0.9773 - val_loss: 0.6670 - val_acc: 0.8391
Epoch 12/500
 - 4s - los

Epoch 97/500
 - 3s - loss: 0.0119 - acc: 0.9960 - val_loss: 1.1076 - val_acc: 0.8405
Epoch 98/500
 - 3s - loss: 0.0120 - acc: 0.9958 - val_loss: 1.1136 - val_acc: 0.8410
Epoch 99/500
 - 3s - loss: 0.0123 - acc: 0.9960 - val_loss: 1.1565 - val_acc: 0.8394
Epoch 100/500
 - 3s - loss: 0.0141 - acc: 0.9953 - val_loss: 1.0671 - val_acc: 0.8418
Epoch 101/500
 - 3s - loss: 0.0122 - acc: 0.9960 - val_loss: 1.1247 - val_acc: 0.8412
Epoch 102/500
 - 3s - loss: 0.0118 - acc: 0.9964 - val_loss: 1.1337 - val_acc: 0.8393
Epoch 103/500
 - 3s - loss: 0.0103 - acc: 0.9964 - val_loss: 1.1781 - val_acc: 0.8397
Epoch 104/500
 - 3s - loss: 0.0140 - acc: 0.9949 - val_loss: 1.1544 - val_acc: 0.8381
Epoch 105/500
 - 3s - loss: 0.0139 - acc: 0.9956 - val_loss: 1.0894 - val_acc: 0.8390
Epoch 106/500
 - 3s - loss: 0.0122 - acc: 0.9960 - val_loss: 1.1111 - val_acc: 0.8393
Epoch 107/500
 - 3s - loss: 0.0127 - acc: 0.9957 - val_loss: 1.1278 - val_acc: 0.8350
Epoch 108/500
 - 3s - loss: 0.0144 - acc: 0.9955 - val_lo

Epoch 193/500
 - 3s - loss: 0.0092 - acc: 0.9971 - val_loss: 1.2646 - val_acc: 0.8379
Epoch 194/500
 - 3s - loss: 0.0088 - acc: 0.9969 - val_loss: 1.2962 - val_acc: 0.8398
Epoch 195/500
 - 3s - loss: 0.0080 - acc: 0.9977 - val_loss: 1.2700 - val_acc: 0.8362
Epoch 196/500
 - 3s - loss: 0.0092 - acc: 0.9971 - val_loss: 1.2712 - val_acc: 0.8376
Epoch 197/500
 - 3s - loss: 0.0079 - acc: 0.9974 - val_loss: 1.2642 - val_acc: 0.8370
Epoch 198/500
 - 3s - loss: 0.0089 - acc: 0.9970 - val_loss: 1.1876 - val_acc: 0.8361
Epoch 199/500
 - 3s - loss: 0.0082 - acc: 0.9978 - val_loss: 1.2637 - val_acc: 0.8371
Epoch 200/500
 - 3s - loss: 0.0072 - acc: 0.9976 - val_loss: 1.3454 - val_acc: 0.8359
Epoch 201/500
 - 3s - loss: 0.0098 - acc: 0.9971 - val_loss: 1.2517 - val_acc: 0.8372
Epoch 202/500
 - 3s - loss: 0.0085 - acc: 0.9975 - val_loss: 1.2381 - val_acc: 0.8347
Epoch 203/500
 - 3s - loss: 0.0101 - acc: 0.9973 - val_loss: 1.1908 - val_acc: 0.8381
Epoch 204/500
 - 3s - loss: 0.0075 - acc: 0.9976 - val

## 6. Evaluating the model
This will give you the accuracy of the model, as evaluated on the testing set. Can you get something over 85%?

In [None]:
score = sa_model.evaluate(x_test, y_test, verbose=0)
print("Accuracy: ", score[1])