In [1]:
import numpy as np
import pandas as pd

import keras
from keras import datasets
from keras.preprocessing.sequence import pad_sequences

In [2]:
vocab_size = 10000
(X_train, y_train), (X_test, y_test) = datasets.imdb.load_data(num_words=vocab_size) # already tokenized

# Data preprocessing

In [5]:
X_train_padded = pad_sequences(X_train, maxlen=200)
X_test_padded = pad_sequences(X_test, maxlen=200)

print(X_train_padded.shape)
print(X_test_padded.shape)

(25000, 200)
(25000, 200)


# Modeling

In [8]:
from keras.models import Sequential
from keras.layers import Embedding, Dropout, Conv1D, GlobalMaxPooling1D, Dense
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import load_model

In [9]:
# hyperparameters
embedding_dim = 256
dropout_rate = 0.3
num_filters = 256
kernel_size = 3
hidden_units = 128

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim))
model.add(Dropout(rate=dropout_rate))
model.add(Conv1D(filters=num_filters, kernel_size=kernel_size, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(units=hidden_units, activation='relu'))
model.add(Dropout(rate=dropout_rate))
model.add(Dense(units=1, activation='sigmoid'))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 256)         2560000   
                                                                 
 dropout (Dropout)           (None, None, 256)         0         
                                                                 
 conv1d (Conv1D)             (None, None, 256)         196864    
                                                                 
 global_max_pooling1d (Glob  (None, 256)               0         
 alMaxPooling1D)                                                 
                                                                 
 dense (Dense)               (None, 128)               32896     
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                        

In [10]:
model_path = "../model/"
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)
mc = ModelCheckpoint(filepath=model_path + 'imbd_model_cnn.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(X_train_padded, y_train, epochs=10, batch_size=128, validation_split=0.2, callbacks=[es, mc])

Epoch 1/10
Epoch 1: val_acc improved from -inf to 0.86160, saving model to ../model/imbd_model_cnn.h5
Epoch 2/10
  2/157 [..............................] - ETA: 11s - loss: 0.2717 - acc: 0.8945

  saving_api.save_model(


Epoch 2: val_acc improved from 0.86160 to 0.88900, saving model to ../model/imbd_model_cnn.h5
Epoch 3/10
Epoch 3: val_acc did not improve from 0.88900
Epoch 4/10
Epoch 4: val_acc did not improve from 0.88900
Epoch 5/10
Epoch 5: val_acc improved from 0.88900 to 0.89200, saving model to ../model/imbd_model_cnn.h5
Epoch 5: early stopping


In [12]:
loaded_model = load_model(model_path + 'imbd_model_cnn.h5')
loaded_model.evaluate(X_test_padded, y_test)



[0.37286871671676636, 0.8870000243186951]