# Classification of Movie Reviews with Recurrent Neural Networks

In [1]:
from __future__ import print_function
import numpy as np
np.random.seed(1337)  # for reproducibility

from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding
from keras.layers import LSTM, SimpleRNN, GRU
from keras.datasets import imdb
from keras.optimizers import RMSprop
from sklearn.metrics import roc_auc_score

Using Theano backend.
Using gpu device 0: GeForce GT 750M (CNMeM is disabled, cuDNN 5004)

Couldn't import dot_parser, loading of dot files will not be possible.





## Hyperparameters for data processing and the batch size for modeling

In [9]:
batch_size = 32
# The maximum number of words allowed in the corpus
max_features = 5000 
# cut texts after this number of words (among top max_features most common words)
maxlen = 80  

# Loading the IMDB Dataset

In [10]:
print('Loading data...')

(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features)

print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

Loading data...
25000 train sequences
25000 test sequences
Pad sequences (samples x time)
X_train shape: (25000, 80)
X_test shape: (25000, 80)


# What does the data look like?

In [32]:
windx = imdb.get_word_index()
rwindx = dict((k,v) for v,k in windx.items())

print("The numeric tokens look like:")
print( ' '.join([str(x) for x in X_train[0]]) )

print("\nWhich maps to these words:")
print('"'+' '.join([rwindx[x] for x in X_train[0]]) + '"')

print("\nClass labels are either", list(set(y_train)) )

The numeric tokens look like:
15 256 4 2 7 3766 5 723 36 71 43 530 476 26 400 317 46 7 4 2 1029 13 104 88 4 381 15 297 98 32 2071 56 26 141 6 194 2 18 4 226 22 21 134 476 26 480 5 144 30 2 18 51 36 28 224 92 25 104 4 226 65 16 38 1334 88 12 16 283 5 16 4472 113 103 32 15 16 2 19 178 32

Which maps to these words:
"for anyone of and br show's to whether from than out themselves history he name half some br of and odd was two most of mean for 1 any an boat she he should is thought and but of script you not while history he heart to real at and but when from one bit then have two of script their with her nobody most that with wasn't to with armed acting watch an for with and film want an"

Class labels are either [0, 1]


In [21]:
print('Build model...')
model = Sequential()
model.add(Embedding(max_features, 128, dropout=0.2))
model.add(LSTM(128, dropout_W=0.2, dropout_U=0.2))  # try using a GRU instead, for fun
model.add(Dense(1))
model.add(Activation('sigmoid'))

# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

Build model...


In [22]:
print('Train...')
model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=1,
          validation_data=(X_test, y_test), verbose=0)
score, acc = model.evaluate(X_test, y_test,
                            batch_size=batch_size)

Train...


In [23]:
print('Test binary cross entropy:', score)
print('Test accuracy:', acc)
ypred_tst = model.predict(X_test)
# Pretty good AUC
print('Test AUC:', roc_auc_score(y_test, ypred_tst))

Test binary cross entropy: 0.437651659021
Test accuracy: 0.80804
Test AUC: 0.9040442848
