In [1]:
import numpy
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

Using TensorFlow backend.


In [2]:
# load the dataset but only keep the top n words, zero the rest
top_words = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)

In [3]:
print(X_train[0])

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 2, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 2, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 2, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 2, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 2, 19, 178, 32]


In [4]:
# truncate and pad input sequences
max_review_length = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)

In [5]:
print(X_train[0])
print(y_train[:10])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0 

In [6]:
# create the model
embedding_vecor_length = 32
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
model.add(LSTM(10))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 10)                1720      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 11        
Total params: 161,731
Trainable params: 161,731
Non-trainable params: 0
_________________________________________________________________
None


In [7]:
#
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3, batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f62da8d4a90>

In [8]:
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 87.49%


In [19]:
W = model.layers[-1].get_weights()[0]
print(W)

[[-1.1598136 ]
 [-1.2236058 ]
 [-0.42562348]
 [-1.2016828 ]
 [-0.9543209 ]
 [ 0.43749776]
 [-0.99669063]
 [ 0.03122998]
 [ 1.1871601 ]
 [ 0.9906302 ]]


In [20]:
# Save model
model.save('my_model.h5')
del model

In [21]:
whos

Variable                 Type       Data/Info
---------------------------------------------
Dense                    type       <class 'keras.layers.core.Dense'>
Embedding                type       <class 'keras.layers.embeddings.Embedding'>
LSTM                     type       <class 'keras.layers.recurrent.LSTM'>
Sequential               type       <class 'keras.engine.sequential.Sequential'>
W                        ndarray    10x1: 10 elems, type `float32`, 40 bytes
X_test                   ndarray    25000x500: 12500000 elems, type `int32`, 50000000 bytes (47.6837158203125 Mb)
X_train                  ndarray    25000x500: 12500000 elems, type `int32`, 50000000 bytes (47.6837158203125 Mb)
embedding_vecor_length   int        32
imdb                     module     <module 'keras.datasets.i<...>/keras/datasets/imdb.py'>
max_review_length        int        500
numpy                    module     <module 'numpy' from '/ho<...>kages/numpy/__init__.py'>
scores                   list      

In [23]:
# Reload it
from keras.models import load_model

model = load_model('my_model.h5')
W = model.layers[-1].get_weights()[0]
print(W)

[[-1.1598136 ]
 [-1.2236058 ]
 [-0.42562348]
 [-1.2016828 ]
 [-0.9543209 ]
 [ 0.43749776]
 [-0.99669063]
 [ 0.03122998]
 [ 1.1871601 ]
 [ 0.9906302 ]]


In [24]:
# Evaluate again
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 87.49%
