In [1]:
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.models import model_from_json

import numpy as np
print(np.__version__)
import math

1.17.3


In [2]:
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=10000)

## Exploring Data

In [3]:
# [print('{}\n'.format(X_train[i])) for i in range(len(X_train)) if i < 5]

print('=== Data ====\n{}\n\n=== Sentiment ====\n{}\n\n{}'.format(
    X_train[5], y_train[5], type(X_train)))

=== Data ====
[1, 778, 128, 74, 12, 630, 163, 15, 4, 1766, 7982, 1051, 2, 32, 85, 156, 45, 40, 148, 139, 121, 664, 665, 10, 10, 1361, 173, 4, 749, 2, 16, 3804, 8, 4, 226, 65, 12, 43, 127, 24, 2, 10, 10]

=== Sentiment ====
0

<class 'numpy.ndarray'>


In [4]:
# Default value in load_data
INDEX_FROM = 3

# Download word index and prepare word id
word2id = imdb.get_word_index()
word2id = {word:(word_id + INDEX_FROM) for (word, word_id) in word2id.items()}
# Labelling predefined value to prevent error
word2id["<PAD>"] = 0
word2id["<START>"] = 1
word2id["<UNK>"] = 2
word2id["<UNUSED>"] = 3

# Prepare id to word
id2word = {value:key for key, value in word2id.items()}

print('=== Tokenized sentences words ===')
print(' '.join(id2word[word_id] for word_id in X_train[5]))

=== Tokenized sentences words ===
<START> begins better than it ends funny that the russian submarine crew <UNK> all other actors it's like those scenes where documentary shots br br spoiler part the message <UNK> was contrary to the whole story it just does not <UNK> br br


## Train Model

In [5]:
pad_size = 1000
X_train_pad = pad_sequences(X_train, maxlen=pad_size)
X_test_pad = pad_sequences(X_test, maxlen=pad_size)

In [6]:
vocab_size=len(word2id)
embedding_size=32
output_units = 1

model=Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_size,
    input_length=pad_size))
model.add(LSTM(units=100))
model.add(Dense(output_units, activation='sigmoid'))

model.compile(loss='binary_crossentropy', 
             optimizer='adam', 
             metrics=['accuracy'])

print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1000, 32)          2834816   
_________________________________________________________________
lstm (LSTM)                  (None, 100)               53200     
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 2,888,117
Trainable params: 2,888,117
Non-trainable params: 0
_________________________________________________________________
None


In [7]:
train_size = math.ceil(0.8 * len(X_train))

X_val, y_val = X_train_pad[:train_size], y_train[:train_size]
X, y = X_train_pad[train_size:], y_train[train_size:]

In [8]:
batch_size = 64
epochs = 5

model.fit(X, y, validation_data=(X_val, y_val), batch_size=batch_size,
          epochs=epochs, shuffle=True)

Train on 5000 samples, validate on 20000 samples
Epoch 1/5

KeyboardInterrupt: 

In [None]:
# Saving structure and weights
model_structure = model.to_json()
with open('model_structure.json', 'w') as f:
    f.write(model_structure)
    
model.save_weights('model_weights.h5')

In [None]:
# Load and compile model

with open('model_structure.json', 'r') as f:
    loaded_model_json = f.read()
    loaded_model = model_from_json(loaded_model_json)

loaded_model.load_weights('model_weights.h5')
loaded_model.compile(loss='binary_crossentropy', 
             optimizer='adam', 
             metrics=['accuracy'])

In [None]:
scores = loaded_model.evaluate(X_test_pad, y_test, verbose=0)
print('Model Accuracy:', scores[1])