In [1]:
%matplotlib inline
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt

In [2]:
data = keras.datasets.imdb

Get 10000 words of data

In [3]:
(train_data, train_labels), (test_data, test_labels) = data.load_data(num_words=88000)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


Get words for IMDB dataset

In [14]:
word_index = data.get_word_index()

In [20]:
word_index = {k:(v+3) for k, v in word_index.items()}
word_index["<PAD>"] = 0
word_index["<START"] = 1
word_index["<UNK>"] = 2
word_index["<UNUSED>"] = 3

In [21]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

In [22]:
def decode_review(text):
    return " ".join([reverse_word_index.get(i, "?") for i in text])

In [23]:
print(decode_review(test_data[0]))

&lt;START murder both in have ? easily of of &lt;UNK&gt; &lt;UNK&gt; ? ? boring the ? again marries understand dead ? over a odd odd odd of of br how where first lead spiral make you cross in have movie not convict are role dark and where in true director and old just ? not last i lot &lt;UNK&gt; an he film spiral based both in ? easily


Shapes of data in data_test are not the same and we have to fix that

In [25]:
train_data = keras.preprocessing.sequence.pad_sequences(train_data, value=word_index["<PAD>"], padding="post", maxlen=250)
test_data = keras.preprocessing.sequence.pad_sequences(test_data, value=word_index["<PAD>"], padding="post", maxlen=250)

In [26]:
print(decode_review(test_data[0]))

&lt;START murder both in have ? easily of of &lt;UNK&gt; &lt;UNK&gt; ? ? boring the ? again marries understand dead ? over a odd odd odd of of br how where first lead spiral make you cross in have movie not convict are role dark and where in true director and old just ? not last i lot &lt;UNK&gt; an he film spiral based both in ? easily &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &

Define model

In [29]:
model = keras.Sequential()
model.add(keras.layers.Embedding(10000, 16)) #group words in simmilar way. We create 10000 vectors. Data to vectors
model.add(keras.layers.GlobalAveragePooling1D()) #vectors to 1D
model.add(keras.layers.Dense(16, activation="relu")) #16 inner nevrons
model.add(keras.layers.Dense(1, activation="sigmoid")) #0 = bad review, 1 0 good review

In [30]:
model.summary()

Model: &quot;sequential_2&quot;
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 16)          160000    
_________________________________________________________________
global_average_pooling1d_2 ( (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 16)                272       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 17        
Total params: 160,289
Trainable params: 160,289
Non-trainable params: 0
_________________________________________________________________


In [31]:
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

Train and validation data

In [32]:
x_val = train_data[:10000]
x_train = train_data[10000:]

y_val = train_labels[:10000]
y_train = train_labels[10000:]

In [33]:
fitModel = model.fit(x_train, y_train, epochs=40, batch_size=512, validation_data=(x_val, y_val), verbose=1) #batch_size - how many time will load on review

Train on 15000 samples, validate on 10000 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [34]:
results = model.evaluate(test_data, test_labels)
print(results)

test_review = test_data[0]
predict = model.predict([test_review])
print("Review: ")
print(decode_review(test_review))
print("Prediction: " + str(predict[0]))
print("Actual: " + str(test_labels[0]))
print(results)

[0.31690297874450685, 0.87336]
Review: 
&lt;START murder both in have ? easily of of &lt;UNK&gt; &lt;UNK&gt; ? ? boring the ? again marries understand dead ? over a odd odd odd of of br how where first lead spiral make you cross in have movie not convict are role dark and where in true director and old just ? not last i lot &lt;UNK&gt; an he film spiral based both in ? easily &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&gt; &lt;PAD&g