In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np

In [2]:
data = keras.datasets.imdb

In [3]:
((train_data, train_labels), (test_data, test_labels)) = data.load_data(num_words=10000)

In [4]:
word_index = data.get_word_index()

In [5]:
# shifts all numbers by three required for the next cell 0 is already free only
word_index = {k:(v+3) for k,v in word_index.items()} 

In [6]:
# this is in accordance with the keras.datasets.imdb

word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2
word_index["<UNUSED>"] = 3

In [7]:
reversed_word_index = dict([(val, key) for (key, val) in word_index.items()])
#  or {val: key for key, val in word_index.items()}

In [8]:
def decode_review(text):
    return " ".join([reversed_word_index.get(i, "?") for i in text])

# example
print(test_data[5])
print(decode_review(test_data[0]))

[1, 146, 427, 5718, 14, 20, 218, 112, 2962, 32, 37, 119, 14, 20, 144, 9493, 910, 5, 8817, 4, 4659, 18, 12, 3403, 853, 28, 8, 2225, 12, 95, 474, 818, 4651, 18, 1462, 13, 124, 285, 5, 1462, 11, 14, 20, 122, 6, 52, 292, 5, 13, 774, 2626, 46, 138, 910, 1481, 276, 14, 20, 23, 288, 42, 23, 1856, 11, 2364, 5687, 33, 222, 13, 774, 110, 101, 4651, 14, 9, 6, 3799, 52, 20, 5, 144, 30, 110, 34, 32, 4, 362, 11, 4, 162, 2248, 92, 79, 8, 67, 12, 5, 13, 104, 36, 144, 12, 144, 33, 222, 30, 276, 145, 23, 4, 1308, 14, 20, 152, 1833, 6, 706, 2, 12, 1015, 4, 147, 155, 146, 98, 150, 14, 20, 80, 30, 23, 288]
<START> please give this one a miss br br <UNK> <UNK> and the rest of the cast rendered terrible performances the show is flat flat flat br br i don't know how michael madison could have allowed this one on his plate he almost seemed to know this wasn't going to work out and his performance was quite <UNK> so all you madison fans give this a miss


In [9]:
# notice difference in lenghts of different review's lengths

print(len(test_data[0]))
print(len(test_data[1]))
print(len(test_data[2]))
print(len(test_data[3]))

68
260
603
181


In [10]:
# using padding length as 250 (can be set to max len(test_data[i])) also

# you can also do bottom with simple for loops
train_data = keras.preprocessing.sequence.pad_sequences(train_data, value = word_index["<PAD>"], padding = "post", maxlen=250)
test_data = keras.preprocessing.sequence.pad_sequences(test_data, value = word_index["<PAD>"], padding = "post", maxlen=250)

# now length same
print(len(test_data[0]))
print(len(test_data[1]))
print(len(test_data[2]))
print(len(test_data[3]))

250
250
250
250


In [11]:
# model

model = keras.Sequential()
model.add(keras.layers.Embedding(10000,16))
model.add(keras.layers.GlobalAveragePooling1D())
model.add(keras.layers.Dense(16, activation="relu"))
model.add(keras.layers.Dense(1, activation="sigmoid"))

In [12]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 16)          160000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 16)                272       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 160,289
Trainable params: 160,289
Non-trainable params: 0
_________________________________________________________________


In [13]:
model.compile(optimizer = "adam", loss = "binary_crossentropy", metrics = ["accuracy"])

x_val = train_data[:10000]
x_train = train_data[10000:]


y_val = train_labels[:10000]
y_train = train_labels[10000:]

In [14]:
fitModel = model.fit(x_train, y_train, epochs=50, batch_size=512, validation_data=(x_val, y_val), verbose = 1)

Train on 15000 samples, validate on 10000 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [15]:
results = model.evaluate(test_data, test_labels)



In [17]:
results

[0.37322977763652804, 0.86604]

In [27]:
k=0
l=1000

for i in range(l):
    predict = model.predict([test_data[i]])

#     print("Review: " + decode_review(test_data[i]))
#     print("Prediction: " +str(predict[i]))
#     print("Actual: "+str(test_labels[i]))
    
    if predict[0] == test_labels[i]:
        k+=1

In [28]:
print(k/l)

0.44
