In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data = keras.datasets.imdb

In [3]:
(train_data, train_labels), (test_data, test_labels) = data.load_data(num_words=10000)

In [4]:
# train_data[0] is a list of integers.

In [5]:
word_index = data.get_word_index() #Retrieves a dict mapping words to their index in the IMDB dataset.

In [6]:
word_index = {k:(v+3) for k, v in word_index.items()}
#dictionary.items() returns a view object that displays a list of a given dictionary's (key, value) tuple pair.
#by making it a list or dictionary, tuples inside that object is reachable.
#we are adding 3 to the each value for each key
word_index["<PAD>"] = 0 #will be used to make reviews the same length by adding 0 to the shorter ones
word_index["<START>"] = 1 #each texts first element is 1, so "<START>" will be printed before anything else
word_index["<UNK>"] = 2 #2 represents the unkown words in the data, "<UNK>" will be printed when an unknown word comes
word_index["<UNUSED>"] = 3 #3 represents the unkown words in the data, "<UNUSED>" will be printed when an unknown word comes

In [7]:
#Reversing the dict to get a dict that is mapping the index' to their key
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

In [8]:
#decoding the data to readable words
#gets the word that corresponds to that index and puts space after every word
#if that index doesn't correspond to a word, puts a "?"
def decode_review(text):
    return " ".join([reverse_word_index.get(i, "?") for i in text])

In [9]:
print(decode_review(test_data[0]))

<START> please give this one a miss br br <UNK> <UNK> and the rest of the cast rendered terrible performances the show is flat flat flat br br i don't know how michael madison could have allowed this one on his plate he almost seemed to know this wasn't going to work out and his performance was quite <UNK> so all you madison fans give this a miss


In [10]:
len(test_data[0]), len(test_data[1]) #two different lengths

(68, 260)

#### We have to know the max length of the texts to determine the number of neurons or we can set a number and rearrange the train_data and test_data 

In [11]:
len(train_data[0]), len(train_data[1]), len(test_data[0]), len(test_data[1])

(218, 189, 68, 260)

##### By setting maxLen=250, texts that are longer than 250 in train_data and test_data will be deleted and by setting value=word_index["<PAD">"], we are adding 0's to the texts that has shorter length than 250 and by setting padding="post", we are adding the 0's to the "end" of the text

In [12]:
train_data = keras.preprocessing.sequence.pad_sequences(train_data, value=word_index["<PAD>"], padding="post", maxlen=250)
test_data = keras.preprocessing.sequence.pad_sequences(test_data, value=word_index["<PAD>"], padding="post", maxlen=250)

In [13]:
len(train_data[0]), len(train_data[1]), len(test_data[0]), len(test_data[1])

(250, 250, 250, 250)

### Creating the model

In [14]:
model = keras.Sequential()

In [15]:
model.add(keras.layers.Embedding(10000, 16)) #16 dimensions
#created 10000 word vectors. If two words have similar meanings, those vectors are
#close to each other so the degree between them is very small.
model.add(keras.layers.GlobalAveragePooling1D()) #puts the daha in a lower dimension
model.add(keras.layers.Dense(16, activation="relu"))
model.add(keras.layers.Dense(1, activation="sigmoid"))

In [16]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 16)          160000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 16)                272       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 160,289
Trainable params: 160,289
Non-trainable params: 0
_________________________________________________________________


In [17]:
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

In [18]:
x_val = train_data[:10000]
x_train = train_data[10000:]

In [19]:
y_val = train_labels[:10000]
y_train = train_labels[10000:]

In [20]:
fitModel = model.fit(x_train, y_train, epochs=40, batch_size=512, validation_data=(x_val, y_val), verbose=1)
#batch_size: number of the movie reviews that we are giving every time

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [21]:
results = model.evaluate(test_data, test_labels)

