In [2]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt

In [3]:
data = keras.datasets.imdb

In [4]:
(train_data, train_labels), (test_data, test_labels) = data.load_data(num_words=10000)

In [5]:
# train_data[0] is a list of integers

In [6]:
word_index = data.get_word_index() #Retrieves a dict mapping words to their index in the IMDB dataset.

In [7]:
word_index = {k:(v+3) for k, v in word_index.items()}
#dictionary.items() returns a view object that displays a list of a given dictionary's (key, value) tuple pair.
#by making it a list or dictionary, tuples inside that object is reachable.
#we are adding 3 to the each value for each key
word_index["<PAD>"] = 0 #will be used to make reviews the same length by adding 0 to the shorter ones
word_index["<START>"] = 1 #each texts first element is 1, so "<START>" will be printed before anything else
word_index["<UNK>"] = 2 #2 represents the unkown words in the data, "<UNK>" will be printed when an unknown word comes
word_index["<UNUSED>"] = 3 #3 represents the unkown words in the data, "<UNUSED>" will be printed when an unknown word comes

In [8]:
#Reversing the dict to get a dict that is mapping the index' to their key
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

In [9]:
#decoding the data to readable words
#gets the word that corresponds to that index and puts space after every word
#if that index doesn't correspond to a word, puts a "?"
def decode_review(text):
    return " ".join([reverse_word_index.get(i, "?") for i in text])

In [10]:
print(decode_review(test_data[0]))

<START> please give this one a miss br br <UNK> <UNK> and the rest of the cast rendered terrible performances the show is flat flat flat br br i don't know how michael madison could have allowed this one on his plate he almost seemed to know this wasn't going to work out and his performance was quite <UNK> so all you madison fans give this a miss


In [11]:
len(test_data[0]), len(test_data[1]) #two different lengths

(68, 260)

#### We have to know the max length of the texts to determine the number of neurons or we can set a number and rearrange the train_data and test_data 

In [12]:
len(train_data[0]), len(train_data[1]), len(test_data[0]), len(test_data[1])

(218, 189, 68, 260)

In [13]:
train_data = keras.preprocessing.sequence.pad_sequences(train_data, value=word_index["<PAD>"], padding="post", maxlen=250)
test_data = keras.preprocessing.sequence.pad_sequences(test_data, value=word_index["<PAD>"], padding="post", maxlen=250)

In [14]:
len(train_data[0]), len(train_data[1]), len(test_data[0]), len(test_data[1])

(250, 250, 250, 250)

### Creating the model

In [15]:
model = keras.Sequential()

In [16]:
model.add(keras.layers.Embedding(10000, 16))
model.add(keras.layers.GlobalAveragePooling1D())
model.add(keras.layers.Dense(16, activation="relu"))
model.add(keras.layers.Dense(1, activation="sigmoid"))