In [41]:
#setup
import tensorflow as tf
from tensorflow import keras

import tensorflow_datasets as tfds
tfds.disable_progress_bar()

import numpy as np
print(tf.__version__)

2.2.0


In [42]:
#download the dataset

(train_data, test_data), info = tfds.load(
    # Use the version pre-encoded with an ~8k vocabulary.
    'imdb_reviews/subwords8k', 
    # Return the train/test datasets as a tuple.
    split = (tfds.Split.TRAIN, tfds.Split.TEST),
    # Return (example, label) pairs from the dataset (instead of a dictionary).
    as_supervised=True,
    # Also return the `info` structure. 
    with_info=True)




In [43]:
#check the encoder
encoder = info.features["text"].encoder
print("Vocabulary size: {}".format(encoder.vocab_size))


Vocabulary size: 8185


In [44]:
#encorder reversibly encodes any string

sample_string = "Hello TensorFlow"
encoded_string = encoder.encode(sample_string)
print('Encoded string is {}'.format(encoded_string))

original_string = encoder.decode(encoded_string)
print('The original string. "{}"'.format(original_string))

assert original_string == sample_string

Encoded string is [4025, 222, 6307, 2327, 4043, 2120]
The original string. "Hello TensorFlow"


In [45]:
for ts in encoded_string:
    print('{} ----- {}'.format(ts, encoder.decode([ts])))

4025 ----- Hell
222 ----- o 
6307 ----- Ten
2327 ----- sor
4043 ----- Fl
2120 ----- ow


In [46]:
#dataset comes preprocessed
#each example in an array of integers rep the words of the main movie review
#each label is either integer value 1 0r 0

for train_example, train_label in train_data.take(1):
    print("Encoded text:", train_example[:10].numpy())
    print("Label:", train_label.numpy())

Encoded text: [  62   18   41  604  927   65    3  644 7968   21]
Label: 0


In [47]:
encoder.decode(train_example)

"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it."

In [48]:
#prepare the dat for training

BUFFER_SIZE = 1000
train_batches = (
    train_data
    .shuffle(BUFFER_SIZE)
    .padded_batch(32))
test_batches = (
    test_data
    .padded_batch(32))

#each batch has the shape (batch_size, sequence_length)

for example_batch, label_batch in train_batches.take(2):
    print("Batch shape: ", example_batch.shape)
    print("Label shape: ", label_batch.shape)
    


Batch shape:  (32, 770)
Label shape:  (32,)
Batch shape:  (32, 1043)
Label shape:  (32,)


In [49]:
#build the model
model = keras.Sequential([
    keras.layers.Embedding(encoder.vocab_size, 16),
    keras.layers.GlobalAveragePooling1D(),
    keras.layers.Dense(1)])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, None, 16)          130960    
_________________________________________________________________
global_average_pooling1d_1 ( (None, 16)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 130,977
Trainable params: 130,977
Non-trainable params: 0
_________________________________________________________________


In [None]:
#compile the model
model.compile(optimizer= "adam",
             loss = tf.keras.losses.BinaryCrossentropy(from_logits = True),
             metrics = ["accuracy"])
#train the model

history = model.fit(train_batches,
                   epochs =10,
                   validation_data = test_batches,
                   validation_steps = 30)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10

In [None]:
#evaluate the model
loss, accuracy = model.evaluate(test_batches)

print("Loss", loss)
print("Accuracy", accuracy)

In [None]:
#the history object has a dict with everything that happend before

history_dict = history.history
history_dict.keys()

In [None]:
#four entries each is monitored during training and validation
#plot a training and validation loss for comparison

import matplotlib.pyplot as plt

acc = history_dict['accuracy']
val_acc = history_dict['val_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)

# "bo" is for "blue dot"
plt.plot(epochs, loss, 'bo', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

In [None]:
plt.clf()   # clear figure

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')

plt.show()