In [1]:
%load_ext autoreload
%autoreload 2
import tensorflow_datasets as tfds
import tensorflow as tf

import time
import numpy as np
import matplotlib.pyplot as plt
import os
import shutil
from functools import partial
from project import *

## Setup input pipeline

In [2]:
BUFFER_SIZE = 20000
BATCH_SIZE = 64
MAX_LENGTH = 200

In [3]:
train_examples, test_examples = get_datasets()

In [4]:
tokenizer = create_tokenizer(train_examples)

Subwords found. Loading.


In [5]:
train_dataset, test_dataset = transform_datasets(train_examples, test_examples, tokenizer, BATCH_SIZE, MAX_LENGTH, BUFFER_SIZE)

## Set hyperparameters

To keep this example small and relatively fast, the values for *num_layers, d_model, and dff* have been reduced. 

The values used in the base model of transformer were; *num_layers=6*, *d_model = 512*, *dff = 2048*. See the [paper](https://arxiv.org/abs/1706.03762) for all the other versions of the transformer.

Note: By changing the values below, you can get the model that achieved state of the art on many tasks.

In [6]:
num_layers = 4
d_model = 128
dff = 512
num_heads = 8

input_vocab_size = tokenizer.vocab_size
dropout_rate = 0.1

## Optimizer

In [7]:
learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, 
                                     epsilon=1e-9)

## Loss and metrics

In [8]:
loss_function = tf.keras.losses.BinaryCrossentropy(from_logits=True)

In [9]:
train_loss = tf.keras.metrics.BinaryCrossentropy(name='train_loss', from_logits=True)
train_accuracy = tf.keras.metrics.BinaryAccuracy(name='train_accuracy')

## Training and checkpointing

In [10]:
transformer = TransformerEncoderClassifier(num_layers, d_model, num_heads, dff,
                          input_vocab_size,
                          pe_input=input_vocab_size, 
                          rate=dropout_rate)

In [11]:
ckpt_manager = create_checkpoint_manager(transformer, optimizer)

In [12]:
EPOCHS = 10

In [13]:
# The @tf.function trace-compiles train_step into a TF graph for faster
# execution. The function specializes to the precise shape of the argument
# tensors. To avoid re-tracing due to the variable sequence lengths or variable
# batch sizes (the last batch is smaller), use input_signature to specify
# more generic shapes.

# train_step_signature = [
#     tf.TensorSpec(shape=(None, None), dtype=tf.int64),
#     tf.TensorSpec(shape=(None,), dtype=tf.int64),
# ]

# @tf.function #(input_signature=train_step_signature)
def train_step(inp, tar):
    enc_padding_mask = create_padding_mask(inp)
    with tf.GradientTape() as tape:
        prediction_logits, _ = transformer(inp, True, enc_padding_mask)
        
        loss = loss_function(tar, prediction_logits)

    gradients = tape.gradient(loss, transformer.trainable_variables)    
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

    train_loss(tar, prediction_logits)
    
    predictions = tf.sigmoid(prediction_logits)
    train_accuracy(tar, predictions)

## Training Loop

In [14]:
for epoch in range(EPOCHS):
    start = time.time()
  
    train_loss.reset_states()
    train_accuracy.reset_states()

    # inp -> review, tar -> sentiment
    for (batch, (inp, tar)) in enumerate(train_dataset):
        train_step(inp, tar)

        if batch % 25 == 0:
            print (f'Epoch {epoch + 1} Batch {batch} Loss {train_loss.result():.4f} Accuracy {train_accuracy.result():.4f}')
      
    if (epoch + 1) % 5 == 0:
        ckpt_save_path = ckpt_manager.save()
        print (f'Saving checkpoint for epoch {epoch+1} at {ckpt_save_path}')
    
    print (f'Epoch {epoch + 1} Loss {train_loss.result():.4f} Accuracy {train_accuracy.result():.4f}')

    print (f'Time taken for 1 epoch: {time.time() - start} secs\n')

Epoch 1 Batch 0 Loss 53.2097 Accuracy 0.5625
Epoch 1 Batch 25 Loss 33.3003 Accuracy 0.5391
Epoch 1 Batch 50 Loss 23.2261 Accuracy 0.5325
Epoch 1 Batch 75 Loss 17.4384 Accuracy 0.5296
Epoch 1 Batch 100 Loss 14.2478 Accuracy 0.5309
Epoch 1 Batch 125 Loss 12.3562 Accuracy 0.5315
Epoch 1 Loss 11.8819 Accuracy 0.5303
Time taken for 1 epoch: 48.982946157455444 secs

Epoch 2 Batch 0 Loss 8.3148 Accuracy 0.4531
Epoch 2 Batch 25 Loss 3.9008 Accuracy 0.5559
Epoch 2 Batch 50 Loss 4.3877 Accuracy 0.5533
Epoch 2 Batch 75 Loss 4.1177 Accuracy 0.5592
Epoch 2 Batch 100 Loss 3.7869 Accuracy 0.5676
Epoch 2 Batch 125 Loss 3.7255 Accuracy 0.5672
Epoch 2 Loss 3.8189 Accuracy 0.5652
Time taken for 1 epoch: 21.753007411956787 secs

Epoch 3 Batch 0 Loss 10.4740 Accuracy 0.4219
Epoch 3 Batch 25 Loss 4.4264 Accuracy 0.5613
Epoch 3 Batch 50 Loss 3.2283 Accuracy 0.5910
Epoch 3 Batch 75 Loss 3.2163 Accuracy 0.5874
Epoch 3 Batch 100 Loss 3.3606 Accuracy 0.5774
Epoch 3 Batch 125 Loss 3.3141 Accuracy 0.5827
Epoch 3 L

## Testing loop

In [21]:
start = time.time()

test_loss = tf.keras.metrics.BinaryCrossentropy(name='test_loss', from_logits=True)
test_accuracy = tf.keras.metrics.BinaryAccuracy(name='test_accuracy')

test_loss.reset_states()
test_accuracy.reset_states()

# inp -> review, tar -> sentiment
for (batch, (inp, tar)) in enumerate(test_dataset):
    enc_padding_mask = create_padding_mask(inp)
    prediction_logits, _ = transformer(inp, True, enc_padding_mask)

    loss = loss_function(tar, prediction_logits)
    
    test_loss(tar, prediction_logits)
    
    predictions = tf.sigmoid(prediction_logits)
    test_accuracy(tar, predictions)
    
    if batch % 25 == 0:
        print (f'Batch {batch} Loss {test_loss.result():.4f} Accuracy {test_accuracy.result():.4f}')

print (f'Loss {test_loss.result():.4f} Accuracy {test_accuracy.result():.4f}')

print (f'Time taken for testing: {time.time() - start} secs\n')

Batch 0 Loss 0.7206 Accuracy 0.8906
Batch 25 Loss 1.3223 Accuracy 0.8245
Batch 50 Loss 1.3630 Accuracy 0.8263
Batch 75 Loss 1.3919 Accuracy 0.8269
Batch 100 Loss 1.3931 Accuracy 0.8238
Batch 125 Loss 1.3747 Accuracy 0.8271
Loss 1.3807 Accuracy 0.8268
Time taken for testing: 36.14819073677063 secs



## Evaluate

### Examples of sentiment analysis

In [15]:
sent = partial(sentiment, tokenizer=tokenizer, transformer=transformer)

In [16]:
sent("This was not a very good movie. It dragged on for far too long and we couldn't wait to leave the theater.")

Input: This was not a very good movie. It dragged on for far too long and we couldn't wait to leave the theater.
Predicted sentiment: neg


In [17]:
neg_review = 'Just because someone is under the age of 10 does not mean they are stupid. If your child likes this film you\'d better have him/her tested. I am continually amazed at how so many people can be involved in something that turns out so bad. This "film" is a showcase for digital wizardry AND NOTHING ELSE. The writing is horrid. I can\'t remember when I\'ve heard such bad dialogue. The songs are beyond wretched. The acting is sub-par but then the actors were not given much. Who decided to employ Joey Fatone? He cannot sing and he is ugly as sin.<br /><br />The worst thing is the obviousness of it all. It is as if the writers went out of their way to make it all as stupid as possible. Great children\'s movies are wicked, smart and full of wit - films like Shrek and Toy Story in recent years, Willie Wonka and The Witches to mention two of the past. But in the continual dumbing-down of American more are flocking to dreck like Finding Nemo (yes, that\'s right), the recent Charlie & The Chocolate Factory and eye-crossing trash like Red Riding Hood.'

In [18]:
sent(neg_review)
print ("Real translation: and my neighboring homes heard about this idea .")

Input: Just because someone is under the age of 10 does not mean they are stupid. If your child likes this film you'd better have him/her tested. I am continually amazed at how so many people can be involved in something that turns out so bad. This "film" is a showcase for digital wizardry AND NOTHING ELSE. The writing is horrid. I can't remember when I've heard such bad dialogue. The songs are beyond wretched. The acting is sub-par but then the actors were not given much. Who decided to employ Joey Fatone? He cannot sing and he is ugly as sin.<br /><br />The worst thing is the obviousness of it all. It is as if the writers went out of their way to make it all as stupid as possible. Great children's movies are wicked, smart and full of wit - films like Shrek and Toy Story in recent years, Willie Wonka and The Witches to mention two of the past. But in the continual dumbing-down of American more are flocking to dreck like Finding Nemo (yes, that's right), the recent Charlie & The Chocol

In [19]:
sent("Este não foi um filme muito bom.") # neg 

Input: Este não foi um filme muito bom.
Predicted sentiment: pos


You can pass different layers and attention blocks of the decoder to the `plot` parameter.

In [20]:
sent("Este foi um filme incrível!") # Pos

Input: Este foi um filme incrível!
Predicted sentiment: pos


## Summary

In this tutorial, you learned about positional encoding, multi-head attention, the importance of masking and how to create a transformer.

Try using a different dataset to train the transformer. You can also create the base transformer or transformer XL by changing the hyperparameters above. You can also use the layers defined here to create [BERT](https://arxiv.org/abs/1810.04805) and train state of the art models. Futhermore, you can implement beam search to get better predictions.