In [1]:
import os
import time
import datetime
import numpy as np
import tensorflow as tf
# import tensorflow_datasets as tfds

  from ._conv import register_converters as _register_converters


In [2]:
from src.main import prepare_data as prep_demo_data
from src.config import Config
from src.transformer import Transformer

from src.utils import CustomSchedule
from src.utils import LossObject
from src.utils import create_masks
from src.reids_data_loader.data_generator import DataGenerator

Using Jupyter Imports


In [36]:
def prepare_i2a_data():
    
    pass
    
def prepare_data(config, dataset_type="PT2EN"):
    """
    PARAMS:
    dataset_type (str): what dataset to prepare on of ["I2A", "PT2EN"]
    """
    if dataset_type == "I2A":
        train_dataset, val_dataset, tokenizer_in, tokenizer_out = prepare_i2a_data(config)
        return train_dataset, val_dataset, tokenizer_in, tokenizer_out

    if dataset_type == "PT2EN":
        (train_dataset, val_dataset, 
         tokenizer_in, tokenizer_out) = prep_demo_data(config, 
                                                       "../data/en_tokenizer", 
                                                       "../data/pt_tokenizer")
        return train_dataset, val_dataset, tokenizer_in, tokenizer_out
    
    
def train_transformer(model, config, data=None):
    """
    Transformer training code
    
    data = (train_dataset, input_vocab_size, target_vocab_size)
    """
    log_dir = config.log_dir

    BUFFER_SIZE = config.BUFFER_SIZE
    BATCH_SIZE = config.BATCH_SIZE
    EPOCHS = 1 # config.EPOCHS
    MAX_LENGTH = config.MAX_LENGTH

    # Hyperparameters
    num_layers = config.num_layers
    d_model = config.d_model
    dff = config.dff
    num_heads = config.num_heads
    dropout_rate = config.dropout_rate

    if data is None:
        print("preparing data ...")
        (train_dataset, val_dataset,
         tokenizer_en, tokenizer_pt) = prepare_data(config)

        input_vocab_size = tokenizer_pt.vocab_size + 2
        target_vocab_size = tokenizer_en.vocab_size + 2
        
    else:
        (train_dataset, input_vocab_size, target_vocab_size) = data
        
#     transformer = Transformer(num_layers, d_model, num_heads, dff,
#                           input_vocab_size, target_vocab_size,
#                           pe_input=input_vocab_size,
#                           pe_target=target_vocab_size,
#                           rate=dropout_rate)
    transformer = model

    train_loss = tf.keras.metrics.Mean(name='train_loss')
    train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
        name='train_accuracy')

    learning_rate = CustomSchedule(d_model)
    optimizer = tf.keras.optimizers.Adam(learning_rate,
                                         beta_1=0.9,
                                         beta_2=0.98,
                                         epsilon=1e-9)
    loss_function = LossObject().loss_function

    checkpoint_path = "./checkpoints/train"

    ckpt = tf.train.Checkpoint(transformer=transformer,
                               optimizer=optimizer)

    ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

    # if a checkpoint exists, restore the latest checkpoint.
    if ckpt_manager.latest_checkpoint:
        # ckpt.restore(ckpt_manager.latest_checkpoint)
        ckpt.restore(ckpt_manager.latest_checkpoint).expect_partial()
        print ('Latest checkpoint restored!!')

    ## set up logging of Tensorboard events
    current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    train_log_dir = os.path.join(log_dir, "events", current_time, "train")
    # test_log_dir = log_dir + current_time + '/test'
    train_summary_writer = tf.summary.create_file_writer(train_log_dir)
    # test_summary_writer = tf.summary.create_file_writer(test_log_dir)

    train_step_signature = [
        tf.TensorSpec(shape=(None, None), dtype=tf.int64),
        tf.TensorSpec(shape=(None, None), dtype=tf.int64),
    ]

    @tf.function(input_signature=train_step_signature)
    def train_step(inp, tar):
        tar_inp = tar[:, :-1]
        tar_real = tar[:, 1:]

        (enc_padding_mask,
         combined_mask,
         dec_padding_mask) = create_masks(inp, tar_inp)

        with tf.GradientTape() as tape:
            predictions, _ = transformer(inp, tar_inp,
                                         True,
                                         enc_padding_mask,
                                         combined_mask,
                                         dec_padding_mask)
            loss = loss_function(tar_real, predictions)

        gradients = tape.gradient(loss, transformer.trainable_variables)
        optimizer.apply_gradients(zip(gradients,
                                      transformer.trainable_variables))

        train_loss(loss)
        train_accuracy(tar_real, predictions)


    # def test_step(model, x_test, y_test):
    #     predictions = model(x_test)
    #     loss = loss_object(y_test, predictions)
    #
    #     test_loss(loss)
    #     test_accuracy(y_test, predictions)


    for epoch in range(EPOCHS):
        start = time.time()

        train_loss.reset_states()
        train_accuracy.reset_states()

        # inp -> portuguese, tar -> english
        for (batch, (inp, tar)) in enumerate(train_dataset):
            train_step(inp, tar)

            ## Tensorboard event logging
            ### QUESTION: should this be less often?
            with train_summary_writer.as_default():
                tf.summary.scalar('loss',
                                  train_loss.result(),
                                  step=epoch)
                tf.summary.scalar('accuracy',
                                  train_accuracy.result(),
                                  step=epoch)

            if batch % 50 == 0:
                print ('Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}'.format(
                       epoch + 1, batch, train_loss.result(), train_accuracy.result()))

        if (epoch + 1) % 5 == 0:
            ckpt_save_path = ckpt_manager.save()
            print ('Saving checkpoint for epoch {} at {}'.format(epoch+1,
                                                             ckpt_save_path))

        print ('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1,
                                                    train_loss.result(),
                                                    train_accuracy.result()))

        print ('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))

    print("We got to here!!!")
    
    
def evaluate(inp_sentence):
    print("In evaluate()")
    start_token = [tokenizer_pt.vocab_size]
    end_token = [tokenizer_pt.vocab_size + 1]

    # inp sentence is portuguese, hence adding the start and end token
    inp_sentence = start_token + tokenizer_pt.encode(inp_sentence) + end_token
    print("input sentence encoded")
    encoder_input = tf.expand_dims(inp_sentence, 0)
    
    print("len(inp_sentence)", len(inp_sentence))
    print("shape of encoded input", encoder_input.shape)

    # as the target is english, the first word to the transformer should be the
    # english start token.
    decoder_input = [tokenizer_en.vocab_size]
    output = tf.expand_dims(decoder_input, 0)

    for i in range(MAX_LENGTH):
        if i % 50 == 0:
            print("at step", i)
            
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
                                                                encoder_input, output)

        # predictions.shape == (batch_size, seq_len, vocab_size)
        predictions, attention_weights = transformer(encoder_input, 
                                                     output,
                                                     False,
                                                     enc_padding_mask,
                                                     combined_mask,
                                                     dec_padding_mask)

        # select the last word from the seq_len dimension
        predictions = predictions[: ,-1:, :]  # (batch_size, 1, vocab_size)

        predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)

        # return the result if the predicted_id is equal to the end token
        if predicted_id == tokenizer_en.vocab_size+1:
            return tf.squeeze(output, axis=0), attention_weights
    
        # concatentate the predicted_id to the output which is given to the decoder
        # as its input.
        output = tf.concat([output, predicted_id], axis=-1)

    return tf.squeeze(output, axis=0), attention_weights


def translate(sentence, plot=''):
    result, attention_weights = evaluate(sentence)

    predicted_sentence = tokenizer_en.decode([i for i in result 
                                            if i < tokenizer_en.vocab_size])  

    print('Input: {}'.format(sentence))
    print('Predicted translation: {}'.format(predicted_sentence))
  
    if plot:
        plot_attention_weights(attention_weights, sentence, result, plot)
        



# Prepare model, data and parameters

In [4]:
config = Config()

In [9]:
train_dataset, val_dataset, tokenizer_en, tokenizer_pt = prepare_data(config, dataset_type="PT2EN")
print("Complete")

loading dataset
building tokenizers
filtering and encoding datasets
Complete


In [7]:
en_voc_path = "../data/en_tokenizer"
tokenizer_en.save_to_file(en_voc_path)

pt_voc_path = "../data/pt_tokenizer"
tokenizer_pt.save_to_file(pt_voc_path)


Using Jupyter Imports


In [12]:
MAX_LENGTH = config.MAX_LENGTH

num_layers = config.num_layers
d_model = config.d_model
num_heads = config.num_heads
dff = config.dff
dropout_rate = config.dropout_rate


# def build_transformer(train_dataset, val_dataset, tokenizer_pt, tokenizer_en,):
input_vocab_size = tokenizer_pt.vocab_size + 2
target_vocab_size = tokenizer_en.vocab_size + 2
# input_vocab_size = config.input_vocab_size
# target_vocab_size = config.target_vocab_size
# input_vocab_size = 1
# target_vocab_size = 12


print("creating Transformer")
transformer = Transformer(num_layers, d_model, num_heads, dff,
                      input_vocab_size, target_vocab_size,
                      pe_input=input_vocab_size,
                      pe_target=target_vocab_size,
                      rate=dropout_rate)

train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
    name='train_accuracy')

learning_rate = CustomSchedule(d_model)
optimizer = tf.keras.optimizers.Adam(learning_rate,
                                     beta_1=0.9,
                                     beta_2=0.98,
                                     epsilon=1e-9)
loss_function = LossObject().loss_function
    
# transformer = build_transformer(train_dataset, val_dataset, tokenizer_pt, tokenizer_en,)

creating Transformer


# Examine data and encoder output

In [30]:
output = evaluate("do gato")
print(type(output))

In evaluate()
input sentence encoded
len(inp_sentence) 4
shape of encoded input (1, 4)
<class 'tuple'>


In [8]:
print(output[0].numpy())

predicted_sentence = tokenizer_en.decode([i for i in output[0] 
                                            if i < tokenizer_en.vocab_size])
predicted_sentence

[8087  501]


'turns '

In [9]:
tokenizer_en.decode([3, 1, 1037, 2053, 7777, 3, 82, ])

'the  , eat appearbootthe who '

In [10]:
for ii, item in enumerate(train_dataset):
    if ii > 5: break
    print(len(item), item[0].numpy().shape, item[1].numpy().shape, sep=" ")


2 (64, 60) (64, 62)
2 (64, 79) (64, 71)
2 (64, 80) (64, 84)
2 (64, 85) (64, 82)
2 (64, 85) (64, 82)
2 (64, 62) (64, 66)


In [11]:
sample_item[1].numpy()[0]

array([8087,   62,   12,  101,   10,   16,   13,   36, 1395,   52,  537,
          5,    3, 5396,   45,   11,    3,  135, 1973,  104,    4,  252,
       2264,  129,  137,    1,   16, 2437,    1,   16,  908,    1,   29,
        149,   16,  333,  798, 8088,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0])

In [12]:
tokenizer_en.decode([xx for xx in sample_item[1].numpy()[0] if xx < 8087])

"`` i know that this is not easy . turn to the gentleman in the little suite and say : `` look here , this contract , this part , what does this mean ? '' ''"

In [23]:
temp = next(iter(train_dataset))
type(temp)

tuple

In [26]:
temp[0].numpy()[0].shape

(62,)

# run single example through transformer

In [14]:
def _evaluate(inp_sentence, model):
#     start_token = [tokenizer_pt.vocab_size]
#     end_token = [tokenizer_pt.vocab_size + 1]

#     # inp sentence is portuguese, hence adding the start and end token
#     inp_sentence = start_token + tokenizer_pt.encode(inp_sentence) + end_token

    transformer = model
    encoder_input = tf.expand_dims(inp_sentence, 0)
    print(encoder_input.shape)
#     encoder_input = inp_sentence

    # as the target is english, the first word to the transformer should be the
    # english start token.
    decoder_input = [tokenizer_en.vocab_size]
    output = tf.expand_dims(decoder_input, 0)

#     enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
#                                                             encoder_input, output)
    
    for i in range(MAX_LENGTH):
#     for i in range(3):

        if i % 20 == 0:
            print("at step {} of {}".format(i, MAX_LENGTH))
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
                                                                encoder_input, output)

        # predictions.shape == (batch_size, seq_len, vocab_size)
        predictions, attention_weights = transformer(encoder_input, 
                                                     output,
                                                     False,
                                                     enc_padding_mask,
                                                     combined_mask,
                                                     dec_padding_mask)

        # select the last word from the seq_len dimension
        predictions = predictions[: ,-1:, :]  # (batch_size, 1, vocab_size)

        predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)

        # return the result if the predicted_id is equal to the end token
        if predicted_id == tokenizer_en.vocab_size+1:
            return tf.squeeze(output, axis=0), attention_weights
    
        # concatentate the predicted_id to the output which is given to the decoder
        # as its input.
        output = tf.concat([output, predicted_id], axis=-1)

    return tf.squeeze(output, axis=0), attention_weights

In [26]:
sample_item = next(iter(train_dataset))


(<tf.Tensor: id=637054, shape=(64, 100), dtype=int64, numpy=
 array([[8214,    7, 2341, ...,    0,    0,    0],
        [8214,   18,  322, ...,    0,    0,    0],
        [8214,   22, 7189, ...,    0,    0,    0],
        ...,
        [8214, 8059,    1, ...,    0,    0,    0],
        [8214,  276,    1, ...,    0,    0,    0],
        [8214,    3,  496, ...,    0,    0,    0]])>,
 <tf.Tensor: id=637055, shape=(64, 87), dtype=int64, numpy=
 array([[8087,    3,  706, ...,    0,    0,    0],
        [8087,  368,   97, ...,    0,    0,    0],
        [8087,   11,    3, ...,    0,    0,    0],
        ...,
        [8087,  359,    1, ...,    0,    0,    0],
        [8087,   23,  145, ...,    0,    0,    0],
        [8087,    3,  486, ...,    0,    0,    0]])>)

In [28]:
sample_item[0].numpy().shape

(64, 100)

In [24]:
sample_item = next(iter(train_dataset))

# # inp, target = next(data)
inp, tar_inp = sample_item


# (enc_padding_mask,
#  combined_mask,
#  dec_padding_mask) = create_masks(inp, tar_inp)

print(inp.shape, tar_inp.shape)


inp, tar_inp = inp[0], tar_inp[0]
print(inp.shape, tar_inp.shape)

# (enc_padding_mask,
#  combined_mask,
#  dec_padding_mask) = create_masks(inp, tar_inp)

# print(enc_padding_mask.shape,
#  combined_mask.shape,
#  dec_padding_mask.shape)


temp = _evaluate(inp, transformer)

print("Done")

# # predictions, _ = transformer(inp, tar_inp,
# #                              False,
# #                              enc_padding_mask,
# #                              combined_mask,
# #                              dec_padding_mask)

# # predictions

(64, 65) (64, 61)
(65,) (61,)
(1, 65)
at step 0 of 400
Done


In [25]:
temp[0].numpy()

array([8087], dtype=int32)

In [23]:
temp[1].keys()

dict_keys(['decoder_layer1_block1', 'decoder_layer1_block2', 'decoder_layer2_block1', 'decoder_layer2_block2', 'decoder_layer3_block1', 'decoder_layer3_block2', 'decoder_layer4_block1', 'decoder_layer4_block2'])

# Train Transformer

In [33]:
data = (train_dataset, input_vocab_size, target_vocab_size)
train_transformer(transformer, config, data=data)

Latest checkpoint restored!!
Epoch 1 Batch 0 Loss 0.1181 Accuracy 0.3477


KeyboardInterrupt: 

In [41]:
translate("este é um problema que temos que resolver.")
print ("Real translation: this is a problem we have to solve .")
print()

In evaluate()
input sentence encoded
len(inp_sentence) 11
shape of encoded input (1, 11)
at step 0
Input: este é um problema que temos que resolver.
Predicted translation: this is one problem that we have to solve precisely by the time in data .
Real translation: this is a problem we have to solve .



In [42]:
translate("os meus vizinhos ouviram sobre esta ideia.")
print ("Real translation: and my neighboring homes heard about this idea .")
print()



In evaluate()
input sentence encoded
len(inp_sentence) 11
shape of encoded input (1, 11)
at step 0
Input: os meus vizinhos ouviram sobre esta ideia.
Predicted translation: my neighbors have heard about such a idea .
Real translation: and my neighboring homes heard about this idea .



In [43]:
translate("vou então muito rapidamente partilhar convosco algumas histórias de algumas coisas mágicas que aconteceram.")
print ("Real translation: so i 'll just share with you some stories very quickly of some magical things that have happened .")
print()

In evaluate()
input sentence encoded
len(inp_sentence) 20
shape of encoded input (1, 20)
at step 0
Input: vou então muito rapidamente partilhar convosco algumas histórias de algumas coisas mágicas que aconteceram.
Predicted translation: so i 'm very quickly going to share with you some stories of some magic things that happened so much as am .
Real translation: so i 'll just share with you some stories very quickly of some magical things that have happened .



In [45]:
translate("este é o primeiro livro que eu fiz.") #, plot='decoder_layer4_block2')
print ("Real translation: this is the first book i've ever done.")

In evaluate()
input sentence encoded
len(inp_sentence) 11
shape of encoded input (1, 11)
at step 0
Input: este é o primeiro livro que eu fiz.
Predicted translation: this is the first book i ever did .
Real translation: this is the first book i've ever done.
