In [17]:
%run DataandTokenizer.ipynb
%run Transformer.ipynb
import time

# Now that everything is set up all thats left is to train the model


In [47]:
#This are the parameters that define our transformers
layersnum = 4
modeldim = 128
dff = 512 #<- this is for the RELu layers between layers
num_heads = 4 
dropout_rate = 0.1

To fully complete whats described in the paper we will use the same variable learning rate and learning rate. This can be also found on the tensorflow post

In [48]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()

        self.d_model = modeldim
        self.d_model = tf.cast(self.d_model, tf.float32)

        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)
    
    

In [49]:
learning_rate = CustomSchedule(modeldim)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)

Also since our target will have padding we need to apply a transformation to the loss function

In [50]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(reduction='none')


def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred) #we calculate the loss first 
    mask = tf.cast(mask, dtype=loss_.dtype) #if a False, means it was zero , which means it was padding
    
    
    
    #and padding shouldnt account for the loss so when multiplying its not counted.
    loss_ *= mask#we apply the mask to it
    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)#sum all loses over the length of the tensor of mask


def accuracy_function(real, pred):
    accuracies = tf.equal(real, tf.argmax(pred, axis=2))
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    accuracies = tf.math.logical_and(mask, accuracies)
    accuracies = tf.cast(accuracies, dtype=tf.float32)
    mask = tf.cast(mask, dtype=tf.float32)
    return tf.reduce_sum(accuracies)/tf.reduce_sum(mask)
    #its exactly the same thought but checking if its accurate or not 
    #get when its padding and then not acount it 
    
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.Mean(name='train_accuracy')

In [51]:
Embed=Embedding()
datos=Data()
tokenizers=Tokenizer()

def tokenize_pairs(es,pt):
    data2=[]
    data2.append(es)
    data2.append(pt)
    pt = Embed.tokenize(data2)[1]
    # Convert from ragged to dense, padding with zeros.
    es=Embed.tokenize(data2)[0]
    # Convert from ragged to dense, padding with zeros.
    return es,pt

maxnum=2
transformer = Transformer(
    num_layers=layersnum,
    d_model=modeldim,
    num_heads=num_heads,
    dff=dff,
    input_vocab_size=5000,
    target_vocab_size=5000,
    pe_input=1000,
    pe_target=1000,
    rate=dropout_rate)


In [52]:
checkpoint_path = "./checkpoints/train"

ckpt = tf.train.Checkpoint(transformer=transformer,
                           optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

# if a checkpoint exists, restore the latest checkpoint.
if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print('Latest checkpoint restored!!')

In [53]:


# The @tf.function trace-compiles train_step into a TF graph for faster
# execution. The function specializes to the precise shape of the argument
# tensors. To avoid re-tracing due to the variable sequence lengths or variable
# batch sizes (the last batch is smaller), use input_signature to specify
# more generic shapes.


BUFFER_SIZE = 20000

BATCH_SIZE = 32
def train_step(inp, tar):
    tar_inp = tar[:,:-1]#(batch,tar_seq_len)
    tar_real = tar[:,1:]

    with tf.GradientTape() as tape:
        predictions, _ = transformer([inp, tar_inp],
                                 training = True)
        loss = loss_function(tar_real, predictions)
    gradients = tape.gradient(loss, transformer.trainable_variables)
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
    train_loss(loss)
    train_accuracy(accuracy_function(tar_real, predictions))
    
    #STUFF
examples, metadata = tfds.load('ted_hrlr_translate/es_to_pt', with_info=True,
                               as_supervised=True)


def make_batches(ds):
    return (
      ds
      .cache()
      .shuffle(BUFFER_SIZE)
      .batch(BATCH_SIZE)
      .map(tokenize_pairs, num_parallel_calls=tf.data.AUTOTUNE)
      .prefetch(tf.data.AUTOTUNE))
    
    
train_examples, val_examples = examples['train'], examples['validation']
train_batches = make_batches(train_examples)
val_batches = make_batches(val_examples) 
    
    
    
    

In [54]:
train_examples, val_examples = examples['train'], examples['validation']
train_batches=make_batches(train_examples)


In [80]:
EPOCHS=20  # 3done!

for epoch in range(EPOCHS):
    start = time.time()
    checkpoint_path = "./checkpoints/train"

    ckpt = tf.train.Checkpoint(transformer=transformer,
                           optimizer=optimizer)

    ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

    if ckpt_manager.latest_checkpoint:
        ckpt.restore(ckpt_manager.latest_checkpoint)
        print('Latest checkpoint restored!!')
    train_loss.reset_states()
    train_accuracy.reset_states()
    for (batch, (inp, tar)) in enumerate(train_batches):
        train_step(inp, tar)

    
        if batch % 50 == 0:
            print(f'Epoch {epoch + 1} Batch {batch} Loss {train_loss.result():.4f} Accuracy {train_accuracy.result():.4f}')

        if batch % 500 == 0:
            ckpt_save_path = ckpt_manager.save()
            print(f'Saving checkpoint for epoch {epoch+1} and for Batch {batch} at {ckpt_save_path}')

    if (epoch + 1) % 1 == 0:
        ckpt_save_path = ckpt_manager.save()
        print(f'Saving checkpoint for epoch {epoch+1} at {ckpt_save_path}')

        print(f'Epoch {epoch + 1} Loss {train_loss.result():.4f} Accuracy {train_accuracy.result():.4f}')

        print(f'Time taken for 1 epoch: {time.time() - start:.2f} secs\n')

Latest checkpoint restored!!
Epoch 1 Batch 0 Loss 2.1492 Accuracy 0.5591
Saving checkpoint for epoch 1 and for Batch 0 at ./checkpoints/train\ckpt-53


KeyboardInterrupt: 

In [104]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  0


In [68]:
class Translator(tf.Module):
    def __init__(self, embed, transformer):
        self.embed = embed
        self.transformer = transformer
        checkpoint_path = "./checkpoints/train"

        ckpt = tf.train.Checkpoint(transformer=transformer,
                               optimizer=optimizer)

        ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

        if ckpt_manager.latest_checkpoint:
            ckpt.restore(ckpt_manager.latest_checkpoint)
            print('Latest checkpoint restored!!')

    def __call__(self, sentence, max_length=20):
    # input sentence is portuguese, hence adding the start and end token
        data=[]
        dataes=[]
        datapt=[]
        dataes.append(sentence)
        data.append(dataes)
        data.append(datapt)
        sentence=self.embed.tokenize(data)
        encoder_input = sentence[0]
        print(encoder_input)
        data2=[]
        data2.append(encoder_input)
        data2.append([[2]])
        print((self.embed.detokenize(data2)[0].numpy())[0])
        # as the target is english, the first token to the transformer should be the
        # english start token.
        start = sentence[0][0][0][tf.newaxis]#WE USE THE IDS IN spanish BECAUSE THEY ARE THE SAME IN PT
        end = sentence[0][0][-1][tf.newaxis] #there could be some padding!
        print(start)
        print(end)
        # `tf.TensorArray` is required here (instead of a python list) so that the
        # dynamic-loop can be traced by `tf.function`.
        output_array = tf.TensorArray(dtype=tf.int64, size=0, dynamic_size=True)
        output_array = output_array.write(0, start)

        for i in tf.range(max_length):
            output = tf.transpose(output_array.stack())
            predictions, _ = self.transformer([encoder_input, output], training=False)

          # select the last token from the seq_len dimension
            predictions = predictions[:, -1:, :]  # (batch_size, 1, vocab_size)

            predicted_id = tf.argmax(predictions, axis=-1)
            print(predicted_id)
          # concatentate the predicted_id to the output which is given to the decoder
          # as its input.
            output_array = output_array.write(i+1, predicted_id[0])

            if predicted_id == end:
                break

        output = tf.transpose(output_array.stack())
    # output.shape (1, tokens)
        data=[]
        data.append([[2]])
        data.append(output[tf.newaxis][0])
        print(data)
        text = (self.embed.detokenize(data)[1].numpy())[0]
        print(text)# shape: ()

        # `tf.function` prevents us from using the attention_weights that were
        # calculated on the last iteration of the loop. So recalculate them outside
        # the loop.
        _, attention_weights = self.transformer([encoder_input, output[:,:-1]], training=False)

        return text, attention_weights

In [69]:
Embed=Embedding()
translator=Translator(Embed,transformer)

Latest checkpoint restored!!


In [70]:
def print_translation(sentence, text, ground_truth):
    print(f'{"Input:":15s}: {sentence}')
    print(f'{"Prediction":15s}: {text.decode("utf-8")}')
    print(f'{"Ground truth":15s}: {ground_truth}')

In [71]:
Datos=Data()
sentence = Datos.es[15]
ground_truth = "oi, como vai."

In [89]:

sentence = Datos.es[1954]
print(sentence)

sentence='no tengo ganas de hacer eso'
ground_truth = "oi, como vai."

un veterano de ted es mi amigo jonathan haidt .


In [90]:
translated_text, attention_weights = translator((sentence))
print_translation(sentence, translated_text, ground_truth)

tf.Tensor([[   2  197  380 3444  198  188  230  218    3]], shape=(1, 9), dtype=int64)
b'[START] no tengo ganas de hacer eso [END]'
tf.Tensor([2], shape=(1,), dtype=int64)
tf.Tensor([3], shape=(1,), dtype=int64)
tf.Tensor([[117]], shape=(1, 1), dtype=int64)
tf.Tensor([[263]], shape=(1, 1), dtype=int64)
tf.Tensor([[4763]], shape=(1, 1), dtype=int64)
tf.Tensor([[118]], shape=(1, 1), dtype=int64)
tf.Tensor([[112]], shape=(1, 1), dtype=int64)
tf.Tensor([[143]], shape=(1, 1), dtype=int64)
tf.Tensor([[135]], shape=(1, 1), dtype=int64)
tf.Tensor([[16]], shape=(1, 1), dtype=int64)
tf.Tensor([[3]], shape=(1, 1), dtype=int64)
[[[2]], <tf.Tensor: shape=(1, 10), dtype=int64, numpy=
array([[   2,  117,  263, 4763,  118,  112,  143,  135,   16,    3]],
      dtype=int64)>]
b'[START] n\xc3\xa3o tenho ganhas de fazer isso . [END]'
Input:         : no tengo ganas de hacer eso
Prediction     : [START] não tenho ganhas de fazer isso . [END]
Ground truth   : oi, como vai.
