In [1]:
%run DataandTokenizer.ipynb
%run Transformer.ipynb
import time

# Now that everything is set up all thats left is to train the model


In [2]:
#This are the parameters that define our transformers
layersnum = 4
modeldim = 128
dff = 512 #<- this is for the RELu layers between layers
num_heads = 4 
dropout_rate = 0.1

To fully complete whats described in the paper we will use the same variable learning rate and learning rate. This can be also found on the tensorflow post

In [3]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()

        self.d_model = modeldim
        self.d_model = tf.cast(self.d_model, tf.float32)

        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)
    
    

In [4]:
learning_rate = CustomSchedule(modeldim)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)

Also since our target will have padding we need to apply a transformation to the loss function

In [5]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')


def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred) #we calculate the loss first 

    mask = tf.cast(mask, dtype=loss_.dtype) #if a False, means it was zero , which means it was padding
    #and padding shouldnt account for the loss so when multiplying its not counted.
    loss_ *= mask#we apply the mask to it

    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)#sum all loses over the length of the tensor of mask


def accuracy_function(real, pred):
    accuracies = tf.equal(real, tf.argmax(pred, axis=2))
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    accuracies = tf.math.logical_and(mask, accuracies)
    accuracies = tf.cast(accuracies, dtype=tf.float32)
    mask = tf.cast(mask, dtype=tf.float32)
    return tf.reduce_sum(accuracies)/tf.reduce_sum(mask)
    #its exactly the same thought but checking if its accurate or not 
    #get when its padding and then not acount it 
    
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.Mean(name='train_accuracy')

In [20]:
Embed=Embedding()
datos=Data()
tokenizers=Tokenizer()

def tokenize_pairs(es,pt):
    data2=[]
    data2.append(es)
    data2.append(pt)
    pt = Embed.tokenize(data2)[1]
    # Convert from ragged to dense, padding with zeros.
    es=Embed.tokenize(data2)[0]
    # Convert from ragged to dense, padding with zeros.
    return es,pt

shape1=(tf.shape(tokenize_pairs(datos.es,datos.pt)[0])[1])

shape2=(tf.shape(tokenize_pairs(datos.es,datos.pt)[1])[1])
maxnum=2
transformer = Transformer(
    num_layers=layersnum,
    d_model=modeldim,
    num_heads=num_heads,
    dff=dff,
    input_vocab_size=5000,
    target_vocab_size=5000,
    pe_input=1000,
    pe_target=1000,
    rate=dropout_rate)


In [12]:

print(shape2)
print(shape1)

tf.Tensor(240, shape=(), dtype=int32)
tf.Tensor(215, shape=(), dtype=int32)


In [8]:
checkpoint_path = "./checkpoints/train"

ckpt = tf.train.Checkpoint(transformer=transformer,
                           optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

# if a checkpoint exists, restore the latest checkpoint.
if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print('Latest checkpoint restored!!')

In [21]:


# The @tf.function trace-compiles train_step into a TF graph for faster
# execution. The function specializes to the precise shape of the argument
# tensors. To avoid re-tracing due to the variable sequence lengths or variable
# batch sizes (the last batch is smaller), use input_signature to specify
# more generic shapes.

train_step_signature = [
    tf.TensorSpec(shape=(None, None), dtype=tf.int64),
    tf.TensorSpec(shape=(None, None), dtype=tf.int64),
]

BUFFER_SIZE = 20000

BATCH_SIZE = maxnum
@tf.function(input_signature=train_step_signature)
def train_step(inp, tar):
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]

    with tf.GradientTape() as tape:
        predictions, _ = transformer([inp, tar_inp],
                                 training = True)
        loss = loss_function(tar_real, predictions)
    gradients = tape.gradient(loss, transformer.trainable_variables)
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

    train_loss(loss)
    train_accuracy(accuracy_function(tar_real, predictions))
    
    
    
    #STUFF
examples, metadata = tfds.load('ted_hrlr_translate/pt_to_en', with_info=True,
                               as_supervised=True)


def make_batches(ds):
    return (
      ds
      .cache()
      .shuffle(BUFFER_SIZE)
      .batch(BATCH_SIZE)
      .map(tokenize_pairs, num_parallel_calls=tf.data.AUTOTUNE)
      .prefetch(tf.data.AUTOTUNE))
    
    
train_examples, val_examples = examples['train'], examples['validation']
train_batches = make_batches(train_examples)
val_batches = make_batches(val_examples) 
    
    
    
    

In [53]:
print(train_examples[0])

TypeError: 'PrefetchDataset' object is not subscriptable

In [None]:
EPOCHS=1  

for epoch in range(EPOCHS):
    start = time.time()

    train_loss.reset_states()
    train_accuracy.reset_states()

    for (batch, (inp, tar)) in enumerate(train_batches):
        train_step(inp, tar)

    if batch % 50 == 0:
        print(f'Epoch {epoch + 1} Batch {batch} Loss {train_loss.result():.4f} Accuracy {train_accuracy.result():.4f}')

    if (epoch + 1) % 5 == 0:
        ckpt_save_path = ckpt_manager.save()
        print(f'Saving checkpoint for epoch {epoch+1} at {ckpt_save_path}')

    print(f'Epoch {epoch + 1} Loss {train_loss.result():.4f} Accuracy {train_accuracy.result():.4f}')
 
    print(f'Time taken for 1 epoch: {time.time() - start:.2f} secs\n')

In [21]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  0
