<a href="https://colab.research.google.com/github/deathvadeR-afk/mini_Supervised_Unsupervised_Deep_Learning_GenAI_projects/blob/main/Building_a_transformer_model_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Embedding, Dropout, LayerNormalization
from tensorflow.keras.models import Model
import numpy as np

# Positional Encoding

In [None]:
def positional_encoding(position, d_model):
    angle_rads = np.arange(position)[:, np.newaxis] / np.power(10000, (2 * (np.arange(d_model) // 2)) / np.float32(d_model))
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    return tf.cast(angle_rads[np.newaxis, ...], dtype=tf.float32)

# Multi-Head Attention

In [None]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        assert d_model % num_heads == 0

        self.depth = d_model // num_heads
        self.wq = Dense(d_model)
        self.wk = Dense(d_model)
        self.wv = Dense(d_model)
        self.dense = Dense(d_model)

    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, v, k, q, mask=None):
        batch_size = tf.shape(q)[0]

        q = self.wq(q)
        k = self.wk(k)
        v = self.wv(v)

        q = self.split_heads(q, batch_size)
        k = self.split_heads(k, batch_size)
        v = self.split_heads(v, batch_size)

        attention, attention_weights = self.scaled_dot_product_attention(q, k, v, mask)

        attention = tf.transpose(attention, perm=[0, 2, 1, 3])
        attention = tf.reshape(attention, (batch_size, -1, self.d_model))
        output = self.dense(attention)
        return output

    def scaled_dot_product_attention(self, q, k, v, mask):
        matmul_qk = tf.matmul(q, k, transpose_b=True)
        dk = tf.cast(tf.shape(k)[-1], tf.float32)
        scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

        if mask is not None:
            scaled_attention_logits += (mask * -1e9)

        attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
        output = tf.matmul(attention_weights, v)
        return output, attention_weights

# Feed Forward Network

In [None]:
class PositionwiseFeedforward(tf.keras.layers.Layer):
    def __init__(self, d_model, dff):
        super(PositionwiseFeedforward, self).__init__()
        self.d_model = d_model
        self.dff = dff
        self.dense1 = Dense(dff, activation='relu')
        self.dense2 = Dense(d_model)

    def call(self, x):
        x = self.dense1(x)
        x = self.dense2(x)
        return x

# Transformer Block

In [None]:
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, dropout_rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadAttention(d_model, num_heads)
        self.ffn = PositionwiseFeedforward(d_model, dff)
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(dropout_rate)
        self.dropout2 = Dropout(dropout_rate)

    def call(self, x, training, mask=None):
        attn_output = self.att(x, x, x, mask)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)
        return out2


# Encoder


In [None]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, maximum_position_encoding, dropout_rate=0.1):
        super(Encoder, self).__init__()
        self.d_model = d_model
        self.num_layers = num_layers
        self.embedding = Embedding(input_vocab_size, d_model)
        self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)
        self.dropout = Dropout(dropout_rate)
        self.enc_layers = [TransformerBlock(d_model, num_heads, dff, dropout_rate) for _ in range(num_layers)]

    def call(self, x, training, mask=None):
        seq_len = tf.shape(x)[1]
        x = self.embedding(x)
        x += self.pos_encoding[:, :seq_len, :]
        x = self.dropout(x, training=training)
        for i in range(self.num_layers):
            x = self.enc_layers[i](x, training=training, mask=mask) # Pass training as keyword
        return x

# Decoder

In [None]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size, maximum_position_encoding, dropout_rate=0.1):
        super(Decoder, self).__init__()
        self.d_model = d_model
        self.num_layers = num_layers
        self.embedding = Embedding(target_vocab_size, d_model)
        self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)
        self.dropout = Dropout(dropout_rate)
        self.dec_layers = [TransformerBlock(d_model, num_heads, dff, dropout_rate) for _ in range(num_layers)]

    def call(self, x, enc_output, training, look_ahead_mask=None, padding_mask=None):
        seq_len = tf.shape(x)[1]
        attention_weights = {}
        x = self.embedding(x)
        x += self.pos_encoding[:, :seq_len, :]
        x = self.dropout(x, training=training)
        for i in range(self.num_layers):
            # Pass training as keyword and enc_output and padding_mask to the TransformerBlock if needed for cross-attention
            # However, the TransformerBlock call signature is (x, training, mask)
            # So, we only pass x, training, and the relevant mask
            x = self.dec_layers[i](x, training=training, mask=look_ahead_mask) # Pass training as keyword
        return x

# Transformer Model

In [None]:
class Transformer(Model):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, pe_input, pe_target, rate=0.1):
        super(Transformer, self).__init__()
        self.encoder = Encoder(num_layers, d_model, num_heads, dff, input_vocab_size, pe_input, rate)
        self.decoder = Decoder(num_layers, d_model, num_heads, dff, target_vocab_size, pe_target, rate)
        self.final_layer = Dense(target_vocab_size)

    def call(self, inputs, targets, training, look_ahead_mask=None, padding_mask=None):
        enc_output = self.encoder(inputs, training=training, mask=padding_mask) # Pass training as keyword
        dec_output = self.decoder(targets, enc_output, training=training, look_ahead_mask=look_ahead_mask, padding_mask=padding_mask) # Pass training as keyword
        final_output = self.final_layer(dec_output)
        return final_output

# Parameters

In [None]:
num_layers = 4
d_model = 128
num_heads = 8
dff = 512
input_vocab_size = tokenizer_pt.vocab_size + 2  # Update with actual vocab size
target_vocab_size = tokenizer_en.vocab_size + 2 # Update with actual vocab size
pe_input = 1000
pe_target = 1000
dropout_rate = 0.1

# Create the Transformer model

In [None]:
transformer = Transformer(num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, pe_input, pe_target, dropout_rate)

# Compile the Transformer model

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

# Define a suitable loss function (e.g., SparseCategoricalCrossentropy for classification)
# You'll need to choose a loss function that matches your specific task
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0)) # Assuming 0 is padding
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

# Use SparseCategoricalAccuracy for sequence-to-sequence accuracy
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')

# The Transformer model is typically trained with a custom training loop
# rather than directly using model.compile() and model.fit()
# due to the need for custom masking and handling sequence generation during inference.

# However, you can define metrics for evaluation
# train_accuracy = tf.keras.metrics.Mean(name='train_accuracy') # Replaced with SparseCategoricalAccuracy

# Prepare sample data and masks

# Basic Training Loop

## 1. Load the dataset

In [None]:
import tensorflow_datasets as tfds

# Load the Portuguese to English translation dataset
dataset, metadata = tfds.load('ted_hrlr_translate/pt_to_en', with_info=True, as_supervised=True)

train_dataset, val_dataset = dataset['train'], dataset['validation']

print("Dataset loaded successfully.")
print("Metadata:", metadata)



Downloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to /root/tensorflow_datasets/ted_hrlr_translate/pt_to_en/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/ted_hrlr_translate/pt_to_en/incomplete.ZXWL8Z_1.0.0/ted_hrlr_translate-tra…

Generating validation examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/ted_hrlr_translate/pt_to_en/incomplete.ZXWL8Z_1.0.0/ted_hrlr_translate-val…

Generating test examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/ted_hrlr_translate/pt_to_en/incomplete.ZXWL8Z_1.0.0/ted_hrlr_translate-tes…

Dataset ted_hrlr_translate downloaded and prepared to /root/tensorflow_datasets/ted_hrlr_translate/pt_to_en/1.0.0. Subsequent calls will reuse this data.
Dataset loaded successfully.
Metadata: tfds.core.DatasetInfo(
    name='ted_hrlr_translate',
    full_name='ted_hrlr_translate/pt_to_en/1.0.0',
    description="""
    Data sets derived from TED talk transcripts for comparing similar language pairs
    where one is high resource and the other is low resource.
    """,
    config_description="""
    Translation dataset from pt to en in plain text.
    """,
    homepage='https://github.com/neulab/word-embeddings-for-nmt',
    data_dir='/root/tensorflow_datasets/ted_hrlr_translate/pt_to_en/1.0.0',
    file_format=tfrecord,
    download_size=124.94 MiB,
    dataset_size=10.89 MiB,
    features=Translation({
        'en': Text(shape=(), dtype=string),
        'pt': Text(shape=(), dtype=string),
    }),
    supervised_keys=('pt', 'en'),
    disable_shuffling=False,
    nondeterministic_orde

## 2. Preprocess the data: Tokenization

In [None]:
# Build the tokenizer for English and Portuguese from the dataset
tokenizer_pt = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    (pt.numpy() for pt, en in train_dataset), target_vocab_size=2**13) # Adjust target_vocab_size as needed

tokenizer_en = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    (en.numpy() for pt, en in train_dataset), target_vocab_size=2**13) # Adjust target_vocab_size as needed


print("Tokenizers built.")
print("English vocabulary size:", tokenizer_en.vocab_size)
print("Portuguese vocabulary size:", tokenizer_pt.vocab_size)

Tokenizers built.
English vocabulary size: 8087
Portuguese vocabulary size: 8214


## 3. Preprocess the data: Tokenize, Pad, and Filter

In [None]:
# Define the maximum sequence length
MAX_LENGTH = 40  # You can adjust this based on your dataset and computational resources

# Add start and end token ids
# The start and end token IDs should be outside the range of the tokenizer's vocabulary.
# A common practice is to use tokenizer.vocab_size for the start token
# and tokenizer.vocab_size + 1 for the end token.
START_TOKEN = tokenizer_en.vocab_size
END_TOKEN = tokenizer_en.vocab_size + 1

# Update vocabulary sizes to include the start and end tokens
# This is the vocabulary size that the embedding layer in the decoder should use.
target_vocab_size = tokenizer_en.vocab_size + 2

# Function to tokenize and pad/truncate, returning a boolean flag for filtering
def tokenize_and_filter(pt, en):
    # Tokenize Portuguese (input)
    pt_sequence = [tokenizer_pt.vocab_size] + tokenizer_pt.encode(pt.numpy()) + [tokenizer_pt.vocab_size + 1]

    # Tokenize English (target)
    en_sequence = [START_TOKEN] + tokenizer_en.encode(en.numpy()) + [END_TOKEN]


    # Check if sequences are within MAX_LENGTH
    should_keep = len(pt_sequence) <= MAX_LENGTH and len(en_sequence) <= MAX_LENGTH

    if should_keep:
        # Pad sequences
        pt_sequence = pt_sequence + [0] * (MAX_LENGTH - len(pt_sequence))
        en_sequence = en_sequence + [0] * (MAX_LENGTH - len(en_sequence))
        return tf.constant(pt_sequence, dtype=tf.int64), tf.constant(en_sequence, dtype=tf.int64), tf.constant(True, dtype=tf.bool)
    else:
        # Return dummy tensors and False flag for sequences to be filtered
        # We need to return tensors with the expected shape and dtype, even if they will be filtered
        return tf.zeros(MAX_LENGTH, dtype=tf.int64), tf.zeros(MAX_LENGTH, dtype=tf.int64), tf.constant(False, dtype=tf.bool)


# Apply the preprocessing to the dataset
train_dataset_processed = train_dataset.map(lambda pt, en: tf.py_function(
    func=tokenize_and_filter, inp=[pt, en], Tout=[tf.int64, tf.int64, tf.bool]))

val_dataset_processed = val_dataset.map(lambda pt, en: tf.py_function(
    func=tokenize_and_filter, inp=[pt, en], Tout=[tf.int64, tf.int64, tf.bool]))


print("Datasets preprocessed (tokenized, padded, and flagged for filtering).")

# Print shapes of a sample from the processed dataset
for pt_seq, en_seq, should_keep in train_dataset_processed.take(1):
    print("\nSample Portuguese sequence:", pt_seq)
    print("Sample English sequence:", en_seq)
    print("Should keep:", should_keep)
    print("Shape of Portuguese sequence:", pt_seq.shape)
    print("Shape of English sequence:", en_seq.shape)

Datasets preprocessed (tokenized, padded, and flagged for filtering).

Sample Portuguese sequence: tf.Tensor(
[8214    6   40 4092   57    3 1687    1 6155   12    3  461 6770   19
 5227 1088   97    1    5    8    3 4213 3408 7256 1670    2 8215    0
    0    0    0    0    0    0    0    0    0    0    0    0], shape=(40,), dtype=int64)
Sample English sequence: tf.Tensor(
[8087    4   59   15 1792 6561 3060 7952    1   15  103  134  378    3
   47 6122    6 5311    1   91   13 1849  559 1609  894    2 8088    0
    0    0    0    0    0    0    0    0    0    0    0    0], shape=(40,), dtype=int64)
Should keep: tf.Tensor(True, shape=(), dtype=bool)
Shape of Portuguese sequence: (40,)
Shape of English sequence: (40,)


## 4. Create and prepare the TensorFlow Dataset

In [None]:
# Set hyperparameters for the dataset pipeline
BUFFER_SIZE = 20000  # Adjust based on your dataset size and memory
BATCH_SIZE = 64    # Adjust based on your computational resources

# Create the tf.data.Dataset
train_dataset_tf = train_dataset_processed.filter(lambda pt, en, should_keep: should_keep).map(lambda pt, en, should_keep: (pt, en)).cache() # Filter based on flag, then drop flag, then cache
train_dataset_tf = train_dataset_tf.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
train_dataset_tf = train_dataset_tf.prefetch(tf.data.AUTOTUNE)

val_dataset_tf = val_dataset_processed.filter(lambda pt, en, should_keep: should_keep).map(lambda pt, en, should_keep: (pt, en)).cache() # Filter based on flag, then drop flag, then cache
val_dataset_tf = val_dataset_tf.batch(BATCH_SIZE)
val_dataset_tf = val_dataset_tf.prefetch(tf.data.AUTOTUNE)

print("TensorFlow Datasets created and prepared for training.")

# Print the structure of the dataset
for element in train_dataset_tf.take(1):
    print("\nSample batch structure:")
    print("Input batch shape:", element[0].shape)
    print("Target batch shape:", element[1].shape)

Cause: could not parse the source code of <function <lambda> at 0x792aa2408ea0>: found multiple definitions with identical signatures at the location. This error may be avoided by defining each lambda on a single line and with unique argument names. The matching definitions were:
Match 0:
lambda pt, en, should_keep: (pt, en)

Match 1:
lambda pt, en, should_keep: should_keep



Cause: could not parse the source code of <function <lambda> at 0x792aa2408ea0>: found multiple definitions with identical signatures at the location. This error may be avoided by defining each lambda on a single line and with unique argument names. The matching definitions were:
Match 0:
lambda pt, en, should_keep: (pt, en)

Match 1:
lambda pt, en, should_keep: should_keep



Cause: could not parse the source code of <function <lambda> at 0x792aa240b100>: found multiple definitions with identical signatures at the location. This error may be avoided by defining each lambda on a single line and with unique argument names. The matching definitions were:
Match 0:
lambda pt, en, should_keep: (pt, en)

Match 1:
lambda pt, en, should_keep: should_keep



Cause: could not parse the source code of <function <lambda> at 0x792aa240b100>: found multiple definitions with identical signatures at the location. This error may be avoided by defining each lambda on a single line and with unique argument names. The matching definitions were:
Match 0:
lambda pt, en, should_keep: (pt, en)

Match 1:
lambda pt, en, should_keep: should_keep



Cause: could not parse the source code of <function <lambda> at 0x792aa2409300>: found multiple definitions with identical signatures at the location. This error may be avoided by defining each lambda on a single line and with unique argument names. The matching definitions were:
Match 0:
lambda pt, en, should_keep: (pt, en)

Match 1:
lambda pt, en, should_keep: should_keep



Cause: could not parse the source code of <function <lambda> at 0x792aa2409300>: found multiple definitions with identical signatures at the location. This error may be avoided by defining each lambda on a single line and with unique argument names. The matching definitions were:
Match 0:
lambda pt, en, should_keep: (pt, en)

Match 1:
lambda pt, en, should_keep: should_keep



Cause: could not parse the source code of <function <lambda> at 0x792aa2409260>: found multiple definitions with identical signatures at the location. This error may be avoided by defining each lambda on a single line and with unique argument names. The matching definitions were:
Match 0:
lambda pt, en, should_keep: (pt, en)

Match 1:
lambda pt, en, should_keep: should_keep



Cause: could not parse the source code of <function <lambda> at 0x792aa2409260>: found multiple definitions with identical signatures at the location. This error may be avoided by defining each lambda on a single line and with unique argument names. The matching definitions were:
Match 0:
lambda pt, en, should_keep: (pt, en)

Match 1:
lambda pt, en, should_keep: should_keep

TensorFlow Datasets created and prepared for training.

Sample batch structure:
Input batch shape: (64, 40)
Target batch shape: (64, 40)


## 5. Training Step and Loop

In [None]:
# Create masks
def create_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    # add extra dimensions to add the padding to the attention logits.
    return seq[:, tf.newaxis, tf.newaxis, :]

def create_look_ahead_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask  # (seq_len, seq_len)

def create_masks(inp, tar):
    # Encoder padding mask
    enc_padding_mask = create_padding_mask(inp)

    # Used in the 2nd attention block in the decoder.
    # This padding mask is used to mask the encoder outputs.
    dec_padding_mask = create_padding_mask(inp)

    # Used in the 1st attention block in the decoder.
    # It is used to pad and mask future tokens in the input received by the decoder.
    look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
    dec_target_padding_mask = create_padding_mask(tar)
    combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)

    return enc_padding_mask, combined_mask, dec_padding_mask

# Metrics
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')

# Training loop
EPOCHS = 10 # Adjust as needed

for epoch in range(EPOCHS):
    train_loss.reset_state()
    train_accuracy.reset_state()

    # Use tqdm for a progress bar
    for (batch, (inp, tar)) in enumerate(train_dataset_tf):
        tar_inp = tar[:, :-1]
        tar_real = tar[:, 1:]

        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)

        with tf.GradientTape() as tape:
            predictions = transformer(inp, tar_inp,
                                      training=True, # Pass training as keyword
                                      look_ahead_mask=combined_mask,
                                      padding_mask=enc_padding_mask)
            loss = loss_function(tar_real, predictions)

        gradients = tape.gradient(loss, transformer.trainable_variables)
        optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

        train_loss(loss)
        train_accuracy(tar_real, predictions)

        if batch % 50 == 0:
            print(f'Epoch {epoch+1} Batch {batch} Loss {train_loss.result():.4f} Accuracy {train_accuracy.result():.4f}')

    print(f'Epoch {epoch+1} Loss {train_loss.result():.4f} Accuracy {train_accuracy.result():.4f}')

    # You can add validation here as well
    # For validation, you would typically not use look_ahead_mask on the decoder target input
    # and set training=False in the model call.

ValueError: Unknown variable: <Variable path=transformer_3/encoder_3/embedding_6/embeddings, shape=(8216, 128), dtype=float32, value=[[-0.00602344 -0.00865012 -0.00362698 ...  0.03263194  0.01404348
  -0.00496665]
 [ 0.04102465  0.04732454  0.03636659 ... -0.04499829 -0.00279299
   0.00185742]
 [ 0.01148772  0.00525052  0.02763403 ... -0.01427466  0.01298418
   0.03535299]
 ...
 [-0.00830592 -0.01542817 -0.04617376 ...  0.01114074  0.04713403
   0.01660562]
 [ 0.04759903 -0.0463377   0.0037526  ...  0.00894784  0.01321394
  -0.00713097]
 [ 0.02826972  0.01244457 -0.02089764 ...  0.04521878 -0.01835018
  -0.00840268]]>. This optimizer can only be called for the variables it was originally built with. When working with a new set of variables, you should recreate a new optimizer instance.

In [None]:
# Explicitly find the maximum token ID in the target sequences
max_target_token_id = 0
for pt_seq, en_seq, should_keep in train_dataset_processed:
    if should_keep.numpy():
        max_val = tf.reduce_max(en_seq).numpy()
        if max_val > max_target_token_id:
            max_target_token_id = max_val

print(f"Maximum token ID in processed target sequences: {max_target_token_id}")
print(f"Calculated target_vocab_size: {target_vocab_size}")

if max_target_token_id >= target_vocab_size:
    print("Warning: Maximum token ID is greater than or equal to the calculated target_vocab_size.")