<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/transformers2025_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

* The Transformer is a novel network architecture that relies solely on attention mechanisms, avoiding recurrence and convolutions.  It has shown superior quality and parallelizability in machine translation tasks, requiring less training time.

* The Transformer outperforms existing models in English-to-German translation, achieving 28.4 BLEU, and sets a new single-model state-of-the-art BLEU score of 41.8 in English-to-French translation. Additionally, it generalizes well to English constituency parsing.

Attention Is All You Need: https://arxiv.org/abs/1706.03762

In [None]:
!pip install torch -q
!pip install torchtext -q
!pip install nltk -q
!pip install datasets -q
!pip install transformers -q

In [1]:
!nvidia-smi

Fri Apr 18 12:34:52 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   34C    P0             51W /  400W |       0MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from nltk.translate.bleu_score import corpus_bleu
from datasets import load_dataset
from transformers import AutoTokenizer
from warnings import filterwarnings
filterwarnings("ignore")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1. Data Loading and Preprocessing
# Load the datasets

#en_de_dataset = load_dataset("wmt14", "de-en")
#en_fr_dataset = load_dataset("wmt14", "fr-en")


#en_de_dataset = load_dataset("wmt14", "de-en", split="train[:1%]")  # Use 1% of the training data
#en_fr_dataset = load_dataset("wmt14", "fr-en", split="train[:1%]")  # Use 1% of the training data


#en_de_dataset = load_dataset("wmt14", "de-en", split={"train": "train[:1%]", "validation": "validation[:50%]", "test": "test[:50%]"})
#en_fr_dataset = load_dataset("wmt14", "fr-en", split={"train": "train[:1%]", "validation": "validation[:50%]", "test": "test[:50%]"})


en_de_dataset = load_dataset("wmt14", "de-en", split={"train": "train[:1%]", "validation": "validation[:100%]", "test": "test[:100%]"})
en_fr_dataset = load_dataset("wmt14", "fr-en", split={"train": "train[:1%]", "validation": "validation[:100%]", "test": "test[:100%]"})

# Tokenization, Vocabulary, and Batching
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-small")

def preprocess_function(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets_de = [ex["de"] for ex in examples["translation"]]

    # Tokenize the inputs and targets
    model_inputs = tokenizer(inputs, padding=True, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets_de, padding=True, truncation=True)

    # Add the target language labels to the model inputs
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

en_de_dataset = en_de_dataset.map(preprocess_function, batched=True)

def preprocess_function_fr(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets_fr = [ex["fr"] for ex in examples["translation"]]

    # Tokenize the inputs and targets
    model_inputs = tokenizer(inputs, padding=True, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets_fr, padding=True, truncation=True)

    # Add the target language labels to the model inputs
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

en_fr_dataset = en_fr_dataset.map(preprocess_function_fr, batched=True)

# PyTorch Dataset
class TranslationDataset(Dataset):
    def __init__(self, data, lang):
        self.data = data
        self.lang = lang

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return {
            "input_text": torch.tensor(self.data[idx]["input_ids"]),
            f"target_text_{self.lang}": torch.tensor(self.data[idx]["labels"])
        }

en_de_train_dataset = TranslationDataset(en_de_dataset["train"], "de")
en_de_val_dataset = TranslationDataset(en_de_dataset["validation"], "de")
en_de_test_dataset = TranslationDataset(en_de_dataset["test"], "de")

en_fr_train_dataset = TranslationDataset(en_fr_dataset["train"], "fr")
en_fr_val_dataset = TranslationDataset(en_fr_dataset["validation"], "fr")
en_fr_test_dataset = TranslationDataset(en_fr_dataset["test"], "fr")

from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    input_texts = [item["input_text"] for item in batch]
    # Get the target language from the first item in the batch
    # Check if 'target_text_de' is in the keys. If so, the target language is 'de', otherwise it's 'fr'.
    target_lang = 'de' if 'target_text_de' in batch[0] else 'fr'

    # Use the correct target language key when creating 'target_texts'
    target_texts = [item[f"target_text_{target_lang}"] for item in batch]

    # Pad the sequences
    input_texts = pad_sequence(input_texts, batch_first=True, padding_value=pad_idx)
    target_texts = pad_sequence(target_texts, batch_first=True, padding_value=pad_idx)

    return {
        "input_text": input_texts,
        f"target_text_{target_lang}": target_texts
    }

def create_dataloaders(datasets, batch_size=32):
    dataloaders = {}
    for name, dataset in datasets.items():
        dataloaders[name] = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn) # Pass the collate_fn to the DataLoader
    return dataloaders



batch_size = 64
en_de_dataloaders = create_dataloaders({
    "train": en_de_train_dataset,
    "val": en_de_val_dataset,
    "test": en_de_test_dataset
}, batch_size)

en_fr_dataloaders = create_dataloaders({
    "train": en_fr_train_dataset,
    "val": en_fr_val_dataset,
    "test": en_fr_test_dataset
}, batch_size)

# 2. Transformer Model Definition
class Transformer(nn.Module):
    def __init__(self, input_vocab_size, target_vocab_size, d_model, num_heads, num_layers, d_ff):
        super().__init__()
        self.encoder = Encoder(input_vocab_size, d_model, num_heads, num_layers, d_ff)
        self.decoder = Decoder(target_vocab_size, d_model, num_heads, num_layers, d_ff)
        self.generator = nn.Linear(d_model, target_vocab_size)

    def forward(self, src, tgt, src_mask, tgt_mask):
        memory = self.encoder(src, src_mask)
        output = self.decoder(tgt, memory, src_mask, tgt_mask)
        return self.generator(output)

# Encoder and Decoder
class Encoder(nn.Module):
    def __init__(self, vocab_size, d_model, num_heads, num_layers, d_ff):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff) for _ in range(num_layers)])

    def forward(self, src, mask):
        x = self.embedding(src)
        for layer in self.layers:
            x = layer(x, mask)
        return x

class Decoder(nn.Module):
    def __init__(self, vocab_size, d_model, num_heads, num_layers, d_ff):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff) for _ in range(num_layers)])

    def forward(self, tgt, memory, src_mask, tgt_mask):
        x = self.embedding(tgt)
        for layer in self.layers:
            x = layer(x, memory, src_mask, tgt_mask)
        return x

# EncoderLayer and DecoderLayer
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = FeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x, mask):
        attn_output, _ = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = FeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x, memory, src_mask, tgt_mask):
        attn_output, _ = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        cross_attn_output, _ = self.cross_attn(x, memory, memory, src_mask)
        x = self.norm2(x + self.dropout(cross_attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x

# MultiHeadAttention and FeedForward
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"

        self.d_k = d_model // num_heads
        self.h = num_heads
        self.d_model = d_model

        self.W_Q = nn.Linear(d_model, d_model)
        self.W_K = nn.Linear(d_model, d_model)
        self.W_V = nn.Linear(d_model, d_model)
        self.W_O = nn.Linear(d_model, d_model)

        self.scaled_dot_product_attention = ScaledDotProductAttention()

    def forward(self, query, key, value, mask=None):
        batch_size = query.size(0)

        Q = self.W_Q(query).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)
        K = self.W_K(key).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)
        V = self.W_V(value).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)

        attn_output, attn_output_weights = self.scaled_dot_product_attention(Q, K, V, mask)

        attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, -1, self.h * self.d_k)
        output = self.W_O(attn_output)

        return output, attn_output_weights

class ScaledDotProductAttention(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, Q, K, V, mask=None):
        d_k = K.size(-1)
        scores = torch.matmul(Q, K.transpose(-2, -1)) / torch.sqrt(torch.tensor(d_k, dtype=torch.float32))
        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))
        attn = F.softmax(scores, dim=-1)
        output = torch.matmul(attn, V)
        return output, attn

class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        return self.linear2(self.relu(self.linear1(x)))

# Masking
def create_masks(src, tgt, pad_idx):
    src_mask = (src != pad_idx).unsqueeze(1).unsqueeze(2)
    tgt_mask = (tgt != pad_idx).unsqueeze(1).unsqueeze(2) & subsequent_mask(tgt.size(1)).to(tgt.device)
    return src_mask, tgt_mask

def subsequent_mask(size):
    attn_shape = (1, size, size)
    subsequent_mask = torch.triu(torch.ones(attn_shape), diagonal=1).type(torch.uint8)
    return subsequent_mask == 0

# 3. Training

print('\n\n')
print(f"Number of training batches (en_de): {len(en_de_dataloaders['train'])}")
print(f"Number of validation batches (en_de): {len(en_de_dataloaders['val'])}")
print(f"Number of test batches (en_de): {len(en_de_dataloaders['test'])}")
print('\n')
print(f"Number of training batches (en_fr): {len(en_fr_dataloaders['train'])}")
print(f"Number of validation batches (en_fr): {len(en_fr_dataloaders['val'])}")
print(f"Number of test batches (en_fr): {len(en_fr_dataloaders['test'])}")
print('\n')

from tqdm import tqdm
def train_loop(model, optimizer, dataloader, loss_fn, device, pad_idx):
    model.train()
    total_loss = 0

    # Get the target language from the dataloader's dataset
    target_lang = dataloader.dataset.lang

    for batch in tqdm(dataloader, desc=f"Epoch {epoch + 1} - {target_lang.upper()}", leave=False):
        src = batch["input_text"].to(device)
        # Use the target_lang directly to access the target text
        tgt = batch[f"target_text_{target_lang}"].to(device)
        tgt_input = tgt[:, :-1]
        tgt_output = tgt[:, 1:]

        src_mask, tgt_mask = create_masks(src, tgt_input, pad_idx)
        preds = model(src, tgt_input, src_mask, tgt_mask)
        loss = loss_fn(preds.reshape(-1, preds.size(-1)), tgt_output.reshape(-1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

# Model parameters
d_model = 512
num_heads = 8
num_layers = 6
d_ff = 2048
input_vocab_size_de = tokenizer.vocab_size
target_vocab_size_de = tokenizer.vocab_size
input_vocab_size_fr = tokenizer.vocab_size
target_vocab_size_fr = tokenizer.vocab_size
pad_idx = tokenizer.pad_token_id
max_len = 100
start_symbol = tokenizer.bos_token_id
end_symbol = tokenizer.eos_token_id

# Initialize models
transformer_de = Transformer(input_vocab_size_de, target_vocab_size_de, d_model, num_heads, num_layers, d_ff).to(device)
transformer_fr = Transformer(input_vocab_size_fr, target_vocab_size_fr, d_model, num_heads, num_layers, d_ff).to(device)

# Optimizers
optimizer_de = optim.Adam(transformer_de.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)
optimizer_fr = optim.Adam(transformer_fr.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

# Loss function
loss_fn = nn.CrossEntropyLoss(ignore_index=pad_idx)


In [None]:
print('\n\n')
print(f"Number of training batches (en_de): {len(en_de_dataloaders['train'])}")
print(f"Number of validation batches (en_de): {len(en_de_dataloaders['val'])}")
print(f"Number of test batches (en_de): {len(en_de_dataloaders['test'])}")
print('\n')
print(f"Number of training batches (en_fr): {len(en_fr_dataloaders['train'])}")
print(f"Number of validation batches (en_fr): {len(en_fr_dataloaders['val'])}")
print(f"Number of test batches (en_fr): {len(en_fr_dataloaders['test'])}")
print('\n')

# Training loop
num_epochs = 3

for epoch in range(num_epochs):
    train_loss_de = train_loop(transformer_de, optimizer_de, en_de_dataloaders["train"], loss_fn, device, pad_idx)
    print(f"Epoch {epoch+1} - DE Train Loss: {train_loss_de:.4f}")

    train_loss_fr = train_loop(transformer_fr, optimizer_fr, en_fr_dataloaders["train"], loss_fn, device, pad_idx)
    print(f"Epoch {epoch+1} - FR Train Loss: {train_loss_fr:.4f}")

# 4. Evaluation
def translate_sentence(model, src, max_len, start_symbol, end_symbol, device):
    model.eval()
    with torch.no_grad():
        src_mask = (src != pad_idx).unsqueeze(1).unsqueeze(2).to(device)
        memory = model.encoder(src.to(device), src_mask.to(device))

        tgt = torch.zeros(1, 1).fill_(start_symbol).type(torch.long).to(device)

        for i in range(1, max_len):
            tgt_mask = subsequent_mask(tgt.size(1)).unsqueeze(0).to(device)
            output = model.decoder(tgt, memory, src_mask, tgt_mask)
            prob = model.generator(output[:, -1])
            _, next_word = torch.max(prob, dim=1)
            next_word = next_word.item()
            tgt = torch.cat([tgt, torch.zeros(1, 1).fill_(next_word).type(torch.long).to(device)], dim=1)
            if next_word == end_symbol:
                break

    return tgt[0].cpu().tolist()

def calculate_bleu(model, dataloader, target_lang, max_len, start_symbol, end_symbol, device, pad_idx):
    references = []
    predictions = []

    model.eval()
    with torch.no_grad():
        for batch in dataloader:
            src = batch["input_text"].to(device)
            tgt = batch[f"target_text_{target_lang}"].to(device)

            for i in range(src.size(0)):
                ref = [tgt[i].tolist()]
                pred = translate_sentence(model, src[i].unsqueeze(0), max_len, start_symbol, end_symbol, device)

                # Remove padding and special tokens from prediction and reference for BLEU calculation
                pred = [token for token in pred if token not in [pad_idx, start_symbol, end_symbol]]
                ref = [token for token in ref[0] if token not in [pad_idx, start_symbol, end_symbol]]

                predictions.append(pred)
                references.append([ref]) # Expected format for corpus_bleu: list of list of tokens


    # Calculate BLEU score using corpus_bleu, converting tokens to strings
    bleu_score = corpus_bleu(references, predictions, weights=(1, 0, 0, 0))

    return bleu_score

# Evaluation (after training)
bleu_de = calculate_bleu(transformer_de, en_de_dataloaders["test"], "de", max_len, start_symbol, end_symbol, device, pad_idx)
bleu_fr = calculate_bleu(transformer_fr, en_fr_dataloaders["test"], "fr", max_len, start_symbol, end_symbol, device, pad_idx)

print(f"BLEU (English-German): {bleu_de:.2f}")
print(f"BLEU (English-French): {bleu_fr:.2f}")




Number of training batches (en_de): 705
Number of validation batches (en_de): 47
Number of test batches (en_de): 47


Number of training batches (en_fr): 6381
Number of validation batches (en_fr): 47
Number of test batches (en_fr): 47






Epoch 1 - DE Train Loss: 5.1106


Epoch 1 - FR:  17%|█▋        | 1068/6381 [13:11<1:06:41,  1.33it/s]