In [1]:
# ==========================================
# STEP 1: SETUP, IMPORTS & DATA LOADING
# ==========================================
from google.colab import drive
import os
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import math
import pandas as pd
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from datasets import load_dataset
import spacy
from time import time
import matplotlib.pyplot as plt

# 1. Mount Drive
drive.mount('/content/drive')
work_dir = '/content/drive/My Drive/CENG543_Midterm_Q3'
if not os.path.exists(work_dir):
    os.makedirs(work_dir)

# 2. Install Dependencies
!pip install -q portalocker nltk spacy torchmetrics
!python -m spacy download en_core_web_sm > /dev/null
!python -m spacy download de_core_news_sm > /dev/null

# 3. Random Seeds
SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

# 4. Tokenizers & Vocab
spacy_de = spacy.load('de_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')

def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

def build_vocab(sentences, tokenizer, min_freq=2):
    counter = {}
    for sentence in sentences:
        tokens = tokenizer(sentence)
        for token in tokens:
            counter[token] = counter.get(token, 0) + 1
    vocab = {'<pad>': 0, '<unk>': 1, '<sos>': 2, '<eos>': 3}
    sorted_words = sorted(counter.items(), key=lambda x: x[1], reverse=True)
    for word, freq in sorted_words:
        if freq >= min_freq:
            vocab[word] = len(vocab)
    return vocab

print("Loading Data...")
dataset = load_dataset("bentrevett/multi30k")
train_data, val_data, test_data = dataset['train'], dataset['validation'], dataset['test']

vocab_en = build_vocab([x['en'] for x in train_data], tokenize_en)
vocab_de = build_vocab([x['de'] for x in train_data], tokenize_de)

print(f"Vocab Sizes - EN: {len(vocab_en)}, DE: {len(vocab_de)}")

# 5. Collate Function (Batch First for Transformer)
def process_batch(batch):
    src_batch, trg_batch = [], []
    for item in batch:
        src = [vocab_en.get(t, vocab_en['<unk>']) for t in tokenize_en(item['en'])]
        trg = [vocab_de.get(t, vocab_de['<unk>']) for t in tokenize_de(item['de'])]
        src_batch.append(torch.tensor([vocab_en['<sos>']] + src + [vocab_en['<eos>']], dtype=torch.long))
        trg_batch.append(torch.tensor([vocab_de['<sos>']] + trg + [vocab_de['<eos>']], dtype=torch.long))

    src_batch = pad_sequence(src_batch, padding_value=vocab_en['<pad>'], batch_first=True)
    trg_batch = pad_sequence(trg_batch, padding_value=vocab_de['<pad>'], batch_first=True)
    return src_batch.to(device), trg_batch.to(device)

BATCH_SIZE = 128
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=process_batch)
val_loader = DataLoader(val_data, batch_size=BATCH_SIZE, shuffle=False, collate_fn=process_batch)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False, collate_fn=process_batch)
print("Data Ready.")

Mounted at /content/drive
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m983.2/983.2 kB[0m [31m59.4 MB/s[0m eta [36m0:00:00[0m
[?25hDevice: cuda
Loading Data...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train.jsonl: 0.00B [00:00, ?B/s]

val.jsonl: 0.00B [00:00, ?B/s]

test.jsonl: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/29000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1014 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Vocab Sizes - EN: 6191, DE: 8014
Data Ready.


In [2]:
# ==========================================
# STEP 2: TRANSFORMER ARCHITECTURE
# ==========================================
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)

class TransformerModel(nn.Module):
    def __init__(self, src_vocab_size, trg_vocab_size, d_model=256, nhead=8,
                 num_encoder_layers=3, num_decoder_layers=3, dim_feedforward=512, dropout=0.1):
        super(TransformerModel, self).__init__()
        self.d_model = d_model
        self.src_embedding = nn.Embedding(src_vocab_size, d_model)
        self.trg_embedding = nn.Embedding(trg_vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model, dropout)

        self.transformer = nn.Transformer(d_model=d_model, nhead=nhead,
                                          num_encoder_layers=num_encoder_layers,
                                          num_decoder_layers=num_decoder_layers,
                                          dim_feedforward=dim_feedforward,
                                          dropout=dropout,
                                          batch_first=True)

        self.fc_out = nn.Linear(d_model, trg_vocab_size)

    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask.to(device)

    def create_mask(self, src, trg):
        src_seq_len = src.shape[1]
        trg_seq_len = trg.shape[1]
        trg_mask = self.generate_square_subsequent_mask(trg_seq_len)
        src_mask = torch.zeros((src_seq_len, src_seq_len), device=device).type(torch.bool)
        src_padding_mask = (src == vocab_en['<pad>'])
        trg_padding_mask = (trg == vocab_de['<pad>'])
        return src_mask, trg_mask, src_padding_mask, trg_padding_mask

    def forward(self, src, trg):
        src_mask, trg_mask, src_pad_mask, trg_pad_mask = self.create_mask(src, trg)
        src_emb = self.pos_encoder(self.src_embedding(src) * math.sqrt(self.d_model))
        trg_emb = self.pos_encoder(self.trg_embedding(trg) * math.sqrt(self.d_model))

        output = self.transformer(src=src_emb, tgt=trg_emb,
                                  src_mask=src_mask, tgt_mask=trg_mask,
                                  src_key_padding_mask=src_pad_mask,
                                  tgt_key_padding_mask=trg_pad_mask,
                                  memory_key_padding_mask=src_pad_mask)
        return self.fc_out(output)

In [3]:
# ==========================================
# STEP 3: HELPER FUNCTIONS (Train, Infer, Metrics)
# ==========================================
from nltk.translate.bleu_score import corpus_bleu
from torchmetrics.text.rouge import ROUGEScore

def train_epoch(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    for i, (src, trg) in enumerate(iterator):
        src, trg = src.to(device), trg.to(device)
        trg_input = trg[:, :-1]
        trg_expected = trg[:, 1:]

        optimizer.zero_grad()
        output = model(src, trg_input)
        output_dim = output.shape[-1]
        output = output.contiguous().view(-1, output_dim)
        trg_expected = trg_expected.contiguous().view(-1)

        loss = criterion(output, trg_expected)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)

def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, (src, trg) in enumerate(iterator):
            src, trg = src.to(device), trg.to(device)
            trg_input = trg[:, :-1]
            trg_expected = trg[:, 1:]
            output = model(src, trg_input)
            output_dim = output.shape[-1]
            output = output.contiguous().view(-1, output_dim)
            trg_expected = trg_expected.contiguous().view(-1)
            loss = criterion(output, trg_expected)
            epoch_loss += loss.item()
    return epoch_loss / len(iterator)

def translate_sentence(sentence, src_vocab, trg_vocab, model, device, max_len=50):
    model.eval()
    if isinstance(sentence, str):
        tokens = [token.text.lower() for token in spacy_en(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    tokens = ['<sos>'] + tokens + ['<eos>']
    src_indexes = [src_vocab.get(token, src_vocab['<unk>']) for token in tokens]
    src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)
    trg_indexes = [trg_vocab['<sos>']]

    for i in range(max_len):
        trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)
        with torch.no_grad():
            output = model(src_tensor, trg_tensor)
        pred_token = output.argmax(2)[:,-1].item()
        trg_indexes.append(pred_token)
        if pred_token == trg_vocab['<eos>']: break

    trg_itos = {v: k for k, v in trg_vocab.items()}
    return [trg_itos[i] for i in trg_indexes][1:]

def calculate_metrics(data, model, src_vocab, trg_vocab, device):
    targets, outputs = [], []
    rouge_scorer = ROUGEScore()

    for example in data:
        src = example['en']
        trg = example['de']
        pred = translate_sentence(src, src_vocab, trg_vocab, model, device)
        targets.append([tokenize_de(trg)])
        outputs.append(" ".join([t for t in pred if t != '<eos>']))

    bleu = corpus_bleu(targets, [o.split() for o in outputs]) * 100
    ref_strings = [" ".join(t[0]) for t in targets]
    rouge = rouge_scorer(outputs, ref_strings)['rougeL_fmeasure'].item()

    return bleu, rouge

In [4]:
# ==========================================
# STEP 4: ABLATION STUDY & EXECUTION
# ==========================================
# Task 3e: Conduct ablation study varying layers and heads

configs = [
    {'name': 'Base', 'layers': 3, 'heads': 8},
    {'name': 'Shallow', 'layers': 1, 'heads': 8}, # Fewer layers
    {'name': 'Deep', 'layers': 6, 'heads': 8},    # More layers
    {'name': 'Low-Head', 'layers': 3, 'heads': 4} # Fewer heads
]

results = []
INPUT_DIM = len(vocab_en)
OUTPUT_DIM = len(vocab_de)
D_MODEL = 256
FF_DIM = 512
DROPOUT = 0.1
LR = 0.0005
N_EPOCHS = 10 # Short training for demonstration

test_subset = [test_data[i] for i in range(50)] # Fast eval

for conf in configs:
    print(f"\n{'='*40}")
    print(f"Training Config: {conf['name']} (L={conf['layers']}, H={conf['heads']})")
    print(f"{'='*40}")

    # Reset model
    model = TransformerModel(INPUT_DIM, OUTPUT_DIM, D_MODEL, conf['heads'],
                             conf['layers'], conf['layers'], FF_DIM, DROPOUT).to(device)

    optimizer = optim.Adam(model.parameters(), lr=LR)
    criterion = nn.CrossEntropyLoss(ignore_index=vocab_de['<pad>'])

    best_valid_loss = float('inf')
    total_train_time = 0

    # Training Loop
    for epoch in range(N_EPOCHS):
        start = time()
        train_loss = train_epoch(model, train_loader, optimizer, criterion, 1)
        valid_loss = evaluate(model, val_loader, criterion)
        end = time()

        total_train_time += (end - start)

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), f'{work_dir}/transformer_{conf["name"]}.pt')

        if (epoch+1) % 5 == 0:
            print(f"Epoch {epoch+1} | Val Loss: {valid_loss:.3f} | PPL: {math.exp(valid_loss):.3f}")

    # Load best model
    model.load_state_dict(torch.load(f'{work_dir}/transformer_{conf["name"]}.pt', map_location=device))

    # Metrics
    bleu, rouge = calculate_metrics(test_subset, model, vocab_en, vocab_de, device)

    # GPU Memory Tracking (Task d)
    gpu_mem = torch.cuda.max_memory_allocated() / 1024**2 if torch.cuda.is_available() else 0

    results.append({
        'Config': conf['name'],
        'Layers': conf['layers'],
        'Heads': conf['heads'],
        'BLEU': bleu,
        'ROUGE-L': rouge,
        'Train Time (s)': total_train_time,
        'GPU Mem (MB)': gpu_mem
    })

    # Clear memory
    del model
    torch.cuda.empty_cache()

# ==========================================
# FINAL REPORT
# ==========================================
print("\n" + "="*40)
print("ABLATION STUDY RESULTS (Task 3d & 3e)")
print("="*40)
df = pd.DataFrame(results)
print(df)
df.to_csv(f'{work_dir}/q3_ablation_results.csv', index=False)

# Sample Translation
print("\nSample Translation (Base Model):")
model = TransformerModel(INPUT_DIM, OUTPUT_DIM, D_MODEL, 8, 3, 3, FF_DIM, DROPOUT).to(device)
model.load_state_dict(torch.load(f'{work_dir}/transformer_Base.pt', map_location=device))
src_sent = test_data[12]['en']
trg_sent = test_data[12]['de']
pred = translate_sentence(src_sent, vocab_en, vocab_de, model, device)
print(f"Src: {src_sent}")
print(f"Ref: {trg_sent}")
print(f"Pred: {' '.join(pred[:-1])}")


Training Config: Base (L=3, H=8)




Epoch 5 | Val Loss: 2.247 | PPL: 9.456
Epoch 10 | Val Loss: 1.877 | PPL: 6.536

Training Config: Shallow (L=1, H=8)
Epoch 5 | Val Loss: 2.733 | PPL: 15.375
Epoch 10 | Val Loss: 2.373 | PPL: 10.725

Training Config: Deep (L=6, H=8)
Epoch 5 | Val Loss: 2.647 | PPL: 14.115
Epoch 10 | Val Loss: 2.002 | PPL: 7.407

Training Config: Low-Head (L=3, H=4)
Epoch 5 | Val Loss: 2.220 | PPL: 9.205
Epoch 10 | Val Loss: 1.853 | PPL: 6.381

ABLATION STUDY RESULTS (Task 3d & 3e)
     Config  Layers  Heads       BLEU   ROUGE-L  Train Time (s)  GPU Mem (MB)
0      Base       3      8  21.878689  0.519400      191.362544   1515.854004
1   Shallow       1      8  13.871573  0.428283      107.691583   1515.854004
2      Deep       6      8  22.532577  0.506596      324.334867   2177.535645
3  Low-Head       3      4  25.141814  0.512119      182.363690   2177.535645

Sample Translation (Base Model):
Src: A woman holding a bowl of food in a kitchen.
Ref: Eine Frau, die in einer Küche eine Schale mit Essen hä

