<a href="https://colab.research.google.com/github/eraykocabozdogan/Ceng543_Take_Home_Midterm/blob/main/ceng543_q2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ==========================================
# STEP 1: SETUP, IMPORTS & DATA LOADING
# ==========================================
from google.colab import drive
import os
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from datasets import load_dataset
import spacy
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import pandas as pd
import math

# 1. Mount Drive & Setup Directory
drive.mount('/content/drive')
work_dir = '/content/drive/My Drive/CENG543_Midterm_Q2'
if not os.path.exists(work_dir):
    os.makedirs(work_dir)

# 2. Install & Download Requirements
!pip install -q datasets spacy portalocker nltk torchmetrics
!python -m spacy download en_core_web_sm > /dev/null
!python -m spacy download de_core_news_sm > /dev/null

# 3. Reproducibility
SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

# 4. Tokenizers & Vocabulary
spacy_de = spacy.load('de_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')

def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

def build_vocab(sentences, tokenizer, min_freq=2):
    counter = {}
    for sentence in sentences:
        tokens = tokenizer(sentence)
        for token in tokens:
            counter[token] = counter.get(token, 0) + 1

    vocab = {'<pad>': 0, '<unk>': 1, '<sos>': 2, '<eos>': 3}
    sorted_words = sorted(counter.items(), key=lambda x: x[1], reverse=True)
    for word, freq in sorted_words:
        if freq >= min_freq:
            vocab[word] = len(vocab)
    return vocab

# 5. Load Data
print("Loading Data...")
dataset = load_dataset("bentrevett/multi30k")
train_data, val_data, test_data = dataset['train'], dataset['validation'], dataset['test']

train_src = [item['en'] for item in train_data]
train_trg = [item['de'] for item in train_data]

vocab_en = build_vocab(train_src, tokenize_en, min_freq=2)
vocab_de = build_vocab(train_trg, tokenize_de, min_freq=2)

print(f"Vocab Sizes - EN: {len(vocab_en)}, DE: {len(vocab_de)}")

# 6. Collate Function & Loaders
def process_batch(batch):
    src_batch, trg_batch = [], []
    for item in batch:
        src_tokens = [vocab_en.get(t, vocab_en['<unk>']) for t in tokenize_en(item['en'])]
        src_tensor = torch.tensor([vocab_en['<sos>']] + src_tokens + [vocab_en['<eos>']], dtype=torch.long)
        src_batch.append(src_tensor)

        trg_tokens = [vocab_de.get(t, vocab_de['<unk>']) for t in tokenize_de(item['de'])]
        trg_tensor = torch.tensor([vocab_de['<sos>']] + trg_tokens + [vocab_de['<eos>']], dtype=torch.long)
        trg_batch.append(trg_tensor)

    src_batch = pad_sequence(src_batch, padding_value=vocab_en['<pad>'])
    trg_batch = pad_sequence(trg_batch, padding_value=vocab_de['<pad>'])
    return src_batch.to(device), trg_batch.to(device)

BATCH_SIZE = 128
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=process_batch)
val_loader = DataLoader(val_data, batch_size=BATCH_SIZE, shuffle=False, collate_fn=process_batch)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False, collate_fn=process_batch)
print("Data Ready!")

Mounted at /content/drive
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m983.2/983.2 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[?25hDevice: cuda
Loading Data...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train.jsonl: 0.00B [00:00, ?B/s]

val.jsonl: 0.00B [00:00, ?B/s]

test.jsonl: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/29000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1014 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Vocab Sizes - EN: 6191, DE: 8014
Data Ready!


In [None]:
# ==========================================
# STEP 2: MODEL ARCHITECTURE
# ==========================================
import torch.nn.functional as F

class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional=True)
        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, hidden = self.rnn(embedded)
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)))
        return outputs, hidden

class Attention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim, method='additive'):
        super().__init__()
        self.method = method
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim

        if self.method == 'additive':
            self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim)
            self.v = nn.Linear(dec_hid_dim, 1, bias=False)
        elif self.method == 'multiplicative':
            self.attn = nn.Linear((enc_hid_dim * 2), dec_hid_dim)

    def forward(self, hidden, encoder_outputs):
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        if self.method == 'additive':
            hidden_expanded = hidden.unsqueeze(1).repeat(1, encoder_outputs.shape[1], 1)
            energy = torch.tanh(self.attn(torch.cat((hidden_expanded, encoder_outputs), dim=2)))
            attention = self.v(energy).squeeze(2)
        elif self.method == 'multiplicative':
            projected = self.attn(encoder_outputs)
            hidden_usq = hidden.unsqueeze(2)
            attention = torch.bmm(projected, hidden_usq).squeeze(2)
        elif self.method == 'dot':
            hidden_usq = hidden.unsqueeze(2)
            attention = torch.bmm(encoder_outputs, hidden_usq).squeeze(2)
            scale = (self.dec_hid_dim) ** 0.5
            attention = attention / scale

        return F.softmax(attention, dim=1)

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
        super().__init__()
        self.output_dim = output_dim
        self.attention = attention
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)
        self.fc_out = nn.Linear((enc_hid_dim * 2) + dec_hid_dim + emb_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, encoder_outputs):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        a = self.attention(hidden, encoder_outputs).unsqueeze(1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        weighted = torch.bmm(a, encoder_outputs).permute(1, 0, 2)
        rnn_input = torch.cat((embedded, weighted), dim=2)
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        prediction = self.fc_out(torch.cat((output.squeeze(0), weighted.squeeze(0), embedded.squeeze(0)), dim=1))
        return prediction, hidden.squeeze(0), a.squeeze(1)

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = src.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        encoder_outputs, hidden = self.encoder(src)
        input = trg[0,:]
        for t in range(1, trg_len):
            output, hidden, _ = self.decoder(input, hidden, encoder_outputs)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[t] if teacher_force else top1
        return outputs

In [None]:
# ==========================================
# STEP 3: HELPER FUNCTIONS (Train, Eval, Time)
# ==========================================
import time

def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    for i, (src, trg) in enumerate(iterator):
        src, trg = src.to(device), trg.to(device)
        optimizer.zero_grad()
        output = model(src, trg)
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)

def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, (src, trg) in enumerate(iterator):
            src, trg = src.to(device), trg.to(device)
            output = model(src, trg, 0) # Turn off teacher forcing
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)
            loss = criterion(output, trg)
            epoch_loss += loss.item()
    return epoch_loss / len(iterator)

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
# ==========================================
# STEP 4: VISUALIZATION & METRICS (BLEU, ROUGE, ENTROPY)
# ==========================================
from nltk.translate.bleu_score import corpus_bleu
from torchmetrics.text.rouge import ROUGEScore

# 1. Translate & Calculate Entropy
def translate_sentence(sentence, src_vocab, trg_vocab, model, device, max_len=50):
    model.eval()
    if isinstance(sentence, str):
        tokens = [token.text.lower() for token in spacy_en(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    tokens = ['<sos>'] + tokens + ['<eos>']
    src_indexes = [src_vocab.get(token, src_vocab['<unk>']) for token in tokens]
    src_tensor = torch.LongTensor(src_indexes).unsqueeze(1).to(device)

    with torch.no_grad():
        encoder_outputs, hidden = model.encoder(src_tensor)

    trg_indexes = [trg_vocab['<sos>']]
    attentions = torch.zeros(max_len, 1, len(src_indexes)).to(device)
    entropies = []

    for i in range(max_len):
        trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(device)
        with torch.no_grad():
            output, hidden, attention = model.decoder(trg_tensor, hidden, encoder_outputs)

        # Calculate Entropy for Sharpness analysis
        attn_probs = attention.squeeze(0).squeeze(0)
        entropy = -torch.sum(attn_probs * torch.log(attn_probs + 1e-9)).item()
        entropies.append(entropy)

        attentions[i] = attention
        pred_token = output.argmax(1).item()
        trg_indexes.append(pred_token)
        if pred_token == trg_vocab['<eos>']: break

    trg_itos = {v: k for k, v in trg_vocab.items()}
    trg_tokens = [trg_itos[i] for i in trg_indexes]

    return trg_tokens[1:], attentions[:len(trg_tokens)-1], np.mean(entropies)

# 2. Plot Attention Map
def display_attention(sentence, translation, attention, model_name, save_dir):
    fig = plt.figure(figsize=(8, 8))
    ax = fig.add_subplot(111)
    attention = attention.squeeze(1).cpu().numpy()
    cax = ax.matshow(attention, cmap='bone')
    fig.colorbar(cax)
    ax.set_xticklabels([''] + ['<sos>'] + [t.lower() for t in sentence] + ['<eos>'], rotation=90)
    ax.set_yticklabels([''] + translation)
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
    plt.title(f"Attention: {model_name}")
    plt.tight_layout()
    plt.savefig(f'{save_dir}/attn_{model_name}.png')
    plt.close()

# 3. Calculate All Metrics
def calculate_metrics(data, model, src_vocab, trg_vocab, device, model_name):
    targets, outputs, total_entropy = [], [], 0
    rouge_scorer = ROUGEScore()

    print(f"Evaluating {model_name}...")
    for example in data:
        src = example['en']
        trg = example['de']
        pred, _, entropy = translate_sentence(src, src_vocab, trg_vocab, model, device)

        targets.append([tokenize_de(trg)])
        outputs.append(" ".join([t for t in pred if t != '<eos>']))
        total_entropy += entropy

    bleu = corpus_bleu(targets, [o.split() for o in outputs]) * 100

    # ROUGE requires list of strings
    ref_strings = [" ".join(t[0]) for t in targets]
    rouge_scores = rouge_scorer(outputs, ref_strings)

    return bleu, rouge_scores['rougeL_fmeasure'].item(), total_entropy / len(data)

In [None]:
# ==========================================
# STEP 5: EXECUTION LOOP (Train, Viz, Compare)
# ==========================================
INPUT_DIM = len(vocab_en)
OUTPUT_DIM = len(vocab_de)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
ENC_HID_DIM = 256
DEC_HID_DIM = 512
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5
N_EPOCHS = 5
CLIP = 1

results = []
fixed_idx = 12
fixed_src = test_data[fixed_idx]['en']
print(f"Visualizing Sentence: {fixed_src}")

# Use first 100 test samples for fast metric calculation
test_subset = [test_data[i] for i in range(100)]

for method in ['additive', 'multiplicative', 'dot']:
    print(f"\nTraining {method.upper()} Attention...")

    attn = Attention(ENC_HID_DIM, DEC_HID_DIM, method=method)
    enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
    dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)
    model = Seq2Seq(enc, dec, device).to(device)

    optimizer = optim.Adam(model.parameters())
    criterion = nn.CrossEntropyLoss(ignore_index=vocab_de['<pad>'])

    best_valid_loss = float('inf')
    for epoch in range(N_EPOCHS):
        start = time.time()
        train_loss = train(model, train_loader, optimizer, criterion, CLIP)
        valid_loss = evaluate(model, val_loader, criterion)
        end = time.time()
        mins, secs = epoch_time(start, end)

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), f'{work_dir}/model_{method}.pt')

        print(f'Epoch: {epoch+1:02} | Time: {mins}m {secs}s | Val. Loss: {valid_loss:.3f} | PPL: {math.exp(valid_loss):.3f}')

    # Evaluate Best Model
    model.load_state_dict(torch.load(f'{work_dir}/model_{method}.pt', map_location=device))

    # 1. Visualize Same Sentence
    tok_src = tokenize_en(fixed_src)
    trans, attn_w, _ = translate_sentence(tok_src, vocab_en, vocab_de, model, device)
    display_attention(tok_src, trans, attn_w, method, work_dir)

    # 2. Metrics
    bleu, rouge, entropy = calculate_metrics(test_subset, model, vocab_en, vocab_de, device, method)
    results.append({
        'Method': method,
        'BLEU': bleu,
        'ROUGE-L': rouge,
        'Entropy': entropy
    })

print("\nFinal Results:")
df = pd.DataFrame(results)
print(df)
df.to_csv(f'{work_dir}/q2_results.csv', index=False)

Visualizing Sentence: A woman holding a bowl of food in a kitchen.

Training ADDITIVE Attention...
Epoch: 01 | Time: 1m 9s | Val. Loss: 3.850 | PPL: 47.009
Epoch: 02 | Time: 1m 9s | Val. Loss: 3.318 | PPL: 27.607
Epoch: 03 | Time: 1m 12s | Val. Loss: 3.251 | PPL: 25.823
Epoch: 04 | Time: 1m 13s | Val. Loss: 3.177 | PPL: 23.983
Epoch: 05 | Time: 1m 12s | Val. Loss: 3.154 | PPL: 23.441


  ax.set_xticklabels([''] + ['<sos>'] + [t.lower() for t in sentence] + ['<eos>'], rotation=90)
  ax.set_yticklabels([''] + translation)


Evaluating additive...

Training MULTIPLICATIVE Attention...
Epoch: 01 | Time: 1m 2s | Val. Loss: 4.225 | PPL: 68.359
Epoch: 02 | Time: 1m 2s | Val. Loss: 3.831 | PPL: 46.123
Epoch: 03 | Time: 1m 2s | Val. Loss: 3.653 | PPL: 38.593
Epoch: 04 | Time: 1m 1s | Val. Loss: 3.534 | PPL: 34.263
Epoch: 05 | Time: 1m 1s | Val. Loss: 3.546 | PPL: 34.659


  ax.set_xticklabels([''] + ['<sos>'] + [t.lower() for t in sentence] + ['<eos>'], rotation=90)
  ax.set_yticklabels([''] + translation)


Evaluating multiplicative...

Training DOT Attention...
Epoch: 01 | Time: 0m 50s | Val. Loss: 3.796 | PPL: 44.518
Epoch: 02 | Time: 0m 50s | Val. Loss: 3.395 | PPL: 29.826
Epoch: 03 | Time: 0m 50s | Val. Loss: 3.238 | PPL: 25.472
Epoch: 04 | Time: 0m 50s | Val. Loss: 3.147 | PPL: 23.263
Epoch: 05 | Time: 0m 50s | Val. Loss: 3.154 | PPL: 23.435


  ax.set_xticklabels([''] + ['<sos>'] + [t.lower() for t in sentence] + ['<eos>'], rotation=90)
  ax.set_yticklabels([''] + translation)


Evaluating dot...

Final Results:
           Method       BLEU   ROUGE-L   Entropy
0        additive  21.975630  0.532744  1.333818
1  multiplicative  17.865712  0.474366  0.029897
2             dot  21.046007  0.521606  1.560400
