In [1]:
# ==========================================
# QUESTION 5: INTERPRETABILITY & ERROR ANALYSIS (FINAL)
# ==========================================

from google.colab import drive
import os
import torch
import torch.nn as nn
import math
import spacy
import pandas as pd
import torch.nn.functional as F
import numpy as np
from torch.nn.utils.rnn import pad_sequence
from datasets import load_dataset
from nltk.translate.bleu_score import sentence_bleu

# 1. SETUP & DRIVE
drive.mount('/content/drive')
work_dir = '/content/drive/My Drive/CENG543_Midterm_Q3' # Load Q3 Model
model_path = f'{work_dir}/model_transformer.pt'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

# 2. DATA SETUP
!pip install -q portalocker nltk spacy
!python -m spacy download en_core_web_sm > /dev/null
!python -m spacy download de_core_news_sm > /dev/null

spacy_de = spacy.load('de_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')

def tokenize_de(text): return [tok.text for tok in spacy_de.tokenizer(text)]
def tokenize_en(text): return [tok.text for tok in spacy_en.tokenizer(text)]

def build_vocab(sentences, tokenizer, min_freq=2):
    counter = {}
    for sentence in sentences:
        tokens = tokenizer(sentence)
        for token in tokens:
            counter[token] = counter.get(token, 0) + 1
    vocab = {'<pad>': 0, '<unk>': 1, '<sos>': 2, '<eos>': 3}
    sorted_words = sorted(counter.items(), key=lambda x: x[1], reverse=True)
    for word, freq in sorted_words:
        if freq >= min_freq: vocab[word] = len(vocab)
    return vocab

print("Loading Data & Vocab...")
dataset = load_dataset("bentrevett/multi30k")
train_data, test_data = dataset['train'], dataset['test']
vocab_en = build_vocab([x['en'] for x in train_data], tokenize_en)
vocab_de = build_vocab([x['de'] for x in train_data], tokenize_de)

# 3. DEFINE MODEL (Must match Q3)
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)

class TransformerModel(nn.Module):
    def __init__(self, src_vocab_size, trg_vocab_size, d_model=256, nhead=8,
                 num_encoder_layers=3, num_decoder_layers=3, dim_feedforward=512, dropout=0.1):
        super(TransformerModel, self).__init__()
        self.d_model = d_model
        self.src_embedding = nn.Embedding(src_vocab_size, d_model)
        self.trg_embedding = nn.Embedding(trg_vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        self.transformer = nn.Transformer(d_model=d_model, nhead=nhead,
                                          num_encoder_layers=num_encoder_layers,
                                          num_decoder_layers=num_decoder_layers,
                                          dim_feedforward=dim_feedforward,
                                          dropout=dropout, batch_first=True)
        self.fc_out = nn.Linear(d_model, trg_vocab_size)

    def forward(self, src, trg):
        return self.fc_out(self.transformer(self.pos_encoder(self.src_embedding(src)),
                                            self.pos_encoder(self.trg_embedding(trg))))

# 4. LOAD WEIGHTS
print("Loading Saved Model Weights...")
model = TransformerModel(len(vocab_en), len(vocab_de), d_model=256, nhead=8,
                         num_encoder_layers=3, num_decoder_layers=3, dim_feedforward=512, dropout=0.1)

if os.path.exists(model_path):
    model.load_state_dict(torch.load(model_path, map_location=device))
    model = model.to(device)
    print("SUCCESS: Q3 Transformer Model Loaded!")
else:
    print(f"ERROR: Model file not found at {model_path}. Ensure Q3 ran successfully.")

# 5. DIAGNOSTIC TOOLS (Entropy & Interpretation)
# ------------------------------------------------
def calculate_entropy(logits):
    """Calculates Shannon Entropy: Uncertainty of the model"""
    probs = F.softmax(logits, dim=-1)
    log_probs = F.log_softmax(logits, dim=-1)
    entropy = -(probs * log_probs).sum(dim=-1)
    return entropy.item()

def translate_and_diagnose(sentence, src_vocab, trg_vocab, model, device, max_len=50):
    model.eval()
    if isinstance(sentence, str):
        tokens = [token.text.lower() for token in spacy_en(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    tokens = ['<sos>'] + tokens + ['<eos>']
    src_indexes = [src_vocab.get(token, src_vocab['<unk>']) for token in tokens]
    src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)
    trg_indexes = [trg_vocab['<sos>']]

    token_diagnostics = [] # Stores (word, entropy)
    total_entropy = 0

    # Reverse vocab for lookup
    trg_itos = {v: k for k, v in trg_vocab.items()}

    for i in range(max_len):
        trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)

        with torch.no_grad():
            output = model(src_tensor, trg_tensor)

        last_token_logits = output[0, -1, :]
        step_entropy = calculate_entropy(last_token_logits)
        total_entropy += step_entropy

        pred_token = last_token_logits.argmax().item()
        trg_indexes.append(pred_token)

        word = trg_itos.get(pred_token, '<unk>')
        token_diagnostics.append((word, step_entropy))

        if pred_token == trg_vocab['<eos>']:
            break

    avg_entropy = total_entropy / len(token_diagnostics) if token_diagnostics else 0
    return token_diagnostics, avg_entropy

# 6. ANALYSIS EXECUTION
# ------------------------------------------------
print("Scanning Test Set for Failure Cases...")
results = []
scan_limit = 200 # Check first 200 samples

for i in range(min(scan_limit, len(test_data))):
    src = test_data[i]['en']
    trg = test_data[i]['de']

    diagnostics, entropy = translate_and_diagnose(src, vocab_en, vocab_de, model, device)
    pred_sentence = " ".join([x[0] for x in diagnostics])

    ref_tokens = [tokenize_de(trg)]
    pred_tokens = pred_sentence.split()

    # BLEU-2 for sentence level
    score = sentence_bleu(ref_tokens, pred_tokens, weights=(0.5, 0.5, 0, 0))

    results.append({
        'id': i,
        'src': src,
        'ref': trg,
        'hyp': pred_sentence,
        'bleu': score,
        'entropy': entropy,
        'tokens': diagnostics
    })

df = pd.DataFrame(results)
df.to_csv(f'{work_dir}/failure_analysis.csv', index=False)

# 7. REPORT GENERATION
# ------------------------------------------------
failures = df.sort_values(by='bleu', ascending=True).head(5)
successes = df.sort_values(by='bleu', ascending=False).head(2)

print(f"\n{'='*20} FAILURE CASES ANALYSIS (Task 5c & 5d) {'='*20}")
for index, row in failures.iterrows():
    print(f"\n[Case ID: {row['id']}] BLEU: {row['bleu']:.4f} | Avg Entropy: {row['entropy']:.4f}")
    print(f"Source:    {row['src']}")
    print(f"Reference: {row['ref']}")
    print(f"Prediction: {row['hyp']}")

    # Interpretability: Show High Uncertainty Tokens
    print("Token-Level Uncertainty (Entropy > 2.0 is high):")
    high_uncertainty = [f"{t[0]}({t[1]:.2f})" for t in row['tokens'] if t[1] > 1.5]
    if high_uncertainty:
        print(f"  -> Confused at: {', '.join(high_uncertainty)}")
    else:
        print("  -> Model was confidently wrong.")
    print("-" * 50)

print(f"\n{'='*20} SUCCESS CASES (Task 5b - Interpretability) {'='*20}")
for index, row in successes.iterrows():
    print(f"\n[Case ID: {row['id']}] BLEU: {row['bleu']:.4f} | Avg Entropy: {row['entropy']:.4f}")
    print(f"Source: {row['src']}")
    print(f"Prediction: {row['hyp']}")
    print("Token-Level Confidence:")
    # Show low entropy tokens (Confident ones)
    print(f"  -> {', '.join([f'{t[0]}({t[1]:.2f})' for t in row['tokens']])}")

print(f"\nAnalysis Complete. Saved to {work_dir}/failure_analysis.csv")

Mounted at /content/drive
Device: cpu
Loading Data & Vocab...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train.jsonl: 0.00B [00:00, ?B/s]

val.jsonl: 0.00B [00:00, ?B/s]

test.jsonl: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/29000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1014 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Loading Saved Model Weights...
SUCCESS: Q3 Transformer Model Loaded!
Scanning Test Set for Failure Cases...


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()




[Case ID: 28] BLEU: 0.0000 | Avg Entropy: 2.7755
Source:    Women, wearing traditional clothing, are reenacting native life.
Reference: Frauen, die traditionelle Kleidung tragen, spielen das Leben Einheimischer nach.
Prediction: <unk> in <unk> Kleidung . <eos>
Token-Level Uncertainty (Entropy > 2.0 is high):
  -> Confused at: <unk>(4.03), in(3.79), <unk>(4.35), Kleidung(3.78)
--------------------------------------------------

[Case ID: 51] BLEU: 0.0000 | Avg Entropy: 3.6419
Source:    A wakeboarder performs a flip while being towed at high speed.
Reference: Ein Wakeboarder macht einen Salto während er bei hoher Geschwindigkeit an einem Seil gezogen wird.
Prediction: Ein <unk> wird in einem <unk> . <eos>
Token-Level Uncertainty (Entropy > 2.0 is high):
  -> Confused at: Ein(4.35), <unk>(4.67), wird(4.96), in(3.10), einem(1.58), <unk>(5.71), .(4.76)
--------------------------------------------------

[Case ID: 109] BLEU: 0.0000 | Avg Entropy: 3.9738
Source:    A mother teaches her two