In [1]:
import os
import re
import zipfile
import string
import glob
import pandas as pd
from sklearn.model_selection import train_test_split

# -------------------------------
# Step 1: Download & Extract Dataset
# -------------------------------
dataset_url = "https://github.com/amir9ume/urdu_ghazals_rekhta/raw/main/dataset/dataset.zip"
!wget -O dataset.zip "$dataset_url"

unzip_dir = "urdu_dataset"
os.makedirs(unzip_dir, exist_ok=True)

with zipfile.ZipFile("dataset.zip", "r") as zip_ref:
    zip_ref.extractall(unzip_dir)

print("✅ Dataset extracted at:", unzip_dir)

# -------------------------------
# Step 2: Collect Files (Urdu + Roman)
# -------------------------------
urdu_files = sorted(glob.glob(os.path.join(unzip_dir, "dataset", "**", "ur", "*"), recursive=True))
roman_files = sorted(glob.glob(os.path.join(unzip_dir, "dataset", "**", "en", "*"), recursive=True))

print("📊 Urdu files:", len(urdu_files))
print("📊 Roman files:", len(roman_files))

# -------------------------------
# Step 3: Create Urdu-Roman Pairs
# -------------------------------
pairs = []
for ur_path, ro_path in zip(urdu_files, roman_files):
    with open(ur_path, "r", encoding="utf-8") as ur, open(ro_path, "r", encoding="utf-8") as ro:
        ur_lines, ro_lines = ur.readlines(), ro.readlines()
        for u, r in zip(ur_lines, ro_lines):
            u, r = u.strip(), r.strip()
            if u and r:
                pairs.append((u, r))

df = pd.DataFrame(pairs, columns=["Urdu Text", "Roman Transliteration"])
print("✅ Total pairs:", len(df))

# -------------------------------
# Step 4: Clean Urdu & Roman Text
# -------------------------------
def clean_urdu(text):
    text = re.sub(r'[ًٌٍَُِّْٰ]', '', text)  # remove diacritics
    text = text.replace("أ", "ا").replace("إ", "ا").replace("آ", "ا")  # normalize alef
    text = text.replace("ي", "ی").replace("ك", "ک")  # normalize Yeh, Kaaf
    text = re.sub(r'[^\u0600-\u06FF\s]', '', text)  # keep only Urdu letters
    return re.sub(r'\s+', ' ', text).strip()

def clean_roman(text):
    text = text.lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    return re.sub(r'\s+', ' ', text).strip()

df["Urdu Text"] = df["Urdu Text"].apply(clean_urdu)
df["Roman Transliteration"] = df["Roman Transliteration"].apply(clean_roman)

print("✅ Cleaned sample:\n", df.sample(3))

# -------------------------------
# Step 5: Tokenization (character-level for transliteration)
# -------------------------------
def char_tokenize(text):
    return " ".join(list(text))

df["Urdu Tokens"] = df["Urdu Text"].apply(char_tokenize)
df["Roman Tokens"] = df["Roman Transliteration"].apply(char_tokenize)

print("📂 Tokenized sample:\n", df.sample(3))

# -------------------------------
# Step 6: Train/Validation/Test Split (50/25/25)
# -------------------------------
train_data, temp_data = train_test_split(df, test_size=0.50, random_state=42)
valid_data, test_data = train_test_split(temp_data, test_size=0.50, random_state=42)

print("✅ Split sizes:")
print("Train:", len(train_data))
print("Validation:", len(valid_data))
print("Test:", len(test_data))

# -------------------------------
# Step 7: Save Splits to CSV
# -------------------------------
train_data.to_csv("train_data.csv", index=False)
valid_data.to_csv("valid_data.csv", index=False)
test_data.to_csv("test_data.csv", index=False)

print("📂 CSV files saved: train_data.csv, valid_data.csv, test_data.csv")


--2025-09-22 04:50:35--  https://github.com/amir9ume/urdu_ghazals_rekhta/raw/main/dataset/dataset.zip
Resolving github.com (github.com)... 140.82.114.3
Connecting to github.com (github.com)|140.82.114.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/amir9ume/urdu_ghazals_rekhta/main/dataset/dataset.zip [following]
--2025-09-22 04:50:36--  https://raw.githubusercontent.com/amir9ume/urdu_ghazals_rekhta/main/dataset/dataset.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2927519 (2.8M) [application/zip]
Saving to: ‘dataset.zip’


2025-09-22 04:50:36 (33.6 MB/s) - ‘dataset.zip’ saved [2927519/2927519]

✅ Dataset extracted at: urdu_dataset
📊 Urdu files: 1314
📊 Roman files: 1314
✅ Total pairs: 2

In [2]:
# Full pipeline: dataset -> preprocess -> char vocab -> Seq2Seq (BiLSTM enc, LSTM dec) -> train (5 epochs) -> eval metrics
# Save as run_transliteration_experiments.py and run, or paste into a notebook cell.

import os
import re
import zipfile
import glob
import random
import string
import math
from collections import Counter
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# ---------- Utilities ----------
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", DEVICE)

def download_and_extract():
    dataset_url = "https://github.com/amir9ume/urdu_ghazals_rekhta/raw/main/dataset/dataset.zip"
    if not os.path.exists("dataset.zip"):
        print("Downloading dataset.zip ...")
        # If running in a shell-enabled environment like Colab, uncomment:
        # !wget -O dataset.zip "$dataset_url"
        # If wget unavailable, the user should provide dataset.zip manually.
        raise RuntimeError("Please download dataset.zip manually into working directory or enable shell commands.")
    print("Extracting dataset.zip ...")
    unzip_dir = "urdu_dataset"
    os.makedirs(unzip_dir, exist_ok=True)
    with zipfile.ZipFile("dataset.zip", "r") as zip_ref:
        zip_ref.extractall(unzip_dir)
    return unzip_dir

def collect_pairs(unzip_dir):
    urdu_files = sorted(glob.glob(os.path.join(unzip_dir, "dataset", "**", "ur", "*"), recursive=True))
    roman_files = sorted(glob.glob(os.path.join(unzip_dir, "dataset", "**", "en", "*"), recursive=True))
    pairs = []
    for u_path, r_path in zip(urdu_files, roman_files):
        with open(u_path, "r", encoding="utf-8", errors="ignore") as uf, open(r_path, "r", encoding="utf-8", errors="ignore") as rf:
            u_lines = uf.readlines()
            r_lines = rf.readlines()
            for u, r in zip(u_lines, r_lines):
                u, r = u.strip(), r.strip()
                if u and r:
                    pairs.append((u, r))
    return pd.DataFrame(pairs, columns=["Urdu Text", "Roman Transliteration"])

# ---------- Text cleaning ----------
def clean_urdu(text):
    # remove diacritics
    text = re.sub(r'[ًٌٍَُِّْٰ]', '', text)
    # normalize alef and variants
    text = text.replace("أ", "ا").replace("إ", "ا").replace("آ", "ا")
    # normalize yeh/kaaf etc
    text = text.replace("ي", "ی").replace("ك", "ک").replace("ھ", "ہ")
    # remove non-Arabic/Persian block chars (keep Urdu block)
    text = re.sub(r'[^\u0600-\u06FF\s]', '', text)
    # collapse spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def clean_roman(text):
    text = text.lower()
    # keep letters and digits and spaces and basic punctuation maybe, but we will remove punctuation for transliteration
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# ---------- Tokenization & vocab ----------
PAD_TOKEN = "<pad>"
SOS_TOKEN = "<sos>"
EOS_TOKEN = "<eos>"
UNK_TOKEN = "<unk>"

class CharVocab:
    def __init__(self, tokens=None, min_freq=1):
        self.min_freq = min_freq
        self.idx2token = []
        self.token2idx = {}
        if tokens is None:
            tokens = []
        self.build(tokens)

    def build(self, tokens):
        # tokens: iterable of characters (flattened)
        counts = Counter(tokens)
        # required specials
        specials = [PAD_TOKEN, SOS_TOKEN, EOS_TOKEN, UNK_TOKEN]
        self.idx2token = specials + [t for t, c in counts.items() if c >= self.min_freq and t not in specials]
        self.token2idx = {t: i for i, t in enumerate(self.idx2token)}

    def __len__(self):
        return len(self.idx2token)

    def token2index(self, t):
        return self.token2idx.get(t, self.token2idx.get(UNK_TOKEN))

    def index2token(self, i):
        return self.idx2token[i]

# ---------- Dataset ----------
def char_tokenize_spaced(text):
    # returns list of characters (no spaces removed since we want spaces as token too)
    # preserve spaces as actual ' ' tokens
    return list(text)

class TranslitDataset(Dataset):
    def __init__(self, df, src_vocab, trg_vocab, max_len=200):
        self.src = df["Urdu Text"].tolist()
        self.trg = df["Roman Transliteration"].tolist()
        self.src_vocab = src_vocab
        self.trg_vocab = trg_vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.src)

    def encode_seq(self, seq, vocab):
        # add SOS and EOS for target only outside if needed
        ids = [vocab.token2index(ch) for ch in seq]
        return ids

    def __getitem__(self, idx):
        src_chars = char_tokenize_spaced(self.src[idx])
        trg_chars = char_tokenize_spaced(self.trg[idx])
        src_ids = [self.src_vocab.token2index(ch) for ch in src_chars]
        trg_ids = [self.trg_vocab.token2index(ch) for ch in trg_chars]
        return torch.tensor(src_ids, dtype=torch.long), torch.tensor(trg_ids, dtype=torch.long)

def collate_fn(batch):
    # batch: list of (src_ids, trg_ids)
    src_seqs, trg_seqs = zip(*batch)
    src_lengths = [len(s) for s in src_seqs]
    trg_lengths = [len(t) for t in trg_seqs]
    max_src = max(src_lengths)
    max_trg = max(trg_lengths) + 2  # for SOS EOS in decoder input/output

    padded_src = torch.full((len(batch), max_src), fill_value=src_vocab.token2index(PAD_TOKEN), dtype=torch.long)
    padded_trg = torch.full((len(batch), max_trg), fill_value=trg_vocab.token2index(PAD_TOKEN), dtype=torch.long)

    for i, (s, t) in enumerate(zip(src_seqs, trg_seqs)):
        padded_src[i, :len(s)] = s
        # target decoder input: <sos> + trg + <eos>
        trg_in = [trg_vocab.token2index(SOS_TOKEN)] + t.tolist() + [trg_vocab.token2index(EOS_TOKEN)]
        padded_trg[i, :len(trg_in)] = torch.tensor(trg_in, dtype=torch.long)

    src_lengths = torch.tensor(src_lengths, dtype=torch.long)
    trg_lengths = torch.tensor([len(t)+2 for t in trg_seqs], dtype=torch.long)
    return padded_src.to(DEVICE), src_lengths.to(DEVICE), padded_trg.to(DEVICE), trg_lengths.to(DEVICE)

# ---------- Seq2Seq Model ----------
class EncoderBiLSTM(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers=2, dropout=0.3, pad_idx=0):
        super().__init__()
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(emb_dim, hid_dim, num_layers=n_layers, bidirectional=True, batch_first=True, dropout=dropout if n_layers > 1 else 0.0)
        # we'll use a linear to map encoder final states to decoder initial states
        self.fc_hidden = nn.Linear(hid_dim * 2, hid_dim)
        self.fc_cell = nn.Linear(hid_dim * 2, hid_dim)

    def forward(self, src, src_lengths):
        # src: (B, L)
        embedded = self.embedding(src)  # (B, L, E)
        # pack padded would be better, but for brevity we won't pack; model handles paddings via embeddings PAD idx
        outputs, (hidden, cell) = self.lstm(embedded)  # hidden: (2*n_layers, B, hid_dim)
        # Take last layer's forward and backward hidden (for each layer it's organized as (fwd_l1, bwd_l1, fwd_l2, bwd_l2,...))
        # We'll extract last forward and backward pair
        # hidden shape: (num_directions * n_layers, B, hid_dim)
        # For simplicity, take the last two rows (last layer fwd and bwd)
        # hidden[-2] = forward last layer, hidden[-1] = backward last layer
        h_last_fwd = hidden[-2]  # (B, hid_dim)
        h_last_bwd = hidden[-1]  # (B, hid_dim)
        c_last_fwd = cell[-2]
        c_last_bwd = cell[-1]

        h_cat = torch.cat((h_last_fwd, h_last_bwd), dim=1)  # (B, hid_dim*2)
        c_cat = torch.cat((c_last_fwd, c_last_bwd), dim=1)

        h_dec = torch.tanh(self.fc_hidden(h_cat))  # (B, hid_dim)
        c_dec = torch.tanh(self.fc_cell(c_cat))    # (B, hid_dim)

        # return full outputs for attention (not used here) and decoder initial states
        return outputs, (h_dec.unsqueeze(0), c_dec.unsqueeze(0))  # (1,B,hid) as initial for decoder

class DecoderLSTM(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers=4, dropout=0.3, pad_idx=0):
        super().__init__()
        self.output_dim = output_dim
        self.embedding = nn.Embedding(output_dim, emb_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(emb_dim, hid_dim, num_layers=n_layers, batch_first=True, dropout=dropout if n_layers > 1 else 0.0)
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.n_layers = n_layers
        self.hid_dim = hid_dim

    def forward(self, input_token, hidden, cell):
        # input_token: (B,) single token ids
        embedded = self.embedding(input_token).unsqueeze(1)  # (B,1,E)
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        prediction = self.fc_out(output.squeeze(1))  # (B, output_dim)
        return prediction, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, dec_n_layers):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.dec_n_layers = dec_n_layers

        # If encoder produces only 1 layer hidden (h_dec shape (1,B,H)), but decoder expects dec_n_layers,
        # we replicate/expand encoder state to match decoder layers
    def forward(self, src, src_lengths, trg=None, teacher_forcing_ratio=0.5):
        # src: (B, L), trg: (B, T) with <sos>... we assume trg includes SOS as first token
        batch_size = src.shape[0]
        max_trg_len = trg.shape[1] if trg is not None else 200
        trg_vocab_size = self.decoder.output_dim

        outputs = torch.zeros(batch_size, max_trg_len, trg_vocab_size).to(DEVICE)
        enc_outputs, (h_enc, c_enc) = self.encoder(src, src_lengths)  # h_enc: (1,B,H)

        # Expand encoder hidden to decoder layers
        # h_enc: (1,B,H) -> create (dec_layers,B,H) by repeating
        h_init = h_enc.repeat(self.dec_n_layers, 1, 1).contiguous()
        c_init = c_enc.repeat(self.dec_n_layers, 1, 1).contiguous()

        # first input token is assumed in trg[:,0] as <sos>
        input_tok = trg[:, 0]

        hidden, cell = h_init, c_init

        for t in range(1, max_trg_len):
            output, hidden, cell = self.decoder(input_tok, hidden, cell)
            outputs[:, t, :] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input_tok = trg[:, t] if (trg is not None and teacher_force) else top1
        return outputs

# ---------- Levenshtein / CER ----------
def levenshtein(a, b):
    # a, b are lists or strings
    if len(a) < len(b):
        return levenshtein(b, a)
    # now len(a) >= len(b)
    previous_row = list(range(len(b) + 1))
    for i, ca in enumerate(a, start=1):
        current_row = [i]
        for j, cb in enumerate(b, start=1):
            insertions = previous_row[j] + 1
            deletions = current_row[j-1] + 1
            substitutions = previous_row[j-1] + (0 if ca == cb else 1)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    return previous_row[-1]

def cer_score(ref, hyp):
    # ref,hyp strings
    edits = levenshtein(list(ref), list(hyp))
    denom = max(1, len(ref))
    return edits / denom

# ---------- BLEU function ----------
smooth = SmoothingFunction().method4
def corpus_bleu_from_lists(references, hypotheses):
    # references: list of reference token lists (each inner list: list of reference lists)
    # hypotheses: list of hypothesis token lists
    scores = []
    for refs, hyp in zip(references, hypotheses):
        try:
            sc = sentence_bleu(refs, hyp, smoothing_function=smooth)
        except Exception:
            sc = 0.0
        scores.append(sc)
    return sum(scores) / len(scores) if scores else 0.0

# ---------- Experiment runner ----------
def build_vocabs(df_all, min_freq=1):
    # gather chars from both columns
    src_chars = []
    trg_chars = []
    for s in df_all["Urdu Text"]:
        src_chars.extend(list(s))
    for t in df_all["Roman Transliteration"]:
        trg_chars.extend(list(t))
    sv = CharVocab(src_chars, min_freq=min_freq)
    tv = CharVocab(trg_chars, min_freq=min_freq)
    # ensure specials included
    for sp in [PAD_TOKEN, SOS_TOKEN, EOS_TOKEN, UNK_TOKEN]:
        if sp not in sv.token2idx:
            sv.idx2token.insert(0, sp)
            sv.token2idx = {t:i for i,t in enumerate(sv.idx2token)}
        if sp not in tv.token2idx:
            tv.idx2token.insert(0, sp)
            tv.token2idx = {t:i for i,t in enumerate(tv.idx2token)}
    return sv, tv

def decode_indices(indices, vocab):
    # indices: list of ints (without PAD)
    tokens = []
    for i in indices:
        t = vocab.index2token(i)
        if t in (PAD_TOKEN, SOS_TOKEN, EOS_TOKEN):
            continue
        tokens.append(t)
    return "".join(tokens)

def greedy_decode(model, src_batch, src_lengths, max_len=200):
    model.eval()
    with torch.no_grad():
        batch_size = src_batch.size(0)
        # create dummy trg with SOS at position 0 for each
        sos_idx = trg_vocab.token2index(SOS_TOKEN)
        trg_dummy = torch.full((batch_size, max_len), fill_value=trg_vocab.token2index(PAD_TOKEN), dtype=torch.long).to(DEVICE)
        trg_dummy[:,0] = sos_idx
        outputs = model(src_batch, src_lengths, trg_dummy, teacher_forcing_ratio=0.0)  # (B, T, V)
        preds = outputs.argmax(2).cpu().numpy()  # (B, T)
    # convert preds to strings
    results = []
    for p in preds:
        # stop at EOS if appears
        chars = []
        for idx in p:
            tok = trg_vocab.index2token(idx)
            if tok == EOS_TOKEN:
                break
            if tok == PAD_TOKEN or tok == SOS_TOKEN:
                continue
            chars.append(tok)
        results.append("".join(chars))
    return results

# ---------- Main: prepare data (download extraction is manual here) ----------
# If you have dataset.zip in working dir, uncomment download_and_extract() call and adapt for your environment.
# unzip_dir = download_and_extract()
unzip_dir = "urdu_dataset"  # assume dataset already extracted into this folder. If not, extract dataset.zip manually here.
if not os.path.exists(unzip_dir):
    raise RuntimeError(f"Please extract dataset.zip into folder '{unzip_dir}' (or run download_and_extract()).")

df = collect_pairs(unzip_dir)
print("Initial pairs:", len(df))

# Clean
df["Urdu Text"] = df["Urdu Text"].apply(clean_urdu)
df["Roman Transliteration"] = df["Roman Transliteration"].apply(clean_roman)
# drop empty after cleaning
df = df[(df["Urdu Text"].str.len() > 0) & (df["Roman Transliteration"].str.len() > 0)].reset_index(drop=True)
print("After cleaning pairs:", len(df))

# Save full cleaned dataset
df.to_csv("all_pairs_clean.csv", index=False)

# Build vocabs
src_vocab, trg_vocab = build_vocabs(df, min_freq=1)
print("Src vocab size:", len(src_vocab), "Trg vocab size:", len(trg_vocab))

# Add tokens columns for convenience
df["Urdu Tokens"] = df["Urdu Text"].apply(lambda x: " ".join(list(x)))
df["Roman Tokens"] = df["Roman Transliteration"].apply(lambda x: " ".join(list(x)))

# ---------- Splits: 50/25/25 ----------
train_df, rest_df = train_test_split(df, test_size=0.5, random_state=42)
valid_df, test_df = train_test_split(rest_df, test_size=0.5, random_state=42)

train_df.to_csv("train_data.csv", index=False)
valid_df.to_csv("valid_data.csv", index=False)
test_df.to_csv("test_data.csv", index=False)
print("Saved CSV splits: train/valid/test sizes", len(train_df), len(valid_df), len(test_df))

# ---------- Experiments config ----------
experiments = [
    {
        "name": "exp1",
        "emb_dim": 128,
        "hid_dim": 256,
        "enc_layers": 2,
        "dec_layers": 4,
        "dropout": 0.3,
        "lr": 1e-3,
        "batch_size": 64
    },
    {
        "name": "exp2",
        "emb_dim": 256,
        "hid_dim": 256,
        "enc_layers": 2,
        "dec_layers": 4,
        "dropout": 0.1,
        "lr": 5e-4,
        "batch_size": 64
    },
    {
        "name": "exp3",
        "emb_dim": 256,
        "hid_dim": 512,
        "enc_layers": 2,
        "dec_layers": 4,
        "dropout": 0.3,
        "lr": 1e-4,
        "batch_size": 32
    }
]

N_EPOCHS = 5

# ---------- Training / Evaluation ----------
for cfg in experiments:
    print("\n" + "="*60)
    print("Starting experiment:", cfg["name"])
    print(cfg)
    # Build model
    encoder = EncoderBiLSTM(input_dim=len(src_vocab), emb_dim=cfg["emb_dim"], hid_dim=cfg["hid_dim"], n_layers=cfg["enc_layers"], dropout=cfg["dropout"], pad_idx=src_vocab.token2index(PAD_TOKEN)).to(DEVICE)
    decoder = DecoderLSTM(output_dim=len(trg_vocab), emb_dim=cfg["emb_dim"], hid_dim=cfg["hid_dim"], n_layers=cfg["dec_layers"], dropout=cfg["dropout"], pad_idx=trg_vocab.token2index(PAD_TOKEN)).to(DEVICE)
    model = Seq2Seq(encoder, decoder, dec_n_layers=cfg["dec_layers"]).to(DEVICE)

    optimizer = optim.Adam(model.parameters(), lr=cfg["lr"])
    criterion = nn.CrossEntropyLoss(ignore_index=trg_vocab.token2index(PAD_TOKEN))

    # Datasets & loaders
    train_dataset = TranslitDataset(train_df, src_vocab, trg_vocab)
    val_dataset = TranslitDataset(valid_df, src_vocab, trg_vocab)
    test_dataset = TranslitDataset(test_df, src_vocab, trg_vocab)

    train_loader = DataLoader(train_dataset, batch_size=cfg["batch_size"], shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=cfg["batch_size"], shuffle=False, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=cfg["batch_size"], shuffle=False, collate_fn=collate_fn)

    best_bleu = 0.0

    for epoch in range(1, N_EPOCHS+1):
        # Train
        model.train()
        train_loss_sum = 0.0
        for src_batch, src_lens, trg_batch, trg_lens in train_loader:
            optimizer.zero_grad()
            output = model(src_batch, src_lens, trg_batch, teacher_forcing_ratio=0.5)  # (B, T, V)
            output_dim = output.shape[-1]
            # Flatten predictions and targets (ignore first token <sos> in target)
            pred = output[:,1:,:].reshape(-1, output_dim)
            target = trg_batch[:,1:].reshape(-1)
            loss = criterion(pred, target)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            train_loss_sum += loss.item()
        train_loss = train_loss_sum / len(train_loader)

        # Validate: compute loss + generate outputs for BLEU/CER
        model.eval()
        val_loss_sum = 0.0
        references = []
        hypotheses = []
        cer_vals = []

        with torch.no_grad():
            for src_batch, src_lens, trg_batch, trg_lens in val_loader:
                output = model(src_batch, src_lens, trg_batch, teacher_forcing_ratio=0.0)  # no TF
                output_dim = output.shape[-1]
                pred = output[:,1:,:].reshape(-1, output_dim)
                target = trg_batch[:,1:].reshape(-1)
                loss = criterion(pred, target)
                val_loss_sum += loss.item()

                # greedy decode for metrics
                hyps = greedy_decode(model, src_batch, src_lens, max_len=trg_batch.size(1))
                # build references
                for i in range(trg_batch.size(0)):
                    # reconstruct reference string (remove SOS/EOS/PAD)
                    trg_ids = trg_batch[i].cpu().numpy().tolist()
                    # remove SOS
                    if len(trg_ids) > 0 and trg_ids[0] == trg_vocab.token2index(SOS_TOKEN):
                        trg_ids = trg_ids[1:]
                    # collect until EOS or PAD
                    ref_chars = []
                    for idx in trg_ids:
                        tok = trg_vocab.index2token(idx)
                        if tok == EOS_TOKEN or tok == PAD_TOKEN:
                            break
                        if tok == SOS_TOKEN:
                            continue
                        ref_chars.append(tok)
                    references.append([ref_chars])  # sentence_bleu expects list of references
                hypotheses.extend([list(h) for h in hyps])
                # CER per sample
                for ref_chars, hyp_str in zip(references[-trg_batch.size(0):], hyps):
                    # ref_chars is [[c1,c2,...]] so pick first
                    ref_str = "".join(ref_chars[0])
                    cer_vals.append(cer_score(ref_str, hyp_str))

        val_loss = val_loss_sum / len(val_loader)
        perplexity = math.exp(val_loss) if val_loss < 700 else float("inf")  # avoid overflow

        # Compute BLEU over the whole validation set (references/hypotheses assembled)
        # Note: our `references` list has one element per validation sequence (list of lists)
        # but we appended in loop accumulating; make sure lengths match
        # If lengths mismatch, take min
        n_items = min(len(references), len(hypotheses))
        refs_trim = references[:n_items]
        hyps_trim = hypotheses[:n_items]
        # convert hyps_trim to lists of tokens
        hyp_tokens = [list(h) for h in hyps_trim]
        bleu = corpus_bleu_from_lists(refs_trim, hyp_tokens)
        avg_cer = sum(cer_vals)/len(cer_vals) if cer_vals else 0.0

        print(f"[{cfg['name']}] Epoch {epoch}/{N_EPOCHS} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | PPL: {perplexity:.3f} | BLEU: {bleu:.4f} | CER: {avg_cer:.4f}")

        # print a few qualitative examples
        print("Some validation examples (src => pred | ref):")
        # pick first batch from val_loader for demonstration
        for i in range(3):
            # random sample from val dataset
            idx = random.randint(0, len(val_dataset)-1)
            src_str = val_dataset.src[idx]
            ref_str = val_dataset.trg[idx]
            # encode single example
            model.eval()
            with torch.no_grad():
                s_ids = torch.tensor([ [src_vocab.token2index(ch) for ch in list(src_str)] ], dtype=torch.long).to(DEVICE)
                s_len = torch.tensor([s_ids.size(1)], dtype=torch.long).to(DEVICE)
                pred_list = greedy_decode(model, s_ids, s_len, max_len=200)
            print(f"{src_str} => {pred_list[0]} | {ref_str}")

    # After training each experiment, you can optionally save the model
    torch.save(model.state_dict(), f"seq2seq_{cfg['name']}.pth")
    print(f"Saved model seq2seq_{cfg['name']}.pth")

print("All experiments complete.")


Device: cpu
Initial pairs: 21003
After cleaning pairs: 21003
Src vocab size: 53 Trg vocab size: 40
Saved CSV splits: train/valid/test sizes 10501 5251 5251

Starting experiment: exp1
{'name': 'exp1', 'emb_dim': 128, 'hid_dim': 256, 'enc_layers': 2, 'dec_layers': 4, 'dropout': 0.3, 'lr': 0.001, 'batch_size': 64}
[exp1] Epoch 1/5 | Train Loss: 3.0083 | Val Loss: 2.9997 | PPL: 20.081 | BLEU: 0.0222 | CER: 1.0667
Some validation examples (src => pred | ref):
میں نے مدت سے کوئی خواب نہیں دیکہا ہے => kaa  aaaa aaa aaaa aaaa aaaaaaa aaaaaaaaaaaaaaaaa | maiñ ne muddat se koī ḳhvāb nahīñ dekhā hai
پلکوں پہ کچی نیندوں کا رس پہیلتا ہو جب => kaa  aaaa aaa aaaa aaaa aaaaaaa aaaaaaaaaaaaaaaaa | palkoñ pe kachchī nīñdoñ kā ras phailtā ho jab
جو بے ثبات ہو اس سر خوشی کو کیا کیجے => kaa  aaaa aaa aaaa aaaa aaaaaaa aaaaaaaaaaaaaaaaa | jo besabāt ho us sarḳhushī ko kyā kiije
[exp1] Epoch 2/5 | Train Loss: 2.7256 | Val Loss: 2.9792 | PPL: 19.672 | BLEU: 0.0337 | CER: 0.8315
Some validation examples (src =