**Model Architecture**

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers=2, dropout=0.5):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=0)
        self.lstm = nn.LSTM(emb_dim, hid_dim, num_layers=n_layers,
                            dropout=dropout, bidirectional=True, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.hid_dim = hid_dim
        self.n_layers = n_layers

    def forward(self, src, src_lengths):
        # src: [batch, src_len]
        embedded = self.dropout(self.embedding(src))
        # pack for efficiency
        packed = nn.utils.rnn.pack_padded_sequence(embedded, src_lengths.cpu(), batch_first=True, enforce_sorted=False)
        outputs, (hidden, cell) = self.lstm(packed)
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True)
        # outputs: [batch, src_len, hid_dim*2]
        return outputs, hidden, cell

In [3]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers=4, dropout=0.5):
        super().__init__()
        self.output_dim = output_dim
        self.embedding = nn.Embedding(output_dim, emb_dim, padding_idx=0)
        self.lstm = nn.LSTM(emb_dim, hid_dim, num_layers=n_layers,
                            dropout=dropout, batch_first=True)
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):
        input = input.unsqueeze(1)
        embedded = self.dropout(self.embedding(input))
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        prediction = self.fc_out(output.squeeze(1))
        return prediction, hidden, cell

In [4]:
import torch
import torch.nn as nn

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device, teacher_forcing_ratio=0.5):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        self.teacher_forcing_ratio = teacher_forcing_ratio

        # number of layers
        self.enc_layers = encoder.n_layers
        self.dec_layers = decoder.lstm.num_layers

        # hidden sizes
        self.enc_hid_dim = encoder.hid_dim
        self.dec_hid_dim = decoder.lstm.hidden_size


        if self.enc_hid_dim != self.dec_hid_dim:
            self.bridge = nn.Linear(self.enc_hid_dim, self.dec_hid_dim)
        else:
            self.bridge = None

    def forward(self, src, src_lengths, trg):

        batch_size = src.size(0)
        trg_len = trg.size(1)
        trg_vocab_size = self.decoder.output_dim

        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)

        # 1. Encode
        encoder_outputs, hidden, cell = self.encoder(src, src_lengths)

        # 2. Bridge hidden states
        hidden = self._bridge_hidden(hidden)
        cell   = self._bridge_hidden(cell)


        input = trg[:, 0]  # [batch]

        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[:, t, :] = output
            teacher_force = torch.rand(1).item() < self.teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[:, t] if teacher_force else top1

        return outputs

    def _bridge_hidden(self, hidden):

        enc_layers = hidden.size(0) // 2


        hidden = hidden[0:enc_layers] + hidden[enc_layers:2*enc_layers]


        if self.bridge is not None:
            hidden = self.bridge(hidden)


        if hidden.size(0) < self.dec_layers:
            hidden = hidden.repeat(self.dec_layers // hidden.size(0), 1, 1)
        elif hidden.size(0) > self.dec_layers:
            hidden = hidden[:self.dec_layers]

        return hidden




In [None]:
!unzip  /content/Preprocessed.zip

Archive:  /content/Preprocessed.zip
  inflating: Preprocessed/dataset_full.pkl  
 extracting: Preprocessed/entoken.zip  
   creating: Preprocessed/entoken/
  inflating: Preprocessed/entoken/bpe_merges.txt  
  inflating: Preprocessed/entoken/tgt_bpe.json  
  inflating: Preprocessed/test.pkl   
  inflating: Preprocessed/train.pkl  
 extracting: Preprocessed/urtoken.zip  
   creating: Preprocessed/urtoken/
  inflating: Preprocessed/urtoken/bpe_mergesur.txt  
  inflating: Preprocessed/urtoken/tgt_bpeur.json  
  inflating: Preprocessed/valid.pkl  


**Training and Hyperparameters**

In [5]:
import json, pickle, random
from collections import Counter

# ---------------- Config ----------------
SRC_FILE = "/content/tgt_bpeur.json"   # Urdu tokenized
TGT_FILE = "/content/tgt_bpe.json"     # Roman Urdu tokenized

OUT_DIR = "/content"   # where train.pkl / valid.pkl / test.pkl will be saved

# ---------------- Load tokenized sentences ----------------
with open(SRC_FILE, "r", encoding="utf-8") as f:
    src_sentences = json.load(f)   # list of lists

with open(TGT_FILE, "r", encoding="utf-8") as f:
    tgt_sentences = json.load(f)

assert len(src_sentences) == len(tgt_sentences), "Mismatch between src and tgt!"
print(f"Loaded {len(src_sentences)} parallel sentences.")

# ---------------- Build vocabulary from sentences ----------------
def build_vocab(sentences, min_freq=1):
    counter = Counter(tok for sent in sentences for tok in sent)
    # special tokens always at the start
    tokens = ["<pad>", "<unk>", "<sos>", "<eos>"]
    tokens += [tok for tok, c in counter.items() if c >= min_freq]
    token2id = {tok: idx for idx, tok in enumerate(tokens)}
    id2token = {idx: tok for tok, idx in token2id.items()}
    return token2id, id2token

src_token2id, src_id2token = build_vocab(src_sentences)
tgt_token2id, tgt_id2token = build_vocab(tgt_sentences)

print(f"Source vocab size: {len(src_token2id)}")
print(f"Target vocab size: {len(tgt_token2id)}")

# ---------------- Encode sentences ----------------
def encode(tokens, vocab):
    return [vocab.get(tok, vocab["<unk>"]) for tok in tokens]

pairs = []
for src, tgt in zip(src_sentences, tgt_sentences):
    src_ids = encode(src, src_token2id)
    tgt_ids = [tgt_token2id["<sos>"]] + encode(tgt, tgt_token2id) + [tgt_token2id["<eos>"]]
    pairs.append((src_ids, tgt_ids))

print("Example encoded pair:", pairs[0])

# ---------------- Split into train/valid/test ----------------
random.shuffle(pairs)
n = len(pairs)
train = pairs[:int(0.5*n)]
valid = pairs[int(0.5*n):int(0.75*n)]
test  = pairs[int(0.75*n):]

with open(f"{OUT_DIR}/train.pkl", "wb") as f: pickle.dump(train, f)
with open(f"{OUT_DIR}/valid.pkl", "wb") as f: pickle.dump(valid, f)
with open(f"{OUT_DIR}/test.pkl",  "wb") as f: pickle.dump(test, f)

# ---------------- Save vocabs ----------------
with open(f"{OUT_DIR}/src_vocab.json", "w", encoding="utf-8") as f:
    json.dump(src_token2id, f, ensure_ascii=False, indent=2)
with open(f"{OUT_DIR}/tgt_vocab.json", "w", encoding="utf-8") as f:
    json.dump(tgt_token2id, f, ensure_ascii=False, indent=2)

print("Saved splits and vocab files:")
print("Train:", len(train), "Valid:", len(valid), "Test:", len(test))
print("src_vocab.json / tgt_vocab.json saved with sizes:",
      len(src_token2id), len(tgt_token2id))



Loaded 21003 parallel sentences.
Source vocab size: 4905
Target vocab size: 4916
Example encoded pair: ([4, 5, 6, 7, 8, 9, 10, 11, 12, 13], [2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 3])
Saved splits and vocab files:
Train: 10501 Valid: 5251 Test: 5251
src_vocab.json / tgt_vocab.json saved with sizes: 4905 4916


In [6]:
import pickle

# Load the dataset
with open("/content/train.pkl", "rb") as f:
    dataset = pickle.load(f)
for p in pairs[:4]:
    print(p)

([56, 649, 56, 1953, 68, 42, 2237, 363, 3366], [2, 2930, 2931, 1881, 2327, 2143, 2144, 45, 2445, 395, 3548, 3])
([259, 67, 12, 109, 2748, 62, 281, 76], [2, 274, 74, 12, 116, 696, 1299, 69, 298, 83, 3])
([116, 2001, 71, 230, 74, 356, 622, 10, 295], [2, 123, 2200, 78, 333, 81, 387, 682, 10, 313, 3])
([1451, 314, 1621, 712, 1541, 3097, 3097, 109], [2, 1596, 337, 1781, 780, 1271, 149, 341, 2043, 999, 341, 3982, 116, 3])


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

# ---------------- Dataset Wrapper ----------------
class TranslationDataset(Dataset):
    def __init__(self, pairs):
        self.pairs = pairs
    def __len__(self):
        return len(self.pairs)
    def __getitem__(self, idx):
        src, trg = self.pairs[idx]
        return torch.tensor(src, dtype=torch.long), torch.tensor(trg, dtype=torch.long)

# ---------------- Loss & Optimizer ----------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def train_model(model, train_data, valid_data, epochs=10, lr=1e-3, batch_size=64):
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=lambda x: x)
    valid_loader = DataLoader(valid_data, batch_size=batch_size, collate_fn=lambda x: x)

    criterion = nn.CrossEntropyLoss(ignore_index=0)  # assume PAD=0
    optimizer = optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        # ---- Train ----
        model.train()
        total_loss = 0
        for batch in train_loader:
            src, trg = zip(*batch)
            src = nn.utils.rnn.pad_sequence(src, batch_first=True, padding_value=0).to(device)
            trg = nn.utils.rnn.pad_sequence(trg, batch_first=True, padding_value=0).to(device)
            src_lengths = torch.tensor([len(s) for s in src]).to(device)

            optimizer.zero_grad()
            output = model(src, src_lengths, trg)

            # Shift target for teacher forcing
            output_dim = output.shape[-1]
            loss = criterion(output[:, 1:].reshape(-1, output_dim), trg[:, 1:].reshape(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        # ---- Validation ----
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in valid_loader:
                src, trg = zip(*batch)
                src = nn.utils.rnn.pad_sequence(src, batch_first=True, padding_value=0).to(device)
                trg = nn.utils.rnn.pad_sequence(trg, batch_first=True, padding_value=0).to(device)
                src_lengths = torch.tensor([len(s) for s in src]).to(device)

                output = model(src, src_lengths, trg)
                output_dim = output.shape[-1]
                loss = criterion(output[:, 1:].reshape(-1, output_dim), trg[:, 1:].reshape(-1))
                val_loss += loss.item()

        print(f"Epoch {epoch+1}: Train Loss={total_loss/len(train_loader):.3f}, "
              f"Val Loss={val_loss/len(valid_loader):.3f}")


In [None]:
!unzip /content/PreprocessedR.zip

Archive:  /content/PreprocessedR.zip
  inflating: Preprocessed/dataset_full.pkl  
   creating: Preprocessed/entoken/
  inflating: Preprocessed/entoken/bpe_merges.txt  
  inflating: Preprocessed/entoken/tgt_bpe.json  
  inflating: Preprocessed/src_vocab.json  
  inflating: Preprocessed/test.pkl   
  inflating: Preprocessed/tgt_vocab.json  
  inflating: Preprocessed/train.pkl  
   creating: Preprocessed/urtoken/
  inflating: Preprocessed/urtoken/bpe_mergesur.txt  
  inflating: Preprocessed/urtoken/tgt_bpeur.json  
  inflating: Preprocessed/valid.pkl  


In [7]:
import pickle, random
INPUT_DIM = len(src_token2id)
OUTPUT_DIM = len(tgt_token2id)



Training model with emb_dim=256, hid_dim=512 and droput=0.5, encoder layers=2 decoder layers = 4

In [12]:
import torch, torch.nn as nn, torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import pickle, os

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ---------------- Dataset ----------------
class TranslationDataset(Dataset):
    def __init__(self, pairs): self.pairs = pairs
    def __len__(self): return len(self.pairs)
    def __getitem__(self, i): return self.pairs[i]

def collate_fn(batch):
    src,tgt = zip(*batch)
    src = [torch.tensor(s) for s in src]
    tgt = [torch.tensor(t) for t in tgt]
    return (nn.utils.rnn.pad_sequence(src,batch_first=True,padding_value=0),
            nn.utils.rnn.pad_sequence(tgt,batch_first=True,padding_value=0))

# ---------------- Load Data ----------------
train = pickle.load(open("/content/train.pkl","rb"))
valid = pickle.load(open("/content/valid.pkl","rb"))
test  = pickle.load(open("/content/test.pkl","rb"))

train_loader = DataLoader(TranslationDataset(train), batch_size=64, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(TranslationDataset(valid), batch_size=64, collate_fn=collate_fn)

In [14]:

# ---------------- Model ----------------

INPUT_DIM, OUTPUT_DIM = len(src_token2id), len(tgt_token2id)
encoder = Encoder(INPUT_DIM, 256, 512, n_layers=2, dropout=0.5)
decoder = Decoder(OUTPUT_DIM, 256, 512, n_layers=4, dropout=0.5)
model = Seq2Seq(encoder, decoder, device).to(device)
optimizer = optim.Adam(model.parameters(), lr=5e-4)
criterion = nn.CrossEntropyLoss(ignore_index=0)

In [20]:
# ---------------- Translation ----------------
def translate_sentence(model, src_ids, max_len=40):
    model.eval()
    src_tensor = torch.tensor([src_ids], dtype=torch.long, device=device)
    src_lengths = torch.tensor([len(src_ids)], device=device)

    with torch.no_grad():
        enc_outputs, hidden, cell = model.encoder(src_tensor, src_lengths)
        hidden = model._bridge_hidden(hidden)
        cell   = model._bridge_hidden(cell)

    # start with <sos>
    input_tok = torch.tensor([tgt_token2id["<sos>"]], device=device)
    preds = []
    for _ in range(max_len):
        output, hidden, cell = model.decoder(input_tok, hidden, cell)
        pred = output.argmax(1).item()
        if pred == tgt_token2id["<eos>"]:
            break
        preds.append(pred)
        input_tok = torch.tensor([pred], device=device)
    return preds
def detokenize(tokens):
    return "".join(tok.replace("</w>", " ") for tok in tokens).strip()

In [None]:



# ---------------- Training ----------------
# ---------------- Training epoch function ----------------
def train_epoch(loader):
    model.train()
    total = 0
    for src, tgt in loader:
        src, tgt = src.to(device), tgt.to(device)

        # lengths for packed sequences
        src_lengths = torch.tensor([len(s[s!=0]) for s in src], device=device)

        optimizer.zero_grad()
        output = model(src, src_lengths, tgt)   # forward pass

        # shift for loss (ignore first token <sos>)
        output_dim = output.shape[-1]
        loss = criterion(
            output[:, 1:].reshape(-1, output_dim),
            tgt[:, 1:].reshape(-1)
        )
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        total += loss.item()

    return total / len(loader)

# ---------------- Training loop ----------------
N_EPOCHS = 10
for epoch in range(N_EPOCHS):
    tr_loss = train_epoch(train_loader)
    print(f"Epoch {epoch+1}/{N_EPOCHS} - Train Loss: {tr_loss:.3f}")






src_ids = [src_token2id.get(tok, src_token2id["<unk>"]) for tok in ["آپ","کیسے","ہیں","؟"]]
pred_ids = translate_sentence(model, src_ids)
pred_tokens = [tgt_id2token[i] for i in pred_ids]
print("Prediction:", detokenize(pred_tokens))

# ---------------- BLEU Evaluation ----------------
def evaluate_bleu(model, loader, n_batches=5):
    smooth = SmoothingFunction().method1
    scores=[]
    for i,(src,tgt) in enumerate(loader):
        if i>=n_batches: break
        for s,t in zip(src,tgt):
            s = s[s!=0].tolist()
            t = t[t!=0].tolist()
            pred_ids = translate_sentence(model, s)
            ref = [tgt_id2token[i] for i in t[1:-1]]   # drop <sos>, <eos>
            hyp = [tgt_id2token[i] for i in pred_ids]
            if len(hyp)>0:
                scores.append(sentence_bleu([ref], hyp, smoothing_function=smooth))
    return sum(scores)/len(scores) if scores else 0.0

In [11]:


# ---------------- Run Training ----------------
N_EPOCHS = 30
os.makedirs("/content/checkpoints", exist_ok=True)

for epoch in range(N_EPOCHS):
    tr_loss = train_epoch(train_loader)
    print(f"Epoch {epoch+1}/{N_EPOCHS} - train_loss={tr_loss:.3f}")

    if (epoch+1)%5==0:
        bleu = evaluate_bleu(model, valid_loader)
        print(f"  >> Validation BLEU: {bleu:.4f}")
        torch.save(model.state_dict(), f"/content/checkpoints/model_epoch{epoch+1}.pt")


Epoch 1/10 - Train Loss: 6.530
Epoch 2/10 - Train Loss: 5.736
Epoch 3/10 - Train Loss: 5.293
Epoch 4/10 - Train Loss: 4.939
Epoch 5/10 - Train Loss: 4.611
Epoch 6/10 - Train Loss: 4.324
Epoch 7/10 - Train Loss: 4.052
Epoch 8/10 - Train Loss: 3.810
Epoch 9/10 - Train Loss: 3.580
Epoch 10/10 - Train Loss: 3.359
Prediction: lab lab lab
Epoch 1/30 - train_loss=3.157
Epoch 2/30 - train_loss=2.966
Epoch 3/30 - train_loss=2.804
Epoch 4/30 - train_loss=2.643
Epoch 5/30 - train_loss=2.496
  >> Validation BLEU: 0.0983
Epoch 6/30 - train_loss=2.354
Epoch 7/30 - train_loss=2.227
Epoch 8/30 - train_loss=2.103
Epoch 9/30 - train_loss=1.983
Epoch 10/30 - train_loss=1.894
  >> Validation BLEU: 0.1232
Epoch 11/30 - train_loss=1.786
Epoch 12/30 - train_loss=1.699
Epoch 13/30 - train_loss=1.605
Epoch 14/30 - train_loss=1.532
Epoch 15/30 - train_loss=1.448
  >> Validation BLEU: 0.1384
Epoch 16/30 - train_loss=1.375
Epoch 17/30 - train_loss=1.313
Epoch 18/30 - train_loss=1.245
Epoch 19/30 - train_loss=1.18

Evaluation and Testing of model trained on emb_dim=256, hid_dim=512

In [8]:
import math

@torch.no_grad()
def evaluate_loss(model, loader):
    model.eval()
    total_loss, total_tokens = 0, 0
    for src, tgt in loader:
        src, tgt = src.to(device), tgt.to(device)
        src_lengths = torch.tensor([len(s[s!=0]) for s in src], device=device)

        output = model(src, src_lengths, tgt)
        output_dim = output.shape[-1]

        loss = criterion(
            output[:, 1:].reshape(-1, output_dim),
            tgt[:, 1:].reshape(-1)
        )
        # accumulate loss * number of tokens
        total_loss += loss.item() * (tgt[:,1:].numel())
        total_tokens += tgt[:,1:].numel()

    avg_loss = total_loss / total_tokens
    return math.exp(avg_loss)   # perplexity


In [22]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder = Encoder(INPUT_DIM, 256, 512, n_layers=2, dropout=0.3)
decoder = Decoder(OUTPUT_DIM, 256, 512, n_layers=4, dropout=0.3)
model = Seq2Seq(encoder, decoder, device, teacher_forcing_ratio=0.5).to(device)
model.load_state_dict(torch.load("/content/model_epoch30.pt", map_location=device))

<All keys matched successfully>

In [23]:
ppl = evaluate_loss(model, valid_loader)
print(f"Validation Perplexity: {ppl:.2f}")

Validation Perplexity: 10.32


In [16]:
!pip install editdistance
import editdistance

@torch.no_grad()
def evaluate_cer(model, loader, n_batches=5):
    model.eval()
    total_dist, total_chars = 0, 0
    for i, (src, tgt) in enumerate(loader):
        if i >= n_batches: break
        for s, t in zip(src, tgt):
            s = s[s!=0].tolist()
            t = t[t!=0].tolist()
            pred_ids = translate_sentence(model, s)

            # detokenize to strings
            ref = detokenize([tgt_id2token[i] for i in t[1:-1]])
            hyp = detokenize([tgt_id2token[i] for i in pred_ids])

            # edit distance
            dist = editdistance.eval(hyp, ref)
            total_dist += dist
            total_chars += len(ref)

    return total_dist / total_chars if total_chars > 0 else 0.0




In [24]:
cer = evaluate_cer(model, valid_loader, n_batches=5)
print(f"Validation CER: {cer:.4f}")

Validation CER: 0.2789


In [None]:
cer = evaluate_cer(model, valid_loader, n_batches=5)
print(f"Validation CER: {cer:.4f}")

**In Case of further training**

In [None]:
encoder = Encoder(INPUT_DIM, 256, 512, n_layers=2, dropout=0.3)
decoder = Decoder(OUTPUT_DIM, 256, 512, n_layers=4, dropout=0.3)
model = Seq2Seq(encoder, decoder, device, teacher_forcing_ratio=0.5).to(device)
model.load_state_dict(torch.load("/content/checkpoints/model_epoch30.pt", map_location=device))

# Recreate optimizer with lower LR
optimizer = optim.Adam(model.parameters(), lr=5e-4,weight_decay=1e-5)
criterion = nn.CrossEntropyLoss(ignore_index=src_token2id["<pad>"])

start_epoch = 30   # where you left off
N_EPOCHS = 20      # train 20 more epochs

In [None]:
@torch.no_grad()
def evaluate(loader):
    model.eval(); total=0
    for src, tgt in loader:
        src, tgt = src.to(device), tgt.to(device)
        src_lengths = torch.tensor([len(s[s!=0]) for s in src], device=device)
        output = model(src, src_lengths, tgt)
        output_dim = output.shape[-1]
        loss = criterion(
            output[:,1:].reshape(-1, output_dim),
            tgt[:,1:].reshape(-1)
        )
        total += loss.item()
    return total/len(loader)
for epoch in range(start_epoch+1, start_epoch+N_EPOCHS+1):
    tr_loss = train_epoch(train_loader)
    val_loss = evaluate(valid_loader)

    # Decay teacher forcing ratio
    model.teacher_forcing_ratio = max(0.1, model.teacher_forcing_ratio * 0.9)

    print(f"Epoch {epoch} | Train Loss: {tr_loss:.3f} | Val Loss: {val_loss:.3f} | TF={model.teacher_forcing_ratio:.2f}")

    if epoch % 5 == 0:
        bleu = evaluate_bleu(valid_loader, n_samples=200)
        print(f"   Validation BLEU: {bleu*100:.2f}")

    # Save checkpoint
    torch.save(model.state_dict(), f"model_epoch{epoch}.pt")