In [1]:
URDU_NORMALIZATION_MAP = {
    "ي": "ی",  # Arabic Yeh → Urdu Yeh
    "ك": "ک",  # Arabic Kaf → Urdu Kaf
    "ۀ": "ہ",  # Heh with hamza → Heh
    "ة": "ہ",  # Taa marbuta → Heh
    "ؤ": "و",  # Waw with hamza → Waw
}


In [2]:
import re
DIACRITICS_PATTERN = re.compile(r"[\u064B-\u065F\u0670\u06D6-\u06ED]")

def remove_diacritics(text):
    return DIACRITICS_PATTERN.sub("", text)


In [3]:
PUNCTUATION_REMOVE = '[“”«»—–…"]'

def clean_punctuation(text):
    return re.sub(PUNCTUATION_REMOVE, "", text)


In [15]:
def clean_whitespace(text):
    return re.sub(r"\s+", " ", text).strip()


In [4]:
def normalize_urdu(text):
    for src, tgt in URDU_NORMALIZATION_MAP.items():
        text = text.replace(src, tgt)
    text = remove_diacritics(text)
    text = clean_punctuation(text)
    text = clean_whitespace(text)
    return text


In [10]:
    !git clone https://github.com/amir9ume/urdu_ghazals_rekhta

Cloning into 'urdu_ghazals_rekhta'...
remote: Enumerating objects: 112, done.[K
remote: Counting objects: 100% (9/9), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 112 (delta 7), reused 6 (delta 6), pack-reused 103 (from 1)[K
Receiving objects: 100% (112/112), 2.03 MiB | 30.18 MiB/s, done.
Resolving deltas: 100% (51/51), done.


In [12]:
!unzip /content/urdu_ghazals_rekhta/dataset/dataset.zip

Archive:  /content/urdu_ghazals_rekhta/dataset/dataset.zip
   creating: dataset/
   creating: dataset/ahmad-faraz/
   creating: dataset/ahmad-faraz/ur/
  inflating: dataset/ahmad-faraz/ur/silsile-tod-gayaa-vo-sabhii-jaate-jaate-ahmad-faraz-ghazals  
  inflating: dataset/ahmad-faraz/ur/kyaa-aise-kam-sukhan-se-koii-guftuguu-kare-ahmad-faraz-ghazals  
  inflating: dataset/ahmad-faraz/ur/havaa-ke-zor-se-pindaar-e-baam-o-dar-bhii-gayaa-ahmad-faraz-ghazals  
  inflating: dataset/ahmad-faraz/ur/avval-avval-kii-dostii-hai-abhii-ahmad-faraz-ghazals  
  inflating: dataset/ahmad-faraz/ur/saaqiyaa-ek-nazar-jaam-se-pahle-pahle-ahmad-faraz-ghazals  
  inflating: dataset/ahmad-faraz/ur/saamne-us-ke-kabhii-us-kii-sataaish-nahiin-kii-ahmad-faraz-ghazals  
  inflating: dataset/ahmad-faraz/ur/vahshaten-badhtii-gaiin-hijr-ke-aazaar-ke-saath-ahmad-faraz-ghazals  
  inflating: dataset/ahmad-faraz/ur/ranjish-hii-sahii-dil-hii-dukhaane-ke-liye-aa-ahmad-faraz-ghazals  
  inflating: dataset/ahmad-faraz/ur/qurba

In [16]:
import os, pickle
from pathlib import Path

root_dir = "/content/dataset"
pairs = []

for poet in os.listdir(root_dir):
    poet_dir = Path(root_dir) / poet
    ur_dir = poet_dir / "ur"
    en_dir = poet_dir / "en"
    if not ur_dir.exists() or not en_dir.exists():
        continue

    for fname in os.listdir(ur_dir):
        ur_file = ur_dir / fname
        en_file = en_dir / fname
        if not en_file.exists():
            continue

        with open(ur_file, "r", encoding="utf-8", errors="ignore") as f_ur, \
             open(en_file, "r", encoding="utf-8", errors="ignore") as f_en:
            ur_lines = [normalize_urdu(l) for l in f_ur.readlines() if l.strip()]
            en_lines = [clean_whitespace(l.lower()) for l in f_en.readlines() if l.strip()]

        for ur, en in zip(ur_lines, en_lines):
            pairs.append({"src": ur, "tgt": en})

# Save full dataset
os.makedirs("data/processed", exist_ok=True)
pickle.dump(pairs, open("data/processed/dataset_full.pkl","wb"))

print("Total pairs:", len(pairs))


Total pairs: 21003


In [17]:
import pickle

# Load the dataset
with open("data/processed/dataset_full.pkl", "rb") as f:
    dataset = pickle.load(f)

print(type(dataset))     # should be <class 'list'>
print(len(dataset))      # number of sentence pairs


<class 'list'>
21003


In [19]:
import pickle

# Load the dataset
with open("data/processed/dataset_full.pkl", "rb") as f:
    dataset = pickle.load(f)
for p in pairs[:4]:
    print(p)

{'src': 'کٹھن ہے راہ گزر تھوڑی دور ساتھ چلو', 'tgt': 'kathin hai rāhguzar thoḍī duur saath chalo'}
{'src': 'بہت کڑا ہے سفر تھوڑی دور ساتھ چلو', 'tgt': 'bahut kaḍā hai safar thoḍī duur saath chalo'}
{'src': 'تمام عمر کہاں کوئی ساتھ دیتا ہے', 'tgt': 'tamām umr kahāñ koī saath detā hai'}
{'src': 'یہ جانتا ہوں مگر تھوڑی دور ساتھ چلو', 'tgt': 'ye jāntā huuñ magar thoḍī duur saath chalo'}


In [22]:
# BPE for roman urdu
import pickle
from collections import defaultdict,Counter

def load_pkl(path):
    with open(path, "rb") as f:
        return pickle.load(f)

def get_vocab(corpus):
    vocab = Counter()
    for line in corpus:
        text=line["tgt"]
        for word in text.strip().split():
           symbols=tuple(list(word)+["</w>"])
           vocab[symbols] += 1
    return vocab

def get_stats(vocab):
    pairs = defaultdict(int)
    for word, freq in vocab.items():
        for i in range(len(word) - 1):
            pairs[(word[i], word[i + 1])] += freq
    return pairs

def merge_vocab(pair, vocab):
    new_vocab = Counter()
    a, b = pair
    new_sym = a + b
    for word, freq in vocab.items():
        new_word = []
        i = 0
        while i < len(word):
            if i < len(word) - 1 and word[i] == a and word[i + 1] == b:
                new_word.append(new_sym)
                i += 2
            else:
                new_word.append(word[i])
                i += 1
        new_vocab[tuple(new_word)] += freq
    return new_vocab

def learn_bpe(corpus, num_merges=100):
    vocab = get_vocab(corpus)
    merges = []
    for _ in range(num_merges):
        pairs = get_stats(vocab)
        if not pairs: break
        best = max(pairs, key=pairs.get)
        vocab = merge_vocab(best, vocab)
        merges.append(best)
    return merges

def apply_bpe_word(word, merges):
    symbols = list(word) + ["</w>"]
    for a, b in merges:
        i = 0
        new = []
        while i < len(symbols):
            if i < len(symbols) - 1 and symbols[i] == a and symbols[i + 1] == b:
                new.append(a + b)
                i += 2
            else:
                new.append(symbols[i])
                i += 1
        symbols = new
    return symbols


corpus= load_pkl("/content/data/processed/dataset_full.pkl")
merges = learn_bpe(corpus, num_merges=10000)
print("Merges:", merges)


Merges: [('e', '</w>'), ('h', 'a'), ('ā', '</w>'), ('ñ', '</w>'), ('ī', '</w>'), ('r', '</w>'), ('o', '</w>'), ('e', '-'), ('ha', 'i'), ('-', 'e-'), ('a', 'a'), ('a', 'h'), ('hai', '</w>'), ('a', 'r'), ('s', 'h'), ('e', 'ñ</w>'), ('m', '</w>'), ('n', 'a'), ('a', 'r</w>'), ('h', 'ī</w>'), ('m', 'a'), ('h', 'u'), ('s', 'e</w>'), ('h', '</w>'), ('l', '</w>'), ('t', '</w>'), ('y', 'ā</w>'), ('m', 'eñ</w>'), ('a', 'b'), ('k', 'i'), ('n', '</w>'), ('s', '</w>'), ('c', 'h'), ('n', 'e</w>'), ('b', 'a'), ('k', 'e</w>'), ('na', '</w>'), ('d', 'i'), ('k', 'ī</w>'), ('ā', 'ñ</w>'), ('ī', 'ñ</w>'), ('t', 'a'), ('k', 'o</w>'), ('s', 'a'), ('r', 'a'), ('y', 'e</w>'), ('i', 'r'), ('k', 'ā</w>'), ('hai', 'ñ</w>'), ('d', 'a'), ('ah', 'īñ</w>'), ('b', 'hī</w>'), ('d', '</w>'), ('t', 'o</w>'), ('k', '</w>'), ('o', 'ñ</w>'), ('m', 'u'), ('ab', '</w>'), ('ḳ', 'h'), ('t', 'h'), ('t', 'ā</w>'), ('t', 'e</w>'), ('n', 'ahīñ</w>'), ('ha', 'm</w>'), ('l', 'a'), ('s', 'ha'), ('k', 'h'), ('u', 'ñ</w>'), ('n', 'ā</w

AttributeError: 'dict' object has no attribute 'strip'

In [24]:
with open("bpe_merges.txt", "w", encoding="utf-8") as f:
    for a, b in merges:
        f.write(f"{a} {b}\n")

print("Saved merges to bpe_merges.txt")


Saved merges to bpe_merges.txt


In [23]:
def apply_bpe(corpus, merges):
    out = []
    for line in corpus:
        tokens = []
        text=line["tgt"]
        for word in text.strip().split():
            tokens.extend(apply_bpe_word(word, merges))
        out.append(tokens)
    return out
print("BPE:", apply_bpe(corpus, merges))

BPE: [['kathin</w>', 'hai</w>', 'rāhguzar</w>', 'thoḍī</w>', 'duur</w>', 'saath</w>', 'chalo</w>'], ['bahut</w>', 'kaḍā</w>', 'hai</w>', 'safar</w>', 'thoḍī</w>', 'duur</w>', 'saath</w>', 'chalo</w>'], ['tamām</w>', 'umr</w>', 'kahāñ</w>', 'koī</w>', 'saath</w>', 'detā</w>', 'hai</w>'], ['ye</w>', 'jāntā</w>', 'huuñ</w>', 'magar</w>', 'thoḍī</w>', 'duur</w>', 'saath</w>', 'chalo</w>'], ['nashe</w>', 'meñ</w>', 'chuur</w>', 'huuñ</w>', 'maiñ</w>', 'bhī</w>', 'tumheñ</w>', 'bhī</w>', 'hosh</w>', 'nahīñ</w>'], ['baḍā</w>', 'maza</w>', 'ho</w>', 'agar</w>', 'thoḍī</w>', 'duur</w>', 'saath</w>', 'chalo</w>'], ['ye</w>', 'ek</w>', 'shab</w>', 'kī</w>', 'mulāqāt</w>', 'bhī</w>', 'ġhanīmat</w>', 'hai</w>'], ['kise</w>', 'hai</w>', 'kal</w>', 'kī</w>', 'ḳhabar</w>', 'thoḍī</w>', 'duur</w>', 'saath</w>', 'chalo</w>'], ['abhī</w>', 'to</w>', 'jaag</w>', 'rahe</w>', 'haiñ</w>', 'charāġh</w>', 'rāhoñ</w>', 'ke</w>'], ['abhī</w>', 'hai</w>', 'duur</w>', 'sahar</w>', 'thoḍī</w>', 'duur</w>', 'saath</

In [25]:
import json
bpe_tokens=apply_bpe(corpus, merges)
with open("data/processed/tgt_bpe.json", "w", encoding="utf-8") as f:
    json.dump(bpe_tokens, f, ensure_ascii=False, indent=2)

print("Saved BPE tokenized corpus to data/processed/tgt_bpe.json")

Saved BPE tokenized corpus to data/processed/tgt_bpe.json


In [26]:
# BPE for urdu
import pickle
from collections import defaultdict,Counter

def load_pkl(path):
    with open(path, "rb") as f:
        return pickle.load(f)

def get_vocab(corpus):
    vocab = Counter()
    for line in corpus:
        text=line["src"]
        for word in text.strip().split():
           symbols=tuple(list(word)+["</w>"])
           vocab[symbols] += 1
    return vocab

def get_stats(vocab):
    pairs = defaultdict(int)
    for word, freq in vocab.items():
        for i in range(len(word) - 1):
            pairs[(word[i], word[i + 1])] += freq
    return pairs

def merge_vocab(pair, vocab):
    new_vocab = Counter()
    a, b = pair
    new_sym = a + b
    for word, freq in vocab.items():
        new_word = []
        i = 0
        while i < len(word):
            if i < len(word) - 1 and word[i] == a and word[i + 1] == b:
                new_word.append(new_sym)
                i += 2
            else:
                new_word.append(word[i])
                i += 1
        new_vocab[tuple(new_word)] += freq
    return new_vocab

def learn_bpe(corpus, num_merges=100):
    vocab = get_vocab(corpus)
    merges = []
    for _ in range(num_merges):
        pairs = get_stats(vocab)
        if not pairs: break
        best = max(pairs, key=pairs.get)
        vocab = merge_vocab(best, vocab)
        merges.append(best)
    return merges

def apply_bpe_word(word, merges):
    symbols = list(word) + ["</w>"]
    for a, b in merges:
        i = 0
        new = []
        while i < len(symbols):
            if i < len(symbols) - 1 and symbols[i] == a and symbols[i + 1] == b:
                new.append(a + b)
                i += 2
            else:
                new.append(symbols[i])
                i += 1
        symbols = new
    return symbols


corpus= load_pkl("/content/data/processed/dataset_full.pkl")
merges = learn_bpe(corpus, num_merges=10000)
print("Merges:", merges)
with open("bpe_mergesur.txt", "w", encoding="utf-8") as f:
    for a, b in merges:
        f.write(f"{a} {b}\n")

print("Saved merges to bpe_mergesur.txt")
def apply_bpe(corpus, merges):
    out = []
    for line in corpus:
        tokens = []
        text=line["src"]
        for word in text.strip().split():
            tokens.extend(apply_bpe_word(word, merges))
        out.append(tokens)
    return out
print("BPE:", apply_bpe(corpus, merges))

import json
bpe_tokens=apply_bpe(corpus, merges)
with open("data/processed/tgt_bpeur.json", "w", encoding="utf-8") as f:
    json.dump(bpe_tokens, f, ensure_ascii=False, indent=2)

print("Saved BPE tokenized corpus to data/processed/tgt_bpeur.json")

Merges: [('ے', '</w>'), ('ں', '</w>'), ('ا', '</w>'), ('ی', '</w>'), ('ر', '</w>'), ('ہ', '</w>'), ('ی', 'ں</w>'), ('و', '</w>'), ('ہ', 'ے</w>'), ('م', '</w>'), ('ل', '</w>'), ('م', 'یں</w>'), ('ا', 'ن'), ('و', 'ں</w>'), ('ہ', 'یں</w>'), ('ت', '</w>'), ('س', 'ے</w>'), ('ی', 'ا</w>'), ('ب', '</w>'), ('ب', 'ھ'), ('س', '</w>'), ('ا', 'ں</w>'), ('ک', 'ے</w>'), ('ھ', '</w>'), ('ک', 'ی</w>'), ('د', '</w>'), ('بھ', 'ی</w>'), ('ن', '</w>'), ('ن', 'ہ</w>'), ('ک', 'و</w>'), ('ت', 'و</w>'), ('ن', 'ے</w>'), ('ک', '</w>'), ('ئ', 'ے</w>'), ('ہ', 'و'), ('ئ', 'ی</w>'), ('ک', 'ا</w>'), ('ا', 'ر</w>'), ('ک', 'ھ'), ('ت', 'ھ'), ('ن', 'ہیں</w>'), ('ک', 'یا</w>'), ('ت', 'ا</w>'), ('ر', 'ے</w>'), ('ت', 'ے</w>'), ('ر', 'ی</w>'), ('د', 'ل</w>'), ('ہ', 'و</w>'), ('ر', 'ا'), ('ا', 'س</w>'), ('د', 'ی'), ('ک', 'ہ</w>'), ('ہ', 'ی</w>'), ('و', 'ہ</w>'), ('ہ', 'م</w>'), ('ی', 'ہ</w>'), ('ج', 'ا'), ('ب', 'ا'), ('و', 'ر</w>'), ('ن', 'ا</w>'), ('م', 'ج'), ('ا', 'ل'), ('ک', 'و'), ('ر', 'ا</w>'), ('ان', '</w>'), ('ق', '</