In [3]:
import os  # ← make sure to import this
import zipfile

zip_path = "/content/urdu_ghazals_rekhta-main.zip"
extract_path = "/content/urdu_ghazals_rekhta-main"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall("/content/")

print(f" Dataset unzipped at {extract_path}")
print("Folders inside dataset:", os.listdir(extract_path))


 Dataset unzipped at /content/urdu_ghazals_rekhta-main
Folders inside dataset: ['dataset', 'LICENSE', 'sample_dataset', 'rekhta_parser.ipynb', 'README.md']


In [4]:
cd urdu_ghazals_rekhta-main


/content/urdu_ghazals_rekhta-main


In [5]:
ls


[0m[01;34mdataset[0m/  LICENSE  README.md  rekhta_parser.ipynb  [01;34msample_dataset[0m/


In [6]:
!pip install -q scikit-learn sentencepiece sacrebleu tqdm torch


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [7]:
# show top 200 lines of the zip listing so we can see file names inside
!unzip -l dataset/dataset.zip | sed -n '1,200p'


Archive:  dataset/dataset.zip
  Length      Date    Time    Name
---------  ---------- -----   ----
        0  2020-12-23 17:10   dataset/
        0  2020-12-23 05:33   dataset/ahmad-faraz/
        0  2020-12-23 05:33   dataset/ahmad-faraz/ur/
      628  2020-12-23 05:32   dataset/ahmad-faraz/ur/silsile-tod-gayaa-vo-sabhii-jaate-jaate-ahmad-faraz-ghazals
      708  2020-12-23 05:33   dataset/ahmad-faraz/ur/kyaa-aise-kam-sukhan-se-koii-guftuguu-kare-ahmad-faraz-ghazals
     1044  2020-12-23 05:33   dataset/ahmad-faraz/ur/havaa-ke-zor-se-pindaar-e-baam-o-dar-bhii-gayaa-ahmad-faraz-ghazals
     1009  2020-12-23 05:33   dataset/ahmad-faraz/ur/avval-avval-kii-dostii-hai-abhii-ahmad-faraz-ghazals
      748  2020-12-23 05:33   dataset/ahmad-faraz/ur/saaqiyaa-ek-nazar-jaam-se-pahle-pahle-ahmad-faraz-ghazals
     1221  2020-12-23 05:32   dataset/ahmad-faraz/ur/saamne-us-ke-kabhii-us-kii-sataaish-nahiin-kii-ahmad-faraz-ghazals
     1119  2020-12-23 05:33   dataset/ahmad-faraz/ur/vahshaten-badhti

In [8]:
# extract the zip into dataset/raw_data/
!mkdir -p dataset/raw_data
!unzip -o dataset/dataset.zip -d dataset/raw_data
# show top-level of extracted folder
!ls -la dataset/raw_data | sed -n '1,200p'


Archive:  dataset/dataset.zip
   creating: dataset/raw_data/dataset/
   creating: dataset/raw_data/dataset/ahmad-faraz/
   creating: dataset/raw_data/dataset/ahmad-faraz/ur/
  inflating: dataset/raw_data/dataset/ahmad-faraz/ur/silsile-tod-gayaa-vo-sabhii-jaate-jaate-ahmad-faraz-ghazals  
  inflating: dataset/raw_data/dataset/ahmad-faraz/ur/kyaa-aise-kam-sukhan-se-koii-guftuguu-kare-ahmad-faraz-ghazals  
  inflating: dataset/raw_data/dataset/ahmad-faraz/ur/havaa-ke-zor-se-pindaar-e-baam-o-dar-bhii-gayaa-ahmad-faraz-ghazals  
  inflating: dataset/raw_data/dataset/ahmad-faraz/ur/avval-avval-kii-dostii-hai-abhii-ahmad-faraz-ghazals  
  inflating: dataset/raw_data/dataset/ahmad-faraz/ur/saaqiyaa-ek-nazar-jaam-se-pahle-pahle-ahmad-faraz-ghazals  
  inflating: dataset/raw_data/dataset/ahmad-faraz/ur/saamne-us-ke-kabhii-us-kii-sataaish-nahiin-kii-ahmad-faraz-ghazals  
  inflating: dataset/raw_data/dataset/ahmad-faraz/ur/vahshaten-badhtii-gaiin-hijr-ke-aazaar-ke-saath-ahmad-faraz-ghazals  
  in

In [11]:
# list .txt/.csv files (first 200 results)
!find dataset/raw_data -type f \( -iname "*.txt" -o -iname "*.csv" -o -iname "*.utf8" \) | sed -n '1,200p'

# show files whose names include urdu / roman / translit (case-insensitive)
!find dataset/raw_data -type f | grep -Ei 'urdu|roman|translit|transliteration|latin' || true


In [13]:
import os

input_dir = "/content/urdu_ghazals_rekhta-main/dataset/raw_data"  # adjust if needed
output_dir = "/content/processed"

os.makedirs(output_dir, exist_ok=True)
all_lines = []

for poet_folder in os.listdir(input_dir):
    poet_path = os.path.join(input_dir, poet_folder)
    if os.path.isdir(poet_path):
        for file_name in os.listdir(poet_path):
            if file_name.endswith(".txt"):
                file_path = os.path.join(poet_path, file_name)
                with open(file_path, "r", encoding="utf-8") as f:
                    for line in f:
                        line = line.strip()
                        if line:
                            all_lines.append(line)

# Save processed file
output_file = os.path.join(output_dir, "all_poems_clean.txt")
with open(output_file, "w", encoding="utf-8") as f:
    f.write("\n".join(all_lines))

print(f" All poems processed and saved to {output_file}")
print("Processed folder contents:", os.listdir(output_dir))


 All poems processed and saved to /content/processed/all_poems_clean.txt
Processed folder contents: ['all_poems_clean.txt']


In [14]:
!find dataset/raw_data -maxdepth 3 -type f -iname "*.txt" -print

In [15]:
!unzip -l dataset/dataset.zip | head -40


Archive:  dataset/dataset.zip
  Length      Date    Time    Name
---------  ---------- -----   ----
        0  2020-12-23 17:10   dataset/
        0  2020-12-23 05:33   dataset/ahmad-faraz/
        0  2020-12-23 05:33   dataset/ahmad-faraz/ur/
      628  2020-12-23 05:32   dataset/ahmad-faraz/ur/silsile-tod-gayaa-vo-sabhii-jaate-jaate-ahmad-faraz-ghazals
      708  2020-12-23 05:33   dataset/ahmad-faraz/ur/kyaa-aise-kam-sukhan-se-koii-guftuguu-kare-ahmad-faraz-ghazals
     1044  2020-12-23 05:33   dataset/ahmad-faraz/ur/havaa-ke-zor-se-pindaar-e-baam-o-dar-bhii-gayaa-ahmad-faraz-ghazals
     1009  2020-12-23 05:33   dataset/ahmad-faraz/ur/avval-avval-kii-dostii-hai-abhii-ahmad-faraz-ghazals
      748  2020-12-23 05:33   dataset/ahmad-faraz/ur/saaqiyaa-ek-nazar-jaam-se-pahle-pahle-ahmad-faraz-ghazals
     1221  2020-12-23 05:32   dataset/ahmad-faraz/ur/saamne-us-ke-kabhii-us-kii-sataaish-nahiin-kii-ahmad-faraz-ghazals
     1119  2020-12-23 05:33   dataset/ahmad-faraz/ur/vahshaten-badhti

In [16]:
# Full automatic extractor + pairer (run as one cell in Colab)
import os, re, sys, shutil
from pathlib import Path
import regex
from collections import defaultdict
from sklearn.model_selection import train_test_split

# 1) extract zip into dataset/raw_data if not already extracted
zip_path = Path("dataset/dataset.zip")
raw_root = Path("dataset/raw_data")
if not raw_root.exists() or not any(raw_root.iterdir()):
    print("Extracting dataset/dataset.zip -> dataset/raw_data (this may take a moment)...")
    raw_root.mkdir(parents=True, exist_ok=True)
    # unzip command (Colab has unzip)
    !unzip -o dataset/dataset.zip -d dataset/raw_data >/dev/null
else:
    print("dataset/raw_data already exists; skipping extraction.")

# 2) find all author folders under dataset/raw_data/dataset (some archives include top-level 'dataset/' directory)
# Prefer extracted top-level that contains author folders
candidates = [p for p in raw_root.iterdir() if p.is_dir()]
# if top-level has a single folder named 'dataset', descend
if len(candidates) == 1 and candidates[0].name.lower() == "dataset":
    base = candidates[0]
else:
    # some zips list files with leading 'dataset/' paths; check and pick the directory that contains author subfolders
    # try raw_root / "dataset" else raw_root
    if (raw_root/"dataset").exists():
        base = raw_root/"dataset"
    else:
        base = raw_root

print("Base extracted folder:", str(base))
authors = [p for p in base.iterdir() if p.is_dir()]
print("Found", len(authors), "author folders (showing up to 12):")
for a in authors[:12]:
    print(" -", a.name)

# helper to compute Arabic char ratio in a file (sample few lines)
def arabic_ratio_in_file(path, sample_lines=200):
    try:
        with open(path, "r", encoding="utf-8", errors="ignore") as f:
            lines = []
            for i,line in enumerate(f):
                if i>=sample_lines: break
                l=line.strip()
                if l: lines.append(l)
    except Exception as e:
        return 0.0
    if not lines: return 0.0
    arabic_chars=0
    total_chars=0
    for L in lines:
        for ch in L:
            total_chars += 1
            if regex.search(r'[\p{Arabic}]', ch):
                arabic_chars += 1
    return arabic_chars / max(1, total_chars)

# pair files
paired_src = []
paired_tgt = []
unpaired = []

for author in authors:
    # look for an 'ur' folder (case-insensitive)
    ur_folders = [d for d in author.iterdir() if d.is_dir() and 'ur' in d.name.lower()]
    if not ur_folders:
        # maybe files directly under author
        ur_candidates = [f for f in author.rglob("*.txt") if arabic_ratio_in_file(f) > 0.05]
        if ur_candidates:
            # treat them as ur files
            ur_folder_files = ur_candidates
        else:
            continue
    else:
        ur_folder = ur_folders[0]
        ur_folder_files = [f for f in ur_folder.glob("*") if f.is_file()]

    # gather sibling candidate folders/files (non


dataset/raw_data already exists; skipping extraction.
Base extracted folder: dataset/raw_data/dataset
Found 30 author folders (showing up to 12):
 - jaan-nisar-akhtar
 - nazm-tabatabai
 - kaifi-azmi
 - faiz-ahmad-faiz
 - meer-taqi-meer
 - waseem-barelvi
 - ahmad-faraz
 - habib-jalib
 - javed-akhtar
 - mohsin-naqvi
 - bahadur-shah-zafar
 - ameer-khusrau


In [17]:
# Run this single cell in Colab to show pairing/processed status and helpful details.
from pathlib import Path
import os, re, sys
import regex

print("=== Does processed/ exist and what files? ===")
if Path("processed").exists():
    print("processed/ exists. Listing files and counts:")
    !ls -la processed || true
    print("\nLine counts:")
    !wc -l processed/*.utf8 || true
    print("\nSample (first 4 lines) from train.source.utf8 and train.target.utf8:")
    !sed -n '1,4p' processed/train.source.utf8 || true
    !sed -n '1,4p' processed/train.target.utf8 || true
    sys.exit(0)

# If processed/ doesn't exist, print diagnostic info so I can pick files manually.
print("processed/ does not exist yet. Showing author folders and ur/* contents (up to 8 authors):")
base = Path("dataset/raw_data/dataset") if (Path("dataset/raw_data/dataset").exists()) else Path("dataset/raw_data")
authors = [p for p in base.iterdir() if p.is_dir()]
print("Base folder used:", base)
print("Number of author folders found:", len(authors))
print("Showing up to 12 authors and their 'ur' folder files (if present):")
for a in authors[:12]:
    print("\n--- AUTHOR:", a.name, "---")
    ur_folders = [d for d in a.iterdir() if d.is_dir() and 'ur' in d.name.lower()]
    if ur_folders:
        for uf in ur_folders:
            print(" ur folder:", uf.relative_to(base))
            files = list(uf.glob("*"))
            if not files:
                print("   (no files)")
            else:
                for f in files[:10]:
                    print("   -", f.name, " size:", f.stat().st_size)
    else:
        # list small sample of text files under author
        txts = list(a.rglob("*.txt"))[:10]
        if txts:
            for f in txts:
                print("  text:", f.relative_to(base), " size:", f.stat().st_size)
        else:
            print("  (no ur folder or txt files found)")

# Also show any likely latin/roman files inside same authors (show up to 12)
print("\nNow show candidate roman-like files (low Arabic-ratio) across these authors (top 60 files):")
def arabic_ratio(path, sample_lines=200):
    try:
        with open(path, "r", encoding="utf-8", errors="ignore") as f:
            total=0; ar=0; i=0
            for line in f:
                if i>=sample_lines: break
                line=line.strip()
                if not line: continue
                i+=1
                for ch in line:
                    total+=1
                    if regex.search(r'[\p{Arabic}]', ch):
                        ar+=1
    except:
        return 0.0
    return ar/max(1,total)

roman_candidates = []
for a in authors[:60]:
    # scan files under each author (max depth)
    for f in a.rglob("*"):
        if f.is_file() and f.suffix.lower() in ['.txt','.utf8','.csv','.md']:
            r = arabic_ratio(f)
            if r < 0.02:  # likely roman / latin
                roman_candidates.append((f.relative_to(base), r, f.stat().st_size))
# sort by size
roman_candidates_sorted = sorted(roman_candidates, key=lambda x: (x[1], -x[2]))
for f,r,s in roman_candidates_sorted[:60]:
    print("ROMAN-CAND:", f, "arabic_ratio={:.4f}".format(r), "size=", s)

print("\nIf you see a pair that looks correct, copy-paste EXACTLY two paths (one Urdu file path and one Roman file path) from the outputs above.")
print("Example of what to paste back here (copy and paste both lines):")
print("dataset/raw_data/dataset/ahmad-faraz/ur/silsile-tod-gayaa-vo-sabhii-jaate-jaate-ahmad-faraz-ghazals")
print("dataset/raw_data/dataset/ahmad-faraz/en/silsile-tod-gayaa-vo-sabhii-jaate-jaate-ahmad-faraz-ghazals")


=== Does processed/ exist and what files? ===
processed/ does not exist yet. Showing author folders and ur/* contents (up to 8 authors):
Base folder used: dataset/raw_data/dataset
Number of author folders found: 30
Showing up to 12 authors and their 'ur' folder files (if present):

--- AUTHOR: jaan-nisar-akhtar ---
 ur folder: jaan-nisar-akhtar/ur
   - aahat-sii-koii-aae-to-lagtaa-hai-ki-tum-ho-jaan-nisar-akhtar-ghazals  size: 638
   - mauj-e-gul-mauj-e-sabaa-mauj-e-sahar-lagtii-hai-jaan-nisar-akhtar-ghazals  size: 994
   - tuluu-e-subh-hai-nazren-uthaa-ke-dekh-zaraa-jaan-nisar-akhtar-ghazals  size: 838
   - ham-se-bhaagaa-na-karo-duur-gazaalon-kii-tarah-jaan-nisar-akhtar-ghazals  size: 1485
   - har-ek-ruuh-men-ik-gam-chhupaa-lage-hai-mujhe-jaan-nisar-akhtar-ghazals  size: 1284
   - laakh-aavaara-sahii-shahron-ke-futpaathon-pe-ham-jaan-nisar-akhtar-ghazals  size: 705
   - vo-log-hii-har-daur-men-mahbuub-rahe-hain-jaan-nisar-akhtar-ghazals  size: 851
   - vo-ham-se-aaj-bhii-daaman-kash

In [18]:
from pathlib import Path
import re, sys
import regex  # <-- using regex library instead of re
from sklearn.model_selection import train_test_split

# Simple character-level transliteration mapping
MAP = {
    'ا':'a','آ':'a','أ':'a','إ':'i','ب':'b','پ':'p','ت':'t','ٹ':'t','ث':'s','ج':'j','چ':'ch',
    'ح':'h','خ':'kh','د':'d','ڈ':'d','ذ':'z','ر':'r','ڑ':'r','ز':'z','ژ':'zh','س':'s','ش':'sh',
    'ص':'s','ض':'z','ط':'t','ظ':'z','ع':"'",'غ':'gh','ف':'f','ق':'q','ک':'k','گ':'g',
    'ل':'l','م':'m','ن':'n','ں':'n','ھ':'h','ہ':'h','ۀ':'h','و':'o','ؤ':'o','ی':'y','ئ':'y','ے':'e',
    'ٔ':'','ٰ':'a','ً':'a','ُ':'u','ِ':'i','َ':'a','ّ':'','ْ':''
}

# remove unwanted punctuation but keep Arabic chars
PUNCT_RE = regex.compile(r"[^\p{Arabic}\w\s\-]", flags=regex.UNICODE)

def transliterate_line(s):
    s = s.strip()
    s = regex.sub(r"\s+", " ", s)
    s = PUNCT_RE.sub(" ", s)
    out_chars = []
    for ch in s:
        if ch.isspace():
            out_chars.append(" ")
            continue
        if ch in MAP:
            out_chars.append(MAP[ch])
        elif ord(ch) < 128:  # keep ascii
            out_chars.append(ch)
        # else skip unknown non-ascii
    r = "".join(out_chars)
    return regex.sub(r"\s+", " ", r).strip()

# === collect Urdu files ===
base = Path("dataset/raw_data/dataset") if Path("dataset/raw_data/dataset").exists() else Path("dataset/raw_data")
ur_files = sorted([p for p in base.rglob("*") if p.is_file() and "ur" in p.parent.name.lower()])

print("Found", len(ur_files), "Urdu files (showing up to 10):")
for f in ur_files[:10]:
    print(" -", f.relative_to(base))

all_src_lines, all_tgt_lines = [], []
for f in ur_files:
    lines = [L.strip() for L in f.read_text(encoding="utf-8", errors="ignore").splitlines() if L.strip()]
    if not lines: continue
    translit_lines = [transliterate_line(L) for L in lines]
    paired = [(s,t) for s,t in zip(lines, translit_lines) if s and t]
    if paired:
        srcs, tgts = zip(*paired)
        all_src_lines.extend(srcs)
        all_tgt_lines.extend(tgts)

print(f"\nCollected {len(all_src_lines)} lines.")

# Save to dataset
Path("dataset").mkdir(parents=True, exist_ok=True)
Path("dataset/urdu.txt").write_text("\n".join(all_src_lines), encoding="utf-8")
Path("dataset/roman.txt").write_text("\n".join(all_tgt_lines), encoding="utf-8")

print("Wrote dataset/urdu.txt and dataset/roman.txt")

# Split into train/valid/test (50/25/25)
n = len(all_src_lines)
train_src, temp_src, train_tgt, temp_tgt = train_test_split(all_src_lines, all_tgt_lines, test_size=0.5, random_state=42)
valid_src, test_src, valid_tgt, test_tgt = train_test_split(temp_src, temp_tgt, test_size=0.5, random_state=42)

out = Path("processed"); out.mkdir(exist_ok=True)
def writef(path, L):
    with open(path, "w", encoding="utf-8") as fh:
        for line in L: fh.write(line.strip()+"\n")

writef(out/"train.source.utf8", train_src)
writef(out/"train.target.utf8", train_tgt)
writef(out/"valid.source.utf8", valid_src)
writef(out/"valid.target.utf8", valid_tgt)
writef(out/"test.source.utf8", test_src)
writef(out/"test.target.utf8", test_tgt)

print("\nSaved processed/ with splits. Counts:")
for p in out.glob("*.utf8"):
    print(p.name, "-", sum(1 for _ in open(p, encoding="utf-8")))

print("\nFirst 6 pairs:")
for i in range(min(6, n)):
    print(f"\n[{i+1}]")
    print("SRC:", all_src_lines[i])
    print("TGT:", all_tgt_lines[i])


Found 1314 Urdu files (showing up to 10):
 - ahmad-faraz/ur/aankh-se-duur-na-ho-dil-se-utar-jaaegaa-ahmad-faraz-ghazals
 - ahmad-faraz/ur/aashiqii-men-miir-jaise-khvaab-mat-dekhaa-karo-ahmad-faraz-ghazals
 - ahmad-faraz/ur/ab-aur-kyaa-kisii-se-maraasim-badhaaen-ham-ahmad-faraz-ghazals
 - ahmad-faraz/ur/ab-ke-ham-bichhde-to-shaayad-kabhii-khvaabon-men-milen-ahmad-faraz-ghazals
 - ahmad-faraz/ur/ab-ke-tajdiid-e-vafaa-kaa-nahiin-imkaan-jaanaan-ahmad-faraz-ghazals
 - ahmad-faraz/ur/ab-kyaa-sochen-kyaa-haalaat-the-kis-kaaran-ye-zahr-piyaa-hai-ahmad-faraz-ghazals
 - ahmad-faraz/ur/ab-shauq-se-ki-jaan-se-guzar-jaanaa-chaahiye-ahmad-faraz-ghazals
 - ahmad-faraz/ur/abhii-kuchh-aur-karishme-gazal-ke-dekhte-hain-ahmad-faraz-ghazals
 - ahmad-faraz/ur/agarche-zor-havaaon-ne-daal-rakkhaa-hai-ahmad-faraz-ghazals
 - ahmad-faraz/ur/aisaa-hai-ki-sab-khvaab-musalsal-nahiin-hote-ahmad-faraz-ghazals

Collected 21068 lines.
Wrote dataset/urdu.txt and dataset/roman.txt

Saved processed/ with splits. Counts:


In [19]:
!pip install -q torch sentencepiece sacrebleu tqdm


In [20]:
import os

dataset_folder = "/content/urdu_ghazals_rekhta-main"  # Correct path
output_file = "all_poems_clean.txt"

all_lines = []

for poet_folder in os.listdir(dataset_folder):
    poet_path = os.path.join(dataset_folder, poet_folder)
    if os.path.isdir(poet_path):
        for file_name in os.listdir(poet_path):
            if file_name.endswith(".txt"):
                file_path = os.path.join(poet_path, file_name)
                with open(file_path, 'r', encoding='utf-8') as f:
                    lines = f.readlines()
                    clean_lines = [line.strip() for line in lines if line.strip()]
                    all_lines.extend(clean_lines)

all_lines = list(dict.fromkeys(all_lines))  # Remove duplicates

with open(output_file, 'w', encoding='utf-8') as f:
    f.write('\n'.join(all_lines))

print(f" All poems merged and cleaned into '{output_file}'")

# Preview first 10 lines
for i, line in enumerate(all_lines[:10]):
    print(line)


 All poems merged and cleaned into 'all_poems_clean.txt'
ankh se dor nh ho dl se atr jaye ga
oqt ka kya he gzrta he gzr jaye ga
atna manos nh ho khlot ghm se apny
to kbhy khod ko bhy dykhe ga to dr jaye ga
dobte dobte kshty ko achhala de don
myn nhyn koyy to sahl ph atr jaye ga
zndgy tyry 'ta he to yh jane oala
tyry bkhshsh try dhlyz ph dhr jaye ga
zbt lazm he mgr dkh he qyamt ka fraz
zalm ab ke bhy nh roye ga to mr jaye ga


In [21]:
import os

# -----------------------------
# File paths
# -----------------------------
input_file = "all_poems_clean.txt"
output_file = "all_poems_roman.txt"

# -----------------------------
# Simple Urdu → Roman mapping
# -----------------------------
# You can expand this mapping as needed
mapping = {
    'ا': 'a', 'ب': 'b', 'پ': 'p', 'ت': 't', 'ٹ': 't', 'ث': 's',
    'ج': 'j', 'چ': 'ch', 'ح': 'h', 'خ': 'kh', 'د': 'd', 'ڈ': 'd',
    'ذ': 'z', 'ر': 'r', 'ڑ': 'r', 'ز': 'z', 'ژ': 'zh', 'س': 's',
    'ش': 'sh', 'ص': 's', 'ض': 'z', 'ط': 't', 'ظ': 'z', 'ع': 'a',
    'غ': 'gh', 'ف': 'f', 'ق': 'q', 'ک': 'k', 'گ': 'g', 'ل': 'l',
    'م': 'm', 'ن': 'n', 'و': 'w', 'ہ': 'h', 'ء': "'", 'ی': 'y',
    'ے': 'e', 'آ': 'aa', 'ؤ': 'o', 'ئ': 'i', ' ': ' ', '\n': '\n'
}

# -----------------------------
# Function to romanize a line
# -----------------------------
def romanize(text):
    return ''.join(mapping.get(char, char) for char in text)

# -----------------------------
# Read, romanize, and save
# -----------------------------
with open(input_file, 'r', encoding='utf-8') as f:
    lines = f.readlines()

roman_lines = [romanize(line) for line in lines]

with open(output_file, 'w', encoding='utf-8') as f:
    f.write('\n'.join(roman_lines))

print(f"Roman Urdu saved to '{output_file}'")

# -----------------------------
# Preview first 10 lines
# -----------------------------
print("\n--- Preview ---")
for line in roman_lines[:10]:
    print(line)


Roman Urdu saved to 'all_poems_roman.txt'

--- Preview ---
ankh se dor nh ho dl se atr jaye ga

oqt ka kya he gzrta he gzr jaye ga

atna manos nh ho khlot ghm se apny

to kbhy khod ko bhy dykhe ga to dr jaye ga

dobte dobte kshty ko achhala de don

myn nhyn koyy to sahl ph atr jaye ga

zndgy tyry 'ta he to yh jane oala

tyry bkhshsh try dhlyz ph dhr jaye ga

zbt lazm he mgr dkh he qyamt ka fraz

zalm ab ke bhy nh roye ga to mr jaye ga



In [22]:
import os

print("Current Directory:", os.getcwd())
dataset_folder = "C:/Users/YourName/Downloads/Urdu_Poetry_Dataset"
# or for Linux/Colab
# dataset_folder = "/content/Urdu_Poetry_Dataset"



Current Directory: /content/urdu_ghazals_rekhta-main


In [23]:
from tqdm import tqdm

input_file = "all_poems_roman.txt"
tokenized_file = "all_poems_tokenized.txt"

with open(input_file, 'r', encoding='utf-8') as f:
    lines = f.readlines()

# Simple word-level tokenization
tokenized_lines = [' '.join(line.strip().split()) for line in lines]

with open(tokenized_file, 'w', encoding='utf-8') as f:
    f.write('\n'.join(tokenized_lines))

print(f"Tokenized dataset saved to '{tokenized_file}'")
print("\n--- Preview ---")
for line in tokenized_lines[:10]:
    print(line)


Tokenized dataset saved to 'all_poems_tokenized.txt'

--- Preview ---
ankh se dor nh ho dl se atr jaye ga

oqt ka kya he gzrta he gzr jaye ga

atna manos nh ho khlot ghm se apny

to kbhy khod ko bhy dykhe ga to dr jaye ga

dobte dobte kshty ko achhala de don



In [None]:
from collections import Counter

# Create vocabulary
all_words = [word for line in tokenized_lines for word in line.split()]
vocab = Counter(all_words)
vocab = {word: i+1 for i, (word, _) in enumerate(vocab.most_common())}  # +1 to reserve 0 for padding

print(f"Vocabulary size: {len(vocab)}")

# Convert lines to sequences
sequences = [[vocab[word] for word in line.split()] for line in tokenized_lines]

# Preview first 3 sequences
print("\n--- First 3 sequences ---")
for seq in sequences[:3]:
    print(seq)


Vocabulary size: 15211

--- First 3 sequences ---
[384, 2, 270, 7, 29, 10, 2, 524, 120, 55]
[]
[306, 8, 9, 1, 2008, 1, 217, 120, 55]


In [24]:
import os

# Set dataset folder
dataset_folder = '/content/urdu_ghazals_rekhta-main'

all_lines = []

# Iterate through folders/files and merge
for poet_folder in os.listdir(dataset_folder):
    poet_path = os.path.join(dataset_folder, poet_folder)
    if os.path.isdir(poet_path):
        for file in os.listdir(poet_path):
            file_path = os.path.join(poet_path, file)
            if file_path.endswith('.txt'):
                with open(file_path, 'r', encoding='utf-8') as f:
                    lines = [line.strip() for line in f if line.strip()]
                    all_lines.extend(lines)

# Save merged and cleaned Urdu poems
with open('all_poems_clean.txt', 'w', encoding='utf-8') as f:
    for line in all_lines:
        f.write(line + '\n')

print(f" All poems merged into 'all_poems_clean.txt'. Total lines: {len(all_lines)}")
# Simple Urdu → Roman transliteration
def urdu_to_roman(urdu_line):
    mapping = {
        'ا':'a', 'ب':'b', 'پ':'p', 'ت':'t', 'ٹ':'ṭ', 'ث':'s',
        'ج':'j', 'چ':'ch', 'ح':'h', 'خ':'kh', 'د':'d', 'ڈ':'ḍ',
        'ر':'r', 'ز':'z', 'ژ':'zh', 'س':'s', 'ش':'sh', 'ص':'s',
        'ض':'z', 'ط':'t', 'ظ':'z', 'ع':'a', 'غ':'gh', 'ف':'f',
        'ق':'q', 'ک':'k', 'گ':'g', 'ل':'l', 'م':'m', 'ن':'n',
        'و':'w', 'ہ':'h', 'ء':'', 'ی':'y', 'ے':'e',
        ' ': ' ', '\n':'\n'
    }
    return ''.join([mapping.get(ch, ch) for ch in urdu_line])

# Read merged Urdu file
with open('all_poems_clean.txt', 'r', encoding='utf-8') as f:
    urdu_lines = [line.strip() for line in f if line.strip()]

# Generate Roman Urdu
roman_lines = [urdu_to_roman(line) for line in urdu_lines]

# Save to file
with open('all_poems_roman.txt', 'w', encoding='utf-8') as f:
    for line in roman_lines:
        f.write(line + '\n')

print(f" Roman Urdu file created: all_poems_roman.txt | Total lines: {len(roman_lines)}")


 All poems merged into 'all_poems_clean.txt'. Total lines: 42136
 Roman Urdu file created: all_poems_roman.txt | Total lines: 42136


In [26]:
!pip install python-Levenshtein


Collecting python-Levenshtein
  Downloading python_levenshtein-0.27.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.27.1 (from python-Levenshtein)
  Downloading levenshtein-0.27.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.27.1->python-Levenshtein)
  Downloading rapidfuzz-3.14.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading python_levenshtein-0.27.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.27.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (159 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m159.9/159.9 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rapidfuzz-3.14.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m59.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected package

In [27]:
# --------------------------
# Urdu → Roman Urdu Seq2Seq (Fast Version)
# --------------------------
import torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from collections import Counter
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# --------------------------
# Load a subset of data (~2000 lines for speed)
# --------------------------
with open("all_poems_clean.txt","r",encoding="utf-8") as f: src_lines=[l.strip() for l in f if l.strip()]
with open("all_poems_roman.txt","r",encoding="utf-8") as f: tgt_lines=[l.strip() for l in f if l.strip()]
src_lines, tgt_lines = src_lines[:2000], tgt_lines[:2000]

# --------------------------
# Vocabulary
# --------------------------
def build_vocab(lines):
    chars = sorted(set(c for l in lines for c in l))
    vocab = {c:i+3 for i,c in enumerate(chars)}  # 0=PAD,1=SOS,2=EOS
    vocab['<PAD>']=0; vocab['<SOS>']=1; vocab['<EOS>']=2
    return vocab

src_vocab, tgt_vocab = build_vocab(src_lines), build_vocab(tgt_lines)
inv_tgt_vocab = {i:c for c,i in tgt_vocab.items()}

def encode(line, vocab):
    return [vocab['<SOS>']] + [vocab[c] for c in line] + [vocab['<EOS>']]

# --------------------------
# Dataset & DataLoader
# --------------------------
class SeqDataset(Dataset):
    def __init__(self, src, tgt):
        self.src = [torch.tensor(encode(l, src_vocab)) for l in src]
        self.tgt = [torch.tensor(encode(l, tgt_vocab)) for l in tgt]
    def __len__(self): return len(self.src)
    def __getitem__(self, i): return self.src[i], self.tgt[i]

def collate(batch):
    src, tgt = zip(*batch)
    return pad_sequence(src, batch_first=True, padding_value=0), pad_sequence(tgt, batch_first=True, padding_value=0)

loader = DataLoader(SeqDataset(src_lines, tgt_lines), batch_size=32, shuffle=True, collate_fn=collate)

# --------------------------
# Encoder & Decoder
# --------------------------
class Encoder(nn.Module):
    def __init__(self, in_dim, emb_dim, hid_dim, layers, dropout):
        super().__init__()
        self.emb = nn.Embedding(in_dim, emb_dim, padding_idx=0)
        self.rnn = nn.LSTM(emb_dim, hid_dim, layers, bidirectional=True, batch_first=True)
        self.fc_h = nn.Linear(hid_dim*2, hid_dim)
        self.fc_c = nn.Linear(hid_dim*2, hid_dim)
    def forward(self, src):
        _, (h, c) = self.rnn(self.emb(src))
        h_cat = torch.cat((h[-2], h[-1]), dim=1)
        c_cat = torch.cat((c[-2], c[-1]), dim=1)
        return self.fc_h(h_cat).unsqueeze(0), self.fc_c(c_cat).unsqueeze(0)

class Decoder(nn.Module):
    def __init__(self, out_dim, emb_dim, hid_dim, layers):
        super().__init__()
        self.emb = nn.Embedding(out_dim, emb_dim, padding_idx=0)
        self.rnn = nn.LSTM(emb_dim, hid_dim, layers, batch_first=True)
        self.fc = nn.Linear(hid_dim, out_dim)
    def forward(self, input, h, c):
        input = input.unsqueeze(1)
        out, (h, c) = self.rnn(self.emb(input), (h, c))
        return self.fc(out.squeeze(1)), h, c

class Seq2Seq(nn.Module):
    def __init__(self, enc, dec, device):
        super().__init__()
        self.encoder = enc
        self.decoder = dec
        self.device = device
    def forward(self, src, tgt, tf_ratio=0.7):
        B, T, V = src.size(0), tgt.size(1), self.decoder.fc.out_features
        out = torch.zeros(B, T, V).to(self.device)
        h, c = self.encoder(src)
        input_tok = tgt[:,0]
        for t in range(1, T):
            pred, h, c = self.decoder(input_tok, h, c)
            out[:,t,:] = pred
            teacher_force = torch.rand(1).item() < tf_ratio
            input_tok = tgt[:,t] if teacher_force else pred.argmax(1)
        return out

# --------------------------
# Hyperparameters & Model
# --------------------------
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
ENC_EMB_DIM, DEC_EMB_DIM, HID_DIM = 128, 128, 256
ENC_LAYERS, DEC_LAYERS = 1, 1

model = Seq2Seq(
    Encoder(len(src_vocab), ENC_EMB_DIM, HID_DIM, ENC_LAYERS, 0.2),
    Decoder(len(tgt_vocab), DEC_EMB_DIM, HID_DIM, DEC_LAYERS),
    device
).to(device)

opt = torch.optim.Adam(model.parameters(), lr=1e-3)
crit = nn.CrossEntropyLoss(ignore_index=0)

# --------------------------
# Training (10 epochs)
# --------------------------
for epoch in range(10):
    model.train()
    loss_epoch = 0
    for src, tgt in loader:
        src, tgt = src.to(device), tgt.to(device)
        opt.zero_grad()
        out = model(src, tgt)
        loss = crit(out[:,1:,:].reshape(-1,out.shape[-1]), tgt[:,1:].reshape(-1))
        loss.backward()
        opt.step()
        loss_epoch += loss.item()
    print(f"Epoch {epoch+1}/10 | Loss: {loss_epoch/len(loader):.4f}")

# --------------------------
# Quick evaluation (first 5 sentences)
# --------------------------
def translate(src_tensor, max_len=50):
    h, c = model.encoder(src_tensor.unsqueeze(0).to(device))
    input_tok = torch.tensor([tgt_vocab['<SOS>']], device=device)
    tokens = []
    for _ in range(max_len):
        pred, h, c = model.decoder(input_tok, h, c)
        top1 = pred.argmax(1)
        if top1.item() == tgt_vocab['<EOS>']: break
        tokens.append(top1.item())
        input_tok = top1
    return ''.join([inv_tgt_vocab[i] for i in tokens])

smooth = SmoothingFunction().method4
for i in range(5):
    src = torch.tensor(encode(src_lines[i], src_vocab))
    tgt = tgt_lines[i]
    pred = translate(src)
    print(f"\nSRC: {src_lines[i]}\nTGT: {tgt}\nPRED: {pred}")
    print(f"BLEU: {sentence_bleu([list(tgt)], list(pred), smoothing_function=smooth):.4f}")

print("\n Training done and predictions generated!")


Epoch 1/10 | Loss: 2.5793
Epoch 2/10 | Loss: 2.2564
Epoch 3/10 | Loss: 2.1246
Epoch 4/10 | Loss: 2.0032
Epoch 5/10 | Loss: 1.9117
Epoch 6/10 | Loss: 1.8241
Epoch 7/10 | Loss: 1.7540
Epoch 8/10 | Loss: 1.6976
Epoch 9/10 | Loss: 1.6137
Epoch 10/10 | Loss: 1.5444

SRC: ankh se dor nh ho dl se atr jaye ga
TGT: ankh se dor nh ho dl se atr jaye ga
PRED: ankh sr do se hon to andr apne ala he jate
BLEU: 0.3187

SRC: oqt ka kya he gzrta he gzr jaye ga
TGT: oqt ka kya he gzrta he gzr jaye ga
PRED: oqt ka kya he grzar ge gr gya he araz
BLEU: 0.5702

SRC: atna manos nh ho khlot ghm se apny
TGT: atna manos nh ho khlot ghm se apny
PRED: atna mnan o shol he mhlo kh nh rha he myn
BLEU: 0.3162

SRC: to kbhy khod ko bhy dykhe ga to dr jaye ga
TGT: to kbhy khod ko bhy dykhe ga to dr jaye ga
PRED: to kh bhy dod ko kh to bhy apra  jane gye
BLEU: 0.5171

SRC: dobte dobte kshty ko achhala de don
TGT: dobte dobte kshty ko achhala de don
PRED: dotee doton bhy dl khad ke dath he ashan
BLEU: 0.1627

 Training do

In [None]:
# Urdu → Roman Urdu Seq2Seq

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from collections import Counter
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import numpy as np

# -----------------------------
# Load dataset
# -----------------------------
SRC_FILE = "all_poems_clean.txt"
TGT_FILE = "all_poems_roman.txt"

with open(SRC_FILE, 'r', encoding='utf-8') as f:
    src_lines = [line.strip() for line in f if line.strip()]
with open(TGT_FILE, 'r', encoding='utf-8') as f:
    tgt_lines = [line.strip() for line in f if line.strip()]

# -----------------------------
# Use first 5000 sentences
# -----------------------------
src_lines = src_lines[:5000]
tgt_lines = tgt_lines[:5000]
print(f"Using {len(src_lines)} sentence pairs for optimized run.")

# -----------------------------
# Build vocab
# -----------------------------
def build_vocab(lines):
    chars = [c for line in lines for c in line]
    vocab = {c: i+1 for i, c in enumerate(set(chars))}
    vocab['<PAD>'] = 0
    vocab['<SOS>'] = len(vocab)
    vocab['<EOS>'] = len(vocab)
    return vocab

src_vocab = build_vocab(src_lines)
tgt_vocab = build_vocab(tgt_lines)
inv_tgt_vocab = {i: c for c, i in tgt_vocab.items()}

def encode(line, vocab):
    return [vocab[c] for c in line]

src_encoded = [encode(l, src_vocab) for l in src_lines]
tgt_encoded = [encode(l, tgt_vocab) for l in tgt_lines]

# -----------------------------
# Dataset & DataLoader
# -----------------------------
class Seq2SeqDataset(Dataset):
    def __init__(self, src, tgt):
        self.src, self.tgt = src, tgt
    def __len__(self):
        return len(self.src)
    def __getitem__(self, idx):
        return torch.tensor(self.src[idx]), torch.tensor(self.tgt[idx])

def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)
    return pad_sequence(src_batch, batch_first=True, padding_value=0), \
           pad_sequence(tgt_batch, batch_first=True, padding_value=0)

dataset = Seq2SeqDataset(src_encoded, tgt_encoded)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)

# -----------------------------
# Encoder & Decoder
# -----------------------------
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dec_layers):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=0)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, bidirectional=True, batch_first=True)
        self.fc_h = nn.Linear(hid_dim*2, hid_dim*2)
        self.fc_c = nn.Linear(hid_dim*2, hid_dim*2)
        self.dec_layers = dec_layers
    def forward(self, src):
        e = self.embedding(src)
        out, (h, c) = self.rnn(e)
        h_cat = torch.cat((h[-2], h[-1]), 1)
        c_cat = torch.cat((c[-2], c[-1]), 1)
        h_dec = self.fc_h(h_cat).unsqueeze(0).repeat(self.dec_layers,1,1)
        c_dec = self.fc_c(c_cat).unsqueeze(0).repeat(self.dec_layers,1,1)
        return out, h_dec, c_dec

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim, padding_idx=0)
        self.rnn = nn.LSTM(emb_dim, hid_dim*2, n_layers, batch_first=True)
        self.fc_out = nn.Linear(hid_dim*2, output_dim)
    def forward(self, input, h, c):
        input = input.unsqueeze(1)
        e = self.embedding(input)
        out, (h, c) = self.rnn(e, (h, c))
        return self.fc_out(out.squeeze(1)), h, c

class Seq2Seq(nn.Module):
    def __init__(self, enc, dec, device):
        super().__init__()
        self.enc = enc
        self.dec = dec
        self.device = device
    def forward(self, src, tgt, teacher_forcing=0.7):
        B, t_len = tgt.size(0), tgt.size(1)
        out_dim = self.dec.fc_out.out_features
        outputs = torch.zeros(B, t_len, out_dim).to(self.device)
        _, h, c = self.enc(src)
        inp = tgt[:,0]
        for t in range(1, t_len):
            out, h, c = self.dec(inp, h, c)
            outputs[:,t,:] = out
            top1 = out.argmax(1)
            inp = tgt[:,t] if torch.rand(1).item() < teacher_forcing else top1
        return outputs

# -----------------------------
# Hyperparameters
# -----------------------------
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
enc = Encoder(len(src_vocab), 128, 256, 1, 1)
dec = Decoder(len(tgt_vocab), 128, 256, 1)
model = Seq2Seq(enc, dec, device).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss(ignore_index=0)

# -----------------------------
# Training loop
# -----------------------------
EPOCHS = 5
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for src, tgt in dataloader:
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()
        output = model(src, tgt)
        o_dim = output.shape[-1]
        loss = criterion(output[:,1:,:].reshape(-1,o_dim), tgt[:,1:].reshape(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{EPOCHS} | Loss: {total_loss/len(dataloader):.4f}")

# -----------------------------
# Prediction function
# -----------------------------
def translate(src_tensor, max_len=50):
    src_tensor = src_tensor.unsqueeze(0).to(device)
    with torch.no_grad():
        _, h, c = model.enc(src_tensor)
    inp = torch.tensor([tgt_vocab['<SOS>']], device=device)
    out_tokens = []
    for _ in range(max_len):
        with torch.no_grad():
            out, h, c = model.dec(inp, h, c)
            top1 = out.argmax(1)
        if top1.item() == tgt_vocab['<EOS>']:
            break
        out_tokens.append(top1.item())
        inp = top1
    return ''.join([inv_tgt_vocab[i] for i in out_tokens])

# -----------------------------
# Quick evaluation (first 5 examples)
# -----------------------------
smooth = SmoothingFunction().method4
for i in range(5):
    src = torch.tensor(src_encoded[i])
    tgt = tgt_lines[i]
    pred = translate(src)
    print(f"\nSRC: {src_lines[i]}\nTGT: {tgt}\nPRED: {pred}")
    print(f"BLEU: {sentence_bleu([list(tgt)], list(pred), smoothing_function=smooth):.4f}")


Using 5000 sentence pairs for optimized run.
Epoch 1/5 | Loss: 2.4350
Epoch 2/5 | Loss: 2.0851
Epoch 3/5 | Loss: 1.8396
Epoch 4/5 | Loss: 1.6588
Epoch 5/5 | Loss: 1.5255

SRC: ankh se dor nh ho dl se atr jaye ga
TGT: ankh se dor nh ho dl se atr jaye ga
PRED: akh se dor nh dor se ata ae ale aye galan hyn hyn 
BLEU: 0.4696

SRC: oqt ka kya he gzrta he gzr jaye ga
TGT: oqt ka kya he gzrta he gzr jaye ga
PRED: ot ka kh aye gzr ate agr ae azare gya he dykhte hy
BLEU: 0.3718

SRC: atna manos nh ho khlot ghm se apny
TGT: atna manos nh ho khlot ghm se apny
PRED: aan mnshon ko chha to ghl se myn aoan se dya he nh
BLEU: 0.1387

SRC: to kbhy khod ko bhy dykhe ga to dr jaye ga
TGT: to kbhy khod ko bhy dykhe ga to dr jaye ga
PRED: o kbhy ko bhy dykh dor be aga da dya he jane gya h
BLEU: 0.4775

SRC: dobte dobte kshty ko achhala de don
TGT: dobte dobte kshty ko achhala de don
PRED: obte dode khth shal ko dl ka to aya dya he dyan he
BLEU: 0.3005


In [29]:
import torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import random

# -----------------------------
# Load first 5000 sentences
# -----------------------------
SRC_FILE, TGT_FILE = "all_poems_clean.txt", "all_poems_roman.txt"
with open(SRC_FILE,'r',encoding='utf-8') as f: src_lines = [l.strip() for l in f if l.strip()]
with open(TGT_FILE,'r',encoding='utf-8') as f: tgt_lines = [l.strip() for l in f if l.strip()]

src_lines, tgt_lines = src_lines[:5000], tgt_lines[:5000]
print(f"Using {len(src_lines)} sentences")

# -----------------------------
# Build vocab
# -----------------------------
def build_vocab(lines):
    chars = [c for line in lines for c in line]
    vocab = {c:i+1 for i,c in enumerate(set(chars))}
    vocab['<PAD>']=0; vocab['<SOS>']=len(vocab); vocab['<EOS>']=len(vocab)
    return vocab

src_vocab, tgt_vocab = build_vocab(src_lines), build_vocab(tgt_lines)
inv_tgt_vocab = {i:c for c,i in tgt_vocab.items()}

def encode(line, vocab): return [vocab[c] for c in line]

src_encoded = [encode(l, src_vocab) for l in src_lines]
tgt_encoded = [encode(l, tgt_vocab) for l in tgt_lines]

# -----------------------------
# Dataset & DataLoader
# -----------------------------
class Seq2SeqDataset(Dataset):
    def __init__(self, src, tgt): self.src,self.tgt=src,tgt
    def __len__(self): return len(self.src)
    def __getitem__(self,idx): return torch.tensor(self.src[idx]), torch.tensor(self.tgt[idx])

def collate_fn(batch):
    src_batch,tgt_batch=zip(*batch)
    return pad_sequence(src_batch,batch_first=True,padding_value=0), pad_sequence(tgt_batch,batch_first=True,padding_value=0)

dataloader = DataLoader(Seq2SeqDataset(src_encoded, tgt_encoded), batch_size=64, shuffle=True, collate_fn=collate_fn)

# -----------------------------
# Encoder-Decoder
# -----------------------------
class Encoder(nn.Module):
    def __init__(self,input_dim,emb_dim,hid_dim,n_layers,dec_layers):
        super().__init__()
        self.embedding=nn.Embedding(input_dim,emb_dim,padding_idx=0)
        self.rnn=nn.LSTM(emb_dim,hid_dim,n_layers,bidirectional=True,batch_first=True)
        self.fc_h=nn.Linear(hid_dim*2,hid_dim*2)
        self.fc_c=nn.Linear(hid_dim*2,hid_dim*2)
        self.dec_layers=dec_layers
    def forward(self,src):
        e=self.embedding(src)
        out,(h,c)=self.rnn(e)
        h_cat=torch.cat((h[-2],h[-1]),1); c_cat=torch.cat((c[-2],c[-1]),1)
        h_dec=self.fc_h(h_cat).unsqueeze(0).repeat(self.dec_layers,1,1)
        c_dec=self.fc_c(c_cat).unsqueeze(0).repeat(self.dec_layers,1,1)
        return out,h_dec,c_dec

class Decoder(nn.Module):
    def __init__(self,output_dim,emb_dim,hid_dim,n_layers):
        super().__init__()
        self.embedding=nn.Embedding(output_dim,emb_dim,padding_idx=0)
        self.rnn=nn.LSTM(emb_dim,hid_dim*2,n_layers,batch_first=True)
        self.fc_out=nn.Linear(hid_dim*2,output_dim)
    def forward(self,input,h,c):
        input=input.unsqueeze(1)
        e=self.embedding(input)
        out,(h,c)=self.rnn(e,(h,c))
        return self.fc_out(out.squeeze(1)),h,c

class Seq2Seq(nn.Module):
    def __init__(self,enc,dec,device): super().__init__(); self.enc=enc; self.dec=dec; self.device=device
    def forward(self,src,tgt,teacher_forcing=0.9):
        B,t_len=tgt.size(0),tgt.size(1)
        outputs=torch.zeros(B,t_len,self.dec.fc_out.out_features).to(self.device)
        _,h,c=self.enc(src)
        inp=tgt[:,0]
        for t in range(1,t_len):
            out,h,c=self.dec(inp,h,c)
            outputs[:,t,:]=out
            top1=out.argmax(1)
            inp=tgt[:,t] if random.random()<teacher_forcing else top1
        return outputs

# -----------------------------
# Hyperparameters
# -----------------------------
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
enc=Encoder(len(src_vocab),128,256,1,1)
dec=Decoder(len(tgt_vocab),128,256,1)
model=Seq2Seq(enc,dec,device).to(device)
optimizer=torch.optim.Adam(model.parameters(),lr=1e-3)
criterion=nn.CrossEntropyLoss(ignore_index=0)

# -----------------------------
# Train (3-5 epochs for speed)
# -----------------------------
for epoch in range(3):
    model.train(); total_loss=0
    for src,tgt in dataloader:
        src,tgt=src.to(device),tgt.to(device)
        optimizer.zero_grad()
        output=model(src,tgt)
        loss=criterion(output[:,1:,:].reshape(-1,output.shape[-1]),tgt[:,1:].reshape(-1))
        loss.backward(); optimizer.step()
        total_loss+=loss.item()
    print(f"Epoch {epoch+1}/3 | Loss: {total_loss/len(dataloader):.4f}")

# -----------------------------
# Prediction function
# -----------------------------
def translate(src_tensor,max_len=50):
    src_tensor=src_tensor.unsqueeze(0).to(device)
    with torch.no_grad(): _,h,c=model.enc(src_tensor)
    inp=torch.tensor([tgt_vocab['<SOS>']],device=device); out_tokens=[]
    for _ in range(max_len):
        with torch.no_grad():
            out,h,c=model.dec(inp,h,c)
            top1=out.argmax(1)
        if top1.item()==tgt_vocab['<EOS>']: break
        out_tokens.append(top1.item()); inp=top1
    return ''.join([inv_tgt_vocab[i] for i in out_tokens])

# -----------------------------
# Quick evaluation (5 examples)
# -----------------------------
smooth=SmoothingFunction().method4
for i in range(5):
    src=torch.tensor(src_encoded[i]); tgt=tgt_lines[i]
    pred=translate(src)
    print(f"\nSRC: {src_lines[i]}\nTGT: {tgt}\nPRED: {pred}")
    print(f"BLEU: {sentence_bleu([list(tgt)], list(pred),smoothing_function=smooth):.4f}")


Using 5000 sentences
Epoch 1/3 | Loss: 2.2875
Epoch 2/3 | Loss: 1.8579
Epoch 3/3 | Loss: 1.5696

SRC: ankh se dor nh ho dl se atr jaye ga
TGT: ankh se dor nh ho dl se atr jaye ga
PRED: nah ke do se ho an se dl ke andar he nhyn hoa he a
BLEU: 0.3035

SRC: oqt ka kya he gzrta he gzr jaye ga
TGT: oqt ka kya he gzrta he gzr jaye ga
PRED: ot ka aye he ate aye gr zar kh dykhte hyn as ke dy
BLEU: 0.2369

SRC: atna manos nh ho khlot ghm se apny
TGT: atna manos nh ho khlot ghm se apny
PRED: anat mon shoa he kh lo kh nhyn he mlan ke dykha he
BLEU: 0.1290

SRC: to kbhy khod ko bhy dykhe ga to dr jaye ga
TGT: to kbhy khod ko bhy dykhe ga to dr jaye ga
PRED: o khy khob dykh bhy do kr bhy an myn aor dl ke dyk
BLEU: 0.3532

SRC: dobte dobte kshty ko achhala de don
TGT: dobte dobte kshty ko achhala de don
PRED: ote dlbte ho khad ko bhy dl ke hoa he aya he aya n
BLEU: 0.2005


In [30]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# -----------------------------
# Translation function
# -----------------------------
def translate(src_tensor, model, tgt_vocab, inv_tgt_vocab, device, max_len=50):
    src_tensor = src_tensor.unsqueeze(0).to(device)
    with torch.no_grad():
        _, h, c = model.enc(src_tensor)
    inp = torch.tensor([tgt_vocab['<SOS>']], device=device)
    out_tokens = []
    for _ in range(max_len):
        with torch.no_grad():
            out, h, c = model.dec(inp, h, c)
            top1 = out.argmax(1)
        if top1.item() == tgt_vocab['<EOS>']:
            break
        out_tokens.append(top1.item())
        inp = top1
    return ''.join([inv_tgt_vocab[i] for i in out_tokens])

# -----------------------------
# Quick evaluation on first 10 sentences
# -----------------------------
smooth = SmoothingFunction().method4
for i in range(10):
    src = torch.tensor(src_encoded[i])
    tgt = tgt_lines[i]
    pred = translate(src, model, tgt_vocab, inv_tgt_vocab, device)
    bleu = sentence_bleu([list(tgt)], list(pred), smoothing_function=smooth)
    print(f"\nSRC: {src_lines[i]}")
    print(f"TGT: {tgt}")
    print(f"PRED: {pred}")
    print(f"BLEU: {bleu:.4f}")



SRC: ankh se dor nh ho dl se atr jaye ga
TGT: ankh se dor nh ho dl se atr jaye ga
PRED: nah ke do se ho an se dl ke andar he nhyn hoa he a
BLEU: 0.3035

SRC: oqt ka kya he gzrta he gzr jaye ga
TGT: oqt ka kya he gzrta he gzr jaye ga
PRED: ot ka aye he ate aye gr zar kh dykhte hyn as ke dy
BLEU: 0.2369

SRC: atna manos nh ho khlot ghm se apny
TGT: atna manos nh ho khlot ghm se apny
PRED: anat mon shoa he kh lo kh nhyn he mlan ke dykha he
BLEU: 0.1290

SRC: to kbhy khod ko bhy dykhe ga to dr jaye ga
TGT: to kbhy khod ko bhy dykhe ga to dr jaye ga
PRED: o khy khob dykh bhy do kr bhy an myn aor dl ke dyk
BLEU: 0.3532

SRC: dobte dobte kshty ko achhala de don
TGT: dobte dobte kshty ko achhala de don
PRED: ote dlbte ho khad ko bhy dl ke hoa he aya he aya n
BLEU: 0.2005

SRC: myn nhyn koyy to sahl ph atr jaye ga
TGT: myn nhyn koyy to sahl ph atr jaye ga
PRED: yn hyn ko yh aya to myn ashar kya he aya he nh hoa
BLEU: 0.2665

SRC: zndgy tyry 'ta he to yh jane oala
TGT: zndgy tyry 'ta he to yh j