<a href="https://colab.research.google.com/github/deviljerry/Urdu-Conversational-Chatbot-Transformer-with-Multi-Head-Attention/blob/main/Urdu_ChatBot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ===========================================
# 🚀 Urdu Conversational Chatbot using Transformer (from scratch)
# ===========================================

# Install dependencies
!pip install torch torchvision torchaudio nltk datasets sentencepiece sacrebleu rouge-score gradio urduhack kaggle

import os
import re
import math
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, random_split
from tqdm import tqdm
import nltk
from datasets import load_dataset
import gradio as gr
from rouge_score import rouge_scorer
import sacrebleu
import urduhack


Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m988.9 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting urduhack
  Downloading urduhack-1.1.1-py3-none-any.whl.metadata (7.2 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Collecting tf2crf (from urduhack)
  Downloading tf2crf-0.1.33-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting tensorflow-datasets~=3.1 (from urduhack)
  Downloading tensorflow_datasets-3.2.1-py3-none-any.whl.metadata (4.8 kB)
Collecting click (from nltk)
  Downloading click-7.1.2-py2.py3-none-any.whl.metadata (2.9 kB)
INFO: pip is looking at multiple versions of typ

In [3]:
# ===========================================
# 🔹 Step 1: Load Dataset from Kaggle (Fixed)
# ===========================================

# Set up Kaggle API key (upload kaggle.json to Colab first)
!mkdir -p ~/.kaggle
!echo '{"username":"YOUR_KAGGLE_USERNAME","key":"YOUR_KAGGLE_KEY"}' > ~/.kaggle/kaggle.json
!chmod 600 ~/.kaggle/kaggle.json

# Download and unzip dataset
!kaggle datasets download -d muhammadahmedansari/urdu-dataset-20000 -p ./data
!unzip -q ./data/urdu-dataset-20000.zip -d ./data

# -------------------------------------------
# 🔹 Load the main TSV dataset
# -------------------------------------------
import pandas as pd

# Try reading TSV file safely
file_path = "./data/final_main_dataset.tsv"
data = pd.read_csv(file_path, sep="\t", quoting=3, on_bad_lines='skip')

# Inspect available columns
print("Columns:", data.columns.tolist())
print("Sample rows:")
print(data.head(3))

# -------------------------------------------
# 🔹 Identify text columns
# -------------------------------------------
# Most likely columns: 'input', 'response', 'question', 'answer', etc.
# You can adjust column names below once you see printed names.

if "input" in data.columns and "response" in data.columns:
    data = data[["input", "response"]]
elif "question" in data.columns and "answer" in data.columns:
    data = data[["question", "answer"]]
else:
    # Fallback: use first two columns
    data = data.iloc[:, :2]

data.columns = ["input_text", "target_text"]

# Drop NaNs and sample subset to fit in Colab memory
data = data.dropna().sample(20000, random_state=42)

print("✅ Loaded dataset with shape:", data.shape)
print(data.sample(3))


Dataset URL: https://www.kaggle.com/datasets/muhammadahmedansari/urdu-dataset-20000
License(s): other
urdu-dataset-20000.zip: Skipping, found more recently modified local copy (use --force to force download)
replace ./data/char_to_num_vocab.pkl? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace ./data/final_main_dataset.tsv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace ./data/limited_wav_files/limited_wav_files/common_voice_ur_26562732.wav? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace ./data/limited_wav_files/limited_wav_files/common_voice_ur_26562733.wav? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace ./data/limited_wav_files/limited_wav_files/common_voice_ur_26562734.wav? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
n
Columns: ['client_id', 'path', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accents', 'variant', 'locale', 'segment']
Sample rows:
                                           client_id  \
0  e53f84d151d6cc6d45a57decde08a99efe47d7751a4ca6...   
1  e53f84d151d6c

In [4]:
# ===========================================
# 🔹 Step 1: Load and Prepare Urdu Text Dataset
# ===========================================
import pandas as pd

file_path = "./data/final_main_dataset.tsv"
data = pd.read_csv(file_path, sep="\t", quoting=3, on_bad_lines='skip')

print("Columns:", data.columns.tolist())
print("Sample sentences:")
print(data["sentence"].head(5))

# Use only the text column
sentences = data["sentence"].dropna().astype(str).tolist()

# Simulate conversational pairs:
# e.g. sentence[i] -> sentence[i+1]
input_texts = sentences[:-1]
target_texts = sentences[1:]

# Build DataFrame
data = pd.DataFrame({
    "input_text": input_texts,
    "target_text": target_texts
})

# Take random subset to avoid memory overload
data = data.sample(20000, random_state=42).reset_index(drop=True)

print("✅ Dataset prepared for chatbot training")
print(data.head(5))


Columns: ['client_id', 'path', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accents', 'variant', 'locale', 'segment']
Sample sentences:
0                   کبھی کبھار ہی خیالی پلاو بناتا ہوں
1                    اور پھر ممکن ہے کہ پاکستان بھی ہو
2                        یہ فیصلہ بھی گزشتہ دو سال میں
3                       ان کے بلے بازوں کے سامنے ہو گا
4    آبی جانور میں بطخ بگلا اور دُوسْرا آبی پرندہ ش...
Name: sentence, dtype: object
✅ Dataset prepared for chatbot training
                                  input_text  \
0                                یہ سب تسلیم   
1      اور وہ تم پر اپنے نگران مقرر رکھتا ہے   
2      سیاسی حقوق کے مطالبات کو جائز قرار دی   
3                     لوگ چل کرگئےاورمرکرآئے   
4  حضرت علی بن حسین رضی اللہ عنہ سے روایت ہے   

                                      target_text  
0                  انھیں ہیرو کا درجہ بھی دیا گیا  
1            انسان کے اندر شعور کی عدالت قائم ہے۔  
2  ہر فریق دوسرے کے لیے سِفارشات کی تجویز دیتا ہے  
3          

In [9]:
# ===========================================
# 🔹 Step 2: Preprocessing (Final Fixed Version)
# ===========================================
import re
import nltk

# Download both required tokenizer models
nltk.download('punkt')
nltk.download('punkt_tab')

# ✅ Urdu normalization function (custom implementation)
def normalize_urdu(text):
    text = str(text)
    text = re.sub(r'[\u064B-\u065F\u0670\u06D6-\u06ED]', '', text)  # remove diacritics
    text = re.sub(r'[اآإأٱ]', 'ا', text)  # normalize Alef
    text = re.sub(r'[يىئ]', 'ی', text)    # normalize Yeh
    text = re.sub(r'[ھہۀھٰ]', 'ہ', text)  # normalize Heh
    text = re.sub(r'[^\u0600-\u06FF\s]', '', text)  # keep only Urdu chars
    text = re.sub(r'\s+', ' ', text).strip()  # clean spaces
    return text

# Apply normalization
data["input_text"] = data["input_text"].astype(str).apply(normalize_urdu)
data["target_text"] = data["target_text"].astype(str).apply(normalize_urdu)

# ✅ Tokenization
from nltk.tokenize import word_tokenize

def tokenize(text):
    return word_tokenize(text)

data["input_tokens"] = data["input_text"].apply(tokenize)
data["target_tokens"] = data["target_text"].apply(tokenize)

# ✅ Build vocabulary
from collections import Counter

all_tokens = [token for tokens in data["input_tokens"] + data["target_tokens"] for token in tokens]
vocab = ["<pad>", "<sos>", "<eos>", "<unk>"] + sorted(set(all_tokens))
vocab2idx = {w: i for i, w in enumerate(vocab)}
idx2vocab = {i: w for w, i in vocab2idx.items()}

print("✅ Preprocessing complete!")
print("Vocabulary size:", len(vocab))
print("Sample tokens:", data['input_tokens'].head(3).tolist())


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


✅ Preprocessing complete!
Vocabulary size: 10897
Sample tokens: [['یہ', 'سب', 'تسلیم'], ['اور', 'وہ', 'تم', 'پر', 'اپنے', 'نگران', 'مقرر', 'رکہتا', 'ہے'], ['سیاسی', 'حقوق', 'کے', 'مطالبات', 'کو', 'جایز', 'قرار', 'دی']]


In [11]:
# ===========================================
# 🔹 Step 3: Dataset & Dataloader
# ===========================================

MAX_LEN = 40

def encode(tokens):
    tokens = ["<sos>"] + tokens[:MAX_LEN-2] + ["<eos>"]
    ids = [vocab2idx.get(t, vocab2idx["<unk>"]) for t in tokens]
    return ids + [vocab2idx["<pad>"]] * (MAX_LEN - len(ids))

class UrduChatDataset(Dataset):
    def __init__(self, df):
        self.inputs = df["input_tokens"].tolist()
        self.targets = df["target_tokens"].tolist()
    def __len__(self):
        return len(self.inputs)
    def __getitem__(self, idx):
        return torch.tensor(encode(self.inputs[idx])), torch.tensor(encode(self.targets[idx]))

dataset = UrduChatDataset(data)
train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size
train_ds, val_ds, test_ds = random_split(dataset, [train_size, val_size, test_size])

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=32)

In [15]:
# ===========================================
# 🔹 Step 4: Transformer Model (Fixed for Batch-First + Mask Shape)
# ===========================================
import torch
import torch.nn as nn
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # shape: [1, max_len, d_model]
        self.register_buffer("pe", pe)

    def forward(self, x):
        # x: [batch, seq_len, d_model]
        return x + self.pe[:, :x.size(1)]

class TransformerChatbot(nn.Module):
    def __init__(self, vocab_size, d_model=256, n_heads=2, num_layers=2, dropout=0.1):
        super().__init__()
        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model)

        # ✅ batch_first=True is critical here
        encoder_layer = nn.TransformerEncoderLayer(
            d_model, n_heads, dim_feedforward=512, dropout=dropout, batch_first=True
        )
        decoder_layer = nn.TransformerDecoderLayer(
            d_model, n_heads, dim_feedforward=512, dropout=dropout, batch_first=True
        )

        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers)
        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers)
        self.fc_out = nn.Linear(d_model, vocab_size)

    def _generate_square_subsequent_mask(self, sz):
        """Generate autoregressive mask for decoder to prevent peeking ahead."""
        mask = torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)
        return mask  # shape: [sz, sz]

    def forward(self, src, trg):
        # src, trg: [batch, seq_len]
        src_emb = self.embedding(src) * math.sqrt(self.d_model)
        trg_emb = self.embedding(trg) * math.sqrt(self.d_model)

        src_emb = self.pos_encoder(src_emb)
        trg_emb = self.pos_encoder(trg_emb)

        # ✅ Create target mask dynamically per batch
        tgt_seq_len = trg_emb.size(1)
        tgt_mask = self._generate_square_subsequent_mask(tgt_seq_len).to(trg.device)

        # Encoder + Decoder
        memory = self.encoder(src_emb)
        output = self.decoder(trg_emb, memory, tgt_mask=tgt_mask)

        # Output projection
        return self.fc_out(output)


In [16]:
# ===========================================
# 🔹 Step 5: Training
# ===========================================

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TransformerChatbot(len(vocab)).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=vocab2idx["<pad>"])
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)

EPOCHS = 5
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for src, trg in tqdm(train_loader):
        src, trg = src.to(device), trg.to(device)
        optimizer.zero_grad()
        output = model(src, trg[:, :-1])
        loss = criterion(output.reshape(-1, len(vocab)), trg[:, 1:].reshape(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}")

torch.save(model.state_dict(), "urdu_chatbot.pt")

100%|██████████| 500/500 [12:52<00:00,  1.54s/it]


Epoch 1, Loss: 6.4982


100%|██████████| 500/500 [12:09<00:00,  1.46s/it]


Epoch 2, Loss: 5.5674


100%|██████████| 500/500 [12:18<00:00,  1.48s/it]


Epoch 3, Loss: 4.9248


100%|██████████| 500/500 [12:12<00:00,  1.47s/it]


Epoch 4, Loss: 4.3796


100%|██████████| 500/500 [11:54<00:00,  1.43s/it]

Epoch 5, Loss: 3.8975





In [17]:
# ===========================================
# 🔹 Step 6: Evaluation
# ===========================================

def evaluate_bleu(model, loader):
    model.eval()
    refs, hyps = [], []
    with torch.no_grad():
        for src, trg in loader:
            src = src.to(device)
            output = model(src, trg[:, :-1].to(device))
            pred = output.argmax(-1)
            for i in range(pred.size(0)):
                ref = [idx2vocab[t.item()] for t in trg[i] if t.item() not in [0, 1, 2]]
                hyp = [idx2vocab[t.item()] for t in pred[i] if t.item() not in [0, 1, 2]]
                refs.append([' '.join(ref)])
                hyps.append(' '.join(hyp))
    bleu = sacrebleu.corpus_bleu(hyps, list(zip(*refs)))
    print(f"BLEU: {bleu.score:.2f}")

evaluate_bleu(model, val_loader)


BLEU: 1.99


In [18]:
# ===========================================
# 🔹 Step 7: Gradio Chat Interface
# ===========================================

def generate_reply(prompt):
    model.eval()
    tokens = tokenize(normalize_urdu(prompt))
    ids = torch.tensor([encode(tokens)]).to(device)
    trg = torch.tensor([[vocab2idx["<sos>"]]]).to(device)
    for _ in range(MAX_LEN):
        out = model(ids, trg)
        next_token = out.argmax(-1)[:, -1]
        trg = torch.cat([trg, next_token.unsqueeze(0)], dim=1)
        if next_token.item() == vocab2idx["<eos>"]:
            break
    result = [idx2vocab[i.item()] for i in trg[0]][1:-1]
    return " ".join(result)

iface = gr.Interface(
    fn=generate_reply,
    inputs=gr.Textbox(label="🗨️ Urdu Input", placeholder="اپنا سوال یہاں لکھیں...", rtl=True),
    outputs=gr.Textbox(label="🤖 Chatbot Reply", rtl=True),
    title="Urdu Transformer Chatbot"
)
iface.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://8aa1a0523f28bd041e.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


