In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Load the Dataset and download the dataset and save in drive

### Data Preprocessing
This section prepares and cleans the dataset for training.

In [None]:
from datasets import load_dataset, DatasetDict
import pandas as pd
import re
import unicodedata
from pathlib import Path
from itertools import islice

# CONFIG
HF_DATASET = "itsbib/itsbib-nepali-english-bidirectional-1"
SAVE_DIR = Path("/content/drive/MyDrive/fyp_translator")  # change if you want local path
SAVE_DIR.mkdir(parents=True, exist_ok=True)

# How big do you want splits? (You have ~1M train examples in the HF dataset.)
# We'll create: train ~900k, val 50k, test 50k by default.
TRAIN_TARGET = 50_000
VAL_TARGET   = 10_000
TEST_TARGET  = 10_000

# If you want a "middle slice" (e.g., center portion of the dataset), set this True.
USE_MIDDLE_SLICE = True

def normalize_ne(text: str) -> str:
    # Light Unicode normalization; you can add more Nepali-specific rules if needed
    text = unicodedata.normalize("NFC", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def normalize_en(text: str) -> str:
    text = unicodedata.normalize("NFC", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# 1) Load the dataset
ds = load_dataset(HF_DATASET)

# Inspect columns once (helpful if schema changes)
print(ds)

# Try to detect column names
def find_cols(example):
    cols = example.keys()
    candidates_en = [c for c in cols if c.lower() in ["en", "english", "source_en", "src_en"]]
    candidates_ne = [c for c in cols if c.lower() in ["ne", "nepali", "target_ne", "tgt_ne"]]
    return candidates_en[0] if candidates_en else None, candidates_ne[0] if candidates_ne else None

sample_split = "train" if "train" in ds else list(ds.keys())[0]
en_col, ne_col = find_cols(ds[sample_split][0])

assert en_col is not None and ne_col is not None, f"Couldn't auto-detect columns. Found: {ds[sample_split].column_names}"

print(f"Detected columns: EN='{en_col}', NE='{ne_col}'")

# 2) Concatenate all available splits (some datasets put data in multiple splits)
all_rows = []
for split_name in ds.keys():
    for ex in ds[split_name]:
        en = normalize_en(ex[en_col])
        ne = normalize_ne(ex[ne_col])
        if en and ne:
            # filter too-short/long sentences (improves quality)
            if 1 <= len(en.split()) <= 150 and 1 <= len(ne.split()) <= 150:
                all_rows.append((en, ne))

print(f"Total parallel pairs after basic cleaning: {len(all_rows):,}")

# 3) (Optional) Take a "middle slice" to reduce domain bias
N = len(all_rows)
if USE_MIDDLE_SLICE:
    # center window of size TRAIN+VAL+TEST, or the maximum available
    want = min(TRAIN_TARGET + VAL_TARGET + TEST_TARGET, N)
    start = max(0, (N - want) // 2)
    all_rows = all_rows[start:start+want]
    print(f"Middle slice taken: {len(all_rows):,} examples (from {start} to {start+want})")

# 4) Deduplicate exact duplicates
all_rows = list(dict.fromkeys(all_rows))  # keeps order
print(f"After dedupe: {len(all_rows):,}")

# 5) Create splits: train/val/test
total_needed = min(len(all_rows), TRAIN_TARGET + VAL_TARGET + TEST_TARGET)
rows = all_rows[:total_needed]
train = rows[:TRAIN_TARGET]
val   = rows[TRAIN_TARGET:TRAIN_TARGET+VAL_TARGET]
test  = rows[TRAIN_TARGET+VAL_TARGET:TRAIN_TARGET+VAL_TARGET+TEST_TARGET]

print(f"Final splits → train: {len(train):,}, val: {len(val):,}, test: {len(test):,}")

def write_parallel(pairs, prefix: Path):
    with open(prefix.with_suffix(".en"), "w", encoding="utf-8") as fe, \
         open(prefix.with_suffix(".ne"), "w", encoding="utf-8") as fn:
        for en, ne in pairs:
            fe.write(en + "\n")
            fn.write(ne + "\n")

write_parallel(train, SAVE_DIR / "train")
write_parallel(val,   SAVE_DIR / "val")
write_parallel(test,  SAVE_DIR / "test")

print(f"Wrote:\n{SAVE_DIR/'train.en'}\n{SAVE_DIR/'train.ne'}\n{SAVE_DIR/'val.en'}\n{SAVE_DIR/'val.ne'}\n{SAVE_DIR/'test.en'}\n{SAVE_DIR/'test.ne'}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/565 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/135M [00:00<?, ?B/s]

data/validation-00000-of-00001.parquet:   0%|          | 0.00/16.9M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/16.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1124315 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/140539 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/140540 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['en', 'ne'],
        num_rows: 1124315
    })
    validation: Dataset({
        features: ['en', 'ne'],
        num_rows: 140539
    })
    test: Dataset({
        features: ['en', 'ne'],
        num_rows: 140540
    })
})
Detected columns: EN='en', NE='ne'
Total parallel pairs after basic cleaning: 1,405,303
Middle slice taken: 70,000 examples (from 667651 to 737651)
After dedupe: 69,917
Final splits → train: 50,000, val: 10,000, test: 9,917
Wrote:
/content/drive/MyDrive/fyp_translator/train.en
/content/drive/MyDrive/fyp_translator/train.ne
/content/drive/MyDrive/fyp_translator/val.en
/content/drive/MyDrive/fyp_translator/val.ne
/content/drive/MyDrive/fyp_translator/test.en
/content/drive/MyDrive/fyp_translator/test.ne


# Downlaod the Packages

### Evaluation
This section evaluates the trained model using BLEU or accuracy metrics.

In [None]:
# =========================
# Install dependencies
# =========================
!pip install -q --upgrade transformers datasets sacrebleu sentencepiece evaluate accelerate

# Import the Packages

## Transformer Training Method

This section trains the Transformer model using this specific configuration.

In [None]:
# =========================
# Imports
# =========================
from pathlib import Path
from datasets import Dataset
from transformers import (
    MBart50TokenizerFast, MBartForConditionalGeneration,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments, Seq2SeqTrainer
)
import evaluate
import torch

In [None]:
import torch
print("GPU available:", torch.cuda.is_available())
print("GPU name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")


GPU available: True
GPU name: Tesla T4


##Transformer Training Method

This section trains the Transformer model using this specific configuration.

In [None]:
# =========================
# CONFIG
# =========================
DATA_DIR = Path("/content/drive/MyDrive/fyp_translator")  # change as needed
OUTPUT_DIR = Path("/content/drive/MyDrive/fyp_translator/models")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

BATCH_SIZE = 8        # increase (Colab T4/A100 can handle 8 for 50k dataset)
GRAD_ACCUM = 4        # effective batch size = 32
NUM_EPOCHS = 10       # train longer since you have more data
MAX_SRC_LEN = 128
MAX_TGT_LEN = 128
BEAM_SIZE = 5

# Learning rate (add this)
LEARNING_RATE = 5e-5
WARMUP_STEPS = 1000

SRC_LANG = "en_XX"
TGT_LANG = "ne_NP"


### Training Loop Function
Defines the training loop for the model.

In [None]:
# =========================
# Load dataset from text files
# =========================
def build_aligned(split: str):
    en_lines = [l.strip() for l in open(DATA_DIR / f"{split}.en", encoding="utf-8")]
    ne_lines = [l.strip() for l in open(DATA_DIR / f"{split}.ne", encoding="utf-8")]
    assert len(en_lines) == len(ne_lines)
    return [{"en": e, "ne": n} for e, n in zip(en_lines, ne_lines)]

train_hf = Dataset.from_list(build_aligned("train"))
val_hf   = Dataset.from_list(build_aligned("val"))
test_hf  = Dataset.from_list(build_aligned("test"))

## Transformer Training Method

This section trains the Transformer model using this specific configuration.

In [None]:
# =========================
# Tokenizer & Models
# =========================
MODEL_NAME = "facebook/mbart-large-50-many-to-many-mmt"

# EN->NE
tok_en2ne = MBart50TokenizerFast.from_pretrained(MODEL_NAME)
tok_en2ne.src_lang = SRC_LANG
mod_en2ne = MBartForConditionalGeneration.from_pretrained(MODEL_NAME)
mod_en2ne.config.forced_bos_token_id = tok_en2ne.lang_code_to_id["ne_NP"]

# NE->EN
tok_ne2en = MBart50TokenizerFast.from_pretrained(MODEL_NAME)
tok_ne2en.src_lang = TGT_LANG
mod_ne2en = MBartForConditionalGeneration.from_pretrained(MODEL_NAME)
mod_ne2en.config.forced_bos_token_id = tok_ne2en.lang_code_to_id["en_XX"]


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

### Data Preprocessing
This section prepares and cleans the dataset for training.

In [None]:
# =========================
# Preprocessing for mBART-50
# =========================
def preprocess_en2ne(examples):
    inputs  = examples["en"]
    targets = examples["ne"]

    # Set source lang
    tok_en2ne.src_lang = "en_XX"

    # Encode source
    model_inputs = tok_en2ne(inputs,
                             max_length=MAX_SRC_LEN,
                             truncation=True,
                             padding="max_length")

    # Encode targets with target lang
    labels = tok_en2ne(targets,
                       max_length=MAX_TGT_LEN,
                       truncation=True,
                       padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


def preprocess_ne2en(examples):
    inputs  = examples["ne"]
    targets = examples["en"]

    # Set source lang
    tok_ne2en.src_lang = "ne_NP"

    # Encode source
    model_inputs = tok_ne2en(inputs,
                             max_length=MAX_SRC_LEN,
                             truncation=True,
                             padding="max_length")

    # Encode targets with target lang
    labels = tok_ne2en(targets,
                       max_length=MAX_TGT_LEN,
                       truncation=True,
                       padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs




# Map to dataset
tokenized_train_en2ne = train_hf.map(preprocess_en2ne, batched=True, remove_columns=["en","ne"])
tokenized_val_en2ne   = val_hf.map(preprocess_en2ne, batched=True, remove_columns=["en","ne"])

tokenized_train_ne2en = train_hf.map(preprocess_ne2en, batched=True, remove_columns=["en","ne"])
tokenized_val_ne2en   = val_hf.map(preprocess_ne2en, batched=True, remove_columns=["en","ne"])



Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

### Tokenization
This section handles text tokenization and vocabulary preparation.

In [None]:
# =========================
# Data collators
# =========================
collator_en2ne = DataCollatorForSeq2Seq(tokenizer=tok_en2ne, model=mod_en2ne)
collator_ne2en = DataCollatorForSeq2Seq(tokenizer=tok_ne2en, model=mod_ne2en)


### Data Preprocessing
This section prepares and cleans the dataset for training.

In [None]:
import evaluate

# Load sacrebleu metric globally
sacrebleu = evaluate.load("sacrebleu")

def compute_metrics(preds_labels, tokenizer):
    preds, labels = preds_labels
    if isinstance(preds, tuple):
        preds = preds[0]

    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = [[(l if l != -100 else tokenizer.pad_token_id) for l in label] for label in labels]
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Clean up extra spaces
    decoded_preds = [p.strip() for p in decoded_preds]
    decoded_labels = [[l.strip()] for l in decoded_labels]

    # Compute BLEU score
    result = sacrebleu.compute(predictions=decoded_preds, references=decoded_labels)

    # Round numeric values only (avoids TypeError)
    result = {k: (round(v, 4) if isinstance(v, (int, float)) else v) for k, v in result.items()}
    return result


Downloading builder script: 0.00B [00:00, ?B/s]

### Training Loop Function
Defines the training loop for the model.

In [None]:
# =========================
# Training Arguments
# =========================
def get_training_args(output_dir):
    return Seq2SeqTrainingArguments(
        output_dir=str(output_dir),
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        gradient_accumulation_steps=GRAD_ACCUM,
        num_train_epochs=NUM_EPOCHS,
        learning_rate=LEARNING_RATE,
        warmup_steps=WARMUP_STEPS,
        lr_scheduler_type="linear",

        # ✅ evaluation every 5000 steps
        eval_strategy="steps",
        eval_steps=1000,

        # ✅ save every 1000 steps
        save_strategy="steps",
        save_steps=1000,
        save_total_limit=3,

        predict_with_generate=True,
        generation_num_beams=BEAM_SIZE,
        fp16=True,
        weight_decay=0.01,
        load_best_model_at_end=True,
        label_smoothing_factor=0.1,
        report_to="none",
    )


In [None]:
import transformers
print(transformers.__version__)

4.56.1


## Transformer Training Method

This section trains the Transformer model using this specific configuration.

In [None]:
# =========================
# Trainer EN->NE (Auto Resume)
# =========================
checkpoint_en2ne = None
checkpoints = sorted((OUTPUT_DIR/"EN2NE").glob("checkpoint-*"), key=lambda p: int(p.name.split("-")[-1]))
if checkpoints:
    checkpoint_en2ne = str(checkpoints[-1])
    print(f"Resuming EN->NE training from checkpoint: {checkpoint_en2ne}")

trainer_en2ne = Seq2SeqTrainer(
    model=mod_en2ne,
    args=get_training_args(OUTPUT_DIR/"EN2NE"),
    train_dataset=tokenized_train_en2ne,
    eval_dataset=tokenized_val_en2ne,
    tokenizer=tok_en2ne,
    data_collator=collator_en2ne,
    compute_metrics=lambda preds_labels: compute_metrics((preds_labels[0], preds_labels[1]), tok_en2ne)
)

trainer_en2ne.train(resume_from_checkpoint=checkpoint_en2ne)
trainer_en2ne.save_model(OUTPUT_DIR/"EN2NE/final")
tok_en2ne.save_pretrained(OUTPUT_DIR/"EN2NE/final")

Resuming EN->NE training from checkpoint: /content/drive/MyDrive/fyp_translator/models/EN2NE/checkpoint-6000


  trainer_en2ne = Seq2SeqTrainer(
You are resuming training from a checkpoint trained with 4.56.1 of Transformers but your current version is 4.56.2. This is not recommended and could yield to errors or unwanted behaviors.
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


## Transformer Training Method

This section trains the Transformer model using this specific configuration.

In [None]:
# =========================
# Trainer NE->EN (Auto Resume)
# =========================
checkpoint_ne2en = None
checkpoints = sorted((OUTPUT_DIR/"NE2EN").glob("checkpoint-*"), key=lambda p: int(p.name.split("-")[-1]))
if checkpoints:
    checkpoint_ne2en = str(checkpoints[-1])
    print(f"Resuming NE->EN training from checkpoint: {checkpoint_ne2en}")

trainer_ne2en = Seq2SeqTrainer(
    model=mod_ne2en,
    args=get_training_args(OUTPUT_DIR/"NE2EN"),
    train_dataset=tokenized_train_ne2en,
    eval_dataset=tokenized_val_ne2en,
    tokenizer=tok_ne2en,
    data_collator=collator_ne2en,
    compute_metrics=lambda preds_labels: compute_metrics((preds_labels[0], preds_labels[1]), tok_ne2en)
)

trainer_ne2en.train(resume_from_checkpoint=checkpoint_ne2en)
trainer_ne2en.save_model(OUTPUT_DIR/"NE2EN/final")
tok_ne2en.save_pretrained(OUTPUT_DIR/"NE2EN/final")
print("NE->EN mBART-50 model saved.")


### Tokenization
This section handles text tokenization and vocabulary preparation.

In [None]:
# =========================
# Test Evaluation
# =========================
def evaluate_model(model_path, src_file, tgt_file, tokenizer):
    model = MBartForConditionalGeneration.from_pretrained(model_path).to("cuda").eval()
    src_lines = [l.strip() for l in open(src_file, encoding="utf-8")]
    tgt_lines = [l.strip() for l in open(tgt_file, encoding="utf-8")]
    batch = 8
    preds = []
    for i in range(0, len(src_lines), batch):
        chunk = src_lines[i:i+batch]
        enc = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=128).to("cuda")
        with torch.no_grad():
            gen = model.generate(**enc, num_beams=BEAM_SIZE, max_length=128)
        preds.extend(tokenizer.batch_decode(gen, skip_special_tokens=True))
    result = sacrebleu.compute(predictions=preds, references=[[t] for t in tgt_lines])
    print(f"Test SacreBLEU ({model_path.name}):", result)
    return result


### Evaluation
This section evaluates the trained model using BLEU or accuracy metrics.

In [None]:
# Evaluate
evaluate_model(OUTPUT_DIR/"EN2NE/final", DATA_DIR/"test.en", DATA_DIR/"test.ne", tok_en2ne)
evaluate_model(OUTPUT_DIR/"NE2EN/final", DATA_DIR/"test.ne", DATA_DIR/"test.en", tok_ne2en)

## Transformer Training Method

This section trains the Transformer model using this specific configuration.

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Path to your "final" folder (where model.safetensors + config.json live)
model_path = "/content/drive/MyDrive/fyp_translator/models/EN2NE/final"

# Load tokenizer & model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

# Example input (English → Nepali)
src_text = "where are you from ?"

# Tokenize
inputs = tokenizer(src_text, return_tensors="pt")

# Generate translation
outputs = model.generate(**inputs, max_length=100)

# Decode
translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Translation:", translation)


Translation: तपाईं कहाँबाट आएका हो ?


## Transformer Training Method

This section trains the Transformer model using this specific configuration.

In [None]:
# =========================
# Install dependencies
# =========================
from transformers import (
    M2M100ForConditionalGeneration,
    M2M100Tokenizer,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
)
from datasets import Dataset
import pandas as pd
import torch
import evaluate

In [None]:
MODEL_NAME = "facebook/m2m100_418M"
SOURCE_LANG = "en"
TARGET_LANG = "ne"
MODEL_SAVE_PATH = "/content/drive/MyDrive/fyp_translator/models"
MAX_LENGTH = 128
EPOCHS = 3
BATCH_SIZE = 2

## Transformer Training Method

This section trains the Transformer model using this specific configuration.

In [None]:
tokenizer = M2M100Tokenizer.from_pretrained(MODEL_NAME)
model = M2M100ForConditionalGeneration.from_pretrained(MODEL_NAME)

### Training Loop Function
Defines the training loop for the model.

In [None]:
# -------- LOAD CUSTOM DATA --------
def load_text_data(src_path, tgt_path):
    with open(src_path, "r", encoding="utf-8") as src_file, open(tgt_path, "r", encoding="utf-8") as tgt_file:
        src_lines = [line.strip() for line in src_file]
        tgt_lines = [line.strip() for line in tgt_file]

    # remove empty lines and make sure both have text
    data = [
        {"translation": {SOURCE_LANG: s, TARGET_LANG: t}}
        for s, t in zip(src_lines, tgt_lines)
        if s and t
    ]
    return pd.DataFrame(data)

train_df = load_text_data(
    "/content/drive/MyDrive/fyp_translator/train.en",
    "/content/drive/MyDrive/fyp_translator/train.ne"
)
valid_df = load_text_data(
    "/content/drive/MyDrive/fyp_translator/val.en",
    "/content/drive/MyDrive/fyp_translator/val.ne"
)

train_dataset = Dataset.from_pandas(train_df)
valid_dataset = Dataset.from_pandas(valid_df)


### Data Preprocessing
This section prepares and cleans the dataset for training.

In [None]:
# -------- PREPROCESSING --------
def preprocess_function(examples):
    inputs = []
    targets = []

    for ex in examples["translation"]:
        if ex is None:
            continue
        src = ex.get("en")
        tgt = ex.get("ne")
        if src and tgt:
            inputs.append(src)
            targets.append(tgt)

    # Ensure language codes are valid
    tokenizer.src_lang = "en"
    model_inputs = tokenizer(inputs, max_length=MAX_LENGTH, truncation=True, padding="max_length")

    # Target side tokenization
    labels = tokenizer(
        text_target=targets,
        max_length=MAX_LENGTH,
        truncation=True,
        padding="max_length"
    )
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs


# Map function safely
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_valid = valid_dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

KeyError: None

## Transformer Training Method

This section trains the Transformer model using this specific configuration.

In [None]:
print(train_dataset[0])


{'translation': {'en': '[NE] [] यो शहर पाकिस्तानको ८ औँ ठूलो शहर हो । [EN]', 'ne': 'It is the 86th largest city in Pakistan.'}}


### Tokenization
This section handles text tokenization and vocabulary preparation.

In [None]:
# -------- DATA COLLATOR --------
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

### Data Preprocessing
This section prepares and cleans the dataset for training.

In [None]:
bleu_metric = load_metric("sacrebleu")

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Clean & format labels for BLEU
    decoded_labels = [[label.strip()] for label in decoded_labels]
    bleu = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": round(bleu["score"], 2)}

## Transformer Training Method

This section trains the Transformer model using this specific configuration.

In [None]:
# -------- TRAINING ARGUMENTS --------
training_args = Seq2SeqTrainingArguments(
    output_dir="./models/checkpoints",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=EPOCHS,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),
    logging_dir="./logs",
    logging_steps=50,
    save_strategy="epoch",
)

## Transformer Training Method

This section trains the Transformer model using this specific configuration.

In [None]:
# -------- TRAINER --------
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

## Transformer Training Method

This section trains the Transformer model using this specific configuration.

In [None]:
# -------- TRAIN & EVALUATE --------
trainer.train()

metrics = trainer.evaluate()
print(f"\n✅ Final BLEU Score: {metrics['eval_bleu']}")

# -------- SAVE MODEL --------
trainer.save_model(MODEL_SAVE_PATH)
tokenizer.save_pretrained(MODEL_SAVE_PATH)
print(f"\n✅ Fine-tuned model saved to {MODEL_SAVE_PATH}")


In [None]:
print(torch.cuda.is_available())

False


In [None]:
# file paths
eng_file = "/content/drive/MyDrive/fyp_translator/english.txt"
nep_file = "/content/drive/MyDrive/fyp_translator/nepali.txt"

# how many pairs you want
target_count = 50000

# read full data
with open(eng_file, "r", encoding="utf-8") as f:
    eng_lines = f.readlines()

with open(nep_file, "r", encoding="utf-8") as f:
    nep_lines = f.readlines()

# check total lines
print("English lines:", len(eng_lines))
print("Nepali lines:", len(nep_lines))

# take only first 50k
eng_subset = eng_lines[:target_count]
nep_subset = nep_lines[:target_count]

# save into new files
with open("/content/drive/MyDrive/fyp_translator/english_50k.txt", "w", encoding="utf-8") as f:
    f.writelines(eng_subset)

with open("/content/drive/MyDrive/fyp_translator/nepali_50k.txt", "w", encoding="utf-8") as f:
    f.writelines(nep_subset)

print(f"✅ Saved first {target_count} lines to english_50k.txt and nepali_50k.txt")


English lines: 177334
Nepali lines: 177334
✅ Saved first 50000 lines to english_50k.txt and nepali_50k.txt


In [None]:
# file paths
eng_file = "/content/drive/MyDrive/fyp_translator/english_50k.txt"
nep_file = "/content/drive/MyDrive/fyp_translator/nepali_50k.txt"

# count lines in both files
with open(eng_file, "r", encoding="utf-8") as f:
    eng_lines = f.readlines()

with open(nep_file, "r", encoding="utf-8") as f:
    nep_lines = f.readlines()

print("English lines:", len(eng_lines))
print("Nepali lines:", len(nep_lines))

if len(eng_lines) == len(nep_lines):
    print("✅ Both files have the same number of lines.")
else:
    print("⚠️ Mismatch detected! Difference:", abs(len(eng_lines) - len(nep_lines)))
    # optionally, print problematic indexes
    min_len = min(len(eng_lines), len(nep_lines))
    for i in range(min_len, max(len(eng_lines), len(nep_lines))):
        print(f"Extra line at index {i}")


English lines: 50000
Nepali lines: 50000
✅ Both files have the same number of lines.


### Model Definition
Defines the Transformer architecture and its parameters.

In [None]:
# embeddings

class Inputembedding(nn.Module):
    def __init__(self,d_model:int,vocab_size:int):
        super().__init__()
        self.d_model=d_model
        self.vocab_size=vocab_size
        self.embedding=nn.Embedding(vocab_size,d_model)
    def forward(self,x):
        return self.embedding(x)*math.sqrt(self.d_model)

### Model Definition
Defines the Transformer architecture and its parameters.

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self,d_model:int,max_seq_len:int,dropout:float):
        super().__init__()
        self.d_model=d_model
        self.max_seq_len=max_seq_len
        self.dropout=nn.Dropout(dropout)
        #positional encoding filled with  zeroes
        pe=torch.zeros(max_seq_len,d_model)
        # creating a position
        position=torch.arange(0,max_seq_len,dtype=torch.float).unsqueeze(1)
        dividend_term=torch.exp(torch.arange(0,d_model,2).float()*(-math.log(10000)/d_model))
        #applying sine to even indices
        pe[:,0::2]=torch.sin(position*dividend_term)
        #applying cosine to odd indices
        pe[:,1::2]=torch.cos(position*dividend_term)
        #apply one dimension more for the batch_size
        pe=pe.unsqueeze(0)
        self.register_buffer("pe",pe)
        print(pe.shape)
    def forward(self,x):
        x=x+(self.pe[:,:x.shape[1],:]) # all bach size 0 to maxseqlen-1,dimension
        return self.dropout(x)



In [None]:
class LayerNormalization(nn.Module):
    def __init__(self,eps:float=10**-6):
        super().__init__()
        self.eps=eps
        self.alpha=nn.Parameter(torch.ones(1))
        self.bias=nn.Parameter(torch.zeros(1))

    def forward(self,x):
        mean=x.mean(dim=-1,keepdim=True)
        std=x.std(dim=-1,keepdim=True)
        return self.alpha*(x-mean)/(std+self.eps)+self.bias



### Model Definition
Defines the Transformer architecture and its parameters.

In [None]:
class FeedForwardNeuralNetwork(nn.Module):
    def __init__(self,d_model:int,d_ff:int,dropout:float):
        super().__init__()
        self.firstlayer=nn.Linear(d_model,d_ff)
        self.dropout=nn.Dropout(dropout)
        self.secondlayer=nn.Linear(d_ff,d_model)

    def forward(self,x):
        return self.secondlayer(self.dropout(torch.relu(self.firstlayer(x))))

### Model Definition
Defines the Transformer architecture and its parameters.

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self,d_model:int,num_heads:int,dropout:float):
        super().__init__()
        self.d_model=d_model
        self.num_heads=num_heads
        assert d_model%num_heads==0,'Dimension of model should be divisible by no of heads'
        self.d_k=d_model//num_heads

        # for the  weight metrices
        self.w_q=nn.Linear(d_model,d_model)# Weighted query
        self.w_k=nn.Linear(d_model,d_model)# weighted key
        self.w_v=nn.Linear(d_model,d_model)# weighted value
        self.w_o=nn.Linear(d_model,d_model) #weight of the concatenated layer
        self.dropout=nn.Dropout(dropout)  #last dropoutlayer
    @staticmethod
    def attention(query,key,value,mask,dropout=nn.Dropout):
        d_k=query.shape[-1]
        attention_scores=(query@key.transpose(-2,-1))/math.sqrt(d_k)
        if mask is not None:
            attention_scores.masked_fill_(mask==0,-1e9)# In-place: mask out positions with a large negative value to ignore them in softmax
        attention_scores=attention_scores.softmax(dim=-1) # applied at the last dimension that is max_selenght
        if dropout is not None:
            attention_scores=dropout(attention_scores)
        return (attention_scores@value) ,attention_scores

    def forward(self,q,k,v,mask):
        query=self.w_q(q)
        key=self.w_k(k)
        value=self.w_v(v)
        query=query.view(query.shape[0],query.shape[1],self.num_heads,self.d_k).transpose(1,2) #batchsize sequencelength number of head,d_k #transpose chai aaba independently head lai train garxam so batchsize,oofheads,max_seq_len,d_k hunxa
        key=key.view(key.shape[0],key.shape[1],self.num_heads,self.d_k).transpose(1,2)
        value=value.view(value.shape[0],value.shape[1],self.num_heads,self.d_k).transpose(1,2)
        #obtain output and attention scores
        x,self.attention_scores=MultiHeadAttention.attention(query,key,value,mask,self.dropout)
        # create  a concatenated matrix
        x=x.transpose(1,2).contiguous().view(x.shape[0],-1,self.num_heads*self.d_k)#
        return self.w_o(x)



In [None]:
class ResidualConnection(nn.Module):
    def __init__(self, dropout: float) -> None:
        super().__init__()
        # we use a dropout layer to prevent overfitting
        self.dropout=nn.Dropout(dropout)
        # we use a normalization layer
        self.norm=LayerNormalization()

    def forward(self, x, sublayer):
        # we normalize the input and add it to the original input x`. This creates the residual connection process
        return x+self.dropout(sublayer(self.norm(x)))

In [None]:
# Building Encoder Block
class EncoderBlock(nn.Module):
    def __init__(self,self_attention_block:MultiHeadAttention,ffn:FeedForwardNeuralNetwork,dropout:float):
        super().__init__()
        self.self_attention_block=self_attention_block
        self.ffn=ffn
        self.residual_connections=nn.ModuleList([ResidualConnection(dropout) for _ in range(2)])
    def forward(self,x,src_mask):
        x=self.residual_connections[0](x,lambda x : self.self_attention_block(x,x,x,src_mask))
        x=self.residual_connections[1](x,self.ffn) # x+x.self.ffn(x)
        # output tensor with applying feedforward selfattention feedforward
        return x

class Encoder(nn.Module):
    def __init__(self, layers: nn.ModuleList)-> None:
        super().__init__()
        self.layers=layers # storing the EncoderBlocks
        # layer for the normalization of the output of the encoder layers
        self.norm=LayerNormalization()

    def forward(self, x, mask):
        # Iterating over each EncoderBlock stored in self.layers
        for layer in self.layers:
            # Applying each EncoderBlock to the input tensor x
            x=layer(x, mask)
        return self.norm(x) # Normalizing output After running all n layers



In [None]:
# Decoderblock #it takes multihead attention and crossattention
class DecoderBlock(nn.Module):
    def __init__(self,self_attention_block:MultiHeadAttention,crossattentionblock:MultiHeadAttention,ffn:FeedForwardNeuralNetwork,dropout:float):
        super().__init__()
        self.self_attention_block=self_attention_block
        self.cross_attention_block=crossattentionblock
        self.ffn=ffn
        self.residual_connection=nn.ModuleList([ResidualConnection(dropout) for _ in range(3)])

    def forward(self,x,encoderoutput,src_mask,tgt_mask):
        x=self.residual_connection[0](x,lambda x :self.self_attention_block(x,x,x,tgt_mask))
        x=self.residual_connection[1](x,lambda x : self.cross_attention_block(x,encoderoutput,encoderoutput,src_mask))
        x=self.residual_connection[2](x,self.ffn)
        return x


class Decoder(nn.Module):
    def __init__(self,layers:nn.ModuleList):
        super().__init__()
        self.layers=layers
        self.norm=LayerNormalization()

    def forward(self,x,encoder_output,src_mask,tgt_mask):
        for layer in self.layers:
            x=layer(x,encoder_output,src_mask,tgt_mask)
        return self.norm(x)



### Model Definition
Defines the Transformer architecture and its parameters.

In [None]:
# Projection layer

class ProjectionLayer(nn.Module):
    def __init__(self,d_model:int,vocab_size:int):
        super().__init__()
        self.projection=nn.Linear(d_model,vocab_size)
    def forward(self,x):
        return torch.log_softmax(self.projection(x),dim=-1)

In [None]:
# The Transformer Architecture
# Contains all the Encoder Decoder Embeddings
class Transformer(nn.Module):
    def __init__(self,encoder:Encoder,decoder:Decoder,src_embeding:Inputembedding,tgt_embedding:Inputembedding,src_position:PositionalEncoding,tgt_position:PositionalEncoding,projection_layer:ProjectionLayer) -> None:
        super().__init__()
        self.encoder=encoder
        self.decoder=decoder
        self.src_embedding=src_embeding
        self.tgt_embedding=tgt_embedding
        self.src_position=src_position
        self.tgt_position=tgt_position
        self.projection_layer=projection_layer

    def encode(self,source,src_mask):
        #applying embedding to the input source language
        source=self.src_embedding(source)
        #applying positionalencoding to the source language
        source=self.src_position(source)
        return self.encoder(source,src_mask)

    def decode(self,encoder_output,src_mask,target,tgt_mask):
        target=self.tgt_embedding(target)
        target=self.tgt_position(target)
        return self.decoder(target,encoder_output,src_mask,tgt_mask)

    #applying projection with softmax
    def project(self,x):
        return self.projection_layer(x)


In [None]:
def build_transformer(src_vocab_size:int,tgt_vocab_size:int,src_seq_len:int,tgt_seq_len:int,d_model:int=512,N:int=6,h:int=8,dropout:float=0.2,d_ff:int=2048) -> Transformer:
    # Creating Embedding Layers
    src_embed=Inputembedding(d_model,src_vocab_size)
    tgt_embed=Inputembedding(d_model,tgt_vocab_size)
    #Creating Positional Encoding Layers
    src_pos=PositionalEncoding(d_model,src_seq_len,dropout)
    tgt_pos=PositionalEncoding(d_model,tgt_seq_len,dropout)

    # Encoders Blocks
    encoder_blocks=[]
    for _ in range (N) :
        encoder_self_attention_block=MultiHeadAttention(d_model,h,dropout)
        feed_forward_block=FeedForwardNeuralNetwork(d_model,d_ff,dropout)
        # one layer encoder block
        encoder_block=EncoderBlock(encoder_self_attention_block,feed_forward_block,dropout)
        encoder_blocks.append(encoder_block)

    # creating decoder blocks
    decoder_blocks=[]
    for _ in range(N):
        decoder_self_attention_block=MultiHeadAttention(d_model,h,dropout)
        decoder_cross_attention_block=MultiHeadAttention(d_model,h,dropout)
        feed_forward_block=FeedForwardNeuralNetwork(d_model,d_ff,dropout)

        # decoder block
        decoder_block=DecoderBlock(decoder_self_attention_block,decoder_cross_attention_block,feed_forward_block,dropout)
        decoder_blocks.append(decoder_block)

    # Encoder and decoder
    encoder=Encoder(nn.ModuleList(encoder_blocks))
    decoder=Decoder(nn.ModuleList(decoder_blocks))

    #projection layer
    projection_layer=ProjectionLayer(d_model,tgt_vocab_size)
    # Fulltransforer
    transformer=Transformer(encoder,decoder,src_embed,tgt_embed,src_pos,tgt_pos,projection_layer)
    #initializing all the parameters
    for p in transformer.parameters():
        if p.dim()>1:
            nn.init.xavier_uniform_(p)

    return transformer


In [None]:
def get_all_sentences(ds,lang):
    for pair in ds :
        yield pair['translation'][lang]

### Tokenization
This section handles text tokenization and vocabulary preparation.

In [None]:
from pathlib import Path
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace
from datasets import load_dataset
def build_tokenizer(config,ds,lang):
    # A filepath for tokenizer
    tokenizer_path=Path(config['tokenizer_file'].format(lang)) #The string 'models/tokenizer_{}.json' becomes 'models/tokenizer_ne.json' (since lang is 'ne').
    # check the path of the tokenizer
    if not Path.exists(tokenizer_path):
        tokenizer=Tokenizer(WordLevel(unk_token='[UNK]'))
        tokenizer.pre_tokenizer=Whitespace() # we will spilt the text into tokens based ont hte whitespace

        # creating a trainer for the new tokenizer
        trainer=WordLevelTrainer(special_tokens=['[UNK]','[PAD]','[SOS]','[EOS]'])
        tokenizer.train_from_iterator(get_all_sentences(ds,lang),trainer=trainer)
        tokenizer.save(str(tokenizer_path))
    else:
        tokenizer=Tokenizer.from_file(str(tokenizer_path))
    return tokenizer


In [None]:
def casual_mask(size):
    mask=torch.triu(torch.ones(1,size,size),diagonal=1).type(torch.int)
    return mask==0

### Tokenization
This section handles text tokenization and vocabulary preparation.

In [None]:
from torch.utils.data import Dataset
class BilingualDataset(Dataset):
    def __init__(self,ds,tokenizer_src,tokenizer_tgt,src_lang,tgt_lang,max_seq_len)-> None:
        super().__init__()
        self.seq_len=max_seq_len
        self.ds=ds
        self.tokenizer_src=tokenizer_src
        self.tokenizer_tgt=tokenizer_tgt
        self.src_lang=src_lang
        self.tgt_lang=tgt_lang

        # special tokens numerical value
        self.sos_token=torch.tensor([tokenizer_tgt.token_to_id('[SOS]')],dtype=torch.int64)
        self.eos_token=torch.tensor([tokenizer_tgt.token_to_id('[EOS]')], dtype=torch.int64)
        self.pad_token=torch.tensor([tokenizer_tgt.token_to_id('[PAD]')], dtype=torch.int64)

    # Return the length of the sentences
    def __len__(self):
        return len(self.ds)

    def __getitem__(self,index:Any)-> Any:
        src_target_pair=self.ds[index]
        src_text=src_target_pair['translation'][self.src_lang]
        tgt_text=src_target_pair['translation'][self.tgt_lang]

        #tokenizationgthe source and target text
        enc_input_tokens=self.tokenizer_src.encode(src_text).ids
        dec_input_tokens=self.tokenizer_tgt.encode(tgt_text).ids
        if len(enc_input_tokens) > self.seq_len - 2:
            enc_input_tokens = enc_input_tokens[:self.seq_len - 2]

        if len(dec_input_tokens) > self.seq_len - 2:
            dec_input_tokens = dec_input_tokens[:self.seq_len - 2]

        # sentence ma hamlai aktiota pad token chainxa
        enc_num_padding_tokens=self.seq_len-len(enc_input_tokens) -2 # -2 for eos and sos

        #target tokens
        dec_num_padding_tokens=self.seq_len-len(dec_input_tokens)- 1 # euta chai for sos

        if enc_num_padding_tokens<0 or dec_num_padding_tokens<0:
            raise ValueError("Sentences seem to be long")#yedi maxtokens 10 xa aani tokens 9 ota xa bhaney eos ra sos nai bhayena jaha -1 aauxa tei bahyera

        #suruma sos tokens last ma eos token ani padding tokens
        encoder_input=torch.cat(
            [
                self.sos_token,
                torch.tensor(enc_input_tokens,dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token]*enc_num_padding_tokens,dtype=torch.int64)#padding tokens add gareko jastai list[0]*5 huda list[0,0,0,0,0]

            ]
        )

        #building decoder input tensor
        decoder_input=torch.cat([
            self.sos_token, # inserting the '[SOS]' token
            torch.tensor(dec_input_tokens, dtype=torch.int64), # indersting the tokenized target text
            torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64) # adding padding tokens
        ])

        # yo bhaneko label target yo sanga comaper garera loss nikalxa
        # creating a label tensor, the expected output for training the model
        label=torch.cat([
            torch.tensor(dec_input_tokens, dtype=torch.int64), # inserting the tokenized targate text
            self.eos_token, # inserting the '[EOS]' token
            torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64) # adding padding tokens
        ])
        # Ensuring that the length of each tensor above is equal to the defined `seq_len`
        assert encoder_input.size(0)==self.seq_len,'Encoder input doesnt match with sequencelength'
        assert decoder_input.size(0)==self.seq_len,'Edecoder input doesnt match with sequencelength'
        assert label.size(0)==self.seq_len,'label  doesnt match with sequencelength'

        return {
            'encoder_input':encoder_input,
            'decoder_input':decoder_input,
            'encoder_mask': (encoder_input!=self.pad_token).unsqueeze(0).unsqueeze(0).int(),
            'decoder_mask': (decoder_input!=self.pad_token).unsqueeze(0).unsqueeze(0).int() & casual_mask(decoder_input.size(0)),
            'label':label,
            'src_text': src_text,
            'tgt_text': tgt_text
        }





In [None]:
def read_text_files(src_file,tgt_file):
    with open(src_file,'r',encoding='utf-8') as src_f ,open(tgt_file,"r",encoding='utf-8') as tgt_f :
        src_lines=src_f.readlines()
        tgt_lines=tgt_f.readlines()

    assert len(src_lines) ==len(tgt_lines) ,"Source and target files must have the same number of lines and lengths"
    dataset=[{'translation':{'src':src.strip(),'tgt':tgt.strip()}} for src,tgt in zip(src_lines,tgt_lines)]
    return dataset

### Tokenization
This section handles text tokenization and vocabulary preparation.

In [None]:
def get_ds(config):
    #read dataset form text file

    ds_raw=read_text_files(config['src_file'],config['tgt_file'])
    # building and loading tokenizer for source and target file
    tokenizer_src=build_tokenizer(config,ds_raw,config['lang_src'])
    tokenizer_tgt=build_tokenizer(config,ds_raw,config['lang_tgt'])

    #splitting the dataset for training and validation
    train_ds_size=int(0.9 * len (ds_raw))
    val_ds_size=len(ds_raw) -train_ds_size
    train_ds_raw,val_ds_raw=random_split(ds_raw,[train_ds_size,val_ds_size])

    #processing dataset with bilingualdataset
    train_ds=BilingualDataset(train_ds_raw,tokenizer_src,tokenizer_tgt,config['lang_src'],config['lang_tgt'],config['seq_len'])
    val_ds=BilingualDataset(val_ds_raw,tokenizer_src,tokenizer_tgt,config['lang_src'],config['lang_tgt'],config['seq_len'])

    #finding the maximum length in the dataset
    max_len_src=0
    max_len_tgt=0
    for pair in ds_raw:
        src_ids=tokenizer_src.encode(pair['translation'][config['lang_src']]).ids
        tgt_ids=tokenizer_tgt.encode(pair['translation'][config['lang_tgt']]).ids

        max_len_src=max(max_len_src,len(src_ids))
        max_len_tgt=max(max_len_tgt,len(tgt_ids))

    print(f"MAx Length of source Sentence: {max_len_src}")
    print(f"MAx Length of target Sentence: {max_len_tgt}")

    train_dataloader=DataLoader(train_ds,batch_size=config['batch_size'],shuffle=True)
    val_dataloader=DataLoader(val_ds,batch_size=1,shuffle=True)

    return train_dataloader,val_dataloader,tokenizer_src,tokenizer_tgt


### Tokenization
This section handles text tokenization and vocabulary preparation.

In [None]:
#greedy decode for inferenceing

def greedy_decode(model,source,source_mask,tokenizer_src,tokenizer_tgt,max_len,device):
    #retrieving the indices from the start and end of sequences
    sos_idx=tokenizer_tgt.token_to_id('[SOS]')
    eos_idx=tokenizer_tgt.token_to_id('[EOS]')

    # computing the output of the encoder
    encoder_output=model.encode(source,source_mask)
    decoder_input=torch.empty(1,1).fill_(sos_idx).type_as(source).to(device) #tensor type is like source
    while True:
        if decoder_input.size(1)==max_len:
            break
        #building a mask for decoder input
        decoder_mask=casual_mask(decoder_input.size(1)).type_as(source_mask).to(device)
        #calculating the output of the decoder
        out=model.decode(encoder_output,source_mask,decoder_input,decoder_mask)
        prob=model.project(out[:,-1])

        # Select token with the highest probability
        _,next_word=torch.max(prob,dim=1)
        decoder_input=torch.cat([decoder_input,torch.empty(1,1).type_as(source).fill_(next_word.item()).to(device)],dim=1)
        if next_word==eos_idx:
            break

    # sequence of tokens generated by the decoder
    return decoder_input.squeeze(0)
def run_validation(model, validation_ds, tokenizer_src, tokenizer_tgt, max_len, device, print_msg, global_state, writer, num_examples=2):
    model.eval()
    count=0 # initializing counter to keep track of how many examples have been processed

    console_width=80 # fixed width for printed messages

    # creating evaluation loop
    with torch.no_grad(): # ensuring that no gradients are computed during this process
        for batch in validation_ds:
            count+=1
            encoder_input=batch['encoder_input'].to(device)
            encoder_mask=batch['encoder_mask'].to(device)

            # ensuring that the batch_size of the validation set is 1
            assert encoder_input.size(0)==1, 'Batch size must be 1 for validation.'

            # applying the `greedy_decode` functio to get the model's output of the source text of the input batch
            model_out=greedy_decode(model, encoder_input, encoder_mask, tokenizer_src, tokenizer_tgt, max_len, device)

            # retraeving source and target texts from the batch
            source_text=batch['src_text'][0]
            target_text=batch['tgt_text'][0] # true translation
            # token_ids = model_out.argmax(dim=-1).squeeze().tolist() # Convert tensor to a list of token IDs
            # model_out_text = tokenizer_tgt.decode(token_ids)
            model_out_text=tokenizer_tgt.decode(model_out.detach().cpu().numpy()) # decoded, human-readable model ouptut

            # printing results
            print_msg('-'*console_width)
            print_msg(f'SOURCE: {source_text}')
            print_msg(f'TARGET: {target_text}')
            print_msg(f'PREDICTED: {model_out_text}')

            # After two examples, we break the loop
            if count >= num_examples:
                break

### Tokenization
This section handles text tokenization and vocabulary preparation.

In [None]:

from nltk.translate.bleu_score import sentence_bleu
import torch
from tqdm import tqdm

# This function will be used to compute BLEU score between reference and predicted translations
def compute_bleu(reference, candidate):
    """
    Compute BLEU score between reference and candidate translations.

    Args:
    reference (list of str): The ground truth translation split into tokens.
    candidate (list of str): The predicted translation split into tokens.

    Returns:
    float: BLEU score.
    """
    return sentence_bleu([reference], candidate)
def calculate_bleu_for_validation(model, val_dataloader, tokenizer_src, tokenizer_tgt, max_len, device):
    model.eval()  # Set the model to evaluation mode
    total_bleu_score = 0
    total_examples = 0
    example_printed = False

    with torch.no_grad():  # No gradients needed during validation
        for batch in tqdm(val_dataloader, desc="Calculating BLEU for validation"):
            # Get source (input) and reference (target) texts
            encoder_input = batch['encoder_input'].to(device)
            encoder_mask = batch['encoder_mask'].to(device)
            target_texts = batch['tgt_text']  # Ground truth translations (list of strings)

            # Predict translations using the greedy decoding function
            model_output = greedy_decode(model, encoder_input, encoder_mask, tokenizer_src, tokenizer_tgt, max_len, device)

            # Decode predicted token IDs to text
            predicted_text = tokenizer_tgt.decode(model_output.tolist(), skip_special_tokens=True)

            # Iterate over each example in the batch
            for i, target_text in enumerate(target_texts):
                reference = target_text.split()  # Tokenize the reference (true) sentence
                candidate = predicted_text.split()  # Tokenize the predicted sentence

                # Calculate BLEU score for this example
                bleu_score = compute_bleu(reference, candidate)
                total_bleu_score += bleu_score
                total_examples += 1
                if not example_printed:
                    print("\n--- Sample Validation Output ---")
                    print(f"Real: {target_text}")
                    print(f"Predicted: {predicted_text}")
                    print(f"BLEU score for this example: {bleu_score:.4f}")
                    example_printed = True


    # Calculate and return the average BLEU score across all validation examples
    avg_bleu_score = total_bleu_score / total_examples if total_examples > 0 else 0
    return avg_bleu_score




In [None]:
# we pass as parameters the config dictionary, the length of the vocabulary of the source language and the target language
def get_model(config, vocab_src_len, vocab_tgt_len):
    # loading model using the `build_transformer` function
    # we will use the lengths of the source language and atarget language vocabularies, the `seq_len`, and the dimensionality of embeddings
    model=build_transformer(vocab_src_len, vocab_tgt_len, config['seq_len'], config['seq_len'], config['d_model'])
    return model

### Tokenization
This section handles text tokenization and vocabulary preparation.

In [None]:
# define settings for building and training the transfomer model
def get_config():
    return {
        'batch_size': 32,
        'num_epochs': 50,
        'lr': 5e-4,
        'seq_len': 256,
        'd_model': 512,  # Dimensions of the embeddings in the transformer. 512 like in the paper
        'lang_src': 'src',  # Use 'src' as the source language identifier
        'lang_tgt': 'tgt',  # Use 'tgt' as the target language identifier
        'src_file': '/content/drive/MyDrive/fyp_translator/english_50k.txt',  # Path to your English text file
        'tgt_file': '/content/drive/MyDrive/fyp_translator/nepali_50k.txt',  # Path to your Nepali text file
        'model_folder': '/content/drive/MyDrive/fyp_translator/weights',
        'model_basename': '/content/drive/MyDrive/fyp_translator/tmodel_',
        'preload': '',
        'tokenizer_file': '/content/drive/MyDrive/fyp_translator/tokenizer_{0}.json',
        'experiment_name': '/content/drive/MyDrive/fyp_translator/runs/tmodel',


    }


# function to construct the path for saving and retrieving model weights
def get_weights_file_path(config, epoch: str):
    model_folder=config['model_folder'] # extracting model folder from the config
    model_basename=config['model_basename'] # extracting the base name for model files
    model_filename=f'{model_basename}{epoch}.pt'
    return str(Path('.')/model_folder/model_filename)


### Tokenization
This section handles text tokenization and vocabulary preparation.

In [None]:
import torch
import torch.nn as nn
from torch.utils.tensorboard import SummaryWriter
from pathlib import Path
from tqdm import tqdm
import os  # Needed for file deletion

def train_model(config):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f'Using device {device}')

    # Creating model directory to store weights
    Path(config['model_folder']).mkdir(parents=True, exist_ok=True)

    # Ensure the experiment directory exists
    experiment_path = Path(config['experiment_name'])
    experiment_path.mkdir(parents=True, exist_ok=True)

    train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_ds(config)

    model = get_model(config, tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size()).to(device)

    # Tensorboard
    writer = SummaryWriter(config['experiment_name'])

    optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'], eps=1e-9)

    initial_epoch = 0
    global_step = 0

    # Check if there is a pre-trained model to load
    if config['preload']:
        model_filename = get_weights_file_path(config, config['preload'])
        print(f'Preloading model {model_filename}')

        state = torch.load(model_filename)

        # Sets epoch to the saved in the state plus one, to resume from where it stopped
        initial_epoch = state['epoch'] + 1
        optimizer.load_state_dict(state['optimizer_state_dict'])
        global_step = state['global_step']

    # Initialize loss function
    loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer_src.token_to_id('[PAD]'), label_smoothing=0.1).to(device)

    previous_model_filename = None  # Variable to track the last saved model file

    for epoch in range(initial_epoch, config['num_epochs']):
        batch_iterator = tqdm(train_dataloader, desc=f'Processing epoch {epoch:02d}')

        for i, batch in enumerate(batch_iterator):
            model.train()

            # Loading input data and masks onto the GPU
            encoder_input = batch['encoder_input'].to(device)
            decoder_input = batch['decoder_input'].to(device)
            encoder_mask = batch['encoder_mask'].to(device)
            decoder_mask = batch['decoder_mask'].to(device)

            # Running tensors through the transformer
            encoder_output = model.encode(encoder_input, encoder_mask)
            decoder_output = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask)
            proj_output = model.project(decoder_output)

            # Loading the target labels onto the GPU
            label = batch['label'].to(device)

            # Computing loss between model's output and true labels
            loss = loss_fn(proj_output.view(-1, tokenizer_tgt.get_vocab_size()), label.view(-1))

            # Updating progress bar
            batch_iterator.set_postfix({f'loss': f'{loss.item():6.3f}'})

            writer.add_scalar('train loss', loss.item(), global_step)
            writer.flush()

            # Performing backpropagation
            loss.backward()

            optimizer.step()

            # Clearing the gradients to prepare for the next batch
            optimizer.zero_grad()

            global_step += 1  # Updating global step count

            # Display results every 100 iterations
            if global_step % 100 == 0:
                print(f'Iteration {global_step}: loss = {loss.item():6.3f}')
                writer.add_scalar('iteration loss', loss.item(), global_step)
                writer.flush()

        # Running validation at the end of each epoch
        run_validation(model, val_dataloader, tokenizer_src, tokenizer_tgt, config['seq_len'], device, lambda msg: batch_iterator.write(msg), global_step, writer)

        # Save model after every epoch
        model_filename = get_weights_file_path(config, f'epoch_{epoch+1}')

        # Writing current model state to the model_filename
        torch.save({
            'epoch': epoch,  # Current epoch
            'model_state_dict': model.state_dict(),  # Current model state
            'optimizer_state_dict': optimizer.state_dict(),  # Current optimizer state
            'global_step': global_step  # Current global step
        }, model_filename)

        print(f'Saved model for epoch {epoch+1}: {model_filename}')

        # Delete the model from the previous epoch
        if previous_model_filename and os.path.exists(previous_model_filename):
            os.remove(previous_model_filename)
            print(f'Deleted previous model: {previous_model_filename}')

        # Update the previous model filename to the current one
        previous_model_filename = model_filename

    print("\nTraining complete. Calculating BLEU score on validation data...")
    avg_bleu_score = calculate_bleu_for_validation(model, val_dataloader, tokenizer_src, tokenizer_tgt, config['seq_len'], device)
    print(f"\nFinal Average BLEU score on validation data: {avg_bleu_score:.4f}")



In [None]:
import torch
torch.cuda.empty_cache()


## Transformer Training Method

This section trains the Transformer model using this specific configuration.

In [None]:
import warnings
if __name__=='__main__':
    warnings.filterwarnings('ignore')
    config=get_config() #retrieving config settings
    train_model(config) # training model with config arguments

Using device cuda
MAx Length of source Sentence: 226
MAx Length of target Sentence: 246
torch.Size([1, 256, 512])
torch.Size([1, 256, 512])


Processing epoch 00:   7%|▋         | 100/1407 [01:51<26:20,  1.21s/it, loss=7.662]

Iteration 100: loss =  7.662


Processing epoch 00:  14%|█▍        | 200/1407 [03:44<24:31,  1.22s/it, loss=7.337]

Iteration 200: loss =  7.337


Processing epoch 00:  21%|██▏       | 300/1407 [05:36<22:31,  1.22s/it, loss=7.350]

Iteration 300: loss =  7.350


Processing epoch 00:  28%|██▊       | 400/1407 [07:28<20:25,  1.22s/it, loss=6.919]

Iteration 400: loss =  6.919


Processing epoch 00:  36%|███▌      | 500/1407 [09:21<18:25,  1.22s/it, loss=7.068]

Iteration 500: loss =  7.068


Processing epoch 00:  43%|████▎     | 600/1407 [11:13<16:24,  1.22s/it, loss=6.988]

Iteration 600: loss =  6.988


Processing epoch 00:  50%|████▉     | 700/1407 [13:06<14:21,  1.22s/it, loss=6.835]

Iteration 700: loss =  6.835


Processing epoch 00:  57%|█████▋    | 800/1407 [14:58<12:18,  1.22s/it, loss=6.995]

Iteration 800: loss =  6.995


Processing epoch 00:  64%|██████▍   | 900/1407 [16:50<10:18,  1.22s/it, loss=6.647]

Iteration 900: loss =  6.647


Processing epoch 00:  71%|███████   | 1000/1407 [18:43<08:14,  1.22s/it, loss=6.678]

Iteration 1000: loss =  6.678


Processing epoch 00:  78%|███████▊  | 1100/1407 [20:35<06:14,  1.22s/it, loss=6.699]

Iteration 1100: loss =  6.699


Processing epoch 00:  85%|████████▌ | 1200/1407 [22:28<04:11,  1.22s/it, loss=6.628]

Iteration 1200: loss =  6.628


Processing epoch 00:  92%|█████████▏| 1300/1407 [24:20<02:10,  1.22s/it, loss=6.590]

Iteration 1300: loss =  6.590


Processing epoch 00: 100%|█████████▉| 1400/1407 [26:13<00:08,  1.22s/it, loss=6.528]

Iteration 1400: loss =  6.528


Processing epoch 00: 100%|██████████| 1407/1407 [26:20<00:00,  1.12s/it, loss=6.516]


--------------------------------------------------------------------------------
SOURCE: Necessary facilities will also be provided to establish a slaughter house in Kathmandu in the private sector. e. Brick and Tile Factory:
TARGET: भविष्यमा मेची, कोशी र सगरमाथा अञ्चलको पहाडी भागको र विराटनगर क्षेत्रको हुन सक्ने माग पूर्तिको लागि तमोर र अरुण नदीमा जलविद्युत आयोजनाको लागि प्रारम्भिक सर्वेक्षण गरिनेछ।
PREDICTED: 
--------------------------------------------------------------------------------
SOURCE: It was so, that after they had carried it about, the hand of Yahweh was against the city with a very great confusion: and he struck the men of the city, both small and great; and tumors broke out on them.
TARGET: फेरि परमेश्वरको पवित्र सन्दूक जब गातमा पुग्यो तब देखि नै परमप्रभुले त्यस शहरलाई सजाय दिन थाल्नुभयो। मानिसहरूमा खैलाबैला भयो। परमप्रभुले के बालक के वृद्ध सबलाई दुःख दिनथाल्नु भयो। परमेश्वरले गातवासीहरूमा गिर्खाको रूढी फैलाइ दिनुभयो।
PREDICTED: तब दाऊदले भने , “ , , , , , , , , , , ,

Processing epoch 01:   7%|▋         | 93/1407 [01:44<26:39,  1.22s/it, loss=6.416]

Iteration 1500: loss =  6.416


Processing epoch 01:  14%|█▎        | 193/1407 [03:37<24:36,  1.22s/it, loss=6.363]

Iteration 1600: loss =  6.363


Processing epoch 01:  21%|██        | 293/1407 [05:29<22:35,  1.22s/it, loss=6.640]

Iteration 1700: loss =  6.640


Processing epoch 01:  28%|██▊       | 393/1407 [07:21<20:36,  1.22s/it, loss=6.181]

Iteration 1800: loss =  6.181


Processing epoch 01:  35%|███▌      | 493/1407 [09:14<18:31,  1.22s/it, loss=6.141]

Iteration 1900: loss =  6.141


Processing epoch 01:  42%|████▏     | 593/1407 [11:06<16:31,  1.22s/it, loss=6.268]

Iteration 2000: loss =  6.268


Processing epoch 01:  49%|████▉     | 693/1407 [12:59<14:28,  1.22s/it, loss=6.096]

Iteration 2100: loss =  6.096


Processing epoch 01:  56%|█████▋    | 793/1407 [14:51<12:25,  1.21s/it, loss=6.359]

Iteration 2200: loss =  6.359


Processing epoch 01:  63%|██████▎   | 893/1407 [16:43<10:24,  1.22s/it, loss=6.138]

Iteration 2300: loss =  6.138


Processing epoch 01:  71%|███████   | 993/1407 [18:35<08:24,  1.22s/it, loss=6.182]

Iteration 2400: loss =  6.182


Processing epoch 01:  78%|███████▊  | 1093/1407 [20:28<06:21,  1.22s/it, loss=6.389]

Iteration 2500: loss =  6.389


Processing epoch 01:  85%|████████▍ | 1193/1407 [22:20<04:19,  1.21s/it, loss=6.203]

Iteration 2600: loss =  6.203


Processing epoch 01:  92%|█████████▏| 1293/1407 [24:12<02:18,  1.21s/it, loss=6.195]

Iteration 2700: loss =  6.195


Processing epoch 01:  99%|█████████▉| 1393/1407 [26:04<00:16,  1.21s/it, loss=6.033]

Iteration 2800: loss =  6.033


Processing epoch 01: 100%|██████████| 1407/1407 [26:19<00:00,  1.12s/it, loss=6.513]


--------------------------------------------------------------------------------
SOURCE: Although reliable statistics on this question should be available from the national census to be held in 1971, there is little reason to believe that there has been any remarkable change in the industrial distribution of the labour force. This is because of the underdeveloped nature of the country and the fact that the rate of economic development has been minimal.
TARGET: यस विषयमा भरपर्दे तथ्याक आगामी २०२८ सालमा गरिने राष्ट्रव्यापी जनगणनाबाट प्राप्त हुनेछ तापनि नेपालको आर्थिक अवस्था ज्यादै अविकशितरुपमा भएको र आर्थिक विकासको दर ज्यादै न्यूनतम देखिन आएकोले श्रमशक्तिको ब्यावसायिक विभागमा ठोस परिवर्तन आएको विश्वास गर्न सकिने स्थित अवश्यै छैन
PREDICTED: साथै
--------------------------------------------------------------------------------
SOURCE: I will walk before Yahweh in the land of the living.
TARGET: जीवितहरूको देशमा पनि म परमप्रभुको सेवा अनवरत गर्नेछु।
PREDICTED: म मेरो ।
Saved model for epoch 2

Processing epoch 02:   6%|▌         | 86/1407 [01:36<26:43,  1.21s/it, loss=5.938]

Iteration 2900: loss =  5.938


Processing epoch 02:  13%|█▎        | 186/1407 [03:28<24:45,  1.22s/it, loss=5.987]

Iteration 3000: loss =  5.987


Processing epoch 02:  20%|██        | 286/1407 [05:21<22:43,  1.22s/it, loss=5.863]

Iteration 3100: loss =  5.863


Processing epoch 02:  27%|██▋       | 386/1407 [07:13<20:41,  1.22s/it, loss=5.815]

Iteration 3200: loss =  5.815


Processing epoch 02:  35%|███▍      | 486/1407 [09:05<18:44,  1.22s/it, loss=6.046]

Iteration 3300: loss =  6.046


Processing epoch 02:  42%|████▏     | 586/1407 [10:58<16:35,  1.21s/it, loss=5.832]

Iteration 3400: loss =  5.832


Processing epoch 02:  49%|████▉     | 686/1407 [12:50<14:38,  1.22s/it, loss=5.891]

Iteration 3500: loss =  5.891


Processing epoch 02:  56%|█████▌    | 786/1407 [14:42<12:34,  1.21s/it, loss=5.862]

Iteration 3600: loss =  5.862


Processing epoch 02:  63%|██████▎   | 886/1407 [16:34<10:33,  1.22s/it, loss=5.997]

Iteration 3700: loss =  5.997


Processing epoch 02:  70%|███████   | 986/1407 [18:26<08:29,  1.21s/it, loss=6.064]

Iteration 3800: loss =  6.064


Processing epoch 02:  77%|███████▋  | 1086/1407 [20:18<06:30,  1.22s/it, loss=6.119]

Iteration 3900: loss =  6.119


Processing epoch 02:  84%|████████▍ | 1186/1407 [22:10<04:28,  1.21s/it, loss=6.024]

Iteration 4000: loss =  6.024


Processing epoch 02:  91%|█████████▏| 1286/1407 [24:03<02:26,  1.21s/it, loss=5.834]

Iteration 4100: loss =  5.834


Processing epoch 02:  99%|█████████▊| 1386/1407 [25:55<00:25,  1.21s/it, loss=5.820]

Iteration 4200: loss =  5.820


Processing epoch 02: 100%|██████████| 1407/1407 [26:18<00:00,  1.12s/it, loss=5.721]


--------------------------------------------------------------------------------
SOURCE: "Vanity of vanities," says the Preacher; "Vanity of vanities, all is vanity."
TARGET: उपदेशकको भनाइ छ; सबै थोक व्यर्थैका छन र अकारथ छ। अर्थात् सब कुरा व्यर्थ छ।
PREDICTED: “ यदि कुनै मानिसले मलाई भने , उसले भने , “ ।”
--------------------------------------------------------------------------------
SOURCE: For whom he foreknew, he also predestined to be conformed to the image of his Son, that he might be the firstborn among many brothers.
TARGET: यो संसार बनाउन भन्दा अघिदेखि नै परमेश्वरले ती मानिसहरूलाई जान्नु हुन्थ्यो।अनि परमेश्वरले तिनीहरू आफ्नै पुत्र जस्तै होउन् भन्ने निर्णय गर्नुभयो। त्यसैकारण येशू धेरै दाज्यू-भाईहरू र दिदी-बहिनीहरूमा जेठो हुनुभयो।
PREDICTED: “ यदि कुनै मानिसले आफ्नो बाबुलाई भने उसले आफ्नो । उसले आफ्नो ।
Saved model for epoch 3: /content/drive/MyDrive/fyp_translator/tmodel_epoch_3.pt
Deleted previous model: /content/drive/MyDrive/fyp_translator/tmodel_epoch_2.pt


Processing epoch 03:   6%|▌         | 79/1407 [01:28<26:52,  1.21s/it, loss=5.812]

Iteration 4300: loss =  5.812


Processing epoch 03:  13%|█▎        | 179/1407 [03:21<24:54,  1.22s/it, loss=5.640]

Iteration 4400: loss =  5.640


Processing epoch 03:  20%|█▉        | 279/1407 [05:13<22:48,  1.21s/it, loss=5.773]

Iteration 4500: loss =  5.773


Processing epoch 03:  27%|██▋       | 379/1407 [07:05<20:52,  1.22s/it, loss=5.541]

Iteration 4600: loss =  5.541


Processing epoch 03:  34%|███▍      | 479/1407 [08:57<18:52,  1.22s/it, loss=5.663]

Iteration 4700: loss =  5.663


Processing epoch 03:  40%|████      | 566/1407 [10:35<15:41,  1.12s/it, loss=5.336]

In [None]:

%load_ext tensorboard
%reload_ext tensorboard

# Step 2: Start TensorBoard in the notebook and point it to the log directory
%tensorboard --logdir runs/tmodel


In [None]:


# This function will iterate over the validation data and compute BLEU score


In [None]:
list1=[1,2,3]
list2=[3,4,5]
zipped=zip(list1,list2)
print(list(zipped))

In [None]:
# Define some dummy inputs
batch_size = 2
sequence_length = 200
d_model = 512
num_heads = 8
dropout = 0.1

# Create random tensors for q, k, v with shape [batch_size, sequence_length, d_model]
q = torch.randn(batch_size, sequence_length, d_model)
k = torch.randn(batch_size, sequence_length, d_model)
v = torch.randn(batch_size, sequence_length, d_model)

# Optionally, create a mask (shape: [batch_size, sequence_length])
mask = torch.ones(batch_size, sequence_length).bool()  # No masking here, all ones

# Initialize the multi-head attention module
mha = MultiHeadAttention(d_model=d_model, num_heads=num_heads, dropout=dropout)

# Forward pass through the module
# output = mha(q, k, v, mask)

# # Print the output shape and output tensor
# print("Output shape:", output.shape)  # Should print torch.Size([2, 5, 8])
# print("Output tensor:", output)


In [None]:
# mha.forward(q,k,v,mask)