# Banglish to Bengali Transliteration Model

This notebook implements a sequence-to-sequence model for converting Banglish text to Bengali script using the mBART model.

In [None]:
# Install required packages
%pip install transformers datasets torch sentencepiece tensorboard --quiet

In [None]:
import torch
from datasets import load_dataset
from transformers import (
    MBartForConditionalGeneration,
    MBartTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
# Loading the SKNahin/bengali-transliteration-data dataset
print("Loading bengali-transliteration-data dataset...")
ds = load_dataset("SKNahin/bengali-transliteration-data")
print(f"Dataset loaded successfully!")
print(f"Number of examples: {len(ds['train'])}")

print("\nFirst few examples from the dataset:")
for i, example in enumerate(ds['train'][:3]):
    print(f"\nExample {i+1}:")
    print(f"Banglish: {example['banglish']}")
    print(f"Bangla: {example['bangla']}")

In [None]:
# Initializing tokenizer and model
model_checkpoint = "facebook/mbart-large-cc25"
tokenizer = MBartTokenizer.from_pretrained(model_checkpoint)
model = MBartForConditionalGeneration.from_pretrained(model_checkpoint)

# Adding Bengali language token
special_tokens = {'additional_special_tokens': ['<bn>', '</bn>']}
tokenizer.add_special_tokens(special_tokens)
model.resize_token_embeddings(len(tokenizer))

In [None]:
def preprocess_data(examples):
    # Adding special tokens to mark Banglish input
    inputs = [f"<bn>{text}</bn>" for text in examples['banglish']]
    targets = examples['bangla']
    
    # Tokenizing inputs and targets
    model_inputs = tokenizer(
        inputs,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_tensors="pt"
    )
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=128,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
    
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

In [None]:
# Splitting the dataset
train_val = ds['train'].train_test_split(test_size=0.1, seed=42)
train_dataset = train_val['train']
val_dataset = train_val['test']

# Applying preprocessing
train_dataset = train_dataset.map(
    preprocess_data,
    batched=True,
    remove_columns=train_dataset.column_names
)

val_dataset = val_dataset.map(
    preprocess_data,
    batched=True,
    remove_columns=val_dataset.column_names
)

In [None]:
# Configuring training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=3,
    predict_with_generate=True,
    fp16=True,  # Enabling mixed precision training
    gradient_accumulation_steps=4,
    logging_steps=100
)

# Initializing data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
# Defining metrics for evaluation
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Calculating character-level accuracy
    total_chars = 0
    correct_chars = 0
    for pred, label in zip(decoded_preds, decoded_labels):
        total_chars += len(label)
        for p, l in zip(pred, label):
            if p == l:
                correct_chars += 1
    
    return {"character_accuracy": correct_chars / total_chars}

In [None]:
# Initializing trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:
# Training the model
print("Starting training...")
trainer.train()

In [None]:
# Saving the model
print("Saving model...")
model_path = "./model"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

In [None]:
# Testing the model
def translate_text(text):
    inputs = tokenizer(f"<bn>{text}</bn>", return_tensors="pt", padding=True)
    outputs = model.generate(**inputs, max_length=128)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Testing examples
test_texts = [
    "amar sonar bangla",
    "ami tomake bhalobashi",
    "kemon acho"
]

print("\nTest translations:")
for text in test_texts:
    translated = translate_text(text)
    print(f"Input: {text}")
    print(f"Output: {translated}\n")