In [None]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
from datasets import load_dataset
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
import torch

In [None]:
# Load Dataset
data = load_dataset("SKNahin/bengali-transliteration-data")

# Split Dataset into Train and Validation
data = data["train"].train_test_split(test_size=0.1, seed=42)

In [None]:
# Load Pretrained Model and Tokenizer
model_name = "facebook/mbart-large-50"
tokenizer = MBart50TokenizerFast.from_pretrained(model_name, src_lang="en_XX", tgt_lang="bn_BD")
model = MBartForConditionalGeneration.from_pretrained(model_name)

In [None]:
# Preprocessing Function
def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["rm"],
        max_length=128,
        truncation=True,
        padding="max_length",
    )

    labels = tokenizer(
        text_target=examples["bn"],  # Use `text_target` for labels
        max_length=128,
        truncation=True,
        padding="max_length",
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
# Map Dataset with Batch Handling
tokenized_datasets = data.map(preprocess_function, batched=True)

# Training Arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./mbart-banglish-to-bangla",
    eval_strategy="epoch",  # Updated deprecated argument
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),
    logging_dir="./logs",
    logging_steps=100,
    save_steps=500,
)


In [None]:
# Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],  # Use 'test' as validation
    tokenizer=tokenizer,
)

# Train the Model
trainer.train()

# Save the Model
trainer.save_model("./mbart-banglish-to-bangla")

In [None]:
# Example Inference Function
def translate_banglish_to_bangla(text):
    inputs = tokenizer(text, return_tensors="pt", src_lang="en_XX")
    outputs = model.generate(**inputs, forced_bos_token_id=tokenizer.lang_code_to_id["bn_BD"])
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test the Translation
example_text = "tumi vaat kheyecho?"
print(translate_banglish_to_bangla(example_text))