In [1]:
import torch
from datasets import load_dataset
import evaluate
from transformers import (
    MarianTokenizer,
    MarianMTModel,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments
)

# Parameters
MODEL_NAME = "Helsinki-NLP/opus-mt-hi-en"
MAX_LEN_HI = 40
MAX_LEN_EN = 40
BATCH_SIZE = 8
EPOCHS = 10
LR = 5e-5


  from .autonotebook import tqdm as notebook_tqdm





In [2]:
dataset = load_dataset("cfilt/iitb-english-hindi", split="train[:5%]")
print(dataset)



Dataset({
    features: ['translation'],
    num_rows: 82954
})


In [3]:
tokenizer = MarianTokenizer.from_pretrained(MODEL_NAME)
model = MarianMTModel.from_pretrained(MODEL_NAME)



In [4]:
def preprocess_function(examples):
    src_texts = [ex["hi"] for ex in examples["translation"]]
    tgt_texts = [ex["en"] for ex in examples["translation"]]
    model_inputs = tokenizer(
        src_texts, max_length=MAX_LEN_HI, truncation=True, padding="max_length"
    )
    labels = tokenizer(
        text_target=tgt_texts, max_length=MAX_LEN_EN, truncation=True, padding="max_length"
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [5]:
split = dataset.train_test_split(test_size=0.3, seed=42)
train_ds = split["train"]
val_ds = split["test"]
print(f"Train size: {len(train_ds)} | Validation size: {len(val_ds)}")


Train size: 58067 | Validation size: 24887


In [6]:
tokenized_train = train_ds.map(preprocess_function, batched=True, remove_columns=["translation"])
tokenized_val = val_ds.map(preprocess_function, batched=True, remove_columns=["translation"])


Map: 100%|██████████| 58067/58067 [00:07<00:00, 7500.18 examples/s]
Map: 100%|██████████| 24887/24887 [00:03<00:00, 8173.35 examples/s]


In [7]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

bleu = evaluate.load("sacrebleu")


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = [[tokenizer.decode(l, skip_special_tokens=True)] for l in labels]
    return {"bleu": bleu.compute(predictions=decoded_preds, references=labels)["score"]}


In [8]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results-hi-en-2percent",
    eval_strategy="epoch",
    learning_rate=LR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=EPOCHS,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),
    logging_dir="./logs",
)


In [9]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


  trainer = Seq2SeqTrainer(


In [10]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


Epoch,Training Loss,Validation Loss,Bleu
1,0.0529,0.038977,89.379874
2,0.0297,0.037266,90.979713
3,0.0235,0.032446,91.729577
4,0.0209,0.031196,92.258146
5,0.0151,0.030554,92.44064
6,0.0144,0.029498,92.291173
7,0.0125,0.029185,92.786146
8,0.0099,0.028136,92.920774
9,0.0094,0.027944,92.524119
10,0.0074,0.027881,92.811357




TrainOutput(global_step=72590, training_loss=0.023325058743552674, metrics={'train_runtime': 20268.2469, 'train_samples_per_second': 28.649, 'train_steps_per_second': 3.581, 'total_flos': 6151176113356800.0, 'train_loss': 0.023325058743552674, 'epoch': 10.0})

In [11]:
metrics = trainer.evaluate()
print(f"BLEU Score: {metrics['eval_bleu']:.2f}")

model.save_pretrained("./fine_tuned_hi_en")
tokenizer.save_pretrained("./fine_tuned_hi_en")

print("✅ Fine-tuning complete. Model saved to ./fine_tuned_hi_en")


BLEU Score: 92.81
✅ Fine-tuning complete. Model saved to ./fine_tuned_hi_en
