In [45]:
import torch
from datasets import load_dataset
import evaluate
from transformers import (
    MarianTokenizer,
    MarianMTModel,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments
)

# Parameters
MODEL_NAME = "Helsinki-NLP/opus-mt-hi-en"
MAX_LEN_HI = 40
MAX_LEN_EN = 40
BATCH_SIZE = 8
EPOCHS = 20
LR = 5e-5


In [34]:
tokenizer = MarianTokenizer.from_pretrained(MODEL_NAME)
model = MarianMTModel.from_pretrained(MODEL_NAME)



# IITB-HI-EN-Dataset

In [2]:
dataset = load_dataset("cfilt/iitb-english-hindi", split="train[:5%]")
print(dataset)



Dataset({
    features: ['translation'],
    num_rows: 82954
})


In [4]:
def preprocess_function(examples):
    src_texts = [ex["hi"] for ex in examples["translation"]]
    tgt_texts = [ex["en"] for ex in examples["translation"]]
    model_inputs = tokenizer(
        src_texts, max_length=MAX_LEN_HI, truncation=True, padding="max_length"
    )
    labels = tokenizer(
        text_target=tgt_texts, max_length=MAX_LEN_EN, truncation=True, padding="max_length"
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [5]:
split = dataset.train_test_split(test_size=0.3, seed=42)
train_ds = split["train"]
val_ds = split["test"]
print(f"Train size: {len(train_ds)} | Validation size: {len(val_ds)}")


Train size: 58067 | Validation size: 24887


In [6]:
tokenized_train = train_ds.map(preprocess_function, batched=True, remove_columns=["translation"])
tokenized_val = val_ds.map(preprocess_function, batched=True, remove_columns=["translation"])


Map: 100%|██████████| 58067/58067 [00:07<00:00, 7500.18 examples/s]
Map: 100%|██████████| 24887/24887 [00:03<00:00, 8173.35 examples/s]


In [7]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

bleu = evaluate.load("sacrebleu")


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = [[tokenizer.decode(l, skip_special_tokens=True)] for l in labels]
    return {"bleu": bleu.compute(predictions=decoded_preds, references=labels)["score"]}


In [8]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results-hi-en-2percent",
    eval_strategy="epoch",
    learning_rate=LR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=EPOCHS,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),
    logging_dir="./logs",
)


In [9]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


  trainer = Seq2SeqTrainer(


In [10]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


Epoch,Training Loss,Validation Loss,Bleu
1,0.0529,0.038977,89.379874
2,0.0297,0.037266,90.979713
3,0.0235,0.032446,91.729577
4,0.0209,0.031196,92.258146
5,0.0151,0.030554,92.44064
6,0.0144,0.029498,92.291173
7,0.0125,0.029185,92.786146
8,0.0099,0.028136,92.920774
9,0.0094,0.027944,92.524119
10,0.0074,0.027881,92.811357




TrainOutput(global_step=72590, training_loss=0.023325058743552674, metrics={'train_runtime': 20268.2469, 'train_samples_per_second': 28.649, 'train_steps_per_second': 3.581, 'total_flos': 6151176113356800.0, 'train_loss': 0.023325058743552674, 'epoch': 10.0})

In [11]:
metrics = trainer.evaluate()
print(f"BLEU Score: {metrics['eval_bleu']:.2f}")

model.save_pretrained("./fine_tuned_hi_en")
tokenizer.save_pretrained("./fine_tuned_hi_en")

print("✅ Fine-tuning complete. Model saved to ./fine_tuned_hi_en")


BLEU Score: 92.81
✅ Fine-tuning complete. Model saved to ./fine_tuned_hi_en


# BhaasaAnuvaad Dataset

In [1]:
import torch
import os
import re
import json
import random
import string
from dataclasses import dataclass
from typing import Dict, List, Union, Optional
import torch
import torchaudio
import librosa
import evaluate
from datasets import load_dataset, Audio, DatasetDict

  from .autonotebook import tqdm as notebook_tqdm


In [40]:
data = load_dataset("ai4bharat/Spoken-Tutorial", "indic2en", split="hindi")

In [41]:
data = data.rename_column("text", "hi")
data = data.rename_column("en_text", "en")

In [42]:
def to_translation(example):
    return {"translation": {"hi": example["hi"], "en": example["en"]}}

data = data.map(to_translation)


data = data.remove_columns(["hi", "en"])


In [43]:
split = data.train_test_split(test_size=0.3, seed=42)
train_ds = split["train"]
val_ds = split["test"]
print(f"Train size: {len(train_ds)} | Validation size: {len(val_ds)}")

Train size: 25629 | Validation size: 10984


In [44]:
print(len(train_ds))
print(len(val_ds))

25629
10984


In [46]:
train_ds

Dataset({
    features: ['chunked_audio_filepath', 'pred_text', 'audio_filepath', 'start_time', 'duration', 'alignment_score', 'video_id', 'en_mining_score', 'translation'],
    num_rows: 25629
})

In [47]:
train_ds = train_ds.remove_columns(['pred_text',"chunked_audio_filepath" , "audio_filepath", 'start_time', 'duration', 'alignment_score', 'video_id', 'en_mining_score'])
val_ds = val_ds.remove_columns(['pred_text',"chunked_audio_filepath", "audio_filepath", 'start_time', 'duration', 'alignment_score', 'video_id', 'en_mining_score'])

In [48]:
train_ds

Dataset({
    features: ['translation'],
    num_rows: 25629
})

In [49]:
def display_samples(dataset, num_samples=10):
    dataset.set_format(type="python", columns=["translation"])
    for i in range(min(num_samples, len(dataset))):
        hi = dataset[i]["translation"].get("hi", "")
        en = dataset[i]["translation"].get("en", "")
        print(f"{i+1}. {hi}\n   → {en}\n")

display_samples(train_ds, num_samples=10)


1. किन्तु हमारे पास बहुत बड़ा डेटा समूह है और एक एक करके प्रत्येक विद्यार्थी का औसत निकालना नामुमकिन है
   → But we have such a large data set and calculating the mean of each student one by one is impossible

2. अब यह कोड का नया भाग है
   → Now, this is a new bit of code

3. नंबर बेस को 8 या 16 में बदलकर अपनी स्लाइड्स पर वापस आते हैं
   → The conversion depends on this number base

4. पहले सम्पर्क चुनें
   → First, select the Contact

5. अब फिर से नॉर्मल व्यू बटन पर क्लिक करते हैं
   → Lets click on the Normal view button again

6. और जब मैं फिक्स पर क्लिक करता हूँ तो मैं पूरे सिलेक्शन को चला सकता हूँ
   → And when I click on fix, I can move the whole selection

7. तो कृपया आगे बढ़ने से पहले इंट्रोडक्शन टो जियोजेब्रा ट्यूटोरियल देखें
   → If not, please go through the Introduction to Geogebra tutorial before proceeding further

8. हम यहाँ वेरिएबल में पासवर्ड स्टोर करने जा रहे हैं
   → We are going to store the password in a variable here

9. अधिक जानकारी दिए गए लिंक पर उपलब्ध है https

In [50]:
def preprocess_function(examples):
    src_texts = [ex["hi"] for ex in examples["translation"]]
    tgt_texts = [ex["en"] for ex in examples["translation"]]
    model_inputs = tokenizer(
        src_texts, max_length=MAX_LEN_HI, truncation=True, padding="max_length"
    )
    labels = tokenizer(
        text_target=tgt_texts, max_length=MAX_LEN_EN, truncation=True, padding="max_length"
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [51]:
tokenized_train = train_ds.map(preprocess_function, batched=True, remove_columns=["translation"])
tokenized_val = val_ds.map(preprocess_function, batched=True, remove_columns=["translation"])


Map: 100%|██████████| 25629/25629 [00:05<00:00, 4698.39 examples/s]
Map: 100%|██████████| 10984/10984 [00:02<00:00, 4987.38 examples/s]


In [52]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

bleu = evaluate.load("sacrebleu")


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = [[tokenizer.decode(l, skip_special_tokens=True)] for l in labels]
    return {"bleu": bleu.compute(predictions=decoded_preds, references=labels)["score"]}


In [53]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results-hi-en-BA",
    eval_strategy="epoch",
    learning_rate=LR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=EPOCHS,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),
    logging_dir="./logs",
)


In [54]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


  trainer = Seq2SeqTrainer(


In [55]:
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu
1,0.8392,0.768914,31.454601
2,0.6232,0.694886,34.651562
3,0.4904,0.665377,35.857059
4,0.3924,0.656972,36.848582
5,0.3183,0.654907,37.924812
6,0.2502,0.663318,37.555238
7,0.1988,0.673486,38.477458
8,0.1665,0.682632,39.305956
9,0.1359,0.701377,39.160696
10,0.1089,0.711098,40.297085




TrainOutput(global_step=64080, training_loss=0.20190642944510362, metrics={'train_runtime': 27321.9135, 'train_samples_per_second': 18.761, 'train_steps_per_second': 2.345, 'total_flos': 5429882467123200.0, 'train_loss': 0.20190642944510362, 'epoch': 20.0})

In [56]:
metrics = trainer.evaluate()
print(f"BLEU Score: {metrics['eval_bleu']:.2f}")

model.save_pretrained("./fine_tuned_hi_en_BA")
tokenizer.save_pretrained("./fine_tuned_hi_en_BA")

print("✅ Fine-tuning complete. Model saved to ./fine_tuned_hi_en_BA")


BLEU Score: 41.14
✅ Fine-tuning complete. Model saved to ./fine_tuned_hi_en_BA
