# ðŸ‡®ðŸ‡³ Multilingual Translation: Englishâ€“Hindi using Hugging Face Datasets
This notebook demonstrates neural machine translation using 3 models: MarianMT, Seq2Seq demo, and mBART using the `cfilt/iitb-english-hindi` dataset.

In [None]:

# âœ… Install dependencies
!pip install datasets transformers sentencepiece sacrebleu --quiet


In [None]:

# âœ… Load the IITB Englishâ€“Hindi dataset
from datasets import load_dataset
ds = load_dataset("cfilt/iitb-english-hindi")
print(ds['train'][0])


In [None]:

# âœ… Preprocessing
from transformers import MarianTokenizer
from sklearn.model_selection import train_test_split

model_name = 'Helsinki-NLP/opus-mt-en-hi'
tokenizer = MarianTokenizer.from_pretrained(model_name)

train_data = ds['train']
en_texts = [item['translation']['en'] for item in train_data]
hi_texts = [item['translation']['hi'] for item in train_data]

train_en, val_en, train_hi, val_hi = train_test_split(en_texts, hi_texts, test_size=0.1)

def tokenize_marian(src_texts, tgt_texts):
    return tokenizer.prepare_seq2seq_batch(src_texts, tgt_texts, return_tensors='pt', padding=True, truncation=True)

val_inputs = tokenize_marian(val_en[:500], val_hi[:500])


In [None]:

# âœ… MarianMT Translation
from transformers import MarianMTModel
import torch
from sacrebleu import corpus_bleu

model = MarianMTModel.from_pretrained(model_name).to('cuda')
translated = []
model.eval()
with torch.no_grad():
    for i in range(0, len(val_en[:100]), 10):
        batch = tokenizer(val_en[i:i+10], return_tensors="pt", padding=True, truncation=True).to('cuda')
        translated_tokens = model.generate(**batch)
        translated += tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)

bleu_marian = corpus_bleu(translated, [val_hi[:100]])
print(f"BLEU (MarianMT): {bleu_marian.score:.2f}")


In [None]:

# âœ… Seq2Seq RNN (BERTâ€“BERT) demo loss
from transformers import EncoderDecoderModel

seq2seq_model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "bert-base-multilingual-cased")
seq2seq_model.to('cuda')

inputs = tokenizer(train_en[:100], return_tensors="pt", padding=True, truncation=True).input_ids
labels = tokenizer(train_hi[:100], return_tensors="pt", padding=True, truncation=True).input_ids

loss = seq2seq_model(input_ids=inputs.to('cuda'), labels=labels.to('cuda')).loss
print(f"Seq2Seq RNN Loss (sample): {loss.item():.4f}")


In [None]:

# âœ… mBART Translation
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

mbart_model_name = "facebook/mbart-large-50-many-to-many-mmt"
mbart_tokenizer = MBart50TokenizerFast.from_pretrained(mbart_model_name)
mbart_model = MBartForConditionalGeneration.from_pretrained(mbart_model_name).to("cuda")

mbart_tokenizer.src_lang = "en_XX"
mbart_tokenizer.tgt_lang = "hi_IN"

batch = mbart_tokenizer(val_en[:100], return_tensors="pt", padding=True, truncation=True).to("cuda")
generated_ids = mbart_model.generate(**batch, forced_bos_token_id=mbart_tokenizer.lang_code_to_id["hi_IN"])
mbart_translations = mbart_tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

from sacrebleu import corpus_bleu
bleu_mbart = corpus_bleu(mbart_translations, [val_hi[:100]])
print(f"BLEU (mBART): {bleu_mbart.score:.2f}")


In [None]:

# âœ… Final Summary
print("==== Final BLEU Scores ====")
print(f"MarianMT BLEU: {bleu_marian.score:.2f}")
print(f"mBART BLEU:    {bleu_mbart.score:.2f}")
print("Seq2Seq RNN:   [Loss shown; BLEU requires fine-tuning]")
