# 🔄 Multilingual Translation System
This Colab notebook demonstrates English–Tamil translation using three models: MarianMT, Seq2Seq with Attention, and mBART.

In [None]:

# Install required libraries
!pip install datasets transformers sentencepiece sacrebleu --quiet


In [None]:

# Load and explore dataset
from datasets import load_dataset
ds = load_dataset("ai4bharat/samanantar", "ta")
print(ds['train'][0])


In [None]:

# Preprocessing
from transformers import MarianTokenizer
from sklearn.model_selection import train_test_split

model_name = 'Helsinki-NLP/opus-mt-en-ta'
tokenizer = MarianTokenizer.from_pretrained(model_name)

train_data = ds['train']
en_texts = [item['translation']['en'] for item in train_data]
ta_texts = [item['translation']['ta'] for item in train_data]

train_en, val_en, train_ta, val_ta = train_test_split(en_texts, ta_texts, test_size=0.1)

def tokenize_marian(src_texts, tgt_texts):
    inputs = tokenizer.prepare_seq2seq_batch(src_texts, tgt_texts, return_tensors='pt', padding=True, truncation=True)
    return inputs

val_inputs = tokenize_marian(val_en[:500], val_ta[:500])


In [None]:

# MarianMT Translation
from transformers import MarianMTModel
import torch
from sacrebleu import corpus_bleu

model = MarianMTModel.from_pretrained(model_name).to('cuda')
translated = []
model.eval()
with torch.no_grad():
    for i in range(0, len(val_en[:100]), 10):
        batch = tokenizer(val_en[i:i+10], return_tensors="pt", padding=True, truncation=True).to('cuda')
        translated_tokens = model.generate(**batch)
        translated += tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)

bleu_marian = corpus_bleu(translated, [val_ta[:100]])
print(f"BLEU (MarianMT): {bleu_marian.score:.2f}")


In [None]:

# Seq2Seq RNN + Attention
from transformers import EncoderDecoderModel

seq2seq_model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "bert-base-multilingual-cased")
seq2seq_model.to('cuda')

inputs = tokenizer(train_en[:100], return_tensors="pt", padding=True, truncation=True).input_ids
labels = tokenizer(train_ta[:100], return_tensors="pt", padding=True, truncation=True).input_ids

loss = seq2seq_model(input_ids=inputs.to('cuda'), labels=labels.to('cuda')).loss
print(f"Seq2Seq RNN Loss (sample): {loss.item():.4f}")


In [None]:

# mBART Translation
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

mbart_model_name = "facebook/mbart-large-50-many-to-many-mmt"
mbart_tokenizer = MBart50TokenizerFast.from_pretrained(mbart_model_name)
mbart_model = MBartForConditionalGeneration.from_pretrained(mbart_model_name).to("cuda")

mbart_tokenizer.src_lang = "en_XX"
mbart_tokenizer.tgt_lang = "ta_IN"

batch = mbart_tokenizer(val_en[:100], return_tensors="pt", padding=True, truncation=True).to("cuda")
generated_ids = mbart_model.generate(**batch, forced_bos_token_id=mbart_tokenizer.lang_code_to_id["ta_IN"])
mbart_translations = mbart_tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

bleu_mbart = corpus_bleu(mbart_translations, [val_ta[:100]])
print(f"BLEU (mBART): {bleu_mbart.score:.2f}")


In [None]:

# Final Evaluation Summary
print("==== Final BLEU Scores ====")
print(f"MarianMT BLEU: {bleu_marian.score:.2f}")
print(f"mBART BLEU:    {bleu_mbart.score:.2f}")
print("Seq2Seq RNN:   [Requires fine-tuning for full BLEU evaluation]")
