In [None]:
!pip install transformers datasets evaluate sacrebleu sentencepiece accelerate -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/100.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.8/100.8 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# --- 1. SETUP & INSTALLATION ---
# Install required libraries quietly
!pip install transformers datasets evaluate sacrebleu sentencepiece accelerate -q

import warnings
import torch
import os
import shutil
from datasets import load_dataset
import evaluate
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

# Suppress warnings
warnings.filterwarnings("ignore")

# --- 2. CONFIGURATION (LOCAL SAVE) ---
# We save to a local folder in the Colab instance
local_save_path = "./en_hi_model"

# --- 3. TRANSLATION SYSTEM CLASS ---
class TranslationSystem:
    def __init__(self, model_checkpoint="Helsinki-NLP/opus-mt-en-hi", source_lang="en", target_lang="hi"):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model_checkpoint = model_checkpoint
        self.source_lang = source_lang
        self.target_lang = target_lang

        print(f"\nInitializing System on {self.device.upper()}...")
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_checkpoint)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_checkpoint).to(self.device)
        self.metric = evaluate.load("sacrebleu")

    def preprocess_function(self, examples):
        """Tokenizes inputs and targets."""
        inputs = [ex[self.source_lang] for ex in examples["translation"]]
        targets = [ex[self.target_lang] for ex in examples["translation"]]

        model_inputs = self.tokenizer(inputs, max_length=128, truncation=True)
        labels = self.tokenizer(text_target=targets, max_length=128, truncation=True)
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    def load_and_prep_data(self):
        print("Loading IIT Bombay English-Hindi Dataset...")
        dataset = load_dataset("cfilt/iitb-english-hindi")

        # Use small slice for assignment speed (Train: 1000, Val: 100)
        small_train_dataset = dataset["train"].select(range(1000))
        small_val_dataset = dataset["validation"].select(range(100))

        print("Tokenizing data...")
        tokenized_train = small_train_dataset.map(self.preprocess_function, batched=True)
        tokenized_val = small_val_dataset.map(self.preprocess_function, batched=True)

        return tokenized_train, tokenized_val

    def compute_metrics(self, eval_preds):
        preds, labels = eval_preds
        if isinstance(preds, tuple):
            preds = preds[0]

        decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True)
        labels = np.where(labels != -100, labels, self.tokenizer.pad_token_id)
        decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True)

        decoded_preds = [pred.strip() for pred in decoded_preds]
        decoded_labels = [[label.strip()] for label in decoded_labels]

        result = self.metric.compute(predictions=decoded_preds, references=decoded_labels)
        return {"bleu": result["score"]}

    def train(self, output_dir):
        train_data, val_data = self.load_and_prep_data()

        print(f"Training will save to local path: {output_dir}")

        args = Seq2SeqTrainingArguments(
            output_dir=output_dir,
            eval_strategy="epoch",
            learning_rate=2e-5,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            weight_decay=0.01,
            save_total_limit=2,
            num_train_epochs=1,
            predict_with_generate=True,
            fp16=torch.cuda.is_available(),
            report_to="none"
        )

        data_collator = DataCollatorForSeq2Seq(self.tokenizer, model=self.model)

        # FIX: Removed 'tokenizer=self.tokenizer' to prevent TypeError
        trainer = Seq2SeqTrainer(
            model=self.model,
            args=args,
            train_dataset=train_data,
            eval_dataset=val_data,
            data_collator=data_collator,
            compute_metrics=self.compute_metrics,
        )

        print("Starting Training...")
        trainer.train()
        print("Training Complete.")

        # We manually save the tokenizer here, so removing it from Trainer is safe
        trainer.save_model(output_dir)
        self.tokenizer.save_pretrained(output_dir)

    def translate_text(self, text):
        self.model.eval()
        inputs = self.tokenizer(text, return_tensors="pt").to(self.device)
        translated_tokens = self.model.generate(**inputs)
        translated_text = self.tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
        return translated_text

# --- 4. EXECUTION ---
import numpy as np

# Initialize
mt_system = TranslationSystem()

# Train (Saves to Colab temporary storage)
mt_system.train(output_dir=local_save_path)

# Test Inference
print("\n--- Testing Translation ---")
test_sentences = [
    "The government has announced a new policy for public health.",
    "All schools will remain closed tomorrow due to heavy rain.",
    "Please wear a mask and maintain social distance.",
]

for sent in test_sentences:
    print(f"En: {sent}")
    print(f"Hi: {mt_system.translate_text(sent)}")
    print("-" * 30)

# --- 5. (OPTIONAL) DOWNLOAD MODEL MANUALLY ---
# Run this to zip the model so you can download it from the file explorer
print("\nZipping model for download...")
shutil.make_archive("my_translation_model", 'zip', local_save_path)
print("Model zipped as 'my_translation_model.zip'. You can download it from the Files tab on the left.")


Initializing System on CUDA...


Loading weights:   0%|          | 0/258 [00:00<?, ?it/s]



Loading IIT Bombay English-Hindi Dataset...
Tokenizing data...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Training will save to local path: ./en_hi_model
Starting Training...


Epoch,Training Loss,Validation Loss,Bleu
1,No log,4.71605,5.233337


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Training Complete.


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]


--- Testing Translation ---
En: The government has announced a new policy for public health.
Hi: सरकार ने जनता की सेहत के लिए एक नयी नीति घोषित की है ।
------------------------------
En: All schools will remain closed tomorrow due to heavy rain.
Hi: सभी स्कूल दिन भारी वर्षा के कारण बंद हो जाएँगे ।
------------------------------
En: Please wear a mask and maintain social distance.
Hi: प्लीज़ मास्क पहनो और सामाजिक दूरी बनाए रखें.
------------------------------

Zipping model for download...
