In [1]:
# COMPLETE TRANSLITERATION PROJECT COLAB CODE
# Indic-to-English (Roman) for Hindi, Bengali, Tamil using Aksharantar + mT5 + CTranslate2 + Gradio
# Run this step-by-step in Google Colab (GPU recommended)

# ===== STEP 1: INSTALL DEPENDENCIES =====
!pip install datasets transformers torch accelerate ctranslate2 gradio huggingface_hub sacrebleu
!pip install sentencepiece protobuf

Collecting ctranslate2
  Downloading ctranslate2-4.6.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (10 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.6.0-py3-none-any.whl.metadata (39 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading ctranslate2-4.6.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (38.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.8/38.8 MB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sacrebleu-2.6.0-py3-none-any.whl (100 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.8/100.8 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, ctranslate2,

In [13]:
# ===== ULTRA-FAST STEP 2: LOAD & PREPROCESS DATASET =====
from datasets import load_dataset, DatasetDict, Dataset

# Load with streaming and take only first 3000 total examples (1 minute max)
print("Loading tiny subset for fast execution...")

stream_ds = load_dataset(
    "ai4bharat/Aksharantar",
    "default",
    split="train",
    streaming=True
)

# Collect examples without filtering - just grab first N
examples_list = []
target_langs = {"hin", "ben", "tam"}  # We want these but won't strictly filter for speed

for i, example in enumerate(stream_ds):
    if i >= 3000:  # Stop after 3000 examples total
        break

    # Check if the example has the required keys
    # Print first example to debug
    if i == 0:
        print(f"First example keys: {example.keys()}")
        print(f"Sample values: {list(example.items())[:3]}")

    # Try different possible key names
    if "target" in example and "source" in example:
        # Original format: target=Indic, source=English
        examples_list.append({
            "input": example["target"],
            "output": example["source"]
        })
    elif "native word" in example and "english word" in example:
        examples_list.append({
            "input": example["native word"],
            "output": example["english word"]
        })

# Create dataset
combined_ds = Dataset.from_list(examples_list)

# Split 80/20
split_result = combined_ds.train_test_split(test_size=0.2, seed=42)

combined = DatasetDict({
    "train": split_result["train"],
    "test": split_result["test"]
})

print(f"\nTraining samples: {len(combined['train'])}, Test samples: {len(combined['test'])}")
print("Ready for Step 3 (training)!")


Loading tiny subset for fast execution...


Resolving data files:   0%|          | 0/21 [00:00<?, ?it/s]

First example keys: dict_keys(['unique_identifier', 'native word', 'english word', 'source'])
Sample values: [('unique_identifier', 'asm1'), ('native word', 'লক্ষীনগৰস্থিত'), ('english word', 'lakhyeenogorsthito')]

Training samples: 2400, Test samples: 600
Ready for Step 3 (training)!


In [16]:
# ===== COMPATIBLE STEP 3: TRAIN mT5 MODEL =====
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)
import torch

model_name = "google/mt5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess(examples):
    inputs = [f"transliterate: {inp}" for inp in examples["input"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding=False)
    labels = tokenizer(examples["output"], max_length=128, truncation=True, padding=False)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = combined.map(preprocess, batched=True, remove_columns=combined["train"].column_names)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

device = "cuda" if torch.cuda.is_available() else "cpu"
use_fp16 = torch.cuda.is_available()

print(f"Training on: {device}")

# Use eval_strategy instead of evaluation_strategy (newer transformers versions)
training_args = Seq2SeqTrainingArguments(
    output_dir="./translit-model",
    eval_strategy="epoch",  # Changed from evaluation_strategy
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    save_total_limit=1,
    predict_with_generate=True,
    fp16=use_fp16,
    logging_steps=50,
    save_strategy="epoch",
    load_best_model_at_end=False,
    report_to="none"
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

print("Starting training...")
trainer.train()

trainer.save_model("./translit-model")
tokenizer.save_pretrained("./translit-model")
print("Training complete! Model saved to ./translit-model")


Map:   0%|          | 0/2400 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

Training on: cpu
Starting training...


  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss
1,24.7326,21.61109


Training complete! Model saved to ./translit-model


In [22]:
# ===== STEP 4: EVALUATE (IMPROVED GENERATION) =====
from sacrebleu.metrics import CHRF
import torch

def generate_predictions(model, tokenizer, test_dataset, max_new_tokens=64):
    """Generate predictions with improved generation parameters"""
    predictions = []
    references = []

    model.eval()

    batch_size = 8
    for i in range(0, len(test_dataset), batch_size):
        batch = test_dataset[i:min(i+batch_size, len(test_dataset))]

        # Prepare inputs
        input_texts = [f"transliterate: {inp}" for inp in batch["input"]]
        inputs = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True, max_length=64)

        # IMPROVED: Use beam search and force longer outputs
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=64,  #
                num_beams=4,
                early_stopping=True,
                no_repeat_ngram_size=2,
                temperature=0.8,
                do_sample=False,
                min_length=3,
                forced_bos_token_id=None,
            )

        # Decode
        pred_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        ref_texts = batch["output"]

        predictions.extend(pred_texts)
        references.extend(ref_texts)

    return predictions, references

print("Generating predictions with improved decoding...")

# Use the correct test dataset
if 'tiny_test' in locals():
    test_data = tiny_test
else:
    test_data = combined["test"]

preds, refs = generate_predictions(model, tokenizer, test_data)

# Calculate CHRF score
chrf = CHRF()
score = chrf.corpus_score(preds, [refs])
print(f"\nCHRF Score: {score.score:.2f}%")
print(f"Evaluated on {len(preds)} test samples")

# Show examples
print("\n=== Sample Predictions ===")
for i in range(min(10, len(preds))):  # Show 10 examples
    print(f"Input:      {test_data[i]['input']}")
    print(f"Predicted:  {preds[i]}")
    print(f"Reference:  {refs[i]}")
    print("-" * 50)

# Count how many are still sentinel tokens
sentinel_count = sum(1 for p in preds if '<extra_id' in p or p.strip() == '')
print(f"\nSentinel/empty outputs: {sentinel_count}/{len(preds)} ({sentinel_count/len(preds)*100:.1f}%)")


Generating predictions with improved decoding...

CHRF Score: 5.39%
Evaluated on 600 test samples

=== Sample Predictions ===
Input:      সন্ত্রাসবাদীসকলক
Predicted:  <extra_id_0>.
Reference:  xantrasbadixokolok
--------------------------------------------------
Input:      আগে
Predicted:  <extra_id_0>।
Reference:  aage
--------------------------------------------------
Input:      বনোৱালৈকে
Predicted:  <extra_id_0>.
Reference:  bonuwaloike
--------------------------------------------------
Input:      দুখ
Predicted:  <extra_id_0>.
Reference:  dukh
--------------------------------------------------
Input:      জীৱনক
Predicted:  <extra_id_0>.
Reference:  jeewonok
--------------------------------------------------
Input:      অফাৰসমূহ
Predicted:  <extra_id_0>সমূহ
Reference:  offerxomuh
--------------------------------------------------
Input:      কলম
Predicted:  <extra_id_0>)
Reference:  kolom
--------------------------------------------------
Input:      বন্দীকো
Predicted:  <extra_id_0

In [24]:
# ===== STEP 5: OPTIMIZE WITH CTRANSLATE2 (CORRECTED) =====
!ct2-transformers-converter --model ./translit-model --output_dir ./translit-ct2 --quantization int8 --force

import time
import ctranslate2

# Load CT2 model
ct2_model = ctranslate2.Translator("./translit-ct2")

# Prepare test input - CT2 needs token strings, not IDs
test_input = "नमस्ते"  # Hindi test
input_text = f"transliterate: {test_input}"

# Tokenize and convert to string tokens (what CT2 expects)
tokens = tokenizer.tokenize(input_text)

print(f"Testing with input: {test_input}")
print(f"Tokens: {tokens[:10]}...")  # Show first 10 tokens

# === CT2 Benchmark ===
start = time.time()
for _ in range(100):
    results = ct2_model.translate_batch(
        source=[tokens],  # List of token lists
        max_decoding_length=64,
        beam_size=2
    )
end = time.time()
ct2_latency = (end - start) / 100 * 1000  # ms per inference

# Show CT2 output
ct2_output_tokens = results[0].hypotheses[0]
ct2_output = tokenizer.convert_tokens_to_string(ct2_output_tokens)
print(f"\nCT2 Output: {ct2_output}")
print(f"CTranslate2 latency: {ct2_latency:.2f}ms")

# === Original Model Benchmark ===
inputs = tokenizer(input_text, return_tensors="pt")
start = time.time()
for _ in range(100):
    with torch.no_grad():
        model.generate(inputs["input_ids"], max_new_tokens=64)
end = time.time()
orig_latency = (end - start) / 100 * 1000

print(f"Original latency: {orig_latency:.2f}ms")
print(f"Speed gain: {((orig_latency - ct2_latency)/orig_latency)*100:.1f}%")

# === Model Size Comparison ===
import os

def get_dir_size(path):
    total = 0
    for entry in os.listdir(path):
        full_path = os.path.join(path, entry)
        if os.path.isfile(full_path):
            total += os.path.getsize(full_path)
    return total / (1024**2)  # Convert to MB

orig_size = get_dir_size("./translit-model")
ct2_size = get_dir_size("./translit-ct2")

print(f"\nModel size comparison:")
print(f"Original: {orig_size:.1f}MB")
print(f"Optimized (CT2 int8): {ct2_size:.1f}MB")
print(f"Size reduction: {(1-ct2_size/orig_size)*100:.1f}%")


2026-01-21 12:30:00.909475: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1768998600.935904   18191 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1768998600.945366   18191 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1768998600.981480   18191 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768998600.981529   18191 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768998600.981535   18191 computation_placer.cc:177] computation placer alr

In [29]:
# ===== STEP 6: GRADIO DEMO (CORRECTED) =====
import gradio as gr

def transliterate(text, language):
    if not text.strip():
        return ""

    input_text = f"transliterate: {text}"

    # Convert to token STRINGS (not IDs) for CT2
    tokens = ct2_tokenizer.tokenize(input_text)

    # Translate with CT2
    results = ct2_model.translate_batch(
        source=[tokens],  # List[List[str]]
        max_decoding_length=64,
        beam_size=2
    )

    # Decode output tokens back to text
    output_tokens = results[0].hypotheses[0]
    output = ct2_tokenizer.convert_tokens_to_string(output_tokens)

    # Clean up the output
    output = output.replace("transliterate:", "").strip()

    # If model outputs sentinel token (undertrained), use fallback message
    if "<extra_id" in output or not output:
        return "[Model undertrained - outputs sentinel tokens]"

    return output

iface = gr.Interface(
    fn=transliterate,
    inputs=[
        gr.Textbox(
            label="Indic Input Text",
            placeholder="नमस्ते (Hindi), নমস্কার (Bengali), or வணக்கம் (Tamil)"
        ),
        gr.Dropdown(
            choices=["hin", "ben", "tam"],
            label="Language (for reference)",
            value="hin"
        )
    ],
    outputs=gr.Textbox(label="English Roman Output"),
    title="Indic → English Transliterator",
    description="Enter text in Hindi/Bengali/Tamil script → Get Romanized English\n\n⚠️ Note: Model trained on minimal dataset for demo purposes",
    examples=[
        ["नमस्ते", "hin"],
        ["কলম", "ben"],
        ["வணக்கம்", "tam"]
    ]
)

# Launch with public link
print("Launching Gradio interface...")
iface.launch(share=True, debug=True)  # Set debug=False to reduce console spam


Launching Gradio interface...
Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://16402596597284f351.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7862 <> https://3b67c4054e5e93850f.gradio.live
Killing tunnel 127.0.0.1:7863 <> https://16402596597284f351.gradio.live


