In [None]:
# --- Environment & warnings ---
import os, warnings, torch, random, re
os.environ["TOKENIZERS_PARALLELISM"] = "false"
warnings.filterwarnings("ignore", category=UserWarning)

from datasets import load_dataset, concatenate_datasets, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq
)

# -----------------------------
# 1) Load CSV
# -----------------------------
dataset = load_dataset("csv", data_files={"train": "/kaggle/input/latest/LDataset.csv"})["train"]

# -----------------------------
# 2) Helpers (Roman normalization + length filter)
# -----------------------------
def normalize_roman(s):
    if s is None:
        return ""
    s = str(s).lower().strip()
    s = re.sub(r"[^a-z\s']", " ", s)  # keep letters, spaces, apostrophes
    s = re.sub(r"\s+", " ", s).strip()
    return s

def ok_len(ex):
    src_len = len(str(ex["source"]).split())
    tgt_len = len(str(ex["target"]).split())
    return (3 <= src_len <= 40) and (1 <= tgt_len <= 64)

# -----------------------------
# 3) Expand into source-target pairs with language tags
# -----------------------------
def build_pairs(batch):
    sources, targets = [], []
    for g, r, h, e in zip(batch["Garhwali"], batch["RomanGarhwali"], batch["Hindi"], batch["English"]):
        g = "" if g is None else str(g).strip()
        r = normalize_roman(r)
        h = "" if h is None else str(h).strip()
        e = "" if e is None else str(e).strip()

        sources.append(f"<gar> translate to <eng>: {g}"); targets.append(e)
        sources.append(f"<gar> translate to <hin>: {g}"); targets.append(h)
        sources.append(f"<rgar> translate to <eng>: {r}"); targets.append(e)
        sources.append(f"<rgar> translate to <hin>: {r}"); targets.append(h)
    return {"source": sources, "target": targets}

expanded = dataset.map(
    build_pairs, batched=True,
    remove_columns=["Garhwali","RomanGarhwali","Hindi","English"]
)

# -----------------------------
# 4) Clean: drop empty targets, dedup sources, filter lengths
# -----------------------------
def has_target(example):
    t = example["target"]
    return t is not None and isinstance(t, str) and len(t.strip()) > 0

clean = expanded.filter(has_target)

# Deduplicate by source using a filter
seen = set()
def dedup(example):
    src = example["source"]
    if src in seen:
        return False
    seen.add(src)
    return True

clean = clean.filter(dedup)
clean = clean.filter(ok_len)

# -----------------------------
# 5) Split
# -----------------------------
split = clean.train_test_split(test_size=0.1, seed=42)
train_raw, eval_raw = split["train"], split["test"]

# -----------------------------
# 6) Tokenizer & Model (IndicBART) + add special tokens
# -----------------------------
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indicbart")

special_tokens = {"additional_special_tokens": ["<gar>", "<rgar>", "<hin>", "<eng>"]}
num_added = tokenizer.add_special_tokens(special_tokens)

model = AutoModelForSeq2SeqLM.from_pretrained("ai4bharat/indicbart")
if num_added > 0:
    model.resize_token_embeddings(len(tokenizer))

model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.forced_bos_token_id = tokenizer.bos_token_id

# -----------------------------
# 7) Preprocess with masking
# -----------------------------
PAD_ID = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0

def preprocess_function(examples):
    sources = [str(s) for s in examples["source"]]
    targets = [str(t) if t is not None and len(str(t).strip()) > 0 else "<unk>" for t in examples["target"]]

    inputs = tokenizer(sources, max_length=96, truncation=True, padding="max_length")
    labels_tok = tokenizer(targets, max_length=96, truncation=True, padding="max_length")

    label_ids = labels_tok["input_ids"]
    masked_labels = [[(tid if tid != PAD_ID else -100) for tid in seq] for seq in label_ids]
    inputs["labels"] = masked_labels
    return inputs

# -----------------------------
# 8) Oversample Hindi and Roman
# -----------------------------
hindi_pairs = train_raw.filter(lambda x: "<hin>" in x["source"])
roman_pairs = train_raw.filter(lambda x: "<rgar>" in x["source"])

balanced_train = concatenate_datasets([
    train_raw,
    hindi_pairs,
    roman_pairs, roman_pairs
]).shuffle(seed=42)

# -----------------------------
# 9) Tokenize datasets
# -----------------------------
tokenized_train = balanced_train.map(preprocess_function, batched=True, remove_columns=["source","target"])
tokenized_eval = eval_raw.map(preprocess_function, batched=True, remove_columns=["source","target"])

def has_any_label(ex):
    return any(t != -100 for t in ex["labels"])
tokenized_eval = tokenized_eval.filter(has_any_label)

# -----------------------------
# 10) Data collator
# -----------------------------
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, label_pad_token_id=-100)

# -----------------------------
# 11) Training arguments
# -----------------------------
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=6,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",         # use 'evaluation_strategy' if your transformers supports it
    save_strategy="epoch",
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=50,
    fp16=False,
    max_grad_norm=1.0,
    learning_rate=3e-5,
    warmup_steps=500,
    label_smoothing_factor=0.1,
    report_to="none",
    remove_unused_columns=True
)

# -----------------------------
# 12) Trainer
# -----------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    data_collator=data_collator
)

# -----------------------------
# 13) Train
# -----------------------------
print("üöÄ Starting training...")
train_out = trainer.train()
print("‚úÖ Training complete.")

# -----------------------------
# 14) Inference helper
# -----------------------------
def translate(text, beams=6):
    device = model.device
    inputs = tokenizer(text, return_tensors="pt").to(device)
    inputs.pop("token_type_ids", None)

    outputs = model.generate(
        **inputs,
        max_new_tokens=80,
        num_beams=beams,
        do_sample=False,
        no_repeat_ngram_size=3,
        repetition_penalty=1.05,
        length_penalty=1.0,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id
    )
    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return text.replace("[CLS]", "").replace("[SEP]", "").strip()

# -----------------------------
# 15) Sanity check loop
# -----------------------------
def sanity_check(dataset, n=5):
    samples = random.sample(range(len(dataset)), n)
    for idx in samples:
        ex = dataset[idx]
        src = ex["source"]
        tgt = ex["target"]
        pred = translate(src)
        print("üìù Source:", src)
        print("üéØ Target:", tgt)
        print("ü§ñ Model Output:", pred)
        print("-" * 60)

sanity_check(eval_raw, n=5)

In [None]:
# -----------------------------
# 16) Save model and tokenizer to output directory
# -----------------------------
SAVE_PATH = "/kaggle/working/translation_model"

tokenizer.save_pretrained(SAVE_PATH)
model.save_pretrained(SAVE_PATH)

print(f"‚úÖ Model and tokenizer saved to {SAVE_PATH}")

In [None]:
# Example: Garhwali ‚Üí Hindi
print(translate("<gar> translate to <hin>: ‡§Æ‡•ç‡§Ø‡§æ‡§∞ ‡§®‡•å‡§Ç ‡§∏‡§æ‡§ï‡•ç‡§∑‡•Ä ‡§õ"))

# Example: Garhwali ‚Üí English
print(translate("<gar> translate to <eng>: ‡§Æ‡•ç‡§Ø‡§æ‡§∞ ‡§®‡•å‡§Ç ‡§∏‡§æ‡§ï‡•ç‡§∑‡•Ä ‡§õ"))

# Example: Roman Garhwali ‚Üí Hindi
print(translate("<rgar> translate to <hin>: myar naun sakshi ch"))

# Example: Roman Garhwali ‚Üí English
print(translate("<rgar> translate to <eng>: myar naun sakshi ch"))

In [None]:
print(translate("<gar> translate to <hin>: ‡§Æ‡•Ä ‡§¨‡§æ‡§ú‡§æ‡§∞ ‡§ú‡§æ‡§®‡•Ç ‡§õ‡•å‡§Ç‡•§"))
print(translate("<gar> translate to <eng>: ‡§§‡•Ç ‡§Ü‡§ú ‡§≠‡§ø‡§£‡§∏‡§æ‡§∞‡§ø ‡§≠‡§æ‡§§ ‡§ö‡§¢‡§æ‡§Ø‡§≤‡§ø?"))
print(translate("<rgar> translate to <hin>: mi kitaab paddu chon"))
print(translate("<rgar> translate to <eng>: mi kitaab paddu chon"))

In [None]:
# Save final model + tokenizer into a folder
trainer.save_model("./trained_model")
tokenizer.save_pretrained("./trained_model")

In [None]:
import shutil

# Move the trained model folder into Kaggle's output directory
shutil.move("./trained_model", "/kaggle/outputs/trained_model")

In [None]:
!zip -r /kaggle/working/translation_model.zip /kaggle/working/translation_model

In [None]:
!zip -r /kaggle/working/translation_model.zip /kaggle/working/translation_model
import shutil
shutil.move("/kaggle/working/translation_model.zip", "/kaggle/outputs/translation_model.zip")

In [None]:
!ls -lh /kaggle/outputs/

In [None]:
!zip -r /kaggle/working/translation_model.zip /kaggle/working/translation_model
!mv /kaggle/working/translation_model.zip /kaggle/outputs/translation_model.zip

In [None]:
import gradio as gr

def translate_ui(text, src_token, tgt_token):
    prompt = f"{src_token} translate to {tgt_token}: {text}"
    try:
        return translate(prompt)
    except Exception as e:
        return "Error"

with gr.Blocks(theme=gr.themes.Base()) as demo:
    gr.Markdown(
        "<h1 style='text-align: center; color: #FFD700;'>üåê Garhwali/Hindi/English Translator</h1>",
    )

    with gr.Row():
        src_dropdown = gr.Dropdown(
            choices=["Garhwali"],
            label="Source",
            value="Garhwali"
        )
        gr.Markdown("<h2 style='text-align: center;'>‚áÑ</h2>")
        tgt_dropdown = gr.Dropdown(
            choices=["Hindi", "English"],
            label="Target",
            value="Hindi"
        )

    with gr.Row():
        input_text = gr.Textbox(lines=8, label="Input Text", placeholder="Enter text here...")
        output_text = gr.Textbox(lines=8, label="Translated Output")

    with gr.Row():
        translate_btn = gr.Button("Translate", variant="primary")

    def on_translate(text, src_ui, tgt_ui):
        # Map UI labels to internal tokens
        src_token = "<rgar>" if src_ui == "Garhwali" else "<gar>"
        tgt_token = "<hin>" if tgt_ui == "Hindi" else "<eng>"
        return translate_ui(text, src_token, tgt_token)

    translate_btn.click(
        on_translate,
        inputs=[input_text, src_dropdown, tgt_dropdown],
        outputs=output_text
    )

demo.launch(share=True, inline=True)

In [None]:
SAVE_PATH = "./translation_model"
tokenizer.save_pretrained(SAVE_PATH)
model.save_pretrained(SAVE_PATH)

In [None]:
!zip -r /kaggle/working/translation_model.zip ./translation_model