In [1]:
import os 
import sys
import numpy as np
import transformers
import evaluate
from datasets import Dataset
from transformers import MBartForConditionalGeneration,MBart50TokenizerFast
from transformers import DataCollatorForSeq2Seq,Seq2SeqTrainer,Seq2SeqTrainingArguments
from transformers import AdamWeightDecay




## Loading the Configurations
### The model source
source: <a href="https://huggingface.co/facebook/mbart-large-50-many-to-many-mmt">https://huggingface.co/facebook/mbart-large-50-many-to-many-mmt<a>
### The Datasets Site
source: <a href="https://opus.nlpl.eu/XLEnt/fr&ta/v1.2/XLEnt">https://opus.nlpl.eu/XLEnt/fr&ta/v1.2/XLEnt<a>

In [3]:
# Config
FR_FILE = "fr-ta/XLEnt.fr-ta.fr"
TA_FILE = "fr-ta/XLEnt.fr-ta.ta"
OUTPUT_DIR = "french_to_tamil_model"
MODEL_NAME = 'facebook/mbart-large-50-many-to-many-mmt'
BATCH_SIZE = 16
EPOCHS = 6
LR = 5e-5
MAX_SOURCE_LENGTH = 10
MAX_TARGET_LENGTH = 10

In [4]:
# 1) Load files
with open(FR_FILE, "r", encoding="utf-8") as f:
    fr_sentences = f.read().splitlines()


with open(TA_FILE, "r", encoding="utf-8") as f:
    tam_sentences = f.read().splitlines()

# 2) Filter exactly 5-letter French words
french_sentences = [sentence for sentence in fr_sentences if len(sentence) == 5]
tamil_sentences = [tam_sentences[fr_sentences.index(sentence)] for sentence in french_sentences]

print("No. of French sentences",len(french_sentences))
print("No. of Tamil sentences",len(tamil_sentences))

No. of French sentences 10962
No. of Tamil sentences 10962


In [5]:
# 3) Prepare Hugging Face Dataset
dataset = Dataset.from_dict({
    "fr": french_sentences,
    "ta": tamil_sentences
})

In [None]:
# 4) Load tokenizer and model
tokenizer = MBart50TokenizerFast.from_pretrained(MODEL_NAME)
model = MBartForConditionalGeneration.from_pretrained(MODEL_NAME)

tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

In [None]:
# 5) Tokenization function
tokenizer.src_lang = "fr_XX"
tokenizer.tgt_lang = "ta_IN"
def preprocess_function(examples):
    inputs = examples['fr']
    targets = examples['ta']
    model_inputs = tokenizer(inputs, max_length=MAX_SOURCE_LENGTH, truncation=True, padding='max_length')
    # Tokenize Tamil using the new recommended method
    labels = tokenizer(
        text_target=targets,
        max_length=MAX_TARGET_LENGTH,
        truncation=True,
        padding="max_length"
    )
    model_inputs['labels'] = labels['input_ids']
    return model_inputs


tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=['fr','ta'])

In [None]:
# 6) Split train/validation
split = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split['train']
eval_dataset = split['test']

In [None]:
# 7) Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding='longest')

In [None]:
# 8) Metric
bleu = evaluate.load("sacrebleu")


def postprocess_text(preds, labels):
    preds = [p.strip() for p in preds]
    labels = [[l.strip()] for l in labels]
    return preds, labels


def compute_metrics(eval_pred):
    preds, labels = eval_pred
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = bleu.compute(predictions=decoded_preds, references=decoded_labels)
    return {'bleu': result['score']}

In [None]:
# 9) Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir=str(OUTPUT_DIR),
    evaluation_strategy='epoch',
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LR,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    predict_with_generate=True,
    save_total_limit=2,
    fp16=False,
    remove_unused_columns=False,
    push_to_hub=False,
)

In [None]:
# 10) Create Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
# 11) Train
trainer.train()

In [None]:
# 12) Save model
trainer.save_model(str(OUTPUT_DIR))
print('Saved fine-tuned model to', OUTPUT_DIR)

In [None]:
def translate_5_letter(word):
    if len(word) != 5:
        raise ValueError('Input must be exactly 5 characters long')
# set tokenizer target forced bos token id if available
# MBart requires specifying the language id when generating; tokenizer has a mapping
# We attempt to set forced_bos_token_id for Tamil if present
    try:
        tgt_token_id = tokenizer.lang_code_to_id.get('ta_IN') or tokenizer.lang_code_to_id.get('ta')
        if tgt_token_id:
            out = model.generate(**tokenizer(word, return_tensors='pt'), forced_bos_token_id=tgt_token_id, max_length=MAX_TARGET_LENGTH)
            return tokenizer.decode(out[0], skip_special_tokens=True)
    except Exception:
        pass
    # fallback to pipeline
    res = trans(word, max_length=MAX_TARGET_LENGTH)
    return res[0]['translation_text']