In [82]:
!pip install datasets transformers

In [65]:
from datasets import load_dataset

In [83]:
dataset = load_dataset("SKNahin/bengali-transliteration-data")

In [84]:
dataset = dataset["train"].train_test_split(test_size=0.2)

In [85]:
dataset["train"][0]

{'bn': 'এটা কোনো পোস্ট হলো মিয়া আবাল', 'rm': 'eta kono post holo mia abal'}

In [86]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")



In [87]:
source_lang = "rm"
target_lang = "bn"
prefix = "Transliterate Romanized Bangla to Bangla: "

def preprocess_function(examples):
    inputs = [prefix + example for example in examples[source_lang]]
    targets = [example for example in examples[target_lang]]

    model_inputs = tokenizer(inputs, max_length=128, truncation=True)
    labels = tokenizer(text_target=targets, max_length=128, truncation=True)

    # Replace padding tokens with -100 to avoid computing loss on them
    labels["input_ids"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
    ]
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs




In [88]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/4004 [00:00<?, ? examples/s]

Map:   0%|          | 0/1002 [00:00<?, ? examples/s]

In [101]:
print("Tokenized inputs and labels:")
print(tokenized_dataset["train"][2])

Tokenized inputs and labels:
{'bn': 'জি ভাই অসাধারণ হইছে বাট ফুল ডিটেইলস জানতে পারলাম না, আর এটা কি বিডি তে সব জায়গাতেই পাওয়া যাবে?', 'rm': 'ji bai osadaron hoice but full detailes janty parlam na, r eta ki bd te sob jaygatei powa jabe?', 'input_ids': [14577, 67002, 1614, 10462, 10627, 91697, 288, 91697, 267, 1359, 11938, 124099, 9694, 623, 3681, 1156, 3622, 17932, 299, 7625, 1421, 22140, 282, 294, 261, 259, 286, 1384, 504, 259, 7122, 400, 27506, 432, 276, 17428, 266, 485, 969, 432, 811, 291, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [52837, 259, 30544, 92084, 58100, 85958, 2326, 1413, 3019, 4830, 4402, 259, 91431, 25393, 13420, 1413, 162225, 16481, 1573, 17831, 29422, 3215, 261, 6373, 1244, 3459, 6173, 3154, 13445, 259, 1573, 259, 8909, 7725, 4142, 30899, 55664, 5075, 5946, 6016, 2912, 291, 1]}


In [90]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-small")

In [91]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [92]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=5e-5,  # Lower learning rate
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    fp16=False,  # Disable fp16
    bf16=True,  # Enable bf16 if available
    max_grad_norm=1.0,  # Gradient clipping
    label_smoothing_factor=0.1,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=100,
    save_steps=100,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

  trainer = Seq2SeqTrainer(


Step,Training Loss,Validation Loss
100,16.7455,11.233994
200,10.0535,7.6046
300,8.2557,6.559895
400,7.0709,5.791033
500,6.5249,5.430801
600,6.175,5.260149
700,6.1494,5.146877
800,6.1583,5.100609
900,6.1505,5.075978
1000,6.01,5.068414


TrainOutput(global_step=1002, training_loss=8.888135973802822, metrics={'train_runtime': 838.9321, 'train_samples_per_second': 9.545, 'train_steps_per_second': 1.194, 'total_flos': 288734508933120.0, 'train_loss': 8.888135973802822, 'epoch': 2.0})

In [93]:
# Save model and tokenizer
trainer.save_model("./results3")
tokenizer.save_pretrained("./results3")


('./results3/tokenizer_config.json',
 './results3/special_tokens_map.json',
 './results3/spiece.model',
 './results3/added_tokens.json',
 './results3/tokenizer.json')

In [94]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained("./results3")
tokenizer = AutoTokenizer.from_pretrained("./results3")




In [98]:
text = "Transliterate Romanized Bangla to Bangla: eishob ki balsal ashe"

# Prepare input and move to GPU
inputs = tokenizer(text, return_tensors="pt").to("cuda")

# Move model to GPU
model.to("cuda")

# Generate output
outputs = model.generate(**inputs, max_length=50, num_beams=5, early_stopping=True)

# Decode and print result
decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(decoded_output)


আমার ভাই
