In [64]:
!pip install datasets transformers



In [65]:
from datasets import load_dataset

In [66]:
dataset = load_dataset("SKNahin/bengali-transliteration-data")

In [67]:
dataset = dataset["train"].train_test_split(test_size=0.2)

In [68]:
dataset["train"][0]

{'bn': 'এটা কোনো পোস্ট হলো মিয়া আবাল', 'rm': 'eta kono post holo mia abal'}

In [69]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")

tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [70]:
source_lang = "rm"
target_lang = "bn"
prefix = "Transliterate Romanized Bangla to Bangla: "

def preprocess_function(examples):
    inputs = [prefix + example for example in examples[source_lang]]
    targets = [example for example in examples[target_lang]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)
    # Use 'text_target' for labels
    model_inputs["labels"] = tokenizer(text_target=targets, max_length=128, truncation=True)["input_ids"]
    return model_inputs



In [71]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/4004 [00:00<?, ? examples/s]

Map:   0%|          | 0/1002 [00:00<?, ? examples/s]

In [72]:
print("Tokenized inputs and labels:")
print(tokenized_dataset["train"][0])

Tokenized inputs and labels:
{'bn': 'এটা কোনো পোস্ট হলো মিয়া আবাল', 'rm': 'eta kono post holo mia abal', 'input_ids': [14577, 67002, 1614, 10462, 10627, 91697, 288, 91697, 267, 1384, 135793, 1427, 111012, 11174, 197593, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [1244, 3459, 8265, 10968, 49525, 16867, 2326, 8508, 259, 98393, 1433, 85118, 1]}


In [73]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-small")

pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [74]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [75]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",  # Updated parameter name
    learning_rate=3e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    fp16=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

  trainer = Seq2SeqTrainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,No log,
2,0.000000,
3,0.000000,
4,0.000000,
5,0.000000,


TrainOutput(global_step=1255, training_loss=0.0, metrics={'train_runtime': 286.3066, 'train_samples_per_second': 69.925, 'train_steps_per_second': 4.383, 'total_flos': 839336433131520.0, 'train_loss': 0.0, 'epoch': 5.0})

In [76]:
# Save model and tokenizer
trainer.save_model("./results2")
tokenizer.save_pretrained("./results2")


('./results2/tokenizer_config.json',
 './results2/special_tokens_map.json',
 './results2/spiece.model',
 './results2/added_tokens.json',
 './results2/tokenizer.json')

In [77]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained("./results2")
tokenizer = AutoTokenizer.from_pretrained("./results2")


You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


In [78]:
text = "Transliterate Romanized Bangla to Bangla: Ami Bhat khai"

# Prepare input and move to GPU
inputs = tokenizer(text, return_tensors="pt").to("cuda")

# Move model to GPU
model.to("cuda")

# Generate output
outputs = model.generate(**inputs, max_length=50, num_beams=5, early_stopping=True)

# Decode and print result
decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(decoded_output)


<extra_id_0>
