In [None]:
!pip install datasets transformers

In [32]:
from datasets import load_dataset

In [33]:
dataset = load_dataset("SKNahin/bengali-transliteration-data")

In [34]:
dataset = dataset["train"].train_test_split(test_size=0.2)

In [35]:
dataset["train"][0]

{'bn': 'এটা কোনো পোস্ট হলো মিয়া আবাল', 'rm': 'eta kono post holo mia abal'}

In [36]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("t5-small")

In [37]:
source_lang = "rm"
target_lang = "bn"
prefix = "Transliterate Romanized Bangla to Bangla: "

def preprocess_function(examples):
    inputs = [prefix + example for example in examples[source_lang]]
    targets = [example for example in examples[target_lang]]

    model_inputs = tokenizer(inputs, max_length=128, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [38]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/4004 [00:00<?, ? examples/s]



Map:   0%|          | 0/1002 [00:00<?, ? examples/s]

In [39]:
print("Tokenized inputs and labels:")
print(tokenized_dataset["train"][0])

Tokenized inputs and labels:
{'bn': 'এটা কোনো পোস্ট হলো মিয়া আবাল', 'rm': 'eta kono post holo mia abal', 'input_ids': [4946, 9842, 342, 3385, 1601, 19330, 521, 12, 19330, 521, 10, 3, 15, 17, 9, 10447, 32, 442, 3, 2831, 32, 1337, 9, 703, 138, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 1]}


In [41]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

In [42]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [43]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-6,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    fp16=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss
1,No log,0.384838
2,0.662400,0.292791
3,0.662400,0.274783
4,0.344300,0.267164
5,0.344300,0.264285


TrainOutput(global_step=1255, training_loss=0.4667819262500778, metrics={'train_runtime': 188.0896, 'train_samples_per_second': 106.439, 'train_steps_per_second': 6.672, 'total_flos': 304756958822400.0, 'train_loss': 0.4667819262500778, 'epoch': 5.0})

In [50]:
# Save model and tokenizer
trainer.save_model("./results")
tokenizer.save_pretrained("./results")


('./results/tokenizer_config.json',
 './results/special_tokens_map.json',
 './results/spiece.model',
 './results/added_tokens.json',
 './results/tokenizer.json')

In [51]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained("./results")
tokenizer = AutoTokenizer.from_pretrained("./results")


In [63]:
text = "Transliterate Romanized Bangla to Bangla: Ami Bhat khai"

# Prepare input and move to GPU
inputs = tokenizer(text, return_tensors="pt").to("cuda")

# Move model to GPU
model.to("cuda")

# Generate output
outputs = model.generate(**inputs)

# Decode and print result
decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(decoded_output)


  
