In [2]:
!pip install datasets transformers torch


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m29.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

**1.Load the Dataset**

In [19]:

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from torch.utils.data import DataLoader
import torch

dataset = load_dataset("SKNahin/bengali-transliteration-data")
train_data = dataset["train"].train_test_split(test_size=0.2, seed=42)["train"]
val_data = dataset["train"].train_test_split(test_size=0.2, seed=42)["test"]




**2.Data Preprocessing**

In [20]:
tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")

def preprocess_data(dataset, tokenizer, min_length=5, max_length=32):  # Reduce max_length to 32
    inputs = []
    labels = []
    for example in dataset:
        input_text = example["rm"]
        label_text = example["bn"]
        if len(input_text.split()) < min_length or len(label_text.split()) < min_length:
            continue
        if len(input_text.split()) > max_length or len(label_text.split()) > max_length:
            continue
        input_ids = tokenizer(input_text, padding="max_length", truncation=True, max_length=max_length, return_tensors="pt")["input_ids"].squeeze()
        label_ids = tokenizer(label_text, padding="max_length", truncation=True, max_length=max_length, return_tensors="pt")["input_ids"].squeeze()
        inputs.append(input_ids)
        labels.append(label_ids)
    return torch.stack(inputs), torch.stack(labels)

train_inputs, train_labels = preprocess_data(train_data, tokenizer)
val_inputs, val_labels = preprocess_data(val_data, tokenizer)

print(f"Training Samples: {len(train_inputs)}")
print(f"Validation Samples: {len(val_inputs)}")

print("Sample Tokenized Banglish (Input):", tokenizer.decode(train_inputs[0]))
print("Sample Tokenized Bangla (Label):", tokenizer.decode(train_labels[0]))




Training Samples: 2644
Validation Samples: 662
Sample Tokenized Banglish (Input): eta kono post holo mia abal</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
Sample Tokenized Bangla (Label): এটা কোনো পোস্ট হলো মিয়া আবাল</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>


**3.Select a Model**

I chose the mT5 model because it is pre-trained on a large multilingual corpus, making it highly effective for low-resource language tasks like Banglish-to-Bangla transliteration. Its sequence-to-sequence architecture is tailored for tasks involving text generation and translation, aligning perfectly with the transliteration objective. The smaller variants of mT5, such as mT5-small, offer an excellent balance between computational efficiency and performance, making them suitable for resource-constrained environments. Additionally, its multilingual pre-training allows the model to leverage shared representations, improving accuracy for underrepresented languages. Overall, mT5 is a robust and efficient choice for transliteration due to its design and capabilities.

In [21]:
from transformers import MT5ForConditionalGeneration
import torch

model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

print("Model loaded successfully and moved to device:", device)


Model loaded successfully and moved to device: cuda


**4.Train the model**

In [22]:
from torch.optim import AdamW
from torch.utils.data import DataLoader
from tqdm import tqdm

# Define the dataset class
class TranslationDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return {
            "input_ids": self.inputs[idx],
            "labels": self.labels[idx],
        }

# Create dataset and dataloaders
train_dataset = TranslationDataset(train_inputs, train_labels)
val_dataset = TranslationDataset(val_inputs, val_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
epochs = 3
for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch + 1}"):
        input_ids = batch["input_ids"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
    print(f"Epoch {epoch + 1}, Loss: {epoch_loss / len(train_loader)}")

# Validation loop
model.eval()
val_loss = 0
with torch.no_grad():
    for batch in tqdm(val_loader, desc="Evaluating"):
        input_ids = batch["input_ids"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, labels=labels)
        val_loss += outputs.loss.item()
print(f"Validation Loss: {val_loss / len(val_loader)}")

print("Model fine-tuned successfully!")


Training Epoch 1: 100%|██████████| 166/166 [00:55<00:00,  2.97it/s]


Epoch 1, Loss: 28.578063654612347


Training Epoch 2: 100%|██████████| 166/166 [00:56<00:00,  2.96it/s]


Epoch 2, Loss: 16.61608652321689


Training Epoch 3: 100%|██████████| 166/166 [00:56<00:00,  2.95it/s]


Epoch 3, Loss: 11.594388140253274


Evaluating: 100%|██████████| 42/42 [00:02<00:00, 16.18it/s]

Validation Loss: 7.024769306182861
Model fine-tuned successfully!



