In [1]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("SKNahin/bengali-transliteration-data")

# Split the dataset into training and validation subsets (90/10 split)
dataset_split = dataset["train"].train_test_split(test_size=0.1, seed=42)
train_data = dataset_split["train"]
val_data = dataset_split["test"]

# Display dataset samples
print("Train Example:", train_data[0])
print("Validation Example:", val_data[0])


Train Example: {'bn': 'আপনার এফবি আইডি নেম বিশাল আর এইখানে মামুন কেন ?', 'rm': 'Apnar fb id name Bishal ar Ekhane mamun keno ?'}
Validation Example: {'bn': 'ভালো করে ট্রাই করেন পাবেন..', 'rm': 'valo kore trai koren paben..'}


In [2]:
from transformers import MBart50Tokenizer
import numpy as np

# Load mBART-50 tokenizer
tokenizer = MBart50Tokenizer.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="bn_BD")

# Define data preprocessing function
def preprocess_data(dataset, min_length=5, max_length=128):
    """
    Preprocess data for sequence-to-sequence tasks.
    Filters overly short or long sequences and tokenizes inputs/labels.
    
    Args:
        dataset: The Hugging Face dataset to preprocess.
        min_length: Minimum sequence length (in words).
        max_length: Maximum sequence length (in tokens).
    
    Returns:
        Processed dataset with tokenized inputs and labels.
    """
    inputs = []  # Store tokenized inputs
    labels = []  # Store tokenized labels

    # Iterate through dataset examples
    for example in dataset:
        # Extract source (Banglish) and target (Bangla) texts
        input_text = example["rm"]
        label_text = example["bn"]

        # Filter overly short sentences (based on words) or excessively long
        if len(input_text.split()) < min_length or len(label_text.split()) < min_length:
            continue
        if len(input_text.split()) > max_length or len(label_text.split()) > max_length:
            continue

        # Tokenize source and target texts
        input_ids = tokenizer(input_text, padding="max_length", truncation=True, max_length=max_length, return_tensors="np")["input_ids"].squeeze()
        label_ids = tokenizer(label_text, padding="max_length", truncation=True, max_length=max_length, return_tensors="np")["input_ids"].squeeze()

        inputs.append(input_ids)
        labels.append(label_ids)

    # Return processed data as NumPy arrays
    return np.array(inputs), np.array(labels)

# Apply preprocessing to the train and validation datasets
train_inputs, train_labels = preprocess_data(train_data, min_length=5, max_length=128)
val_inputs, val_labels = preprocess_data(val_data, min_length=5, max_length=128)

# Verify results
print(f"Training Samples: {len(train_inputs)}")
print(f"Validation Samples: {len(val_inputs)}")
print("Sample Tokenized Banglish (Input):", train_inputs[0])
print("Sample Tokenized Bangla (Label):", train_labels[0])


Training Samples: 2985
Validation Samples: 332
Sample Tokenized Banglish (Input): [250004   5787   2322 108642   3447   9351  14851   4200    187   2751
  34414   5568    309    311    157    705      2      1      1      1
      1      1      1      1      1      1      1      1      1      1
      1      1      1      1      1      1      1      1      1      1
      1      1      1      1      1      1      1      1      1      1
      1      1      1      1      1      1      1      1      1      1
      1      1      1      1      1      1      1      1      1      1
      1      1      1      1      1      1      1      1      1      1
      1      1      1      1      1      1      1      1      1      1
      1      1      1      1      1      1      1      1      1      1
      1      1      1      1      1      1      1      1      1      1
      1      1      1      1      1      1      1      1      1      1
      1      1      1      1      1      1      1      1]
Sample T

In [3]:
print(train_data.column_names)


['bn', 'rm']


In [4]:
from transformers import MBartForConditionalGeneration, MBart50Tokenizer

# Load the mBART-50 tokenizer and model
tokenizer = MBart50Tokenizer.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="bn_BD")
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50")

# Verify that the model is ready
print("Model and Tokenizer Loaded Successfully")




Model and Tokenizer Loaded Successfully


In [5]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AdamW
from tqdm import tqdm

# Define a custom PyTorch Dataset class
class TransliterationDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return {
            "input_ids": torch.tensor(self.inputs[idx], dtype=torch.long),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Create DataLoaders
train_dataset = TransliterationDataset(train_inputs, train_labels)
val_dataset = TransliterationDataset(val_inputs, val_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# Define optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training loop
epochs = 3  # Adjust based on dataset size and performance
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    model.train()
    train_loss = 0

    for batch in tqdm(train_loader):
        optimizer.zero_grad()

        # Move inputs and labels to device
        input_ids = batch["input_ids"].to(device)
        labels = batch["labels"].to(device)

        # Forward pass
        outputs = model(
            input_ids=input_ids,
            labels=labels
        )
        loss = outputs.loss
        train_loss += loss.item()

        # Backward pass
        loss.backward()
        optimizer.step()

    avg_train_loss = train_loss / len(train_loader)
    print(f"Training Loss: {avg_train_loss:.4f}")

    # Validation loop
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            # Move inputs and labels to device
            input_ids = batch["input_ids"].to(device)
            labels = batch["labels"].to(device)

            # Forward pass
            outputs = model(
                input_ids=input_ids,
                labels=labels
            )
            loss = outputs.loss
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_loader)
    print(f"Validation Loss: {avg_val_loss:.4f}")

# Save the fine-tuned model
model.save_pretrained("banglish_to_bangla_model")
tokenizer.save_pretrained("banglish_to_bangla_model")
print("Model Saved Successfully")


TypeError: compile() got an unexpected keyword argument 'optimizer'