<a href="https://colab.research.google.com/github/fatemafaria142/Banglish-to-Bangla-Machine-Translation-App-using-Streamlit/blob/main/Translation_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from collections import Counter
import nltk
from nltk.util import ngrams

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# **Dataset Path**

In [None]:
# Read the datasets
train_data = pd.read_csv("/content/drive/MyDrive/Banglish to Bangla/train_dataset.csv")
test_data = pd.read_csv("/content/drive/MyDrive/Banglish to Bangla/test_dataset.csv")
validation_data = pd.read_csv("/content/drive/MyDrive/Banglish to Bangla/validation_dataset.csv")

# Remove extra white spaces from column names
train_data.columns = train_data.columns.str.strip()
test_data.columns = test_data.columns.str.strip()
validation_data.columns = validation_data.columns.str.strip()


In [None]:
train_data.head(5)

In [None]:
train_data.tail(5)

In [None]:
test_data.head(5)

In [None]:
test_data.tail(5)

In [None]:
validation_data.head(5)

In [None]:
validation_data.tail(5)

In [None]:
!pip install transformers torch pandas

In [None]:
!pip install sacrebleu

In [None]:
!pip install rouge_score

In [None]:
!pip install sentencepiece

In [None]:
!pip install transformers[sentencepiece]

In [None]:
!transformers-cli cache clear

In [None]:
!pip install huggingface-cli

In [None]:
!pip install transformers[torch]

In [None]:
!pip install accelerate -U

In [None]:
!pip install git+https://github.com/csebuetnlp/normalizer

In [None]:
!pip install --upgrade pip

In [None]:
!pip install torch transformers

In [None]:
import torch
from transformers import  MT5Model, AutoTokenizer, DataCollatorForSeq2Seq, Trainer, TrainingArguments
from normalizer import normalize
import pandas as pd
import numpy as np
import nltk
from nltk.translate.bleu_score import corpus_bleu
from sacrebleu import sentence_bleu
from rouge_score import rouge_scorer
from sklearn.model_selection import train_test_split
import os


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Read the datasets
train_data = pd.read_csv("/content/drive/MyDrive/Banglish to Bangla/train_dataset.csv")
test_data = pd.read_csv("/content/drive/MyDrive/Banglish to Bangla/test_dataset.csv")
validation_data = pd.read_csv("/content/drive/MyDrive/Banglish to Bangla/validation_dataset.csv")

# Remove extra white spaces from column names
train_data.columns = train_data.columns.str.strip()
test_data.columns = test_data.columns.str.strip()
validation_data.columns = validation_data.columns.str.strip()


In [None]:
train_data.head()

In [None]:
# Rename the columns to match the expected format
train_data.rename(columns={'banglish_speech': 'input_text', 'bangla_speech': 'labels'}, inplace=True)
train_data.head()

In [None]:
test_data.head()

In [None]:
# Rename the columns to match the expected format
test_data.rename(columns={'banglish_speech': 'input_text', 'bangla_speech'	: 'labels'}, inplace=True)
test_data.head()

In [None]:
validation_data.head()

In [None]:
# Rename the columns to match the expected format
validation_data.rename(columns={'banglish_speech': 'input_text', 'bangla_speech'	: 'labels'}, inplace=True)
validation_data.head()

In [None]:
!pip install huggingface-cli

# **BanglaT5 model and Its Tokenizer**

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from normalizer import normalize # pip install git+https://github.com/csebuetnlp/normalizer

model = AutoModelForSeq2SeqLM.from_pretrained("csebuetnlp/banglat5")
tokenizer = AutoTokenizer.from_pretrained("csebuetnlp/banglat5", use_fast=True) #sentencepiece library is required to instantiate the fast tokenizer


In [None]:
from torch.utils.data import Dataset, DataLoader

# **Custom Dataset**

In [None]:
class Seq2SeqDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        """
        Args:
            data: A DataFrame containing 'input_text' and 'labels' columns.
            tokenizer: A Hugging Face tokenizer.
            max_length: Maximum sequence length.
        """
        self.input_text = data['input_text'].apply(normalize).tolist()
        self.labels = data['labels'].apply(normalize).tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.input_text)

    def __getitem__(self, idx):
        input_text = self.input_text[idx]
        label_text = self.labels[idx]

        # Tokenize the input text
        input_encodings = self.tokenizer(
            input_text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        # Tokenize the label text to get its 'input_ids' and 'attention_mask'
        label_encodings = self.tokenizer(
            label_text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': input_encodings['input_ids'].squeeze(),
            'attention_mask': input_encodings['attention_mask'].squeeze(),
            'labels': label_encodings['input_ids'].squeeze(),
        }


In [None]:
# Create train , test and validation datasets
train_dataset = Seq2SeqDataset(train_data, tokenizer)
test_dataset = Seq2SeqDataset(test_data, tokenizer)
validation_dataset = Seq2SeqDataset(validation_data, tokenizer)

# Create train , test and validation dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)  #batch_size=32
test_dataloader = DataLoader(test_dataset, batch_size=16) #batch_size=32
validation_dataloader = DataLoader(validation_dataset, batch_size=16) #batch_size=32


In [None]:
# Move the model to the device (CPU or GPU)
model.to(device)

In [None]:
from transformers import TrainingArguments
from torch.optim import AdamW

# Create a custom optimizer using torch.optim.AdamW
custom_optimizer = AdamW(
    model.parameters(),
    lr=1e-3,  # Learning rate
    eps=1e-8,  # Epsilon value to prevent division by zero
    weight_decay=0.01,  # Weight decay (L2 regularization)
)

#if you have 1,000 training examples and a batch size of 100, you would have 10 iterations in each epoch (1,000 / 100 = 10)
'''
This parameter determines how many small batches are accumulated before performing a weight update.
In your code, it's set to 8, which means you'll accumulate gradients over 8 small batches before performing a weight update.
This effectively simulates a larger batch size without requiring more GPU memory.
So, you are updating the model's weights less frequently compared to the number of actual batches processed.
'''
'''
Learning rate determines how quickly the model learns from the data.
The learning rate scheduler type is set to "cosine_with_restarts," which is a type of learning rate schedule.
 Warmup steps are the number of initial training steps with a smaller learning rate, and weight decay introduces L2 regularization to the optimizer.
'''
'''
Number of Iterations per Epoch = Number of Training Samples / Batch Size
Total Iterations = Number of Iterations per Epoch * Number of Epochs
'''
# Define the TrainingArguments
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/Machine_Translation/banglish_to_bangla_translation_BanglaT5',
    num_train_epochs=15,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,  # Accumulate gradients over 8 small batches
    evaluation_strategy="steps",
    eval_steps=500,
    save_total_limit=2,
    save_steps=500,
    learning_rate=1e-3,
    do_train=True,
    do_eval=True,
    remove_unused_columns=False,
    push_to_hub=False,
    report_to="none",
    load_best_model_at_end=True,
    lr_scheduler_type="cosine_with_restarts",  # Set the learning rate scheduler type
    warmup_steps=100,  # Number of warmup steps
    weight_decay=0.01,  # Weight decay (L2 regularization)
    logging_dir='/content/drive/MyDrive/Machine_Translation/banglish_to_bangla_translation_BanglaT5',  # Use the same directory for logs
    logging_steps=500,  # Log every 500 steps
)


In [None]:
from transformers import DataCollatorForSeq2Seq
# Create a data collator for sequence-to-sequence tasks
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,  # Your Hugging Face tokenizer
    model=model,
    padding=True,
    max_length=128,
    label_pad_token_id=tokenizer.pad_token_id,
)

In [None]:
# Define the Trainer with the custom optimizer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    optimizers=(custom_optimizer, None),  # Pass the custom optimizer here
)

# **Training start here**

In [None]:
# Fine-tune the model
trainer.train()

# **Saving model and tokenizer**

In [None]:
from transformers import AutoModelForSeq2SeqLM

# Save the model
model.save_pretrained('/content/drive/MyDrive/Banglish to Bangla/bangla_translation_BanglaT5.pt')

# Save the tokenizer
tokenizer.save_pretrained('/content/drive/MyDrive/Banglish to Bangla/bangla_tokenizer_BanglaT5.json')


In [None]:
!pip install jiwer

In [None]:
# Move the model to the device (CPU or GPU)
model.to(device)

In [None]:
!pip install rouge-score
#https://github.com/google-research/google-research/tree/master/rouge
#https://huggingface.co/spaces/evaluate-metric/rouge [Different types of ROUGE scores]

In [None]:
!pip install evaluate

# **Loading evaluation metrics**

In [None]:
import torch
from evaluate import load
# Define the move_to_device function
def move_to_device(batch, device):
    if isinstance(batch, torch.Tensor):
        return batch.to(device)
    elif isinstance(batch, list):
        return [move_to_device(item, device) for item in batch]
    elif isinstance(batch, dict):
        return {key: move_to_device(value, device) for key, value in batch.items()}
    else:
        return batch  # If it's not a tensor, list, or dict, leave it as is

# Load the evaluation metric for Character Error Rate (CER) and Word Error Rate (WER) and Exact Match(em)
cer_metric = load("cer")
wer_metric = load("wer")
meteor = load('meteor')
exact_match_metric = load("exact_match")

# Load BLEU and ROUGE metrics
bleu_metric = load("bleu")
rouge_metric = load('rouge')

# Initialize lists to store generated translations and references
generated_translations = []
references = []

# Generate translations for the test dataset
for batch in test_dataloader:
    # Move the batch to CUDA
    batch = move_to_device(batch, 'cuda')

    input_text = batch['input_ids']  # Access the input_text using the correct key
    labels = batch['labels']  # Access the labels using the correct key

    # Generate translations
    translation_ids = model.generate(input_text, max_length=512, num_beams=4, length_penalty=2.0, early_stopping=True)

    # Move the translation_ids to CPU to decode
    translation_ids = translation_ids.to('cpu')

    generated_translation = tokenizer.batch_decode(translation_ids, skip_special_tokens=True)

    generated_translations.extend(generated_translation)
    references.extend(tokenizer.batch_decode(labels, skip_special_tokens=True))  # Decoding the label IDs


In [None]:
print("Number of generated translations:", len(generated_translations))
print("Number of references:", len(references))

In [None]:
print(generated_translations)

In [None]:
print(references)

In [None]:
# Calculate Character Error Rate (CER) and Word Error Rate (WER)
results_CER = cer_metric.compute(predictions=generated_translations, references=references)
results_WER = wer_metric.compute(predictions=generated_translations, references=references)

# Calculate Exact Match (EM) and METEOR(M)
results_em = exact_match_metric.compute(predictions=generated_translations, references=references)
results_met = meteor.compute(predictions=generated_translations, references=references)

# Calculate Bilingual Evaluation Understudy (BLEU) and Recall-Oriented Understudy for Gisting Evaluation (ROUGE)
results_bleu = bleu_metric.compute(predictions=generated_translations, references=references)


# **Printing every evaluation metrics**

In [None]:
print("Character Error Rate for Banglish to Bangla Translation:", results_CER)
print("Word Error Rate for Banglish to Bangla Translation:",results_WER)
print("Exact Match for Banglish to Bangla Translation:",results_em)
print("BLEU Score for Banglish to Bangla Translation:",results_bleu)
print("METEOR for Banglish to Bangla Translation:",results_met)


In [None]:
!pip install unidecode

In [None]:
from rouge_score import rouge_scorer
from unidecode import unidecode

# Initialize the Rouge scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=False)

# Preprocess the text
def preprocess_text(text):
    text = unidecode(text)
    tokens = text.split()
    return ' '.join(tokens)

# Calculate scores for each pair of predictions and references
rouge1_f1_scores = []
rouge1_precision_scores = []
rouge1_recall_scores = []
rouge2_f1_scores = []
rouge2_precision_scores = []
rouge2_recall_scores = []
rougeL_f1_scores = []
rougeL_precision_scores = []
rougeL_recall_scores = []

for ref, pred in zip(references, generated_translations):
    candidate = preprocess_text(pred)
    reference = preprocess_text(' '.join(ref))
    scores = scorer.score(reference, candidate)

    rouge1_f1_scores.append(scores['rouge1'].fmeasure)
    rouge1_precision_scores.append(scores['rouge1'].precision)
    rouge1_recall_scores.append(scores['rouge1'].recall)
    rouge2_f1_scores.append(scores['rouge2'].fmeasure)
    rouge2_precision_scores.append(scores['rouge2'].precision)
    rouge2_recall_scores.append(scores['rouge2'].recall)
    rougeL_f1_scores.append(scores['rougeL'].fmeasure)
    rougeL_precision_scores.append(scores['rougeL'].precision)
    rougeL_recall_scores.append(scores['rougeL'].recall)

# Calculate the average scores
avg_rouge1_f1 = sum(rouge1_f1_scores) / len(rouge1_f1_scores)
avg_rouge1_precision = sum(rouge1_precision_scores) / len(rouge1_precision_scores)
avg_rouge1_recall = sum(rouge1_recall_scores) / len(rouge1_recall_scores)
avg_rouge2_f1 = sum(rouge2_f1_scores) / len(rouge2_f1_scores)
avg_rouge2_precision = sum(rouge2_precision_scores) / len(rouge2_precision_scores)
avg_rouge2_recall = sum(rouge2_recall_scores) / len(rouge2_recall_scores)
avg_rougeL_f1 = sum(rougeL_f1_scores) / len(rougeL_f1_scores)
avg_rougeL_precision = sum(rougeL_precision_scores) / len(rougeL_precision_scores)
avg_rougeL_recall = sum(rougeL_recall_scores) / len(rougeL_recall_scores)

# Print the average scores
print("Average Rouge-1 F1 Score:", avg_rouge1_f1)
print("Average Rouge-1 Precision:", avg_rouge1_precision)
print("Average Rouge-1 Recall:", avg_rouge1_recall)

print("Average Rouge-2 F1 Score:", avg_rouge2_f1)
print("Average Rouge-2 Precision:", avg_rouge2_precision)
print("Average Rouge-2 Recall:", avg_rouge2_recall)

print("Average Rouge-L F1 Score:", avg_rougeL_f1)
print("Average Rouge-L Precision:", avg_rougeL_precision)
print("Average Rouge-L Recall:", avg_rougeL_recall)


In [None]:
# Import necessary library for Hugging Face Hub authentication
from huggingface_hub import notebook_login

# Authenticate to the Hugging Face Hub using the provided function
notebook_login()

In [None]:
model.push_to_hub("Soyeda10/BanglishToBangla")