In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# import torch
# torch.cuda.empty_cache()

In [None]:
!pip install wandb -qqq
import wandb

In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("wandb_api_key")

In [None]:
! wandb login $secret_value_0

In [None]:
!pip install transformers

In [None]:
from transformers import AutoTokenizer
from transformers import AutoTokenizer, MT5Model, TrainingArguments, Trainer, MT5ForConditionalGeneration, AutoModelForSeq2SeqLM
import torch

# tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
tokenizer = AutoTokenizer.from_pretrained("csebuetnlp/banglat5_nmt_en_bn", use_fast=False)
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
model_name = "csebuetnlp/banglat5_nmt_en_bn" #"google/mt5-small"  # Adjust if using a pre-trained model
# model = MT5ForConditionalGeneration.from_pretrained(model_name).to(torch_device)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(torch_device)



In [None]:
! pip install git+https://github.com/csebuetnlp/normalizer
from normalizer import normalize

In [None]:
df_train = pd.read_csv("/kaggle/input/pentabd-transliterated-dataset/train.csv")
df_test = pd.read_csv("/kaggle/input/pentabd-transliterated-dataset/test.csv")
df_val =  pd.read_csv("/kaggle/input/pentabd-transliterated-dataset/val.csv")

In [None]:
df_train

In [None]:
# df_train = df_train[:160]
# df_val = df_val[:40]
# df_test = df_test[:40]

In [None]:
print(df_train.isna().sum())
print(df_test.isna().sum())
print(df_val.isna().sum())

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download necessary NLTK resources (may need internet connection)
nltk.download('punkt')
nltk.download('stopwords')

def clean_text(text, language='english'):
    text = ''.join([char for char in text if char.isalnum() or char.isspace()])

    # Convert to lowercase
    text = text.lower()

    # Remove stopwords (optional, adjust stopword list based on language)
    stop_words = stopwords.words(language)
    text = ' '.join([word for word in text.split() if word not in stop_words])

    return text

# Clean Banglish and Bengali text
# df['Banglish_Clean'] = df['Banglish'].apply(clean_text)
# df['Bengali_Clean'] = df['Bengali'].apply(clean_text, language='bengali')  # Specify Bengali for stopword removal

# Normalization for Bengali text (replace with your desired normalization function)
def normalize_bengali(text):
    normalized_text = normalize(text)
    return normalized_text

df_train['normalized_bengali'] = df_train['text_bengali'].apply(normalize_bengali)
df_test['normalized_bengali'] = df_test['text_bengali'].apply(normalize_bengali)
df_val['normalized_bengali'] = df_val['text_bengali'].apply(normalize_bengali)


In [None]:
df_train["normalized_bengali"][0]

In [None]:
def find_max_length(df, column_name):
    # Find the index of the text with the maximum length
    max_length_index = df[column_name].str.len().idxmax()

    # Get the text with the maximum length
    max_length_text = df.loc[max_length_index, column_name]

    # Print the maximum length and the corresponding text
#     print(f"Index of the text with maximum length: {max_length_index}")
#     print(f"Maximum length: {len(max_length_text)}")
#     print(f"Text with maximum length:\n{max_length_text}")
    return len(max_length_text)

# find_max_length(df_train, 'text_bengali')
# print(df_train['text_bengali'][10454])

In [None]:
from datasets import Dataset
def pad_truncate(df):
    max_length = 200
#     print(max_length)
    bengali_tokenized = tokenizer(df['normalized_bengali'].tolist(), padding="max_length", truncation=True)
#     print(bengali_tokenized)
#     max_length = find_max_length(df, 'text_transliterated')
    banglish_tokenized = tokenizer(df['text_transliterated'].tolist(), padding="max_length", truncation=True)

    dataset = Dataset.from_dict({
        "input_ids": banglish_tokenized["input_ids"],
        "attention_mask": banglish_tokenized["attention_mask"],
        "labels": bengali_tokenized["input_ids"]  # Labels are target language tokens
    })
    
    return dataset

train_dataset = pad_truncate(df_train)
# print(banglish, bengali)
# pad_truncate(df_test)
# pad_truncate(df_val)

In [None]:
test_dataset = pad_truncate(df_test)
val_dataset = pad_truncate(df_val)

In [None]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
# training_args = TrainingArguments(
#     output_dir="./mt5_banglish_bengali",  # Output directory for checkpoints
#     evaluation_strategy="steps",
#     overwrite_output_dir=True,  # Overwrite existing directory if it exists
#     num_train_epochs=3,  # Adjust based on dataset size and desired accuracy
#     per_device_train_batch_size=2,  # Adjust batch size based on GPU memory
#     save_steps=50,  # Save model checkpoints every 10,000 steps
#     save_total_limit=2,  # Keep only the most recent 2 checkpoints
#     logging_steps=50,  # Log training progress every 500 steps
#     fp16 = True,
#     gradient_accumulation_steps = 6,
#     load_best_model_at_end=True  # Load the best model based on validation metrics
# )

batch_size = 4
args = Seq2SeqTrainingArguments(output_dir="weights",
                        evaluation_strategy="epoch",
                        save_strategy = "epoch",
                        per_device_train_batch_size=batch_size,
                        per_device_eval_batch_size=batch_size,
                        learning_rate=2e-5,
                        num_train_epochs=5,
                        weight_decay=0.01,
                        save_total_limit=3,
                        predict_with_generate=True,
                        fp16 = False,
                        gradient_accumulation_steps = 6,
                        save_steps = 50,
                        logging_steps = 50,
                        load_best_model_at_end=True,
                        logging_dir="/logs",
                        report_to="wandb")

In [None]:
!pip install bert-score
from datasets import load_metric

# Load the BERTScore metric
bert_metric = load_metric('bertscore')

In [None]:
def compute_metrics(preds_and_labels):
    preds, labels = preds_and_labels

    # Decode the predictions and labels using the tokenizer, skipping special tokens
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    
    # Replace -100 (masked tokens) in labels with the pad token ID
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    
    # Decode the labels using the tokenizer, skipping special tokens
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Compute BERTScore using decoded predictions and labels
    result = bert_metric.compute(predictions=decoded_preds, references=decoded_labels, lang='bn')
    
    # Return the BERTScore as a dictionary
    return {
      'BERT F1': np.mean(result['f1']),
      'BERT Precision': np.mean(result['precision']),
      'BERT Recall': np.mean(result['recall'])
  }

In [None]:
from transformers import DataCollatorForSeq2Seq

# Instantiate a Seq2Seq model from the specified checkpoint

# Define a data collator for Seq2Seq tasks
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics = compute_metrics
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
def predict_output(input_sentence):
    input_ids = tokenizer((input_sentence), return_tensors="pt").input_ids.to("cuda")
    generated_tokens = model.generate(input_ids)
    decoded_tokens = tokenizer.batch_decode(generated_tokens)[0]
    decoded_tokens = normalize(decoded_tokens)

    return decoded_tokens
    
print("start")   
df_test['predictions'] = df_test['text_transliterated'].apply(predict_output)
df_test.to_csv("banglaT5_nmt_en_bn_test.csv", index=False)
print("complete")              

In [None]:
def predict_output(input_sentence):
    input_ids = tokenizer((input_sentence), return_tensors="pt").input_ids.to("cuda")
    generated_tokens = model.generate(input_ids)
    decoded_tokens = tokenizer.batch_decode(generated_tokens)[0]
    decoded_tokens = normalize(decoded_tokens)

    return decoded_tokens
    
print("start")   
df_val['predictions'] = df_val['text_transliterated'].apply(predict_output)
df_val.to_csv("banglaT5_nmt_en_bn_val.csv", index=False)
print("complete")              