<a href="https://colab.research.google.com/github/deeksha3009/deeksha98/blob/master/mT5_Transformer_BM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install datasets sacrebleu transformers

In [None]:
import warnings
import numpy as np
import pandas as pd
import datasets
import torch
import transformers
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, MT5Tokenizer, MT5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from datasets import load_metric
from sacrebleu import corpus_bleu

warnings.filterwarnings("ignore")


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
def load_data(file_name):
  with open(file_name, 'r', encoding='utf-8') as f:
    lines = f.readlines()
  return lines

src_lines = load_data('/content/drive/MyDrive/Thesis/NLLB_hi_kn-hi.txt')
tgt_lines = load_data('/content/drive/MyDrive/Thesis/NLLB_hi_kn-kn.txt')

dataset = pd.DataFrame({'src': src_lines, 'tgt': tgt_lines})

dataset = dataset.drop_duplicates().sample(frac=0.02).reset_index(drop=True)

In [None]:
import matplotlib.pyplot as plt

def plot_token_distribution(dataset, column, title):
    token_lengths = dataset[column].apply(lambda x: len(x.split()))
    plt.figure(figsize=(6, 4))
    plt.hist(token_lengths, bins=30, alpha=0.7, color='blue')
    plt.title(f'Token Distribution: {title}')
    plt.xlabel('length of Tokens')
    plt.ylabel('Frequency')
    plt.show()

plot_token_distribution(dataset, 'src', 'hindi Language - Tokens')
plot_token_distribution(dataset, 'tgt', 'kannada Language - Tokens')


In [None]:
# Split the data into training and testing sets
def split_data(dataset):
  train_data, test_data = train_test_split(dataset, test_size=0.25)
  train_data, validation_data = train_test_split(train_data, test_size=0.35)
  return train_data, validation_data, test_data

train_data, validation_data, test_data = split_data(dataset)

# Convert DataFrame to Dataset
train_dataset = Dataset.from_pandas(train_data)
validation_dataset = Dataset.from_pandas(validation_data)
test_dataset = Dataset.from_pandas(test_data)

In [None]:
validation_dataset.shape

In [None]:
# Initialize the tokenizer and model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = 'google/mt5-base'
tokenizer = MT5Tokenizer.from_pretrained(model_name)
model = MT5ForConditionalGeneration.from_pretrained(model_name)


In [None]:
model.to(device)

In [None]:
def preprocess_function(examples):
    inputs = examples["src"]
    targets = examples["tgt"]

    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True, padding="max_length")
    return model_inputs

In [None]:
train_dataset = train_dataset.map(preprocess_function, batched=True, desc="Running tokenizer on train dataset")
validation_dataset = validation_dataset.map(preprocess_function, batched=True, desc="Running tokenizer on validation dataset")
test_dataset = test_dataset.map(preprocess_function, batched=True, desc="Running tokenizer on test dataset")


In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",  # Save model at the end of each epoch
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,
    predict_with_generate=True,
    logging_dir='./logs',
    gradient_accumulation_steps=2,
    warmup_steps=500,
    eval_accumulation_steps=2,
    load_best_model_at_end=True,
    metric_for_best_model="bleu",
)

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Define the BLEU metric
bleu_metric = load_metric("sacrebleu")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_labels = [[label] for label in decoded_labels]
    bleu = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": bleu['score']}

# Initialize the trainer for the initial training phase
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Train the model initially


In [None]:
trainer.train()

In [None]:
import pandas as pd

# Extract the log history
log_history = trainer.state.log_history

# Convert the log history into a DataFrame
metrics_df = pd.DataFrame(log_history)

# Display the DataFrame to understand the structure
metrics_df.head()


In [None]:
import matplotlib.pyplot as plt

# Filter out the steps that contain training and evaluation loss
train_loss = metrics_df[metrics_df['loss'].notna()][['step', 'loss']]
eval_loss = metrics_df[metrics_df['eval_loss'].notna()][['step', 'eval_loss']]

# Plot the training loss
plt.figure(figsize=(5, 5))
plt.plot(train_loss['step'], train_loss['loss'], label='Training Loss', color='yellow')

# Plot the validation loss
plt.plot(eval_loss['step'], eval_loss['eval_loss'], label='Validation Loss', color='red')

# Add titles and labels
plt.title('Training and Validation Loss Over Time')
plt.xlabel('Steps')
plt.ylabel('Loss')
plt.legend()

# Display the plot
plt.show()


In [None]:
# Evaluate on the test set before fine-tuning
initial_results = trainer.evaluate(eval_dataset=test_dataset)
print(f"Initial BLEU score on test set: {initial_results['eval_bleu']}")

# Save the model checkpoint after initial training
trainer.save_model("mt5_model.pt")

In [None]:
def generate_translation(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True).to(device)
    outputs = model.generate(**inputs).to(device)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

In [None]:
import sacrebleu

# Generate translations for the test set
def get_score(test_dataset):
  refs = [[ex["tgt"]] for ex in test_dataset]
  preds = [generate_translation(ex["src"]) for ex in test_dataset]
  bleu = sacrebleu.corpus_bleu(preds, refs)
  print(f"Test BLEU score after fine-tuning: {bleu.score}")

In [None]:
get_score(test_dataset)

In [None]:
input_text = "यह शुरू करने का समय है"
result = generate_translation(input_text)
print(result)


In [None]:
back_dataset = pd.DataFrame({'src': tgt_lines , 'tgt': src_lines})

back_dataset = back_dataset.drop_duplicates().sample(frac=0.02).reset_index(drop=True)

In [None]:
back_dataset.head(3)

In [None]:
rev_train, rev_validation, rev_test = split_data(back_dataset)

rev_train_dataset = Dataset.from_pandas(rev_train)
rev_validation_dataset = Dataset.from_pandas(rev_validation)
rev_test_dataset = Dataset.from_pandas(rev_test)

In [None]:
rev_train_dataset.shape

In [None]:
rev_train_dataset = rev_train_dataset.map(preprocess_function, batched=True, desc="Running tokenizer on train dataset back translation")
rev_validation_dataset = rev_validation_dataset.map(preprocess_function, batched=True, desc="Running tokenizer on validation dataset back translation")
rev_test_dataset = rev_test_dataset.map(preprocess_function, batched=True, desc="Running tokenizer on test dataset back translation")

In [None]:
torch.cuda.empty_cache()

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",  # Save model at the end of each epoch
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,
    predict_with_generate=True,
    logging_dir='./logs',
    gradient_accumulation_steps=2,
    warmup_steps=500,
    eval_accumulation_steps=2,
    load_best_model_at_end=True,
    metric_for_best_model="bleu",
)

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Define the BLEU metric
bleu_metric = load_metric("sacrebleu")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_labels = [[label] for label in decoded_labels]
    bleu = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": bleu['score']}

# Initialize the trainer for the initial training phase
back_trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=rev_train_dataset,
    eval_dataset=rev_validation_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Train the model initially


In [None]:
back_trainer.train()

In [None]:
import sacrebleu

# Generate translations for the test set
get_score(rev_test_dataset)

In [None]:
input_text = "ಇದು ಇಂದು ಸಂಭವಿಸಿದೆ"
result = generate_translation(input_text)
print(result)
