<a href="https://colab.research.google.com/github/deeksha3009/deeksha98/blob/master/Copy_of_mT5_Domain_Mix.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Implementation of language translation for Hindi to Kannada

This was implemented using mT5 model for OPUS dataset

In [None]:
! pip install datasets sacrebleu transformers

In [None]:
import warnings
import numpy as np
import pandas as pd
import datasets
import torch
import transformers
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, MT5Tokenizer, MT5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from datasets import load_metric
from sacrebleu import corpus_bleu

warnings.filterwarnings("ignore")


In [None]:
from google.colab import drive
drive.mount('/content/drive')

##Load the OPUS NLLB dataset containing data from Meta AI


In [None]:
def load_data(file_name):
  with open(file_name, 'r', encoding='utf-8') as f:
    lines = f.readlines()
  return lines

src_lines = load_data('/content/drive/MyDrive/Thesis/NLLB_hi_kn-hi.txt')
tgt_lines = load_data('/content/drive/MyDrive/Thesis/NLLB_hi_kn-kn.txt')

dataset = pd.DataFrame({'src': src_lines, 'tgt': tgt_lines})

dataset = dataset.drop_duplicates().sample(frac=0.02).reset_index(drop=True)

In [None]:
dataset.shape

# Load TED2020 dataset from TEDTalk domain

The dataset is available in https://opus.nlpl.eu/TED2020/hi&kn/v1/TED2020

In [None]:
unseen_hi = load_data('/content/drive/MyDrive/Thesis/TED2020_hi-kn_hi.txt')
unseen_kn = load_data('/content/drive/MyDrive/Thesis/TED2020_hi-kn_kn.txt')

unseen_data = pd.DataFrame({'src': unseen_hi, 'tgt': unseen_kn})

In [None]:
unseen_data.shape

In [None]:
# Split the data into training and testing sets
train_data, test_data = train_test_split(dataset, test_size=0.25)
train_data, validation_data = train_test_split(train_data, test_size=0.35)

small_train, small_test = train_test_split(unseen_data, test_size=0.25)


# Convert DataFrame to Dataset
train_dataset = Dataset.from_pandas(train_data)
validation_dataset = Dataset.from_pandas(validation_data)
test_dataset = Dataset.from_pandas(test_data)

small_train = Dataset.from_pandas(small_train)
small_test = Dataset.from_pandas(small_test)

In [None]:
train_data.shape

In [None]:
small_test.shape

In [None]:
# Initialize the tokenizer and model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = 'google/mt5-base'
tokenizer = MT5Tokenizer.from_pretrained(model_name)
model = MT5ForConditionalGeneration.from_pretrained(model_name)


In [None]:
model.to(device)

In [None]:
# Process both the original and smaller dataset

def preprocess_function(examples):
    inputs = examples["src"]
    targets = examples["tgt"]

    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True, padding="max_length")
    return model_inputs

train_dataset = train_dataset.map(preprocess_function, batched=True, desc="Running tokenizer on train dataset")
validation_dataset = validation_dataset.map(preprocess_function, batched=True, desc="Running tokenizer on validation dataset")
test_dataset = test_dataset.map(preprocess_function, batched=True, desc="Running tokenizer on test dataset")

small_train = small_train.map(preprocess_function, batched=True, desc="Running tokenizer on small train dataset")
small_test = small_test.map(preprocess_function, batched=True, desc="Running tokenizer on small test dataset")


In [None]:
train_dataset.shape

# Train the model for the train and validation set and get the evaluation on test data

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",  # Save model at the end of each epoch
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=2,
    predict_with_generate=True,
    logging_dir='./logs',
    gradient_accumulation_steps=2,
    warmup_steps=500,
    eval_accumulation_steps=2,
    load_best_model_at_end=True,
    metric_for_best_model="bleu",
)

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Define the BLEU metric
bleu_metric = load_metric("sacrebleu")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_labels = [[label] for label in decoded_labels]
    bleu = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": bleu['score']}

# Initialize the trainer for the initial training phase
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Train the model initially


In [None]:
trainer.train()

trainer.save_model("./base_large_model.pt")


In [None]:
torch.cuda.empty_cache()

In [None]:
from transformers import AutoModelForSeq2SeqLM

model_new = AutoModelForSeq2SeqLM.from_pretrained("/content/base_large_model.pt")

In [None]:
# Evaluate on the test set before fine-tuning
initial_results = trainer.evaluate(eval_dataset=test_dataset)
print(f"Initial BLEU score on test set: {initial_results['eval_bleu']}")

# Save the model checkpoint after initial training

# Train the model for the train and validation set of the TED2020 dataset similarly

In [None]:
#Set training arguments for fine-tuning

fine_tune_args = Seq2SeqTrainingArguments(
    output_dir="./fine_tune_results",
    evaluation_strategy="epoch",
    learning_rate=3e-5,  # Lower learning rate for fine-tuning
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=2,  # Fewer epochs for fine-tuning
    predict_with_generate=True,
    logging_dir='./fine_tune_logs',
    gradient_accumulation_steps=2,
    warmup_steps=200,
    eval_accumulation_steps=2,
)

# Initialize the trainer for fine-tuning
fine_tune_trainer = Seq2SeqTrainer(
    model=model,
    args=fine_tune_args,
    train_dataset=small_train,
    eval_dataset=small_test,  # Optionally evaluate on the validation set during fine-tuning
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


In [None]:

# Fine-tune the model
fine_tune_trainer.train()

# Evaluate on the test set after fine-tuning
final_results = fine_tune_trainer.evaluate(eval_dataset=test_dataset)
print(f"Final BLEU score on test set after fine-tuning: {final_results['eval_bleu']}")

# Evaluation score and example translation

In [None]:
import sacrebleu
# Generate translations and calculate BLEU score for test set
def generate_translation(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True).to(device)
    outputs = model.generate(**inputs).to(device)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

# Generate translations for the test set
refs = [[ex["tgt"]] for ex in test_dataset]
preds = [generate_translation(ex["src"]) for ex in test_dataset]

# Calculate BLEU score
bleu = sacrebleu.corpus_bleu(preds, refs)
print(f"Test BLEU score after fine-tuning: {bleu.score}")

In [None]:
input_text = "यह एक परीक्षण है।"
result = generate_translation(input_text)
print(result)
