# Amharic Model Finetuning 2

This notebook includes the code for the fine-tuning of the mT5-small model with the Amharic-3 dataset. Since the text normalization and punctuation removal steps are already completed in the preparation of this dataset, these steps are not needed nor included in this code.

Model trained using this flow are:

- Improved-Amharic-FT-2 (using Amharic-3)

In [None]:
# Installing Packages
!pip install datasets transformers sentencepiece accelerate -U tensorflow --upgrade torch torchvision peft nltk rouge_score  > /dev/null 2>&1

In [None]:
import os
import time
import random
import pandas as pd
import torch from torch.utils.data import DataLoader
from tqdm import tqdm
from datasets import load_dataset, Dataset, DatasetDict
from peft import LoraConfig, TaskType, get_peft_model
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    default_data_collator,
    get_linear_schedule_with_warmup,
    MT5Tokenizer,
)
from google.colab import drive

## Data Loading and Preprocessing

In [None]:
# Mouting Google Drive to the current Colab session for accessing files stored in the Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Path of the directory of the data files
folder_path = '/content/drive/My Drive/CPSC_490_Data/'

# Specific paths to training, validation and test files
train_path = folder_path + 'amharic_3_train.csv'
valid_path = folder_path + 'amharic_3_valid.csv'
test_path = folder_path + 'amharic_3_test.csv'

In [None]:
# Preprocessing function
def preprocess_function(examples, tokenizer):
  """
    Preprocesses text and summary examples for model training using the specified tokenizer, preparing them for mT5.

    Args:
      examples (dict): A dictionary containing 'text' and 'summary' keys with lists of strings.
      tokenizer (Tokenizer): The tokenizer to be used for tokenization.

    Returns:
      dict: A dictionary with tokenized inputs and adjusted labels suitable for model training.
  """

  padding = "max_length"
  max_length = 512
  max_summary_length = 128

  # Selecting inputs and targets
  inputs = [ex for ex in examples["text"]]
  targets = [ex for ex in examples["summary"]]

  # Tokenizing inputs and labels
  model_inputs = tokenizer(inputs, max_length=max_length, padding=padding, truncation=True)
  labels = tokenizer(targets, max_length=max_summary_length, padding=padding, truncation=True)

  # Adjusting labels for mT5
  labels["input_ids"] = [
      [(label if label != tokenizer.pad_token_id else -100) for label in label_seq] for label_seq in labels["input_ids"]
  ]

  # Setting the tokenized labels as the model's training targets
  model_inputs["labels"] = labels["input_ids"]

  return model_inputs

In [None]:
# Loading dataset
datasets = load_dataset("csv", data_files={"train": train_path, "validation": valid_path, "test": test_path})

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [None]:
# Loading mT5's tokenizer
tokenizer = MT5Tokenizer.from_pretrained('google/mt5-small')

tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
# Performing preprocessing and tokenizations on each entry
tokenized_datasets = datasets.map(
    lambda examples: preprocess_function(examples, tokenizer),
    batched = True
)

tokenized_datasets["train"] = tokenized_datasets["train"].shuffle(seed=16)

Map:   0%|          | 0/23492 [00:00<?, ? examples/s]

Map:   0%|          | 0/2937 [00:00<?, ? examples/s]

Map:   0%|          | 0/2937 [00:00<?, ? examples/s]

In [None]:
# Save processed data the first time it is processed (if it is the first time, the next line would be uncommented)
# tokenized_datasets.save_to_disk(folder_path + 'bounded_token_length_brand_new_further_cleaned_Amharic_mT5_tokenized_datasets')

# Load processed if it has already been saved
# This is referencing the tokenized, preprocessed version of Amharic-3
tokenized_datasets = DatasetDict.load_from_disk(folder_path + 'bounded_token_length_brand_new_further_cleaned_Amharic_mT5_tokenized_datasets')

Saving the dataset (0/1 shards):   0%|          | 0/23492 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2937 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2937 [00:00<?, ? examples/s]

## Loading Model

In [None]:
# Getting the base model and setting up the IA3 PEFT cofigurations
base_model = AutoModelForSeq2SeqLM.from_pretrained('google/mt5-small')
peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode = False,
    r = 6,
    lora_alpha = 12,
    lora_dropout = 0.15
)

model = get_peft_model(base_model, peft_config)
model.print_trainable_parameters()

pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

trainable params: 258,048 || all params: 300,434,816 || trainable%: 0.0858915099906397


## Training Hyperparameters

In [None]:
# Initializing training hyperparameters
num_epochs = 30
batch_size = 24

In [None]:
# Initialize optimizer and LR scheduler with IA3 parameters
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-4)

# Loading training data
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle = True,
    collate_fn = default_data_collator,
    batch_size = batch_size,
    pin_memory = True,
)

# Loading validation data
validation_dataloader = DataLoader(
    tokenized_datasets["validation"],
    collate_fn = default_data_collator,
    batch_size = batch_size,
    pin_memory = True
)

# Number of training steps
num_training_steps = len(train_dataloader) * num_epochs

# Setting up the learning rate scheduler
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer = optimizer,
    num_warmup_steps = int(0.3 * num_training_steps), # 30% warm-up
    num_training_steps = num_training_steps,
)

In [None]:
# Path to save the checkpoints
checkpoint_path = folder_path + "Improved_Amharic_FT_2_Model_Checkpoints/"

# Early stopping parameters
patience = 3

## Training Loop

In [None]:
# Setting the device for the model training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Setting the system file path to where training_module.py is located
sys.path.append(folder_path)
from training_module import train_model, save_losses

In [None]:
# Training/Fine-tuning the model
training_losses, validation_losses = train_model(
    model = model,
    train_dataloader = train_dataloader,
    validation_dataloader = validation_dataloader,
    optimizer = optimizer,
    lr_scheduler = lr_scheduler,
    device = device,
    num_epochs = num_epochs,
    checkpoint_filepath = checkpoint_path,
    patience = patience,
    model_filepath = folder_path + "Improved_Amharic_FT_2"
)

In [None]:
# Saving the training and validation losses at each step to a CSV file
save_losses(
    training_losses = training_losses,
    validation_losses = validation_losses,
    file_path = folder_path + "improved_amharic_ft_2_losses.csv"
)