# Amharic Model Finetuning

This notebook includes the code for the fine-tuning of the mT5-small model with the Amharic-1 and Amharic-2 datasets. Only the initial datasets are changed for the two models, but nearly the same identical flow is used for both. The only other main difference is the inclusion of the normalization steps during preprocessing, which were added when the mt5-small was fine-tuned with the Amharic-2 dataset and not with the Amharic-1 dataset.

Models trained using this flow are:

- Initial-Amharic-FT (using Amharic-1)
- Improved-Amharic-FT (using Amharic-2)

In [None]:
# Installing Packages
!pip install datasets transformers sentencepiece accelerate -U tensorflow --upgrade torch torchvision peft nltk rouge_score  > /dev/null 2>&1

In [None]:
# Loading packages
import re
import sys
import os
import time
import random
import torch
import pandas as pd
from datasets import load_dataset, Dataset, DatasetDict
from peft import LoraConfig, TaskType, get_peft_model
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    default_data_collator,
    get_linear_schedule_with_warmup,
    MT5Tokenizer,
)
from tqdm import tqdm
from torch.utils.data import DataLoader
from rouge_score import rouge_scorer
from google.colab import drive

## Data Loading and Preprocessing

In [None]:
# Mouting Google Drive to the current Colab session for accessing files stored in the Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Path of the directory of the data files
folder_path = '/content/drive/My Drive/CPSC_490_Data/'

# Specific paths to training, validation and test files
train_path = folder_path + 'amharic_2_train.csv'
valid_path = folder_path + 'amharic_2_valid.csv'
test_path = folder_path + 'amharic_2_test.csv'

In [None]:
# Normalization steps
# The following three functions are taken from: https://abe2g.github.io/am-preprocess.html
# The full source is cited in detail in the final report
# These steps are applied to the dataset this step hasn't already be applied on (i.e., Amharic-2)
# Amharic-3 goes through these steps during its preparation phase.
# These steps weren't included during the use of Amharic-1.

def normalize_char_level_missmatch(input_token):
  '''
    Normalize variations of Amharic characters in a given token to standardize them.

    Args:
        input_token (str): The Amharic token to be normalized.

    Returns:
        str: The normalized Amharic token.
  '''

  rep1=re.sub('[ሃኅኃሐሓኻ]','ሀ',input_token)
  rep2=re.sub('[ሑኁዅ]','ሁ',rep1)
  rep3=re.sub('[ኂሒኺ]','ሂ',rep2)
  rep4=re.sub('[ኌሔዄ]','ሄ',rep3)
  rep5=re.sub('[ሕኅ]','ህ',rep4)
  rep6=re.sub('[ኆሖኾ]','ሆ',rep5)
  rep7=re.sub('[ሠ]','ሰ',rep6)
  rep8=re.sub('[ሡ]','ሱ',rep7)
  rep9=re.sub('[ሢ]','ሲ',rep8)
  rep10=re.sub('[ሣ]','ሳ',rep9)
  rep11=re.sub('[ሤ]','ሴ',rep10)
  rep12=re.sub('[ሥ]','ስ',rep11)
  rep13=re.sub('[ሦ]','ሶ',rep12)
  rep14=re.sub('[ዓኣዐ]','አ',rep13)
  rep15=re.sub('[ዑ]','ኡ',rep14)
  rep16=re.sub('[ዒ]','ኢ',rep15)
  rep17=re.sub('[ዔ]','ኤ',rep16)
  rep18=re.sub('[ዕ]','እ',rep17)
  rep19=re.sub('[ዖ]','ኦ',rep18)
  rep20=re.sub('[ጸ]','ፀ',rep19)
  rep21=re.sub('[ጹ]','ፁ',rep20)
  rep22=re.sub('[ጺ]','ፂ',rep21)
  rep23=re.sub('[ጻ]','ፃ',rep22)
  rep24=re.sub('[ጼ]','ፄ',rep23)
  rep25=re.sub('[ጽ]','ፅ',rep24)
  rep26=re.sub('[ጾ]','ፆ',rep25)

  #Normalizing words with Labialized Amharic characters such as በልቱዋል or  በልቱአል to  በልቷል
  rep27=re.sub('(ሉ[ዋአ])','ሏ',rep26)
  rep28=re.sub('(ሙ[ዋአ])','ሟ',rep27)
  rep29=re.sub('(ቱ[ዋአ])','ቷ',rep28)
  rep30=re.sub('(ሩ[ዋአ])','ሯ',rep29)
  rep31=re.sub('(ሱ[ዋአ])','ሷ',rep30)
  rep32=re.sub('(ሹ[ዋአ])','ሿ',rep31)
  rep33=re.sub('(ቁ[ዋአ])','ቋ',rep32)
  rep34=re.sub('(ቡ[ዋአ])','ቧ',rep33)
  rep35=re.sub('(ቹ[ዋአ])','ቿ',rep34)
  rep36=re.sub('(ሁ[ዋአ])','ኋ',rep35)
  rep37=re.sub('(ኑ[ዋአ])','ኗ',rep36)
  rep38=re.sub('(ኙ[ዋአ])','ኟ',rep37)
  rep39=re.sub('(ኩ[ዋአ])','ኳ',rep38)
  rep40=re.sub('(ዙ[ዋአ])','ዟ',rep39)
  rep41=re.sub('(ጉ[ዋአ])','ጓ',rep40)
  rep42=re.sub('(ደ[ዋአ])','ዷ',rep41)
  rep43=re.sub('(ጡ[ዋአ])','ጧ',rep42)
  rep44=re.sub('(ጩ[ዋአ])','ጯ',rep43)
  rep45=re.sub('(ጹ[ዋአ])','ጿ',rep44)
  rep46=re.sub('(ፉ[ዋአ])','ፏ',rep45)
  rep47=re.sub('[ቊ]','ቁ',rep46) #ቁ can be written as ቊ
  rep48=re.sub('[ኵ]','ኩ',rep47) #ኩ can be also written as ኵ
  return rep48

# :: No longer removed
def remove_punc_and_special_chars(text):
  '''
    Remove punctuation and special characters from a given text.

    Args:
        text (str): The text from which to remove punctuation and special characters.

    Returns:
        str: The text stripped of punctuation and special characters.
  '''

  normalized_text = re.sub('[\!\@\#\$\%\^\«\»\&\*\(\)\…\[\]\{\}\;\“\”\›\’\‘\"\'\:\,\.\‹\/\<\>\?\\\\|\`\´\~\-\=\+\፡\፤\;\፦\፥\፧\፨\፠\፣]', '',text)
  return normalized_text

# Remove all ascii characters
def remove_ascii_and_numbers(text_input):
  '''
    Remove ASCII characters and numbers from the given text.

    Args:
        text_input (str): The text from which to remove ASCII characters and numbers.

    Returns:
        str: The text without any ASCII characters and numbers.
  '''

  return re.sub('[A-Za-z]','',text_input)

In [1]:
def preprocess_function(examples, tokenizer):
  '''
    Preprocess text and summary data for tokenization suitable for mT5 model training.

    Args:
        examples (dict): A dictionary containing 'text' and 'summary' keys with lists of strings as values.
        tokenizer: The tokenizer to be used for tokenizing the text and summary.

    Returns:
        dict: A dictionary with tokenized input data and labels suitable for mT5 model training.
  '''

  padding = "max_length"
  max_length = 512
  max_summary_length = 128

  # Apply preprocessing steps to inputs and targets
  inputs = [normalize_char_level_missmatch(remove_ascii_and_numbers(remove_punc_and_special_chars(ex))) for ex in examples["text"]]
  targets = [normalize_char_level_missmatch(remove_ascii_and_numbers(remove_punc_and_special_chars(ex))) for ex in examples["summary"]]

  # Tokenize inputs and labels
  model_inputs = tokenizer(inputs, max_length=max_length, padding=padding, truncation=True)
  labels = tokenizer(targets, max_length=max_summary_length, padding=padding, truncation=True)

  # Adjust labels for mT5
  labels["input_ids"] = [
      [(label if label != tokenizer.pad_token_id else -100) for label in label_seq] for label_seq in labels["input_ids"]
  ]

  # Setting the tokenized labels as the model's training targets
  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

In [None]:
# Loading dataset
datasets = load_dataset("csv", data_files={"train": train_path, "validation": valid_path, "test": test_path})

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [None]:
# Loading mT5's tokenizer
tokenizer = MT5Tokenizer.from_pretrained('google/mt5-small')

tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
# Performing preprocessing and tokenizations on each entry
tokenized_datasets = datasets.map(
    lambda examples: preprocess_function(examples, tokenizer),
    batched=True
)

tokenized_datasets["train"] = tokenized_datasets["train"].shuffle(seed=16)

In [None]:
# Save processed data the first time it is processed (if it is the first time, the next line would be uncommented)
# tokenized_datasets.save_to_disk(folder_path + 'brand_new_further_cleaned_Amharic_mT5_tokenized_datasets')

# Load processed data if it has already been saved
# This is referencing the tokenized, preprocessed version of Amharic-2
tokenized_datasets = DatasetDict.load_from_disk(folder_path + 'brand_new_further_cleaned_Amharic_mT5_tokenized_datasets')

## Loading Model

In [None]:
# Getting the base model and setting up the IA3 PEFT cofigurations
# Changes indicate from: Initial-Amharic-FT -> Improved-Amharic-FT
base_model = AutoModelForSeq2SeqLM.from_pretrained('google/mt5-small')
peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode = False,
    r = 6, # 4 -> 6
    lora_alpha = 12, # 32 -> 12
    lora_dropout = 0.15 # 0.1 -> 0.15 (to prevent overfitting)
)

model = get_peft_model(base_model, peft_config)
model.print_trainable_parameters()

config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

trainable params: 258,048 || all params: 300,434,816 || trainable%: 0.0858915099906397


## Training Hyperparameters

In [None]:
# Initializing training hyperparameters
num_epochs = 30
batch_size = 24

In [None]:
# Initializing optimizer and LR scheduler with IA3 parameters
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-4) # Starting LR increased from 5e-5 to 2e-4 for Improved models

# Loading training data
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle = True,
    collate_fn = default_data_collator,
    batch_size = batch_size,
    pin_memory = True,
)

# Loading validation data
validation_dataloader = DataLoader(
    tokenized_datasets["validation"],
    collate_fn = default_data_collator,
    batch_size = batch_size,
    pin_memory = True
)

# Number of training steps
num_training_steps = len(train_dataloader) * num_epochs

# Setting up the learning rate scheduler
# Changed to 30% warmup for Improved model
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer = optimizer,
    num_warmup_steps = int(0.3 * num_training_steps), # 10% warmup, increased to 30% for Improved Amharic models
    num_training_steps = num_training_steps,
)

In [None]:
# Path to save checkpoints
checkpoint_path = folder_path + "Improved_Amharic_FT_2_Checkpoints/"

# Early stopping parameters
patience = 3  # How many epochs to wait after last time validation loss improved

## Training Loop

In [None]:
# Setting the device for the model training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Setting the system file path to where training_module.py is located
sys.path.append(folder_path)
from training_module import train_model, save_losses

In [None]:
# Training/Fine-tuning the model
training_losses, validation_losses = train_model(
    model = model,
    train_dataloader = train_dataloader,
    validation_dataloader = validation_dataloader,
    optimizer = optimizer,
    lr_scheduler = lr_scheduler,
    device = device,
    num_epochs = num_epochs,
    checkpoint_filepath = checkpoint_path,
    patience = patience,
    model_filepath = folder_path + "Improved_Amharic_FT"
)

In [None]:
# Saving the training and validation losses at each step to a CSV file
save_losses(
    training_losses = training_losses,
    validation_losses = validation_losses,
    file_path = folder_path + "improved_amharic_ft_losses.csv"
)