# English Fine-tuning

This notebook includes the code used to fine-tune all the models that are first fine-tuned with the English data. The first training loop produced the English-FT model and the second training loop is used to fine-tune this model further on Arabic and/or Amharic data to create different models. The models and training parameters were adjusted accordingly for each fine-tuning step.

Models trained using this flow are:

- English-FT
- English-Arabic-FT
- English-Amharic-FT
- Improved-English-Arabic-Amharic-FT

In [None]:
# Installing packages
!pip install datasets transformers sentencepiece accelerate -U tensorflow --upgrade torch torchvision peft nltk rouge_score > /dev/null 2>&1

In [None]:
# Loading packages
import os
import time
import random
import torch
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset, Dataset, DatasetDict
from peft import LoraConfig, TaskType, get_peft_model
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    default_data_collator,
    get_linear_schedule_with_warmup,
    MT5Tokenizer,
)
from torch.utils.data import DataLoader
from google.colab import drive

## Data Loading and Preprocessing

In [None]:
# Mouting Google Drive to the current Colab session for accessing files stored in the Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Path of the directory to the data files
folder_path = '/content/drive/My Drive/CPSC_490_Data/'

# Specific paths to training, validation and test files
train_path = folder_path + 'CNN_DailyMail_train.csv'
valid_path = folder_path + 'CNN_DailyMail_validation.csv'
test_path = folder_path + 'CNN_DailyMail_test.csv'

In [2]:
# Preprocessing function
def preprocess_function(examples, tokenizer):
  """
    Preprocesses text and summary examples for model training

    Args:
      examples (dict): A dictionary containing 'text' and 'summary' keys with lists of strings.
      tokenizer (Tokenizer): The tokenizer to be used for tokenization.

    Returns:
      dict: A dictionary with tokenized inputs and adjusted labels suitable for model training.
  """

  padding = "max_length"
  max_length = 512
  max_summary_length = 128

  # Split inputs and labels
  inputs = [ex for ex in examples["text"]]
  targets = [ex for ex in examples["summary"]]

  # Tokenize inputs and labels
  model_inputs = tokenizer(inputs, max_length=max_length, padding=padding, truncation=True)
  labels = tokenizer(targets, max_length=max_summary_length, padding=padding, truncation=True)

  # Adjusting labels for mT5
  labels["input_ids"] = [
        [(label if label != tokenizer.pad_token_id else -100) for label in label_seq] for label_seq in labels["input_ids"]
  ]

  # Setting the tokenized labels as the model's training targets
  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

In [None]:
# Loading datasets
datasets = load_dataset("csv", data_files={"train": train_path, "validation": valid_path, "test": test_path})

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [None]:
# Seed for reproducibility
seed = 16

# Shuffle and select 80k entries from the training set
datasets["train"] = datasets["train"].shuffle(seed=seed).select(range(80000))

# Shuffle and select 10k entries from the validation set
datasets["validation"] = datasets["validation"].shuffle(seed=seed).select(range(10000))

# Shuffle and select 10k entries from the test set
datasets["test"] = datasets["test"].shuffle(seed=seed).select(range(10000))

In [None]:
# Function to rename columns
def rename_columns(example):
  """
    Renames columns: "article" --> "text", "highlights" --> "summary"

    Arg:
      example: dataframe whose tables are to be renamed
  """

  example["text"] = example["article"]
  example["summary"] = example["highlights"]
  return example

# Apply the renaming function and dropping the 'id' column for each subset
datasets["train"] = datasets["train"].map(rename_columns).remove_columns(["id", "article", "highlights"])
datasets["validation"] = datasets["validation"].map(rename_columns).remove_columns(["id", "article", "highlights"])
datasets["test"] = datasets["test"].map(rename_columns).remove_columns(["id", "article", "highlights"])

Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [None]:
# Loading the mT5 tokenizer
tokenizer = MT5Tokenizer.from_pretrained('google/mt5-small')

Downloading (…)okenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
# Applying the preprocessing and tokenization to each entry in the dataset
tokenized_datasets = datasets.map(
    lambda examples: preprocess_function(examples, tokenizer),
    batched=True
)
tokenized_datasets["train"] = tokenized_datasets["train"].shuffle(seed=16)

Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [None]:
# Save processed data if necessary (if it is the first time, the next line would be uncommented)
# tokenized_datasets.save_to_disk(folder_path + 'English_mT5_tokenized_datasets')

# Load processed data
# This is referencing the tokenized, preprocessed version of the English dataset
tokenized_datasets = DatasetDict.load_from_disk(folder_path + 'English_mT5_tokenized_datasets')

## Loading Model 1

In [None]:
# Getting the base model and setting up the IA3 PEFT cofigurations
base_model = AutoModelForSeq2SeqLM.from_pretrained('google/mt5-small')
peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode = False,
    r = 4,
    lora_alpha = 32,
    lora_dropout = 0.1
)

model = get_peft_model(base_model, peft_config)
model.print_trainable_parameters()

Downloading pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

trainable params: 172,032 || all params: 300,348,800 || trainable%: 0.057277405469906985


## Training Hyperparameters

In [None]:
# Number of epochs and batch size
num_epochs = 30
batch_size = 24

In [None]:
# Initialize optimizer and LR scheduler with IA3 parameters
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Loading training data
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle = True,
    collate_fn = default_data_collator,
    batch_size = batch_size,
    pin_memory = True,
)

# Loading validation data
validation_dataloader = DataLoader(
    tokenized_datasets["validation"],
    collate_fn = default_data_collator,
    batch_size = batch_size,
    pin_memory = True
)

# Number of training steps
num_training_steps = len(train_dataloader) * num_epochs

# Setting up the learning rate scheduler
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer = optimizer,
    num_warmup_steps = int(0.1 * num_training_steps), # 10% warmup
    num_training_steps = num_training_steps,
)

In [None]:
# Define a path to save the checkpoints
checkpoint_path = folder_path + "English_FT_Model_Checkpoints/"

# Initialize early stopping parameter
patience = 3

## Training Loop 1

In [None]:
# Setting the device for the model training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Setting the system file path to where training_module.py is located
sys.path.append(folder_path)
from training_module import train_model, save_losses

In [None]:
# Training/Fine-tuning the model
training_losses, validation_losses = train_model(
    model = model,
    train_dataloader = train_dataloader,
    validation_dataloader = validation_dataloader,
    optimizer = optimizer,
    lr_scheduler = lr_scheduler,
    device = device,
    num_epochs = num_epochs,
    checkpoint_filepath = checkpoint_path,
    patience = patience,
    model_filepath = folder_path + "English_FT"
)

In [None]:
# Saving the training and validation losses at each step to a CSV file
save_losses(
    training_losses = training_losses,
    validation_losses = validation_losses,
    file_path = folder_path + "english_ft_losses.csv"
)

## Loading Model 2

In [None]:
# Loading model to be further fine-tuned
model = AutoModelForSeq2SeqLM.from_pretrained(folder_path + "English_FT_Model")
model = PeftModel.from_pretrained(model, folder_path + "English_FT_Model", is_trainable=True)

# Moving model to device
model.to(device)

model.print_trainable_parameters()

config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

trainable params: 172,032 || all params: 300,348,800 || trainable%: 0.057277405469906985


## Training Hyperparameters

In [None]:
# Load processed data for further fine-tuning
tokenized_datasets = DatasetDict.load_from_disk(folder_path + 'brand_new_further_cleaned_Amharic_mT5_tokenized_datasets')

# Initialize optimizer and LR scheduler with IA3 parameters
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-4)

# Loading training data
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle = True,
    collate_fn = default_data_collator,
    batch_size = batch_size,
    pin_memory = True,
)

# Loading vaildation data
validation_dataloader = DataLoader(
    tokenized_datasets["validation"],
    collate_fn = default_data_collator,
    batch_size = batch_size,
    pin_memory = True
)

# Number of training steps
num_training_steps = len(train_dataloader) * num_epochs

# Setting up the learning rate scheduler
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer = optimizer,
    num_warmup_steps = int(0.1 * num_training_steps), # 10% warmup
    num_training_steps = num_training_steps,
)

In [None]:
# Define a path to save the checkpoints
checkpoint_path = folder_path + "Improved_English_Amharic_FT_Model_Checkpoints/"

# Initialize early stopping parameter
patience = 3

## Training Loop 2

In [None]:
# Setting the system file path to where training_module.py is located
sys.path.append(folder_path)
from training_module import train_model, save_losses

In [None]:
# Training/Fine-tuning the model
training_losses, validation_losses = train_model(
    model = model,
    train_dataloader = train_dataloader,
    validation_dataloader = validation_dataloader,
    optimizer = optimizer,
    lr_scheduler = lr_scheduler,
    device = device,
    num_epochs = num_epochs,
    checkpoint_filepath = checkpoint_path,
    patience = patience,
    model_filepath = folder_path + "Improved_English_Amharic_FT"
)

In [None]:
# Saving the training and validation losses at each step to a CSV file
save_losses(
    training_losses = training_losses,
    validation_losses = validation_losses,
    file_path = folder_path + "improved_english_amharic_ft_losses.csv"
)