# Arabic Fine-tuning

This notebook includes the code used to fine-tune all the models that are first fine-tuned with the Arabic data. The first training loop produced the Arabic-FT model and the second training loop is used to fine-tune this model further on Arabic and/or Amharic data to create different models. The models and training parameters were adjusted accordingly for each fine-tuning step.

Models trained using this flow are:

- Arabic-FT
- Arabic-English-FT
- Arabic-Amharic-FT
- Improved-Arabic-English-Amharic-FT

In [2]:
# Installing Packages
!pip install datasets transformers sentencepiece accelerate -U tensorflow --upgrade torch torchvision peft nltk rouge_score arabert > /dev/null 2>&1

In [3]:
# Loading packages
import gc
import re
import os
import time
import random
import torch
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset, Dataset, DatasetDict, concatenate_datasets
from peft import LoraConfig, TaskType, get_peft_model, PeftModel
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    default_data_collator,
    get_linear_schedule_with_warmup,
    MT5Tokenizer,
)
from google.colab import drive
from IPython.display import clear_output
from torch.utils.data import DataLoader
from arabert.preprocess import ArabertPreprocessor

## Data Loading, Cleaning and Preprocessing

In [4]:
# Mouting Google Drive to the current Colab session for accessing files stored in the Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
## Path of the directory of the data files
folder_path = '/content/drive/My Drive/CPSC_490_Data/'

In [6]:
# Load the dataset from the CSV file
full_Arabic_dataset = load_dataset("csv", data_files={"data": folder_path + "ArabicMogalad_Ndeef.csv"})['data']

# Removing the 'Unnamed: 0' column - just an index column
full_Arabic_dataset = full_Arabic_dataset.remove_columns(['Unnamed: 0'])

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating data split: 0 examples [00:00, ? examples/s]

In [None]:
# Removing NULL and duplicates
# A function to check for NULL values
def not_null(dictionary):
  """
    Checks if both 'Text' and 'Summary' fields in the given dictionary are not NULL.

    Arg:
      dictionary (dict): A dictionary with 'Text' and 'Summary' keys.

    Returns:
      bool: True if neither 'Text' nor 'Summary' is NULL, False otherwise.
  """

  return dictionary['Text'] is not None and dictionary['Summary'] is not None

# Remove entries with NULL values
full_Arabic_dataset = full_Arabic_dataset.filter(not_null)

# Convert to pandas DataFrame to remove duplicates
Arabic_df = full_Arabic_dataset.to_pandas()

# Drop duplicates
Arabic_df = Arabic_df.drop_duplicates(subset=['Text', 'Summary'])

# Convert back to Dataset
cleaned_Arabic_dataset = Dataset.from_pandas(Arabic_df)

# Saving the cleaned dataset
cleaned_Arabic_dataset.save_to_disk(folder_path + 'Cleaned_Arabic_Dataset')

Filter:   0%|          | 0/265476 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/258797 [00:00<?, ? examples/s]

In [7]:
def preprocess_function(examples, tokenizer):
  """
    Cleans and tokenizes text and summary examples using AraBERT preprocessor and specified tokenizer

    Args:
      examples (dict): A dictionary containing 'Text' and 'Summary' keys with lists of strings.
      tokenizer (Tokenizer): The tokenizer used for tokenization.

    Returns:
      dict: A dictionary with tokenized inputs and adjusted labels for model training.
  """

  # Tokenization parameters
  padding = "max_length"
  max_length = 512
  max_summary_length = 128

  # Initialize the AraBERT preprocessor
  arabert_prep = ArabertPreprocessor("bert-base-arabert")

  # A function to clean the text
  def clean_text(text):
    """
      Cleans a given text by removing URLs and unwanted characters, and then applies AraBERT preprocessing.

      Arg:
        text (str): The text to be cleaned.

      Returns:
        str: The cleaned and preprocessed text.
    """

    # Remove URLs, punctuation, etc.
    text = re.sub(r"http\S+|www\.\S+|[^A-Za-z0-9آ-ي\s]", "", text)

    # Apply the AraBERT preprocess
    text = arabert_prep.preprocess(text)

    return text

  # Clean and preprocess the inputs and targets
  inputs = [clean_text(ex) for ex in examples["Text"]]
  targets = [clean_text(ex) for ex in examples["Summary"]]

  # Tokenize inputs and labels
  model_inputs = tokenizer(inputs, max_length=max_length, padding=padding, truncation=True)
  labels = tokenizer(targets, max_length=max_summary_length, padding=padding, truncation=True)

  # Adjust labels because mT5 does not use the pad token id in labels
  labels["input_ids"] = [
      [(label if label != tokenizer.pad_token_id else -100) for label in label_seq] for label_seq in labels["input_ids"]
  ]

  # Setting the tokenized labels as the model's training targets
  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

In [None]:
# Loading the mT5 tokenizer
tokenizer = MT5Tokenizer.from_pretrained('google/mt5-small')

Downloading (…)okenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
# Loading cleaned dataset
cleaned_Arabic_dataset = Dataset.load_from_disk(folder_path + 'Cleaned_Arabic_Dataset')

# Shuffle the dataset (if you want to randomize the selection)
cleaned_Arabic_dataset = cleaned_Arabic_dataset.shuffle(seed=19)

In [None]:
# Instead of loading the entire dataset, since that kept crashing, it will be load in chunks
chunk_size = 10000
num_chunks = 200000 // chunk_size

# Process each chunk and save it to disk
# This approach is chosen because it kept crashing when trying to preprocess the whole dataset together
for i in range(num_chunks):

    # Select a subset of the dataset
    subset_dataset = cleaned_Arabic_dataset.select(range(i*chunk_size, (i+1)*chunk_size))

    # Preprocess the subset
    tokenized_subset = subset_dataset.map(
        lambda examples: preprocess_function(examples, tokenizer),
        batched=True,
        load_from_cache_file=False,
    )

    # Save the processed subset
    tokenized_subset.save_to_disk(f"{folder_path}tokenized_chunk_{i}")

    # Clear memory, so that it doesn't crash
    del subset_dataset
    del tokenized_subset
    gc.collect()

    # If using PyTorch and there's GPU involved
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    clear_output(wait=True)

# Load and concatenate all chunks into one dataset
all_chunks = [Dataset.load_from_disk(f"{folder_path}tokenized_chunk_{i}") for i in range(num_chunks)]
processed_dataset = concatenate_datasets(all_chunks)

19


In [None]:
# Splitting proportions for the subset
train_split = int(0.8 * len(processed_dataset))  # 80% for training
val_split = int(0.1 * len(processed_dataset))  # 10% for validation
test_split = len(processed_dataset) - train_split - val_split  # The rest for testing

# Split the dataset into train, validation, and test sets
train_dataset = processed_dataset.select(range(train_split))
validation_dataset = processed_dataset.select(range(train_split, train_split + val_split))
test_dataset = processed_dataset.select(range(train_split + val_split, len(processed_dataset)))

# Create the DatasetDict
datasets = DatasetDict({
    "train": train_dataset,
    "validation": validation_dataset,
    "test": test_dataset
})

In [None]:
# Save processed data if necessary (if it is the first time, the next line would be uncommented)
# datasets.save_to_disk(folder_path + 'mT5_Arabic_tokenized_datasets')

# Load processed data if necessary
# This is referencing the tokenized, preprocessed version of the Arabic dataset
tokenized_datasets = DatasetDict.load_from_disk(folder_path + 'mT5_Arabic_tokenized_datasets')

## Loading Model 1

In [None]:
# Getting the base model and setting up the IA3 PEFT cofigurations
base_model = AutoModelForSeq2SeqLM.from_pretrained('google/mt5-small')
peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode = False,
    r = 4,
    lora_alpha = 32,
    lora_dropout = 0.1
)

model = get_peft_model(base_model, peft_config)
model.print_trainable_parameters()

Downloading pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

trainable params: 172,032 || all params: 300,348,800 || trainable%: 0.057277405469906985


## Training Hyperparameters

In [None]:
# Number of epochs and batch size
num_epochs = 30
batch_size = 24

In [None]:
# Initialize optimizer and LR scheduler with IA3 parameters
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5 )

# Loading training data
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle = True,
    collate_fn = default_data_collator,
    batch_size = batch_size,
    pin_memory = True,
)

# Loading validation data
validation_dataloader = DataLoader(
    tokenized_datasets["validation"],
    collate_fn = default_data_collator,
    batch_size = batch_size,
    pin_memory = True
)

# Number of training steps
num_training_steps = len(train_dataloader) * num_epochs

# Setting up the learning rate scheduler
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer = optimizer,
    num_warmup_steps = int(0.1 * num_training_steps), # 10% warmup
    num_training_steps = num_training_steps,
)

In [None]:
# Define a path to save the checkpoints
checkpoint_path = folder_path + "Arabic_FT_Model_Checkpoints/"

# Early stopping parameter
patience = 3

## Training Loop 1

In [None]:
# Setting the device for the model training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Setting the system file path to where training_module.py is located
sys.path.append(folder_path)
from training_module import train_model, save_losses

In [None]:
# Training/Fine-tuning the model
training_losses, validation_losses = train_model(
    model = model,
    train_dataloader = train_dataloader,
    validation_dataloader = validation_dataloader,
    optimizer = optimizer,
    lr_scheduler = lr_scheduler,
    device = device,
    num_epochs = num_epochs,
    checkpoint_filepath = checkpoint_path,
    patience = patience,
    model_filepath = folder_path + "Arabic_FT"
)

In [None]:
# Saving the training and validation losses at each step to a CSV file
save_losses(
    training_losses = training_losses,
    validation_losses = validation_losses,
    file_path = folder_path + "arabic_ft_losses.csv"
)

## Loading Model 2

In [None]:
# Loading model to be further fine-tuned
model = AutoModelForSeq2SeqLM.from_pretrained(folder_path + "Arabic_FT_Model")
model = PeftModel.from_pretrained(model, folder_path + "Arabic_FT_Model", is_trainable=True)

# Moving model to device
model.to(device)

model.print_trainable_parameters()

(…)oogle/mt5-small/resolve/main/config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

(…)mall/resolve/main/generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

trainable params: 172,032 || all params: 300,348,800 || trainable%: 0.057277405469906985


In [None]:
# Load processed data for further fine-tuning
tokenized_datasets = DatasetDict.load_from_disk(folder_path + 'brand_new_further_cleaned_Amharic_mT5_tokenized_datasets')

# Initialize optimizer and LR scheduler with IA3 parameters
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-4)

# Loading training data
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    collate_fn=default_data_collator,
    batch_size=batch_size,
    pin_memory=True,
)

# Loading validation data
validation_dataloader = DataLoader(
    tokenized_datasets["validation"],
    collate_fn=default_data_collator,
    batch_size=batch_size,
    pin_memory=True
)

# Number of training steps
num_training_steps = len(train_dataloader) * num_epochs

# Setting up the learning rate scheduler
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps= int(0.1 * num_training_steps), # 10% warmup
    num_training_steps= num_training_steps,
)

In [None]:
# Define a path to save the checkpoints
checkpoint_path = folder_path + "Improved_Arabic_Amharic_FT_Model_Checkpoints/"

# Early stopping parameter
patience = 3

## Training Loop 2

In [None]:
# Setting the system file path to where training_module.py is located
sys.path.append(folder_path)
from training_module import train_model, save_losses

In [None]:
# Training/Fine-tuning the model
training_losses, validation_losses = train_model(
    model = model,
    train_dataloader = train_dataloader,
    validation_dataloader = validation_dataloader,
    optimizer = optimizer,
    lr_scheduler = lr_scheduler,
    device = device,
    num_epochs = num_epochs,
    checkpoint_filepath = checkpoint_path,
    patience = patience,
    model_filepath = folder_path + "Improved_Arabic_Amharic_FT"
)

In [None]:
# Saving the training and validation losses at each step to a CSV file
save_losses(
    training_losses = training_losses,
    validation_losses = validation_losses,
    file_path = folder_path + "improved_arabic_amharic_ft_losses.csv"
)