<a href="https://colab.research.google.com/github/eduseiti/ia368v_dd_class_06/blob/main/T5_fine_tune_for_doc2query_more_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# This notebook executes the T5-base fine-tuning for doc2query task using a bigger training dataset ― the MS MARCO passage development dataset

In [1]:
!pip install transformers -q
!pip install evaluate -q
!pip install ftfy -q
!pip install sentencepiece -q
!pip install sacrebleu -q
!pip install comet_ml -q

In [2]:
WORKING_FOLDER="drive/MyDrive/unicamp/ia368v_dd/aula_06"

API_KEYS_FILE="/content/drive/MyDrive/unicamp/ia368v_dd/api_keys_20230324.json"

TRAIN_OUTPUT_FOLDER="./trained_model_more_data"
MS_MARCO_SPLIT="ms_marco_passage_dev_data_split.pkl"

MS_MARCO_PASSAGE_DEV_TOKENIZED_DATASETS="ms_marco_passage_dev_tokenized_datasets.pkl"

MS_MARCO_PASSAGE_DEV_FILENAME="ms_marco_passage_dev.pkl"

LINK_WITH_COMET=True

In [3]:
import os
from google.colab import drive
import json

import ftfy
import pandas as pd
import numpy as np

from scipy import stats

import pickle

if LINK_WITH_COMET:
    from comet_ml import Experiment

In [4]:
drive.mount('/content/drive', force_remount=True)
os.chdir(WORKING_FOLDER)

Mounted at /content/drive


Link with comet-ml

In [5]:
if LINK_WITH_COMET:
    with open(API_KEYS_FILE) as inputFile:
        api_keys = json.load(inputFile)

    os.environ["COMET_API_KEY"] = api_keys['comet_ml']
    os.environ["COMET_LOG_ASSETS"] = "True"
    os.environ['COMET_MODE'] = "ONLINE"

    experiment = Experiment(api_key=api_keys['comet_ml'], 
                            project_name="causal-language-model-fine-tuning",
                            workspace="eduseiti")    

COMET ERROR: Failed to calculate active processors count. Fall back to default CPU count 1
COMET INFO: Couldn't find a Git repository in '/content/drive/MyDrive/unicamp/ia368v_dd/aula_06' nor in any parent directory. You can override where Comet is looking for a Git Patch by setting the configuration `COMET_GIT_DIRECTORY`
COMET INFO: Experiment is live on comet.com https://www.comet.com/eduseiti/causal-language-model-fine-tuning/e689765b91c540b8b4dc5fd5e6bf155a



In [6]:
from transformers import (AutoTokenizer, 
                          AutoModelForSeq2SeqLM, 
                          Seq2SeqTrainer,
                          Seq2SeqTrainingArguments,
                          TrainerCallback, 
                          get_cosine_with_hard_restarts_schedule_with_warmup,
                          DataCollatorForSeq2Seq,
                          T5Tokenizer, 
                          T5Model
                          )

import torch

import evaluate

In [7]:
pd.set_option('display.max_colwidth', None)

## Read the MS MARCO passage dev data split, if available

In [8]:
class Doc2queryFinetuning(torch.utils.data.Dataset):

    def __init__(self, ms_df, tokenizer):

        self.tokenized_topics = tokenizer(ms_df['query_text'].tolist(), return_length=True)
        self.tokenized_passage = tokenizer(ms_df['passage_text'].tolist(), return_length=True)

        print("Queries tokens size stats:\n{}\n".format(stats.describe(self.tokenized_topics['length'])))
        print("Passages tokens size stats:\n{}\n".format(stats.describe(self.tokenized_passage['length'])))

    def __len__(self):
        return len(self.tokenized_topics['input_ids'])


    def __getitem__(self, index):
        return {'input_ids': self.tokenized_passage['input_ids'][index],
                'attention_mask': self.tokenized_passage['attention_mask'][index],
                'labels': self.tokenized_topics['input_ids'][index]}

In [9]:
class CustomTrainerCallback(TrainerCallback):

    def __init__(self, best_validation_yet=99999, model=None) -> None:
        super().__init__()

        self.best_validation_metric = best_validation_yet
        self.model = model


    def on_evaluate(self, args, state, control, model=None, metrics=None, **kwargs):
        print(metrics.keys())

        print("metrics['eval_loss']={}".format(metrics['eval_loss']))
        print("metrics['eval_bleu']={}".format(metrics['eval_bleu']))


        if metrics['eval_bleu'] > self.best_validation_metric:
            self.model.save_pretrained(os.path.join(TRAIN_OUTPUT_FOLDER, 
                                                    "checkpoint-{}-{:.4f}".format(state.global_step,
                                                                                  metrics['eval_bleu'])))
            self.best_validation_metric = metrics['eval_bleu']

In [10]:
if os.path.exists(MS_MARCO_PASSAGE_DEV_TOKENIZED_DATASETS):
    with open(MS_MARCO_PASSAGE_DEV_TOKENIZED_DATASETS, "rb") as inputFile:
        ms_marco_datasets = pickle.load(inputFile)

    train_dataset = ms_marco_datasets['train_dataset']
    eval_dataset = ms_marco_datasets['eval_dataset']
else:
    print("Need to import and fix the training dataset...")

## Read the preprocessed data

The data should have been preprocessed by the [`explore_ms_marco_passage.ipynb`](explore_ms_marco_passage.ipynb) notebook.

In [11]:
if not 'train_dataset' in locals():
    with open(MS_MARCO_PASSAGE_DEV_FILENAME, 'rb') as inputFile:
        ms_df = pickle.load(inputFile)

        ms_df['query_text'] = ms_df['query_text'].apply(lambda text: ftfy.fix_text(text))
        ms_df['passage_text'] = ms_df['passage_text'].apply(lambda text: ftfy.fix_text(text))

else:
    print("Datasets have already been loaded...")

Datasets have already been loaded...


Split evaluation part

In [12]:
if not 'train_dataset' in locals():
    print("ms_df.shape={}".format(ms_df.shape))

    split_entries = np.random.choice(list(range(ms_df.shape[0])), 1000, replace=False)

    train_df = ms_df.iloc[np.setdiff1d(list(range(ms_df.shape[0])), split_entries)].reset_index(drop=True)
    validation_df = ms_df.iloc[split_entries].reset_index(drop=True)
    
    print("train_df.shape={}".format(train_df.shape))
    print("validation_df.shape={}".format(validation_df.shape))
    
    with open(MS_MARCO_SPLIT, "wb") as outputFile:
        pickle.dump({'train': train_df, 
                     'validation': validation_df}, outputFile, pickle.HIGHEST_PROTOCOL)
        
else:
    print("Datasets have already been loaded...")

Datasets have already been loaded...


## Fine tune the T5-base model for the query generation

# Prepare T5 model

In [13]:
tokenizer = AutoTokenizer.from_pretrained("t5-base")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [14]:
tokenizer.model_max_length

512

In [15]:
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")

Create and save the datasets, if they are not ready yet...

In [16]:
if not 'train_dataset' in locals():
    train_dataset = Doc2queryFinetuning(train_df, tokenizer)
    eval_dataset = Doc2queryFinetuning(validation_df, tokenizer)

    with open(MS_MARCO_PASSAGE_DEV_TOKENIZED_DATASETS, 'wb') as outputFile:
        pickle.dump({"train_dataset": train_dataset,
                     "eval_dataset": eval_dataset}, outputFile, pickle.HIGHEST_PROTOCOL)    
else:
    print("Datasets have already been loaded...")

Datasets have already been loaded...


### This part was taken from the [`run_translation.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/translation/run_translation.py) script.

In [17]:
metric = evaluate.load("sacrebleu")

In [18]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

In [19]:
def compute_metrics(eval_preds):

    preds, labels = eval_preds

    # print("len(preds)={}".format(len(preds)))

    # for i in range(len(preds)):
    #     print("len(preds[{}])={}".format(i, len(preds[i])))
    #     print("preds[{}].shape={}".format(i, preds[i].shape))
    #     print("preds[i]={}".format(preds[i]))


    if isinstance(preds, tuple):
        preds = preds[0]

    print("compute_metrics. preds.shape={}".format(preds.shape))

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    
    result["gen_len"] = np.mean(prediction_lens)
    
    result = {k: round(v, 4) for k, v in result.items()}

    return result

In [20]:
batch_size=22
gradient_accumulation_steps=8
epochs=10

In [21]:
training_params = Seq2SeqTrainingArguments(output_dir=TRAIN_OUTPUT_FOLDER,
                                           num_train_epochs=epochs,
                                           per_device_train_batch_size=batch_size,
                                           per_device_eval_batch_size=batch_size,
                                           gradient_accumulation_steps=gradient_accumulation_steps,
                                           evaluation_strategy='steps',
                                           eval_steps=50,
                                           save_strategy='steps',
                                           save_steps=1000,
                                           logging_strategy='steps',
                                           logging_steps=10,
                                           save_total_limit=10,
                                           # report_to='comet_ml',
                                           # dataloader_num_workers=2,
                                           dataloader_pin_memory=True,
                                           predict_with_generate=True,
                                           generation_num_beams=10,
                                           fp16=True)

In [22]:
label_pad_token_id = -100

data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8 if training_params.fp16 else None,
)

In [23]:
trainer_callback = CustomTrainerCallback(best_validation_yet=-1, 
                                         model=model)

In [24]:
num_training_steps = epochs * int(len(train_dataset) // (batch_size * gradient_accumulation_steps))

optimzer = torch.optim.AdamW(model.parameters(), lr=6e-4, weight_decay=1e-3)
scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimzer, 
                                                               0,
                                                               num_training_steps, 
                                                               num_cycles=25)

In [25]:
num_training_steps // 25

1208

In [26]:
trainer = Seq2SeqTrainer(model=model,
                         args=training_params,
                         train_dataset=train_dataset,
                         eval_dataset=eval_dataset,
                         data_collator=data_collator,
                         callbacks=[trainer_callback],
                         optimizers=(optimzer, scheduler),
                         tokenizer=tokenizer,
                         compute_metrics=compute_metrics
                         )

In [27]:
train_result = trainer.train()

COMET INFO: ---------------------------------------------------------------------------------------
COMET INFO: Comet.ml Experiment Summary
COMET INFO: ---------------------------------------------------------------------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : https://www.comet.com/eduseiti/causal-language-model-fine-tuning/e689765b91c540b8b4dc5fd5e6bf155a
COMET INFO:   Uploads:
COMET INFO:     environment details : 1
COMET INFO:     filename            : 1
COMET INFO:     installed packages  : 1
COMET INFO:     notebook            : 2
COMET INFO:     os packages         : 1
COMET INFO:     source_code         : 1
COMET INFO: 
COMET ERROR: Failed to calculate active processors count. Fall back to default CPU count 1
COMET INFO: Couldn't find a Git repository in '/content/drive/MyDrive/unicamp/ia368v_dd/aula_06' nor in any parent directory. You can override where Comet is looking for a Git Patch by setting 

Step,Training Loss,Validation Loss,Bleu,Gen Len
50,2.1036,1.672017,16.5589,9.533
100,1.7552,1.609427,17.3526,9.649
150,1.7332,1.571516,17.8306,9.463


compute_metrics. preds.shape=(1000, 20)
dict_keys(['eval_loss', 'eval_bleu', 'eval_gen_len', 'eval_runtime', 'eval_samples_per_second', 'eval_steps_per_second', 'epoch'])
metrics['eval_loss']=1.6720167398452759
metrics['eval_bleu']=16.5589
compute_metrics. preds.shape=(1000, 20)
dict_keys(['eval_loss', 'eval_bleu', 'eval_gen_len', 'eval_runtime', 'eval_samples_per_second', 'eval_steps_per_second', 'epoch'])
metrics['eval_loss']=1.609426736831665
metrics['eval_bleu']=17.3526
compute_metrics. preds.shape=(1000, 20)
dict_keys(['eval_loss', 'eval_bleu', 'eval_gen_len', 'eval_runtime', 'eval_samples_per_second', 'eval_steps_per_second', 'epoch'])
metrics['eval_loss']=1.5715161561965942
metrics['eval_bleu']=17.8306


KeyboardInterrupt: ignored

In [28]:
experiment.end()