<a href="https://colab.research.google.com/github/eduseiti/ia368v_dd_class_06/blob/main/T5_fine_tune_for_doc2query.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers -q
!pip install evaluate -q
!pip install ftfy -q
!pip install sentencepiece -q
!pip install sacrebleu -q
!pip install comet_ml -q

In [2]:
WORKING_FOLDER="drive/MyDrive/unicamp/ia368v_dd/aula_06"

API_KEYS_FILE="/content/drive/MyDrive/unicamp/ia368v_dd/api_keys_20230324.json"

TRAIN_OUTPUT_FOLDER="./trained_model"
MS_MARCO_SPLIT="ms_marco_tiny_data_split.pkl"

MS_MARCO_TINY_URL="https://storage.googleapis.com/unicamp-dl/ia368dd_2023s1/msmarco/msmarco_triples.train.tiny.tsv"

LINK_WITH_COMET=True

In [3]:
import os
from google.colab import drive
import json

import ftfy
import pandas as pd
import numpy as np

from scipy import stats

import pickle

if LINK_WITH_COMET:
    from comet_ml import Experiment

In [4]:
drive.mount('/content/drive', force_remount=True)
os.chdir(WORKING_FOLDER)

Mounted at /content/drive


Link with comet-ml

In [5]:
if LINK_WITH_COMET:
    with open(API_KEYS_FILE) as inputFile:
        api_keys = json.load(inputFile)

    os.environ["COMET_API_KEY"] = api_keys['comet_ml']
    os.environ["COMET_LOG_ASSETS"] = "True"
    os.environ['COMET_MODE'] = "ONLINE"

    Experiment(api_key=api_keys['comet_ml'], 
            project_name="causal-language-model-fine-tuning",
            workspace="eduseiti")    

COMET ERROR: Failed to calculate active processors count. Fall back to default CPU count 1
COMET INFO: Couldn't find a Git repository in '/content/drive/MyDrive/unicamp/ia368v_dd/aula_06' nor in any parent directory. You can override where Comet is looking for a Git Patch by setting the configuration `COMET_GIT_DIRECTORY`
COMET INFO: Experiment is live on comet.com https://www.comet.com/eduseiti/causal-language-model-fine-tuning/7964c297bb2d4576bfa2caaf53772d84



In [6]:
from transformers import (AutoTokenizer, 
                          AutoModelForSeq2SeqLM, 
                          Seq2SeqTrainer,
                          Seq2SeqTrainingArguments,
                          TrainerCallback, 
                          get_cosine_with_hard_restarts_schedule_with_warmup,
                          DataCollatorForSeq2Seq,
                          T5Tokenizer, 
                          T5Model
                          )

import torch

import evaluate

In [7]:
pd.set_option('display.max_colwidth', None)

## Read the MS MARCO data split, if available

In [8]:
if os.path.exists(MS_MARCO_SPLIT):
    with open(MS_MARCO_SPLIT, "rb") as inputFile:
        ms_marco_data = pickle.load(inputFile)

    train_df = ms_marco_data['train']
    validation_df = ms_marco_data['validation']
else:
    print("Need to import and fix the training dataset...")

## Import and fix training dataset

In [9]:
if not 'train_df' in locals():
    if not os.path.exists(os.path.basename(MS_MARCO_TINY_URL)):
        !wget {MS_MARCO_TINY_URL}
    else:
        print("Training dataset already downloaded...")

    ms_df = pd.read_csv(os.path.basename(MS_MARCO_TINY_URL), sep="\t", header=None, names=['topic', 'positive', 'negative'])
    pd.set_option('display.max_colwidth', None)

    display(ms_df.head())

    ms_df['positive'] = ms_df['positive'].apply(lambda text: ftfy.fix_text(text))
    ms_df = ms_df.drop('negative', axis=1)

else:
    print("Data split has already been loaded...")

Data split has already been loaded...


Split evaluation part

In [10]:
if not 'train_df' in locals():
    print("ms_df.shape={}".format(ms_df.shape))

    split_entries = np.random.choice(list(range(ms_df.shape[0])), 1000, replace=False)

    train_df = ms_df.iloc[np.setdiff1d(list(range(ms_df.shape[0])), split_entries)].reset_index(drop=True)
    validation_df = ms_df.iloc[split_entries].reset_index(drop=True)
    
    print("train_df.shape={}".format(train_df.shape))
    print("validation_df.shape={}".format(validation_df.shape))
    
    with open(MS_MARCO_SPLIT, "wb") as outputFile:
        pickle.dump({'train': train_df, 
                    'validation': validation_df}, outputFile, pickle.HIGHEST_PROTOCOL)
        
else:
    print("Data split has already been loaded...")

Data split has already been loaded...


## Fine tune the T5-base model for the query generation

In [11]:
class Doc2queryFinetuning(torch.utils.data.Dataset):

    def __init__(self, ms_df, tokenizer):

        self.tokenized_topics = tokenizer(ms_df['topic'].tolist(), return_length=True)
        self.tokenized_passage = tokenizer(ms_df['positive'].tolist(), return_length=True)

        print("Topics tokens size stats:\n{}\n".format(stats.describe(self.tokenized_topics['length'])))
        print("Passages tokens size stats:\n{}\n".format(stats.describe(self.tokenized_passage['length'])))

    def __len__(self):
        return len(self.tokenized_topics['input_ids'])


    def __getitem__(self, index):
        return {'input_ids': self.tokenized_passage['input_ids'][index],
                'attention_mask': self.tokenized_passage['attention_mask'][index],
                'labels': self.tokenized_topics['input_ids'][index]}

In [12]:
class CustomTrainerCallback(TrainerCallback):

    def __init__(self, best_validation_yet=99999, model=None) -> None:
        super().__init__()

        self.best_validation_metric = best_validation_yet
        self.model = model


    def on_evaluate(self, args, state, control, model=None, metrics=None, **kwargs):
        print(metrics.keys())

        print("metrics['eval_loss']={}".format(metrics['eval_loss']))
        print("metrics['eval_bleu']={}".format(metrics['eval_bleu']))


        if metrics['eval_bleu'] > self.best_validation_metric:
            self.model.save_pretrained(os.path.join(TRAIN_OUTPUT_FOLDER, 
                                                    "checkpoint-{}-{:.4f}".format(state.global_step,
                                                                                  metrics['eval_bleu'])))
            self.best_validation_metric = metrics['eval_bleu']

# Prepare T5 model

In [13]:
tokenizer = AutoTokenizer.from_pretrained("t5-base")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [14]:
tokenizer.model_max_length

512

In [15]:
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")

Create the datasets

In [16]:
train_dataset = Doc2queryFinetuning(train_df, tokenizer)
eval_dataset = Doc2queryFinetuning(validation_df, tokenizer)

Topics tokens size stats:
DescribeResult(nobs=10000, minmax=(3, 57), mean=9.5105, variance=11.948684618461847, skewness=1.7547439898740915, kurtosis=10.741828171750589)

Passages tokens size stats:
DescribeResult(nobs=10000, minmax=(14, 326), mean=86.9099, variance=1264.7692589158914, skewness=1.140892273381897, kurtosis=1.6057605062545974)

Topics tokens size stats:
DescribeResult(nobs=1000, minmax=(3, 29), mean=9.434, variance=11.218862862862863, skewness=1.3393629730350802, kurtosis=4.007153813445774)

Passages tokens size stats:
DescribeResult(nobs=1000, minmax=(16, 261), mean=87.08, variance=1265.3289289289291, skewness=1.1804618158884206, kurtosis=1.6154309574375878)



### This part was taken from the [`run_translation.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/translation/run_translation.py) script.

In [17]:
metric = evaluate.load("sacrebleu")

In [18]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

In [19]:
def compute_metrics(eval_preds):

    preds, labels = eval_preds

    # print("len(preds)={}".format(len(preds)))

    # for i in range(len(preds)):
    #     print("len(preds[{}])={}".format(i, len(preds[i])))
    #     print("preds[{}].shape={}".format(i, preds[i].shape))
    #     print("preds[i]={}".format(preds[i]))


    if isinstance(preds, tuple):
        preds = preds[0]

    print("compute_metrics. preds.shape={}".format(preds.shape))

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    
    result["gen_len"] = np.mean(prediction_lens)
    
    result = {k: round(v, 4) for k, v in result.items()}

    return result

In [20]:
batch_size=26
gradient_accumulation_steps=8
epochs=100

In [21]:
training_params = Seq2SeqTrainingArguments(output_dir=TRAIN_OUTPUT_FOLDER,
                                           num_train_epochs=epochs,
                                           per_device_train_batch_size=batch_size,
                                           per_device_eval_batch_size=batch_size,
                                           gradient_accumulation_steps=gradient_accumulation_steps,
                                           evaluation_strategy='steps',
                                           eval_steps=50,
                                           save_strategy='steps',
                                           save_steps=1000,
                                           logging_strategy='steps',
                                           logging_steps=10,
                                           save_total_limit=10,
                                           # report_to='comet_ml',
                                           # dataloader_num_workers=2,
                                           dataloader_pin_memory=True,
                                           predict_with_generate=True,
                                           fp16=True)

In [22]:
label_pad_token_id = -100

data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8 if training_params.fp16 else None,
)

In [23]:
trainer_callback = CustomTrainerCallback(best_validation_yet=-1, 
                                         model=model)

In [24]:
num_training_steps = epochs * int(len(train_dataset) // (batch_size * gradient_accumulation_steps))

optimzer = torch.optim.AdamW(model.parameters(), lr=5e-6, weight_decay=1e-3)
scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimzer, 
                                                               0,
                                                               num_training_steps, 
                                                               num_cycles=10)

In [25]:
num_training_steps // 5

960

In [26]:
trainer = Seq2SeqTrainer(model=model,
                         args=training_params,
                         train_dataset=train_dataset,
                         eval_dataset=eval_dataset,
                         data_collator=data_collator,
                         callbacks=[trainer_callback],
                         optimizers=(optimzer, scheduler),
                         tokenizer=tokenizer,
                         compute_metrics=compute_metrics
                         )

In [None]:
train_result = trainer.train()

COMET INFO: ---------------------------------------------------------------------------------------
COMET INFO: Comet.ml Experiment Summary
COMET INFO: ---------------------------------------------------------------------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : https://www.comet.com/eduseiti/causal-language-model-fine-tuning/7964c297bb2d4576bfa2caaf53772d84
COMET INFO:   Uploads:
COMET INFO:     environment details : 1
COMET INFO:     filename            : 1
COMET INFO:     installed packages  : 1
COMET INFO:     notebook            : 2
COMET INFO:     os packages         : 1
COMET INFO:     source_code         : 1
COMET INFO: 
COMET ERROR: Failed to calculate active processors count. Fall back to default CPU count 1
COMET INFO: Couldn't find a Git repository in '/content/drive/MyDrive/unicamp/ia368v_dd/aula_06' nor in any parent directory. You can override where Comet is looking for a Git Patch by setting 

Step,Training Loss,Validation Loss,Bleu,Gen Len
50,4.5852,3.973049,1.0798,17.978
100,3.484,2.92351,1.501,17.051
150,2.6823,2.377275,2.0692,13.601
200,2.4917,2.280683,2.5198,12.344
250,2.7671,2.279149,2.5332,12.312
300,2.754,2.279186,2.5336,12.31


compute_metrics. preds.shape=(1000, 20)
dict_keys(['eval_loss', 'eval_bleu', 'eval_gen_len', 'eval_runtime', 'eval_samples_per_second', 'eval_steps_per_second', 'epoch'])
metrics['eval_loss']=3.9730489253997803
metrics['eval_bleu']=1.0798
compute_metrics. preds.shape=(1000, 20)
dict_keys(['eval_loss', 'eval_bleu', 'eval_gen_len', 'eval_runtime', 'eval_samples_per_second', 'eval_steps_per_second', 'epoch'])
metrics['eval_loss']=2.9235098361968994
metrics['eval_bleu']=1.501
compute_metrics. preds.shape=(1000, 20)
dict_keys(['eval_loss', 'eval_bleu', 'eval_gen_len', 'eval_runtime', 'eval_samples_per_second', 'eval_steps_per_second', 'epoch'])
metrics['eval_loss']=2.377274990081787
metrics['eval_bleu']=2.0692
compute_metrics. preds.shape=(1000, 20)
dict_keys(['eval_loss', 'eval_bleu', 'eval_gen_len', 'eval_runtime', 'eval_samples_per_second', 'eval_steps_per_second', 'epoch'])
metrics['eval_loss']=2.2806830406188965
metrics['eval_bleu']=2.5198
compute_metrics. preds.shape=(1000, 20)
dict_k

In [None]:
experiment.end()