<a href="https://colab.research.google.com/github/eduseiti/ia368v_dd_class_07/blob/main/DensePassageRetriever_single_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
LINK_WITH_COMET=True

In [None]:
!pip install transformers -q
!pip install ftfy -q

if LINK_WITH_COMET:
    !pip install comet_ml -q

In [None]:
import pickle
from google.colab import drive

import pandas as pd
import numpy as np

import torch

from scipy import stats

from datetime import datetime

import ftfy

from transformers import (get_constant_schedule,
                          get_linear_schedule_with_warmup, 
                          get_cosine_with_hard_restarts_schedule_with_warmup,
                          AutoTokenizer, 
                          AutoModel,
                          AutoModelForSequenceClassification, 
                          BatchEncoding
)

import os

from tqdm.auto import tqdm

import json

if LINK_WITH_COMET:
    from comet_ml import Experiment

In [None]:
USE_BIGGER_DATASET=True

DIMENSIONS_TO_USE=1

MARGIN_TO_ADD=None

In [None]:
WORKING_FOLDER="drive/MyDrive/unicamp/ia368v_dd/aula_07"


API_KEYS_FILE="/content/drive/MyDrive/unicamp/ia368v_dd/api_keys_20230324.json"

if USE_BIGGER_DATASET:
    TRAIN_OUTPUT_FOLDER="./trained_single_model_even_more_data"
    MS_MARCO_SPLIT="ms_marco_passage_dev_400k_data_split.pkl"

    MS_MARCO_PASSAGE_DEV_TOKENIZED_DATASETS="ms_marco_passage_dev_tokenized_datasets.pkl"

    MS_MARCO_PASSAGE_DEV_FILENAME="ms_marco_passage_dev.pkl"

    TOTAL_NUMBER_OF_SAMPLES=400000
else:
    MS_MARCO_SPLIT="ms_marco_tiny_data_split.pkl"
    MS_MARCO_TINY_URL="https://storage.googleapis.com/unicamp-dl/ia368dd_2023s1/msmarco/msmarco_triples.train.tiny.tsv"
    
    TRAIN_OUTPUT_FOLDER="./trained_single_model"

In [None]:
MODEL_NAME='microsoft/MiniLM-L12-H384-uncased'

NUMBER_OF_EVALUATION_SAMPLES=1000

EPSILON=1e-12

In [None]:
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
os.chdir(WORKING_FOLDER)

In [None]:
if LINK_WITH_COMET:
    with open(API_KEYS_FILE) as inputFile:
        api_keys = json.load(inputFile)

    experiment = Experiment(api_key=api_keys['comet_ml'], 
                            project_name="Dense Passage Retriever",
                            workspace="eduseiti")

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(device)

cuda


## Read the MS MARCO data split, if available

In [None]:
if os.path.exists(MS_MARCO_SPLIT):
    with open(MS_MARCO_SPLIT, "rb") as inputFile:
        ms_marco_data = pickle.load(inputFile)

    train_df = ms_marco_data['train']
    validation_df = ms_marco_data['validation']
else:
    print("Need to import and fix the training dataset...")

## Import and fix training dataset

In [None]:
if not 'train_df' in locals():
    if USE_BIGGER_DATASET:
        with open(MS_MARCO_PASSAGE_DEV_FILENAME, 'rb') as inputFile:
            ms_df = pickle.load(inputFile)

            ms_df['query_text'] = ms_df['query_text'].apply(lambda text: ftfy.fix_text(text))
            ms_df['passage_text'] = ms_df['passage_text'].apply(lambda text: ftfy.fix_text(text))

            #
            # Rename columns to match the tiny dataset scheme...
            #

            ms_df = ms_df.rename(columns={'query_text': 'topic', 'passage_text': 'positive'})
    else:
        if not os.path.exists(os.path.basename(MS_MARCO_TINY_URL)):
            !wget {MS_MARCO_TINY_URL}
        else:
            print("Training dataset already downloaded...")

        ms_df = pd.read_csv(os.path.basename(MS_MARCO_TINY_URL), sep="\t", header=None, names=['topic', 'positive', 'negative'])

    pd.set_option('display.max_colwidth', None)

    print(ms_df.shape)

    display(ms_df.head())

    ms_df['positive'] = ms_df['positive'].apply(lambda text: ftfy.fix_text(text))
    # ms_df = ms_df.drop('negative', axis=1)

else:
    print("Data split has already been loaded...")

Data split has already been loaded...


Split evaluation part

In [None]:
if not 'train_df' in locals():
    print("ms_df.shape={}".format(ms_df.shape))

    if USE_BIGGER_DATASET:
        split_entries = np.random.choice(list(range(ms_df.shape[0])), TOTAL_NUMBER_OF_SAMPLES, replace=False)

        validation_entries = split_entries[:NUMBER_OF_EVALUATION_SAMPLES]
        train_entries = split_entries[NUMBER_OF_EVALUATION_SAMPLES:]

        train_df = ms_df.iloc[train_entries].reset_index(drop=True)
        validation_df = ms_df.iloc[validation_entries].reset_index(drop=True)
    else:
        split_entries = np.random.choice(list(range(ms_df.shape[0])), NUMBER_OF_EVALUATION_SAMPLES, replace=False)

        train_df = ms_df.iloc[np.setdiff1d(list(range(ms_df.shape[0])), split_entries)].reset_index(drop=True)
        validation_df = ms_df.iloc[split_entries].reset_index(drop=True)
    
    print("train_df.shape={}".format(train_df.shape))
    print("validation_df.shape={}".format(validation_df.shape))
    
    with open(MS_MARCO_SPLIT, "wb") as outputFile:
        pickle.dump({'train': train_df, 
                     'validation': validation_df}, outputFile, pickle.HIGHEST_PROTOCOL)
        
else:
    print("Data split has already been loaded...")

Data split has already been loaded...


## Create dataset class

In [None]:
class DensePassageRetrieverDataset(torch.utils.data.Dataset):

    def __init__(self, ms_df, tokenizer):

        self.tokenized_topics = tokenizer(ms_df['topic'].tolist(), return_length=True)
        self.tokenized_passage = tokenizer(ms_df['positive'].tolist(), return_length=True)

        print("Topics tokens size stats:\n{}\n".format(stats.describe(self.tokenized_topics['length'])))
        print("Passages tokens size stats:\n{}\n".format(stats.describe(self.tokenized_passage['length'])))

        self.shuffle()


    def shuffle(self):
        self.samples_order = list(range(len(self.tokenized_topics['input_ids'])))
        np.random.shuffle(self.samples_order)


    def __len__(self):
        return len(self.tokenized_topics['input_ids'])


    def __getitem__(self, index):

        return {'passages': {'input_ids': self.tokenized_passage['input_ids'][self.samples_order[index]],
                            'attention_mask': self.tokenized_passage['attention_mask'][self.samples_order[index]]},
                'topics' : {'input_ids': self.tokenized_topics['input_ids'][self.samples_order[index]],
                            'attention_mask': self.tokenized_topics['attention_mask'][self.samples_order[index]]}}

In [None]:
class DPRCollator(object):
    def __init__(self, type='passages', tokenizer=None):
        self.type = type
        self.tokenizer = tokenizer


    def __call__(self, batch):

        padded_batch = self.tokenizer.pad([item[self.type] for item in batch], return_tensors='pt')

        return BatchEncoding(padded_batch)

## Create the evaluation function

In [None]:
def compute_loss(passages_outputs, topics_outputs):

    # print("passages_outputs.keys()={}".format(passages_outputs.keys()))
    # print("topics_outputs.keys()={}".format(topics_outputs.keys()))

    # print("passages_outputs.pooler_output.shape={}".format(passages_outputs.pooler_output.shape))
    # print("passages_outputs.last_hidden_state.shape={}".format(passages_outputs.last_hidden_state.shape))

    data_shape = passages_outputs.last_hidden_state.shape

    passages_cls = passages_outputs.last_hidden_state[:, 0:DIMENSIONS_TO_USE, :].view(data_shape[0], -1)
    topics_cls = topics_outputs.last_hidden_state[:, 0:DIMENSIONS_TO_USE, :].view(data_shape[0], -1)

    # passages_cls = torch.nn.functional.normalize(passages_cls)
    # topics_cls = torch.nn.functional.normalize(passages_cls)

    # passages_cls = passages_cls / torch.norm(passages_cls, dim=1, keepdim=True)
    # topics_cls = topics_cls / torch.norm(topics_cls, dim=1, keepdim=True)

    # print("passages_cls.shape={}, type(passages_cls)={}".format(passages_cls.shape, type(passages_cls)))


    all_passages_all_topics_dot_product = torch.mm(passages_cls, topics_cls.t())
    passages_and_positive_topics = all_passages_all_topics_dot_product.diag().unsqueeze(1)

    # print(torch.mean(all_passages_all_topics_dot_product - passages_and_positive_topics, dim=1))

    if MARGIN_TO_ADD is not None:
        dot_product_differences = all_passages_all_topics_dot_product - passages_and_positive_topics + MARGIN_TO_ADD
    else:
        dot_product_differences = all_passages_all_topics_dot_product - passages_and_positive_topics                

    loss = torch.log(torch.sum(torch.exp(dot_product_differences), dim=1))

    #
    # The code below is to confirm the loss above is the same if computed exactly as stated in 
    # the DPR paper.
    #
    # dot_prod_exp = torch.exp(all_passages_all_topics_dot_product).fill_diagonal_(0.0)
    # print(dot_prod_exp.shape)
    # print(passages_and_positive_topics.shape)
    

    # all_passages_and_negative_topics_sum = torch.sum(dot_prod_exp, dim=1)
    # print(all_passages_and_negative_topics_sum.shape)

    # passage_positive_exp = torch.exp(passages_and_positive_topics.squeeze(1))

    # verification_loss = -torch.log(passage_positive_exp / (passage_positive_exp + all_passages_and_negative_topics_sum))

    # print(loss)
    # print(verification_loss)
    # print(verification_loss.shape)

    # print(loss==verification_loss)



    non_zeroed_losses = (loss > EPSILON).float().sum()

    if non_zeroed_losses > 0.0:
        final_loss = torch.sum(loss) / non_zeroed_losses
    else:
        final_loss = torch.mean(loss)

    return final_loss

In [None]:
def evaluate(device,
             single_model, 
             passages_dataloader, 
             topics_dataloader, 
             min_eval_loss,
             current_step=0,
             current_epoch=None):
    
    eval_losses = []

    single_model.eval()

    with torch.no_grad():
        for batch in tqdm(list(zip(passages_dataloader, topics_dataloader)), mininterval=0.5, desc="Eval", disable=False):
            
            passages_outputs = single_model(**batch[0].to(device))
            topics_outputs = single_model(**batch[1].to(device))

            eval_losses.append(compute_loss(passages_outputs, topics_outputs).cpu().numpy())

    final_loss = np.mean(eval_losses)

    print("Eval loss: {:0.3f}".format(final_loss))

    if LINK_WITH_COMET:
        experiment.log_metrics({'eval_loss': final_loss},
                               step=current_step)


    if (min_eval_loss['loss'] > final_loss) or (current_epoch is not None):
        training_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

        if current_epoch is not None:
            print("Forcing epoch end checkpoint...")

            checkpoint_name = "checkpoint_epoch_{}_{}_{:.4f}".format(current_epoch, training_timestamp, final_loss)
            single_model.save_pretrained(os.path.join(TRAIN_OUTPUT_FOLDER, checkpoint_name))
        else:
            print("New minimal validation loss; saving model...")

            checkpoint_name = "checkpoint_{}_{:.4f}".format(training_timestamp, final_loss)
            single_model.save_pretrained(os.path.join(TRAIN_OUTPUT_FOLDER, checkpoint_name))

            min_eval_loss['checkpoint_name'] = checkpoint_name
            min_eval_loss['loss'] = final_loss

## Instantiate the tokenizer, models (passage and topic), dataset and dataloaders

### Define the model/training hyperparameters

In [None]:

hyperparameters = {
    'batch_size': 256
}

### Instantiate the model

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [None]:
single_model = AutoModel.from_pretrained(MODEL_NAME).to(device, dtype=torch.bfloat16)

In [None]:
!nvidia-smi

Tue Apr 18 19:29:45 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   29C    P0    53W / 400W |   1337MiB / 40960MiB |      5%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

### Instantiate the datasets

In [None]:
train_dataset = DensePassageRetrieverDataset(train_df, tokenizer)

Topics tokens size stats:
DescribeResult(nobs=399000, minmax=(4, 85), mean=9.033899749373434, variance=7.875421923338093, skewness=2.0892765907800332, kurtosis=15.567681611399525)

Passages tokens size stats:
DescribeResult(nobs=399000, minmax=(11, 328), mean=80.35770426065163, variance=1035.6418512755738, skewness=1.1640816692553344, kurtosis=1.587311989362866)



In [None]:
eval_dataset = DensePassageRetrieverDataset(validation_df, tokenizer)

Topics tokens size stats:
DescribeResult(nobs=1000, minmax=(4, 32), mean=9.123, variance=8.756627627627628, skewness=2.2772873923325294, kurtosis=10.720066443617496)

Passages tokens size stats:
DescribeResult(nobs=1000, minmax=(16, 236), mean=80.971, variance=1057.685844844845, skewness=1.1348818489125028, kurtosis=1.5025265861253656)



### Now create the dataloaders, 2 for each split (train, eval) to supply the passages and the topics data

In [None]:
train_passages_dataloader = torch.utils.data.DataLoader(train_dataset, 
                                                        batch_size=hyperparameters['batch_size'], 
                                                        shuffle=False, 
                                                        collate_fn=DPRCollator('passages', tokenizer))

train_topics_dataloader = torch.utils.data.DataLoader(train_dataset, 
                                                      batch_size=hyperparameters['batch_size'], 
                                                      shuffle=False, 
                                                      collate_fn=DPRCollator('topics', tokenizer))

eval_passages_dataloader = torch.utils.data.DataLoader(eval_dataset, 
                                                       batch_size=hyperparameters['batch_size'], 
                                                       shuffle=False, 
                                                       collate_fn=DPRCollator('passages', tokenizer))

eval_topics_dataloader = torch.utils.data.DataLoader(eval_dataset, 
                                                     batch_size=hyperparameters['batch_size'], 
                                                     shuffle=False, 
                                                     collate_fn=DPRCollator('topics', tokenizer))

## Finally, start trainining

In [None]:
min_eval_loss = {"loss": 1000,
                 "checkpoint_name": None}

In [None]:
hyperparameters['epochs'] = 10
hyperparameters['num_training_steps'] = hyperparameters['epochs'] * int(len(train_dataset) // hyperparameters['batch_size'])
hyperparameters['num_warmup_steps'] = 0
hyperparameters['learning_rate'] = 2e-5

# hyperparameters['num_cosine_scheduler_cycles'] = 2
hyperparameters['num_cosine_scheduler_cycles'] = 1

hyperparameters['train_size'] = len(train_dataset)
hyperparameters['eval_size'] = len(eval_dataset)

hyperparameters['eval_step'] = 500
hyperparameters['dimensions_to_use'] = DIMENSIONS_TO_USE
hyperparameters['margin'] = MARGIN_TO_ADD

In [None]:
single_optimizer = torch.optim.AdamW(single_model.parameters(), lr=hyperparameters['learning_rate'])

In [None]:
# single_scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(single_optimizer, 
#                                                                       hyperparameters['num_warmup_steps'], 
#                                                                       hyperparameters['num_training_steps'],
#                                                                       num_cycles=hyperparameters['num_cosine_scheduler_cycles'])

single_scheduler = get_constant_schedule(single_optimizer)

In [None]:
if LINK_WITH_COMET:
    experiment.log_parameters(hyperparameters)

In [None]:
evaluate(device,
         single_model=single_model, 
         passages_dataloader=eval_passages_dataloader,
         topics_dataloader=eval_topics_dataloader,
         min_eval_loss=min_eval_loss,
         current_step=-1)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Eval:   0%|          | 0/4 [00:00<?, ?it/s]

Eval loss: 5.526
New minimal validation loss; saving model...


In [None]:
current_training_step = 0

for epoch in tqdm(range(hyperparameters['epochs']), desc='Epochs'):
    
    single_model.train()

    train_losses = []
    
    tqdm_batches = tqdm(list(zip(train_passages_dataloader, train_topics_dataloader)), mininterval=0.5, desc='Train', disable=False)

    for batch in tqdm_batches:

        current_training_step += 1

        single_optimizer.zero_grad()

        passages_outputs = single_model(**batch[0].to(device))
        topics_outputs = single_model(**batch[1].to(device))

        final_loss = compute_loss(passages_outputs, topics_outputs)

        final_loss.backward()

        single_optimizer.step()
        single_scheduler.step()

        train_losses.append(final_loss.detach().cpu().numpy())

        tqdm_batches.set_description("Loss {:0.4f}".format(train_losses[-1]))

        if LINK_WITH_COMET:
            experiment.log_metrics({'train loss': train_losses[-1],
                                    'learning_rate': single_scheduler.get_last_lr()},
                                    step=current_training_step)


        # if current_training_step % hyperparameters['eval_step'] == 0:
        #     evaluate(device,
        #             single_model=single_model, 
        #             passages_dataloader=eval_passages_dataloader,
        #             topics_dataloader=eval_topics_dataloader,
        #             min_eval_loss=min_eval_loss,
        #             current_step=current_training_step)

    print("Epoch: {}, Training loss: {:0.4f}".format(epoch, np.mean(train_losses)))
    
    if LINK_WITH_COMET:
        experiment.log_metrics({'train loss': np.mean(train_losses)},
                               epoch=epoch)


    evaluate(device,
             single_model=single_model, 
             passages_dataloader=eval_passages_dataloader,
             topics_dataloader=eval_topics_dataloader,
             min_eval_loss=min_eval_loss,
             current_step=current_training_step,
             current_epoch=epoch)
    
    train_dataset.shuffle()

Epochs:   0%|          | 0/10 [00:00<?, ?it/s]

Train:   0%|          | 0/1559 [00:00<?, ?it/s]

Epoch: 0, Training loss: 1.1787


Eval:   0%|          | 0/4 [00:00<?, ?it/s]

Eval loss: 0.629
Forcing epoch end checkpoint...


Train:   0%|          | 0/1559 [00:00<?, ?it/s]

Epoch: 1, Training loss: 0.6871


Eval:   0%|          | 0/4 [00:00<?, ?it/s]

Eval loss: 0.579
Forcing epoch end checkpoint...


Train:   0%|          | 0/1559 [00:00<?, ?it/s]

Epoch: 2, Training loss: 0.6493


Eval:   0%|          | 0/4 [00:00<?, ?it/s]

Eval loss: 0.584
Forcing epoch end checkpoint...


Train:   0%|          | 0/1559 [00:00<?, ?it/s]

Epoch: 3, Training loss: 0.6383


Eval:   0%|          | 0/4 [00:00<?, ?it/s]

Eval loss: 0.569
Forcing epoch end checkpoint...


Train:   0%|          | 0/1559 [00:00<?, ?it/s]

Epoch: 4, Training loss: 0.6253


Eval:   0%|          | 0/4 [00:00<?, ?it/s]

Eval loss: 0.566
Forcing epoch end checkpoint...


Train:   0%|          | 0/1559 [00:00<?, ?it/s]

OutOfMemoryError: ignored

In [None]:
experiment.end()