<a href="https://colab.research.google.com/github/eduseiti/ia368v_dd_class_07/blob/main/DensePassageRetriever.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
LINK_WITH_COMET=True

In [2]:
!pip install transformers -q
!pip install ftfy -q

if LINK_WITH_COMET:
    !pip install comet_ml -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m39.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m56.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.6/484.6 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.2/199.2 kB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m70.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.5/54.5 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
import pickle
from google.colab import drive

import pandas as pd
import numpy as np

import torch

from scipy import stats

from datetime import datetime

import ftfy

from transformers import (get_linear_schedule_with_warmup, 
                          get_cosine_with_hard_restarts_schedule_with_warmup,
                          AutoTokenizer, 
                          AutoModel,
                          AutoModelForSequenceClassification, 
                          BatchEncoding
)

import os

from tqdm.auto import tqdm

import json

if LINK_WITH_COMET:
    from comet_ml import Experiment

In [4]:
WORKING_FOLDER="drive/MyDrive/unicamp/ia368v_dd/aula_07"
TRAIN_OUTPUT_FOLDER="./trained_model"


API_KEYS_FILE="/content/drive/MyDrive/unicamp/ia368v_dd/api_keys_20230324.json"

MS_MARCO_SPLIT="ms_marco_tiny_data_split.pkl"
MS_MARCO_TINY_URL="https://storage.googleapis.com/unicamp-dl/ia368dd_2023s1/msmarco/msmarco_triples.train.tiny.tsv"

In [5]:
MODEL_NAME='microsoft/MiniLM-L12-H384-uncased'

NUMBER_OF_EVALUATION_SAMPLES=1000

EPSILON=1e-8

In [6]:
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [7]:
os.chdir(WORKING_FOLDER)

In [8]:
if LINK_WITH_COMET:
    with open(API_KEYS_FILE) as inputFile:
        api_keys = json.load(inputFile)

    experiment = Experiment(api_key=api_keys['comet_ml'], 
                            project_name="Dense Passage Retriever",
                            workspace="eduseiti")

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/content/drive/MyDrive/unicamp/ia368v_dd/aula_07' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.
[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/eduseiti/dense-passage-retriever/eb69b909a8294e71bd214eadd35b3389



In [9]:
pd.set_option('display.max_colwidth', None)

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(device)

cuda


## Read the MS MARCO data split, if available

In [11]:
if os.path.exists(MS_MARCO_SPLIT):
    with open(MS_MARCO_SPLIT, "rb") as inputFile:
        ms_marco_data = pickle.load(inputFile)

    train_df = ms_marco_data['train']
    validation_df = ms_marco_data['validation']
else:
    print("Need to import and fix the training dataset...")

## Import and fix training dataset

In [12]:
if not 'train_df' in locals():
    if not os.path.exists(os.path.basename(MS_MARCO_TINY_URL)):
        !wget {MS_MARCO_TINY_URL}
    else:
        print("Training dataset already downloaded...")

    ms_df = pd.read_csv(os.path.basename(MS_MARCO_TINY_URL), sep="\t", header=None, names=['topic', 'positive', 'negative'])
    pd.set_option('display.max_colwidth', None)

    display(ms_df.head())

    ms_df['positive'] = ms_df['positive'].apply(lambda text: ftfy.fix_text(text))
    # ms_df = ms_df.drop('negative', axis=1)

else:
    print("Data split has already been loaded...")

Data split has already been loaded...


Split evaluation part

In [13]:
if not 'train_df' in locals():
    print("ms_df.shape={}".format(ms_df.shape))

    split_entries = np.random.choice(list(range(ms_df.shape[0])), NUMBER_OF_EVALUATION_SAMPLES, replace=False)

    train_df = ms_df.iloc[np.setdiff1d(list(range(ms_df.shape[0])), split_entries)].reset_index(drop=True)
    validation_df = ms_df.iloc[split_entries].reset_index(drop=True)
    
    print("train_df.shape={}".format(train_df.shape))
    print("validation_df.shape={}".format(validation_df.shape))
    
    with open(MS_MARCO_SPLIT, "wb") as outputFile:
        pickle.dump({'train': train_df, 
                    'validation': validation_df}, outputFile, pickle.HIGHEST_PROTOCOL)
        
else:
    print("Data split has already been loaded...")

Data split has already been loaded...


## Create dataset class

In [14]:
class DensePassageRetrieverDataset(torch.utils.data.Dataset):

    def __init__(self, ms_df, tokenizer):

        self.tokenized_topics = tokenizer(ms_df['topic'].tolist(), return_length=True)
        self.tokenized_passage = tokenizer(ms_df['positive'].tolist(), return_length=True)

        print("Topics tokens size stats:\n{}\n".format(stats.describe(self.tokenized_topics['length'])))
        print("Passages tokens size stats:\n{}\n".format(stats.describe(self.tokenized_passage['length'])))

        self.shuffle()


    def shuffle(self):
        self.samples_order = list(range(len(self.tokenized_topics['input_ids'])))
        np.random.shuffle(self.samples_order)


    def __len__(self):
        return len(self.tokenized_topics['input_ids'])


    def __getitem__(self, index):

        return {'passages': {'input_ids': self.tokenized_passage['input_ids'][self.samples_order[index]],
                            'attention_mask': self.tokenized_passage['attention_mask'][self.samples_order[index]]},
                'topics' : {'input_ids': self.tokenized_topics['input_ids'][self.samples_order[index]],
                            'attention_mask': self.tokenized_topics['attention_mask'][self.samples_order[index]]}}

In [15]:
class DPRCollator(object):
    def __init__(self, type='passages', tokenizer=None):
        self.type = type
        self.tokenizer = tokenizer


    def __call__(self, batch):

        padded_batch = self.tokenizer.pad([item[self.type] for item in batch], return_tensors='pt')

        return BatchEncoding(padded_batch)

## Create the evaluation function

In [16]:
def compute_loss(passages_outputs, topics_outputs):

    # print("passages_outputs.keys()={}".format(passages_outputs.keys()))
    # print("topics_outputs.keys()={}".format(topics_outputs.keys()))

    # print("passages_outputs.pooler_output.shape={}".format(passages_outputs.pooler_output.shape))
    # print("passages_outputs.last_hidden_state.shape={}".format(passages_outputs.last_hidden_state.shape))

    passages_cls = passages_outputs.last_hidden_state[:, 0, :]
    topics_cls = topics_outputs.last_hidden_state[:, 0, :]

    # print("passages_cls.shape={}, type(passages_cls)={}".format(passages_cls.shape, type(passages_cls)))


    all_passages_all_topics_dot_product = torch.mm(passages_cls, topics_cls.t())
    passages_and_positive_topics = all_passages_all_topics_dot_product.diag().unsqueeze(1)

    dot_product_differences = all_passages_all_topics_dot_product - passages_and_positive_topics

    loss = torch.log(torch.sum(torch.exp(dot_product_differences), dim=1))

    non_zeroed_losses = (loss > EPSILON).float().sum()

    if non_zeroed_losses > 0.0:
        final_loss = torch.sum(loss) / non_zeroed_losses
    else:
        final_loss = torch.mean(loss)

    return final_loss

In [17]:
def evaluate(device,
             passages_model, 
             topics_model, 
             passages_dataloader, 
             topics_dataloader, 
             min_eval_loss,
             current_epoch=0):
    
    eval_losses = []

    passages_model.eval()
    topics_model.eval()

    with torch.no_grad():
        for batch in tqdm(list(zip(passages_dataloader, topics_dataloader)), mininterval=0.5, desc="Eval", disable=False):
            
            passages_outputs = passages_model(**batch[0].to(device))
            topics_outputs = topics_model(**batch[1].to(device))

            eval_losses.append(compute_loss(passages_outputs, topics_outputs).cpu().numpy())

    final_loss = np.mean(eval_losses)

    print("Eval loss: {:0.3f}".format(final_loss))

    if LINK_WITH_COMET:
        experiment.log_metrics({'eval_loss': final_loss},
                               epoch=current_epoch)


    if min_eval_loss['loss'] > final_loss:
        print("New minimal validation loss; saving model...")

        training_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

        checkpoint_name = "checkpoint_{}_{:.4f}".format(training_timestamp, final_loss)
        passages_model.save_pretrained(os.path.join(TRAIN_OUTPUT_FOLDER, checkpoint_name, "_passages"))
        topics_model.save_pretrained(os.path.join(TRAIN_OUTPUT_FOLDER, checkpoint_name, "_topics"))

        min_eval_loss['checkpoint_name'] = checkpoint_name
        min_eval_loss['loss'] = final_loss

## Instantiate the tokenizer, models (passage and topic), dataset and dataloaders

### Define the model/training hyperparameters

In [18]:
hyperparameters = {
    'batch_size': 32
}

### Instantiate the model

In [19]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [20]:
passages_model = AutoModel.from_pretrained(MODEL_NAME).to(device)
topics_model = AutoModel.from_pretrained(MODEL_NAME).to(device)

Downloading pytorch_model.bin:   0%|          | 0.00/133M [00:00<?, ?B/s]

### Instantiate the datasets

In [21]:
train_dataset = DensePassageRetrieverDataset(train_df, tokenizer)

Topics tokens size stats:
DescribeResult(nobs=10000, minmax=(4, 43), mean=9.0995, variance=8.161415891589158, skewness=2.234958225795292, kurtosis=14.688689468456474)

Passages tokens size stats:
DescribeResult(nobs=10000, minmax=(13, 280), mean=80.6722, variance=1058.9128384438445, skewness=1.1296226868887678, kurtosis=1.4924264968953178)



In [22]:
eval_dataset = DensePassageRetrieverDataset(validation_df, tokenizer)

Topics tokens size stats:
DescribeResult(nobs=1000, minmax=(4, 34), mean=9.148, variance=7.741837837837838, skewness=1.910799024012278, kurtosis=11.565017118056652)

Passages tokens size stats:
DescribeResult(nobs=1000, minmax=(19, 260), mean=80.393, variance=970.5731241241243, skewness=1.1956679092637208, kurtosis=1.8950420310041958)



### Now create the dataloaders, 2 for each split (train, eval) to supply the passages and the topics data

In [23]:
train_passages_dataloader = torch.utils.data.DataLoader(train_dataset, 
                                                        batch_size=hyperparameters['batch_size'], 
                                                        shuffle=False, 
                                                        collate_fn=DPRCollator('passages', tokenizer))

train_topics_dataloader = torch.utils.data.DataLoader(train_dataset, 
                                                      batch_size=hyperparameters['batch_size'], 
                                                      shuffle=False, 
                                                      collate_fn=DPRCollator('topics', tokenizer))

eval_passages_dataloader = torch.utils.data.DataLoader(eval_dataset, 
                                                       batch_size=hyperparameters['batch_size'], 
                                                       shuffle=False, 
                                                       collate_fn=DPRCollator('passages', tokenizer))

eval_topics_dataloader = torch.utils.data.DataLoader(eval_dataset, 
                                                     batch_size=hyperparameters['batch_size'], 
                                                     shuffle=False, 
                                                     collate_fn=DPRCollator('topics', tokenizer))

## Finally, start trainining

In [24]:
min_eval_loss = {"loss": 1000,
                 "checkpoint_name": None}

In [25]:
hyperparameters['epochs'] = 10
hyperparameters['num_training_steps'] = hyperparameters['epochs'] * int(len(train_dataset) // hyperparameters['batch_size'])
hyperparameters['num_warmup_steps'] = 0
hyperparameters['learning_rate'] = 1e-4
hyperparameters['num_cosine_scheduler_cycles'] = 1

In [26]:
passages_optimizer = torch.optim.AdamW(passages_model.parameters(), lr=hyperparameters['learning_rate'])
topics_optimizer = torch.optim.AdamW(topics_model.parameters(), lr=hyperparameters['learning_rate'])

In [27]:
passages_scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(passages_optimizer, 
                                                                        hyperparameters['num_warmup_steps'], 
                                                                        hyperparameters['num_training_steps'],
                                                                        num_cycles=hyperparameters['num_cosine_scheduler_cycles'])

In [28]:
topics_scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(topics_optimizer, 
                                                                      hyperparameters['num_warmup_steps'], 
                                                                      hyperparameters['num_training_steps'],
                                                                      num_cycles=hyperparameters['num_cosine_scheduler_cycles'])

In [29]:
if LINK_WITH_COMET:
    experiment.log_parameters(hyperparameters)

In [30]:
evaluate(device,
         passages_model=passages_model, 
         topics_model=topics_model,
         passages_dataloader=eval_passages_dataloader,
         topics_dataloader=eval_topics_dataloader,
         min_eval_loss=min_eval_loss,
         current_epoch=-1)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Eval:   0%|          | 0/32 [00:00<?, ?it/s]

Eval loss: 3.410
New minimal validation loss; saving model...


In [None]:
current_training_step = 0

for epoch in tqdm(range(hyperparameters['epochs']), desc='Epochs'):
    
    passages_model.train()
    topics_model.train()

    train_losses = []
    
    tqdm_batches = tqdm(list(zip(train_passages_dataloader, train_topics_dataloader)), mininterval=0.5, desc='Train', disable=False)

    for batch in tqdm_batches:

        current_training_step += 1

        passages_optimizer.zero_grad()
        topics_optimizer.zero_grad()

        passages_outputs = passages_model(**batch[0].to(device))
        topics_outputs = topics_model(**batch[1].to(device))

        final_loss = compute_loss(passages_outputs, topics_outputs)

        final_loss.backward()

        passages_optimizer.step()
        topics_optimizer.step()

        passages_scheduler.step()
        topics_scheduler.step()

        train_losses.append(final_loss.detach().cpu().numpy())

        tqdm_batches.set_description("Loss {:0.4f}".format(train_losses[-1]))

        if LINK_WITH_COMET:
            experiment.log_metrics({'train loss': train_losses[-1],
                                    'learning_rate': passages_scheduler.get_last_lr()},
                                    step=current_training_step)

    print("Epoch: {}, Training loss: {:0.4f}".format(epoch + 1, np.mean(train_losses)))
    
    if LINK_WITH_COMET:
        experiment.log_metrics({'train loss': np.mean(train_losses)},
                               epoch=epoch)


    evaluate(device,
             passages_model=passages_model, 
             topics_model=topics_model,
             passages_dataloader=eval_passages_dataloader,
             topics_dataloader=eval_topics_dataloader,
             min_eval_loss=min_eval_loss,
             current_epoch=epoch)
    
    train_dataset.shuffle()

Epochs:   0%|          | 0/10 [00:00<?, ?it/s]

Train:   0%|          | 0/313 [00:00<?, ?it/s]

Epoch: 1, Training loss: 0.8478


Eval:   0%|          | 0/32 [00:00<?, ?it/s]

Eval loss: 0.280
New minimal validation loss; saving model...


Train:   0%|          | 0/313 [00:00<?, ?it/s]

Epoch: 2, Training loss: 0.2957


Eval:   0%|          | 0/32 [00:00<?, ?it/s]

Eval loss: 0.179
New minimal validation loss; saving model...


Train:   0%|          | 0/313 [00:00<?, ?it/s]

Epoch: 3, Training loss: 0.1532


Eval:   0%|          | 0/32 [00:00<?, ?it/s]

Eval loss: 0.158
New minimal validation loss; saving model...


Train:   0%|          | 0/313 [00:00<?, ?it/s]

Epoch: 4, Training loss: 0.1119


Eval:   0%|          | 0/32 [00:00<?, ?it/s]

Eval loss: 0.155
New minimal validation loss; saving model...


Train:   0%|          | 0/313 [00:00<?, ?it/s]

Epoch: 5, Training loss: 0.0639


Eval:   0%|          | 0/32 [00:00<?, ?it/s]

Eval loss: 0.094
New minimal validation loss; saving model...


Train:   0%|          | 0/313 [00:00<?, ?it/s]

In [None]:
experiment.end()