In [2]:
# 1. magic for inline plot
# 2. magic to print version
# 3. magic so that the notebook will reload external python modules
# 4. magic to enable retina (high resolution) plots
# https://gist.github.com/minrk/3301035
%matplotlib inline
%load_ext watermark
%load_ext autoreload
%autoreload 2

import os
import torch
import datasets
import evaluate
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
from time import perf_counter
from torch.utils.data import DataLoader
from datasets import load_dataset, DatasetDict
from transformers import (
    pipeline,
    Trainer,
    TrainingArguments,
    AutoConfig,
    AutoTokenizer,
    AutoModel,
    DataCollatorWithPadding
)
device = "cuda" if torch.cuda.is_available() else "cpu"

%watermark -a 'Ethen' -d -u -p torch,datasets,transformers,evaluate,numpy,pandas

Author: Ethen

Last updated: 2022-09-18

torch       : 1.12.1
datasets    : 2.3.2
transformers: 4.20.1
evaluate    : 0.2.2
numpy       : 1.21.6
pandas      : 1.2.4



# Bi-Encoder Architecture

## Data Preprocessing

https://huggingface.co/datasets/Tevatron/msmarco-passage

In [3]:
test_size = 0.1
dataset = load_dataset("Tevatron/msmarco-passage", split='train')
dataset_dict = dataset.train_test_split(test_size=test_size)
dataset_dict

Reusing dataset msmarco-passage (/home/mingyuliu/.cache/huggingface/datasets/Tevatron___msmarco-passage/default/0.0.1/1874f5d9ae5257b9dbc7d8f89c76f8d4c321be6b660bb5df208e5e64decfa978)


DatasetDict({
    train: Dataset({
        features: ['query_id', 'query', 'positive_passages', 'negative_passages'],
        num_rows: 360703
    })
    test: Dataset({
        features: ['query_id', 'query', 'positive_passages', 'negative_passages'],
        num_rows: 40079
    })
})

In [4]:
example = dataset_dict['train'][0]
example

{'query_id': '889473',
 'query': 'what providence is bay of fundy in',
 'positive_passages': [{'docid': '5980953',
   'title': '-',
   'text': 'Over the next 150 years, a number of other French settlements and seigneuries were founded in the area occupied by present-day New Brunswick, including along the Saint John River, the upper Bay of Fundy region, in the Tantramar Marshes at Beaubassin, and finally at St. Pierre (site of present-day Bathurst).'}],
 'negative_passages': [{'docid': '7178187',
   'title': 'Divine providence',
   'text': "In theology, divine providence, or just providence, is God's intervention in the world. The term Divine Providence is also used as a title of God. A distinction is usually made between general providence, which refers to God's continuous upholding the existence and natural order of the Universe, and special providence, which refers to God's extraordinary intervention in the life of people. Miracles generally fall in the latter category."},
  {'docid'

In [5]:
def explode_fn(batch):
    batch_query = [query for query in batch['query']]
    
    batch_num_neg_passage = []
    batch_negative_text = []
    for passages in batch['negative_passages']:
        num_neg_passage = len(passages)
        batch_num_neg_passage.append(num_neg_passage)

        negative_text = [passage['text'] for passage in passages]
        batch_negative_text.append(negative_text)
    
    # for every positive passage, multiply it with number of negative passage in the same index
    queries = []
    negative_texts = []
    positive_texts = []
    for idx, passages in enumerate(batch['positive_passages']):
        num_neg_passage = batch_num_neg_passage[idx]
        negative_text = batch_negative_text[idx]
        query = batch_query[idx]
        for passage in passages:
            queries += [query] * num_neg_passage
            positive_texts += [passage['text']] * num_neg_passage
            negative_texts += negative_text

    return {'query': queries, 'pos': positive_texts, 'neg': negative_texts}

In [6]:
dataset_dict_exploded = dataset_dict.map(
    explode_fn,
    batched=True,
    remove_columns=['query_id', 'query', 'positive_passages', 'negative_passages']
)
dataset_dict_exploded

  0%|          | 0/361 [00:00<?, ?ba/s]

  0%|          | 0/41 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['query', 'pos', 'neg'],
        num_rows: 11412906
    })
    test: Dataset({
        features: ['query', 'pos', 'neg'],
        num_rows: 1269038
    })
})

In [7]:
dataset_dict_exploded['train'][0]

{'query': 'what providence is bay of fundy in',
 'pos': 'Over the next 150 years, a number of other French settlements and seigneuries were founded in the area occupied by present-day New Brunswick, including along the Saint John River, the upper Bay of Fundy region, in the Tantramar Marshes at Beaubassin, and finally at St. Pierre (site of present-day Bathurst).',
 'neg': "In theology, divine providence, or just providence, is God's intervention in the world. The term Divine Providence is also used as a title of God. A distinction is usually made between general providence, which refers to God's continuous upholding the existence and natural order of the Universe, and special providence, which refers to God's extraordinary intervention in the life of people. Miracles generally fall in the latter category."}

In [8]:
model_name = 'distilbert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(model_name)
output = tokenizer(example['query'], padding=False, truncation='longest_first', max_length=tokenizer.model_max_length)
output

{'input_ids': [101, 2054, 11293, 2003, 3016, 1997, 4636, 2100, 1999, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [9]:
def tokenize_fn(batch):
    
    query_encoded = tokenizer(
        batch['query'],
        padding=False,
        truncation='longest_first',
        max_length=tokenizer.model_max_length
    )
    # we should check the proportion of documents that are affected 
    pos_encoded = tokenizer(
        batch['pos'],
        padding=False,
        truncation='longest_first',
        max_length=tokenizer.model_max_length
    )
    
    neg_encoded = tokenizer(
        batch['neg'],
        padding=False,
        truncation='longest_first',
        max_length=tokenizer.model_max_length
    )
    return {
        'query_input_ids': query_encoded['input_ids'],
        'query_attention_mask': query_encoded['attention_mask'],
        'pos_input_ids': pos_encoded['input_ids'],
        'pos_attention_mask': pos_encoded['attention_mask'],
        'neg_input_ids': neg_encoded['input_ids'],
        'neg_attention_mask': neg_encoded['attention_mask']
    }

In [10]:
# By default datasets will try to save tokenization results as int64, which takes
# up a lot of necessary space especially for binary tokens such as
# attention_mask and token_type_ids
# In fact, when features are named appropriately it will use compact types
# refer to OptimizedTypedSequence under 
# https://github.com/huggingface/datasets/blob/main/src/datasets/arrow_writer.py
features = datasets.Features({
    'query_input_ids': datasets.Sequence(datasets.Value('int32')),
    'query_attention_mask': datasets.Sequence(datasets.Value('int8')),
    'pos_input_ids': datasets.Sequence(datasets.Value('int32')),
    'pos_attention_mask': datasets.Sequence(datasets.Value('int8')),
    'neg_input_ids': datasets.Sequence(datasets.Value('int32')),
    'neg_attention_mask': datasets.Sequence(datasets.Value('int8'))
})

In [11]:
dataset_dict_tokenized = dataset_dict_exploded.map(
    tokenize_fn,
    batched=True,
    num_proc=8,
    remove_columns=['query', 'pos', 'neg'],
    features=features
)
dataset_dict_tokenized

           

#3:   0%|          | 0/1427 [00:00<?, ?ba/s]

    

#2:   0%|          | 0/1427 [00:00<?, ?ba/s]

#1:   0%|          | 0/1427 [00:00<?, ?ba/s]

#5:   0%|          | 0/1427 [00:00<?, ?ba/s]

#0:   0%|          | 0/1427 [00:00<?, ?ba/s]

 

#6:   0%|          | 0/1427 [00:00<?, ?ba/s]

#7:   0%|          | 0/1427 [00:00<?, ?ba/s]

#4:   0%|          | 0/1427 [00:00<?, ?ba/s]

                

#0:   0%|          | 0/159 [00:00<?, ?ba/s]

#1:   0%|          | 0/159 [00:00<?, ?ba/s]

#2:   0%|          | 0/159 [00:00<?, ?ba/s]

#3:   0%|          | 0/159 [00:00<?, ?ba/s]

#4:   0%|          | 0/159 [00:00<?, ?ba/s]

#5:   0%|          | 0/159 [00:00<?, ?ba/s]

#7:   0%|          | 0/159 [00:00<?, ?ba/s]

#6:   0%|          | 0/159 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['query_input_ids', 'query_attention_mask', 'pos_input_ids', 'pos_attention_mask', 'neg_input_ids', 'neg_attention_mask'],
        num_rows: 11412906
    })
    test: Dataset({
        features: ['query_input_ids', 'query_attention_mask', 'pos_input_ids', 'pos_attention_mask', 'neg_input_ids', 'neg_attention_mask'],
        num_rows: 1269038
    })
})

In [12]:
# 30.982270552
dataset_dict_tokenized.data['train'].nbytes / 1e9

9.870764719

In [13]:
dataset_dict_tokenized['train'][0]

{'query_input_ids': [101,
  2054,
  11293,
  2003,
  3016,
  1997,
  4636,
  2100,
  1999,
  102],
 'query_attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'pos_input_ids': [101,
  2058,
  1996,
  2279,
  5018,
  2086,
  1010,
  1037,
  2193,
  1997,
  2060,
  2413,
  7617,
  1998,
  7367,
  23773,
  11236,
  3111,
  2020,
  2631,
  1999,
  1996,
  2181,
  4548,
  2011,
  2556,
  1011,
  2154,
  2047,
  9192,
  1010,
  2164,
  2247,
  1996,
  3002,
  2198,
  2314,
  1010,
  1996,
  3356,
  3016,
  1997,
  4636,
  2100,
  2555,
  1010,
  1999,
  1996,
  9092,
  6494,
  7849,
  19257,
  2012,
  17935,
  22083,
  11493,
  1010,
  1998,
  2633,
  2012,
  2358,
  1012,
  5578,
  1006,
  2609,
  1997,
  2556,
  1011,
  2154,
  21897,
  1007,
  1012,
  102],
 'pos_attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,


## Modeling

In [14]:
import torch
from dataclasses import dataclass
from typing import List, Dict, Optional, Union
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
from transformers.file_utils import PaddingStrategy


@dataclass
class DataCollatorForSiamese:
    """
    
    https://huggingface.co/docs/transformers/main_classes/data_collator#transformers.DataCollatorWithPadding
    """
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features: List[Dict[str, List[int]]]) -> Dict[str, torch.Tensor]:
        query_features = {
            'input_ids': [feature['query_input_ids'] for feature in features],
            'attention_mask': [feature['query_attention_mask'] for feature in features]
        }
        pos_features = {
            'input_ids': [feature['pos_input_ids'] for feature in features],
            'attention_mask': [feature['pos_attention_mask'] for feature in features]
        }
        neg_features = {
            'input_ids': [feature['neg_input_ids'] for feature in features],
            'attention_mask': [feature['neg_attention_mask'] for feature in features]
        }
        query_batch = self.tokenizer.pad(
            query_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        pos_batch = self.tokenizer.pad(
            pos_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        neg_batch = self.tokenizer.pad(
            neg_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        batch = {
            'query_input_ids': query_batch['input_ids'],
            'query_attention_mask': query_batch['attention_mask'],
            'pos_input_ids': pos_batch['input_ids'],
            'pos_attention_mask': pos_batch['attention_mask'],
            'neg_input_ids': neg_batch['input_ids'],
            'neg_attention_mask': neg_batch['attention_mask'],
        }
        return batch

In [15]:
data_collate_siamese = DataCollatorForSiamese(tokenizer)

In [16]:
data_loader = DataLoader(dataset_dict_tokenized['train'], batch_size=2, collate_fn=data_collate_siamese)
example = next(iter(data_loader))
example

{'query_input_ids': tensor([[  101,  2054, 11293,  2003,  3016,  1997,  4636,  2100,  1999,   102],
         [  101,  2054, 11293,  2003,  3016,  1997,  4636,  2100,  1999,   102]]),
 'query_attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]),
 'pos_input_ids': tensor([[  101,  2058,  1996,  2279,  5018,  2086,  1010,  1037,  2193,  1997,
           2060,  2413,  7617,  1998,  7367, 23773, 11236,  3111,  2020,  2631,
           1999,  1996,  2181,  4548,  2011,  2556,  1011,  2154,  2047,  9192,
           1010,  2164,  2247,  1996,  3002,  2198,  2314,  1010,  1996,  3356,
           3016,  1997,  4636,  2100,  2555,  1010,  1999,  1996,  9092,  6494,
           7849, 19257,  2012, 17935, 22083, 11493,  1010,  1998,  2633,  2012,
           2358,  1012,  5578,  1006,  2609,  1997,  2556,  1011,  2154, 21897,
           1007,  1012,   102],
         [  101,  2058,  1996,  2279,  5018,  2086,  1010,  1037,  2193,  1997,
           2060,  24

In [17]:
class TransformerPooling(nn.Module):

    def __init__(self, model_name: str, pooling_mode: str = 'avg'):
        super().__init__()
        if pooling_mode not in {'avg', 'cls'}:
            raise ValueError(f'{pooling_mode} needs to one of avg, cls')

        self.base_model = AutoModel.from_pretrained(model_name)
        self.pooling_mode = pooling_mode

    def forward(self, input_ids, attention_mask):
        output = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        if self.pooling_mode == 'avg':
            pooled = output.last_hidden_state.mean(dim=1)
        elif self.pooling_mode == 'cls':
            pooled = output.last_hidden_state[:, 0, :]

        return pooled

In [18]:
model = TransformerPooling(model_name).to(device)
model

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


TransformerPooling(
  (base_model): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Line

In [19]:
output = model(
    input_ids=example['query_input_ids'].to(device),
    attention_mask=example['query_attention_mask'].to(device)
)
output

tensor([[-0.4427, -0.1523, -0.1497,  ..., -0.1537, -0.0263,  0.2855],
        [-0.4427, -0.1523, -0.1497,  ..., -0.1537, -0.0263,  0.2855]],
       device='cuda:0', grad_fn=<MeanBackward1>)

In [20]:
from transformers.modeling_outputs import SequenceClassifierOutput


class SiameseModel(nn.Module):
    
    def __init__(self, model_name: str, pooling_mode: str = 'avg'):
        super().__init__()
        self.transformer_model = TransformerPooling(model_name, pooling_mode)

    def forward(
        self,
        query_input_ids,
        query_attention_mask,
        pos_input_ids,
        pos_attention_mask,
        neg_input_ids,
        neg_attention_mask
    ):
        query_embedding = self.transformer_model(
            input_ids=query_input_ids,
            attention_mask=query_attention_mask
        )
        pos_embedding = self.transformer_model(
            input_ids=pos_input_ids,
            attention_mask=pos_attention_mask
        )
        neg_embedding = self.transformer_model(
            input_ids=neg_input_ids,
            attention_mask=neg_attention_mask
        )
        doc_embedding = torch.cat([pos_embedding, neg_embedding])
        
        # cosine similarity
        query_norm = F.normalize(query_embedding, p=2, dim=1)
        doc_norm = F.normalize(doc_embedding, p=2, dim=1)
        scores = torch.mm(query_norm, doc_norm.transpose(0, 1)) # * self.scale

        # Example a[i] should match with b[i]
        labels = torch.arange(scores.size()[0], device=scores.device)
        loss = F.cross_entropy(scores, labels)
        return loss, scores

In [21]:
def model_init():
    return SiameseModel(model_name)

In [22]:
os.environ['DISABLE_MLFLOW_INTEGRATION'] = 'TRUE'

In [25]:
batch_size = 128
num_train_epochs = 3
learning_rate = 2e-5
weight_decay = 0.01

finetuned_checkpoint = f"{model_name}-siamese"
training_args = TrainingArguments(
    output_dir=finetuned_checkpoint,
    num_train_epochs=num_train_epochs,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=weight_decay,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    dataloader_num_workers=4,
    fp16=True,
    #gradient_accumulation_steps=4
)

trainer = Trainer(
    #siamese_model,
    model_init=model_init,
    args=training_args,
    tokenizer=tokenizer, 
    data_collator=data_collate_siamese,
    train_dataset=dataset_dict_tokenized['train'],
    eval_dataset=dataset_dict_tokenized['test'],
    #compute_metrics=compute_metrics
)
result = trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /home/mingyuliu/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq

Epoch,Training Loss,Validation Loss


RuntimeError: CUDA out of memory. Tried to allocate 1006.00 MiB (GPU 0; 31.75 GiB total capacity; 28.95 GiB already allocated; 229.75 MiB free; 30.38 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF