<a href="https://colab.research.google.com/github/finardi/Ranking/blob/main/3_Cobert_Eval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!nvidia-smi

Tue Jun  8 11:57:21 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
%%capture
!pip install -q transformers

In [None]:
import gc
import os
import time
import torch
import random
import pickle
import numpy as np
import pandas as pd
from math import ceil
from functools import partial
from itertools import accumulate
from collections import defaultdict, OrderedDict
from transformers import BertPreTrainedModel, BertModel, BertTokenizerFast

# better pandas viz
pd.set_option('display.max_columns', 100)  
pd.set_option('display.expand_frame_repr', 100)
pd.set_option('max_colwidth', 700)
pd.set_option('display.max_rows', 5000)
  
# save/load pickles
def pickle_file(path, data=None):
    if data is None:
        with open(path, 'rb') as f:
            return pickle.load(f)
    if data is not None:
        with open(path, 'wb') as handle:
            pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)
 
# path base
path_base = '/content/drive/MyDrive/ColBERT/ColBERT - FAQ Receita Federal/'

In [None]:
# =============
# ✨ Constants
# =============
bsize = 16 # N
query_maxlen = 48
doc_maxlen = 128
path_model = 'bert-base-multilingual-uncased'

# ==================
# ✨ QueryTokenizer
# ==================
class QueryTokenizer():
    def __init__(self, query_maxlen, path_tokenizer):
        self.tok = BertTokenizerFast.from_pretrained(path_tokenizer)
        self.query_maxlen = query_maxlen

        self.cls_token, self.cls_token_id = self.tok.cls_token, self.tok.cls_token_id
        self.sep_token, self.sep_token_id = self.tok.sep_token, self.tok.sep_token_id
        self.mask_token, self.mask_token_id = self.tok.mask_token, self.tok.mask_token_id

    def tokenize(self, batch_text, add_special_tokens=False):
        assert type(batch_text) in [list, tuple], (type(batch_text))

        tokens = [self.tok.tokenize(x, add_special_tokens=False) for x in batch_text]

        if not add_special_tokens:
            return tokens

        prefix, suffix = [self.cls_token], [self.sep_token]
        tokens = [prefix + lst + suffix + [self.mask_token] * (self.query_maxlen - (len(lst)+3)) for lst in tokens]

        return tokens

    def encode(self, batch_text, add_special_tokens=False):
        assert type(batch_text) in [list, tuple], (type(batch_text))

        ids = self.tok(batch_text, add_special_tokens=False)['input_ids']

        if not add_special_tokens:
            return ids

        prefix, suffix = [self.cls_token_id], [self.sep_token_id]
        ids = [prefix + lst + suffix + [self.mask_token_id] * (self.query_maxlen - (len(lst)+3)) for lst in ids]

        return ids

    def tensorize(self, batch_text, bsize=None):
        assert type(batch_text) in [list, tuple], (type(batch_text))

        obj = self.tok(batch_text, padding='max_length', truncation=True,
                       return_tensors='pt', max_length=self.query_maxlen)

        ids, mask = obj['input_ids'], obj['attention_mask']

        ids[ids == 0] = self.mask_token_id

        if bsize:
            batches = _split_into_batches(ids, mask, bsize)
            return batches

        return ids, mask

# ================
# ✨ DocTokenizer
# ================
class DocTokenizer():
    def __init__(self, doc_maxlen, path_tokenizer):
        self.tok = BertTokenizerFast.from_pretrained(path_tokenizer)
        self.doc_maxlen = doc_maxlen

        self.cls_token, self.cls_token_id = self.tok.cls_token, self.tok.cls_token_id
        self.sep_token, self.sep_token_id = self.tok.sep_token, self.tok.sep_token_id

    def tokenize(self, batch_text, add_special_tokens=False):
        assert type(batch_text) in [list, tuple], (type(batch_text))

        tokens = [self.tok.tokenize(x, add_special_tokens=False) for x in batch_text]

        if not add_special_tokens:
            return tokens

        prefix, suffix = [self.cls_token], [self.sep_token]
        tokens = [prefix + lst + suffix for lst in tokens]

        return tokens

    def encode(self, batch_text, add_special_tokens=False):
        assert type(batch_text) in [list, tuple], (type(batch_text))

        ids = self.tok(batch_text, add_special_tokens=False)['input_ids']

        if not add_special_tokens:
            return ids

        prefix, suffix = [self.cls_token_id], [self.sep_token_id]
        ids = [prefix + lst + suffix for lst in ids]

        return ids

    def tensorize(self, batch_text, bsize=None):
        assert type(batch_text) in [list, tuple], (type(batch_text))

        obj = self.tok(batch_text, padding='longest', truncation='longest_first',
                       return_tensors='pt', max_length=self.doc_maxlen)

        ids, mask = obj['input_ids'], obj['attention_mask']

        if bsize:
            ids, mask, reverse_indices = _sort_by_length(ids, mask, bsize)
            batches = _split_into_batches(ids, mask, bsize)
            return batches, reverse_indices

        return ids, mask

# =====================
# ✨ tensorize triples
# =====================
def tensorize_triples(query_tokenizer, doc_tokenizer, queries, positives, negatives, bsize):
    assert len(queries) == len(positives) == len(negatives)
    assert bsize is None or len(queries) % bsize == 0

    N = len(queries)
    assert bsize == N
    Q_ids, Q_mask = query_tokenizer.tensorize(queries)
    D_ids, D_mask = doc_tokenizer.tensorize(positives + negatives)
    D_ids, D_mask = D_ids.view(2, N, -1), D_mask.view(2, N, -1)

    # Compute max among {length of i^th positive, length of i^th negative} for i \in N
    maxlens = D_mask.sum(-1).max(0).values

    # Sort by maxlens
    indices = maxlens.sort().indices
    Q_ids, Q_mask = Q_ids[indices], Q_mask[indices]
    D_ids, D_mask = D_ids[:, indices], D_mask[:, indices]

    (positive_ids, negative_ids), (positive_mask, negative_mask) = D_ids, D_mask

    query_batches = _split_into_batches(Q_ids, Q_mask, bsize)
    positive_batches = _split_into_batches(positive_ids, positive_mask, bsize)
    negative_batches = _split_into_batches(negative_ids, negative_mask, bsize)

    batches = []
    for (q_ids, q_mask), (p_ids, p_mask), (n_ids, n_mask) in zip(query_batches, positive_batches, negative_batches):
        Q = (torch.cat((q_ids, q_ids)), torch.cat((q_mask, q_mask)))
        D = (torch.cat((p_ids, n_ids)), torch.cat((p_mask, n_mask)))
        batches.append((Q, D))

    return batches

# =============
# ✨ Aux funcs
# =============
def _sort_by_length(ids, mask, bsize):
    if ids.size(0) <= bsize:
        return ids, mask, torch.arange(ids.size(0))

    indices = mask.sum(-1).sort().indices
    reverse_indices = indices.sort().indices

    return ids[indices], mask[indices], reverse_indices

def _split_into_batches(ids, mask, bsize):
    batches = []
    for offset in range(0, ids.size(0), bsize):
        batches.append((ids[offset:offset+bsize], mask[offset:offset+bsize]))

    return batches

# ===============
# ✨ LazyBatcher
# ===============
class LazyBatcher():
    def __init__(self, bsize, path, path_tokenizer, query_maxlen, doc_maxlen, mode='train', accumsteps=1):
        self.bsize, self.accumsteps = bsize, accumsteps
        self.query_tokenizer = QueryTokenizer(query_maxlen=query_maxlen, path_tokenizer=path_tokenizer)
        self.doc_tokenizer = DocTokenizer(doc_maxlen=doc_maxlen, path_tokenizer=path_tokenizer)
        self.tensorize_triples = partial(tensorize_triples, self.query_tokenizer, self.doc_tokenizer)
        self.position = 0
        self.mode = mode

        self.triples = self._load_triples(path_base)
        self.queries = self._load_queries(path_base)
        self.collection = self._load_collection(path_base)
    
    def _load_triples(self, path):
        if self.mode == 'train':
            path = path+'data/df_FAQ_triplet_IDS_TRAIN.parquet.gzip'
        elif self.mode == 'valid':
            path = path+'data/df_FAQ_triplet_IDS_VALID.parquet.gzip'

        df_triplet = pd.read_parquet(path)
        triples = []
        for qid, pos_pid, neg_pid in zip(
            df_triplet.qid.values,
            df_triplet.pos_pid.values,
            df_triplet.neg_pid.values
            ):
            triples.append((qid, pos_pid, neg_pid))

        return triples

    def _load_queries(self, path):
        if self.mode == 'train':
            qid_to_query_train = path+'data/qid_to_query_TRAIN'
            return pickle_file(qid_to_query_train)
        elif self.mode == 'valid':
            qid_to_query_valid = path+'data/qid_to_query_VALID'
            return pickle_file(qid_to_query_valid)

    def _load_collection(self, path):
        if self.mode == 'train':
            pid_to_doc_train = path+'data/pid_to_doc_TRAIN'
            return pickle_file(pid_to_doc_train)
        elif self.mode == 'valid':
            pid_to_doc_valid = path+'data/pid_to_doc_VALID'
            return pickle_file(pid_to_doc_valid)
        

    def __iter__(self):
        return self

    def __len__(self):
        return len(self.triples)

    def __next__(self):
        # offsets determines the starting index position of each bag (sequence) in input.
        offset, endpos = self.position, min(self.position + self.bsize, len(self.triples))
        self.position = endpos

        if offset + self.bsize > len(self.triples):
            raise StopIteration

        queries, positives, negatives = [], [], []

        for position in range(offset, endpos):
            query, pos, neg = self.triples[position]
            query, pos, neg = self.queries[query], self.collection[pos], self.collection[neg]
            queries.append(query)
            positives.append(pos)
            negatives.append(neg)

        return self.collate(queries, positives, negatives)

    def collate(self, queries, positives, negatives):
        assert len(queries) == len(positives) == len(negatives) == self.bsize

        return self.tensorize_triples(queries, positives, negatives, self.bsize // self.accumsteps)

# ===========
# ✨ ColBERT
# ===========
class ColBERT(BertPreTrainedModel):
    def __init__(self, config, query_maxlen, doc_maxlen, mask_punctuation, dim=128, similarity_metric='cosine'):

        super(ColBERT, self).__init__(config)

        self.query_maxlen = query_maxlen
        self.doc_maxlen = doc_maxlen
        self.similarity_metric = similarity_metric
        self.dim = dim

        self.mask_punctuation = mask_punctuation
        self.skiplist = {}

        if self.mask_punctuation:
            self.tokenizer = BertTokenizerFast.from_pretrained(path_model)
            self.skiplist = {w: True
                             for symbol in string.punctuation
                             for w in [symbol, self.tokenizer.encode(symbol, add_special_tokens=False)[0]]}

        self.bert = BertModel(config)
        self.linear = torch.nn.Linear(config.hidden_size, dim, bias=False)

        self.init_weights()

    def forward(self, Q, D):
        return self.score(self.query(*Q), self.doc(*D))

    def query(self, input_ids, attention_mask):
        input_ids, attention_mask = input_ids.to(DEVICE), attention_mask.to(DEVICE)
        Q = self.bert(input_ids, attention_mask=attention_mask)[0]
        Q = self.linear(Q)

        return torch.nn.functional.normalize(Q, p=2, dim=2)

    def doc(self, input_ids, attention_mask, keep_dims=True):
        input_ids, attention_mask = input_ids.to(DEVICE), attention_mask.to(DEVICE)
        D = self.bert(input_ids, attention_mask=attention_mask)[0]
        D = self.linear(D)

        mask = torch.tensor(self.mask(input_ids), device=DEVICE).unsqueeze(2).float()
        D = D * mask

        D = torch.nn.functional.normalize(D, p=2, dim=2)

        if not keep_dims:
            D, mask = D.cpu().to(dtype=torch.float16), mask.cpu().bool().squeeze(-1)
            D = [d[mask[idx]] for idx, d in enumerate(D)]

        return D

    def score(self, Q, D):
        if self.similarity_metric == 'cosine':
            return (Q @ D.permute(0, 2, 1)).max(2).values.sum(1)

        assert self.similarity_metric == 'l2'
        return (-1.0 * ((Q.unsqueeze(2) - D.unsqueeze(1))**2).sum(-1)).max(-1).values.sum(-1)

    def mask(self, input_ids):
        mask = [[(x not in self.skiplist) and (x != 0) for x in d] for d in input_ids.cpu().tolist()]
        return mask

# - - - - -
dataloader_train = LazyBatcher(
    bsize=bsize, 
    path=path_base, 
    path_tokenizer=path_model,
    query_maxlen=query_maxlen,
    doc_maxlen=doc_maxlen,
    mode='train'
    )

print('batches:')
for i, batches in enumerate(dataloader_train):
    print(f' {i }.', end ='')

try:
    del colbert
    gc.collect()
    torch.cuda.empty_cache()
except:
    pass

DEVICE = 'cuda'

print()

colbert = ColBERT.from_pretrained(
    path_model,
    query_maxlen=query_maxlen,
    doc_maxlen=doc_maxlen,
    dim=128,
    similarity_metric='cosine',
    mask_punctuation=False).to(DEVICE)        

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=871891.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1715180.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…


batch:
 0. 1. 2. 3. 4. 5. 6. 7. 8. 9. 10. 11. 12. 13. 14. 15. 16. 17. 18. 19. 20. 21. 22. 23. 24. 25. 26. 27. 28. 29. 30. 31. 32. 33. 34. 35. 36. 37.


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=625.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=672271273.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing ColBERT: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing ColBERT from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ColBERT from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ColBERT were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['linear.weight'

In [None]:
# ✨ load colbert from checkpoint
colbert.load_state_dict(torch.load(path_base+'data/EPOCH_3_FAQ'))
print('\nmodel loaded!\n')


model loaded!



# MS MARCO Passage Reranking Task

 *Given a query $q$ and a the $K$ most relevant passages $P = p1, p2, p3,… pK$, as retrieved by BM25 a succeful system is expected to rerank the most relevant passage as high as possible. Evaluation will be done using MRR*

> - topK_pids $\;\;\; key($qid$):$ $value($[pids]$)$
- topK_docs $\;\;key($qid$):$ $value($[text_docs]$)$
- queries: $\;\;\;\;\;\,key($qid$):$ $value($text$)$
- qrels: $\;\;\;\;\;\;\;\;\;key($qid$):$ $value($[pids]$)$
- collection: $\;\;\,list$


#### [notebook da FAQ dataprep](https://colab.research.google.com/drive/1ON0XqrOY0uyU3L9poA0sxTC6GQU3i_Yu?usp=sharing)

In [None]:
topK_pids_valid  = pickle_file(path_base+'data/topK_pids_VALID')
topK_docs_valid  = pickle_file(path_base+'data/topK_docs_VALID')
queries_valid    = pickle_file(path_base+'data/queries_VALID')
qrels_valid      = pickle_file(path_base+'data/qrels_VALID')
collection_valid = pickle_file(path_base+'data/collection_VALID')

print('\nVALID OBJECTS')
assert len(queries_valid) == len(topK_docs_valid) == len(topK_pids_valid)
print(f'\tlen(queries_valid):    {len(queries_valid)}')
print(f'\tlen(topK_docs_valid):  {len(topK_docs_valid)}')
print(f'\tlen(topK_pids_valid):  {len(topK_pids_valid)}')
print(f'\tlen(collection_valid): {len(collection_valid)}')


VALID OBJECTS
	len(queries_valid):    67
	len(topK_docs_valid):  67
	len(topK_pids_valid):  67
	len(collection_valid): 67


# Evaluate

In [None]:
# =================================================
# ✨ step 0: select qids from queries with shuffle
# =================================================
keys = sorted(list(queries_valid.keys()))
random.shuffle(keys)

In [None]:
# ================================================
# ✨ step 1: for each qid in qids, select a query
# ================================================
for query_idx, qid in enumerate(keys):
    query = queries_valid[qid]
    if query_idx == 0: break 

# fixing a qid 
qid = 3
query = queries_valid[qid]
print(query)

qual é o tratamento tributário do valor locativo de imóvel cedido gratuitamente?


In [None]:
# =========================================================
# ✨ step 2: check if exist relevant pids in the  topKpids
# =============================================== ==========
# in other words: if the doc for qid qrels was ranked by BM25
if  len(set.intersection(set(qrels_valid[qid]), set(topK_pids_valid[qid]))) == 0:
    pass # continue

print(f'qid = {qid} | len(set.intersection(set(qrels[qid]), set(topK_pids[qid]))): ' \
f'{ len(set.intersection(set(qrels_valid[qid]), set(topK_pids_valid[qid])))}')
    
# if intersection > 0 goes to ranking, otherwise a new qid is selected

qid = 3 | len(set.intersection(set(qrels[qid]), set(topK_pids[qid]))): 1


In [None]:
# =========================================
# ✨ step 3: performe the ranking if False
# =========================================
# re-rank depth
DEPTH = 5

# if collecion is provided:
if collection_valid is not None:
    passages = [collection_valid[pid] for pid in topK_pids_valid[qid][:DEPTH]]
else:
    topK_docs_valid[qid][:DEPTH]

# em funtion format
def qid2passages(qid, topK_pids, depth=DEPTH, collection=None):
    if collection is not None:
        return [collection[pid] for pid in topK_pids[qid][:depth]]
    else:
        return topK_docs_valid[qid][:depth]

for p in passages:
    print(p)

valor locativo de imóvel cedido a terceiro é tributado na declaração de ajuste anual, devendo ser informado  em rendimentos tributáveis recebidos de pessoas jurídicas, não se sujeitando, portanto, ao recolhimento  mensal (carnê-leão).    valor tributável corresponde a 10% do valor venal do imóvel, podendo ser adotado o constante da guia do   do ano-calendário da declaração de ajuste anual. se a cessão de uso não abrangeu todo o ano- calendário, o valor tributável é apurado proporcionalmente ao período de cessão de uso de imóvel.   não há incidência do imposto quando o imóvel for ocupado por seu proprietário ou cedido gratuitamente para  uso do cônjuge ou de parentes de 1º grau (pais e filhos).   do valor tributável podem ser subtraídas as seguintes despesas, quando o ônus tenha sido do proprietário:   a) impostos, taxas e emolumentos incidentes sobre o bem que produzir o rendimento;   b) aluguel pago pela locação de imóvel sublocado;   c) despesas pagas para cobrança ou recebimento do 

In [None]:
# =====================================
# ✨ step 4: proccess the query's text
# =====================================
query_tokenizer = QueryTokenizer(query_maxlen, path_model)
q_input_ids, q_attention_mask = query_tokenizer.tensorize([query])
Q = colbert.query(q_input_ids, q_attention_mask)
print(query)
Q

qual é o tratamento tributário do valor locativo de imóvel cedido gratuitamente?


tensor([[[-0.0170,  0.0582, -0.0923,  ...,  0.0806,  0.1548,  0.0774],
         [ 0.0216,  0.1341, -0.1049,  ...,  0.0719, -0.0624,  0.0288],
         [ 0.0732, -0.0810, -0.0232,  ..., -0.0286,  0.1619,  0.0026],
         ...,
         [-0.0475, -0.0129,  0.0738,  ..., -0.0551, -0.1977,  0.1212],
         [-0.0109, -0.0215,  0.0397,  ...,  0.0724, -0.0868,  0.1058],
         [ 0.0443,  0.0471, -0.0584,  ...,  0.0517, -0.0862,  0.0196]]],
       device='cuda:0', grad_fn=<DivBackward0>)

In [None]:
# =====================================
# ✨ step 5: proccess the docs's texts
# =====================================
# get the passages ranked by BM25
passages = qid2passages(qid=qid, topK_pids=topK_pids_valid, depth=DEPTH, collection=collection_valid)

# tokenize the passages
doc_tokenizer = DocTokenizer(doc_maxlen, path_model)
batches, reverse_indices = doc_tokenizer.tensorize(passages, bsize=bsize)

# get logits from colbert's batch
batches = [
           colbert.doc(d_input_ids, d_attention_mask, keep_dims=True)
                for d_input_ids, d_attention_mask in batches
          ]
print('batches[0].shape:', batches[0].size())

batches[0].shape: torch.Size([5, 128, 128])


In [None]:
# =============================
# ✨ step 6: stack doc vectors
# =============================
def stack_3D_tensors(groups):
    bsize = sum([x.size(0) for x in groups])
    maxlen = max([x.size(1) for x in groups])
    hdim = groups[0].size(2)

    output = torch.zeros(bsize, maxlen, hdim, device=groups[0].device, dtype=groups[0].dtype)

    offset = 0
    for x in groups:
        endpos = offset + x.size(0)
        output[offset:endpos, :x.size(1)] = x
        offset = endpos

    return output

D = stack_3D_tensors(batches)
D = D[reverse_indices]
D.size()

torch.Size([5, 128, 128])

In [None]:
# ================================================
# ✨ step 7: calculate the score between Q and Ds
# ================================================
scores = colbert.score(Q, D).cpu()
print('1. colbert.score(Q, D).cpu():', scores.detach(), '\n')

scores = scores.sort(descending=True)
print('2. scores.sort(descending=True):', scores.values, '\n')

ranked = scores.indices.tolist()
print('3. scores.indices.tolist():', ranked, '\n')

ranked_scores = scores.values.tolist()
print('4. scores.values.tolist():', ranked_scores, '\n')

pids = topK_pids_valid[qid]
print('5. topK_pids[qid]:', pids, '\n')

ranked_pids = [pids[position] for position in ranked]
print('6. [pids[position] for position in ranked]:', pids, '\n')

ranked_passages = [passages[position] for position in ranked]
print('7. [passages[position] for position in ranked]:', ranked_passages, '\n')

assert len(ranked_pids) == len(set(ranked_pids))

ranking = list(zip(ranked_scores, ranked_pids, ranked_passages))
print('Final format (score, pid, text)')
for tup in ranking:
    print(f'\t > score: {tup[0]:<4.5}  pid[ {tup[1]:<4}] text: {tup[2]:>5}')

print(f'\nQuery: {query} -- Relevant pids{qrels_valid[qid]}')    

1. colbert.score(Q, D).cpu(): tensor([30.3036, 25.0713, 25.2650, 24.2342, 24.6499]) 

2. scores.sort(descending=True): tensor([30.3036, 25.2650, 25.0713, 24.6499, 24.2342], grad_fn=<SortBackward>) 

3. scores.indices.tolist(): [0, 2, 1, 4, 3] 

4. scores.values.tolist(): [30.303611755371094, 25.265033721923828, 25.071269989013672, 24.649850845336914, 24.23416519165039] 

5. topK_pids[qid]: [3, 32, 16, 54, 65, 46, 37, 48, 62, 15, 51, 24, 34, 14, 1, 0, 61, 59, 55, 43, 4, 8, 18, 10, 7, 52, 13, 39, 27, 47, 50, 19, 66, 17, 44, 42, 20, 21, 22, 23, 41, 25, 26, 40, 28, 29, 30, 31, 38, 36] 

6. [pids[position] for position in ranked]: [3, 32, 16, 54, 65, 46, 37, 48, 62, 15, 51, 24, 34, 14, 1, 0, 61, 59, 55, 43, 4, 8, 18, 10, 7, 52, 13, 39, 27, 47, 50, 19, 66, 17, 44, 42, 20, 21, 22, 23, 41, 25, 26, 40, 28, 29, 30, 31, 38, 36] 

7. [passages[position] for position in ranked]: ['valor locativo de imóvel cedido a terceiro é tributado na declaração de ajuste anual, devendo ser informado  em rendime

# MRR

In [None]:
# ==================================================================
# ✨ step 0: Init a dict MRR@K, where k in K, K are the keys's dict
# ==================================================================
mrr_depths={1, 2, 3}
mrr_sums = {depth: 0.0 for depth in mrr_depths}
mrr_sums

{1: 0.0, 2: 0.0, 3: 0.0}

In [None]:
# ==================================
# ✨ step 1: get the gold_positives 
# ==================================
# gold_positives are  the relevant pids to the query q.
gold_positives = qrels_valid[qid]
gold_positives

[3]

In [None]:
# ========================================
# ✨ step 2: compute the positives vector
# ========================================
# enumerates the occurrences that exist in the ranking list that are gold positives
positives = [i for i, (_, pid, _) in enumerate(ranking) if pid in gold_positives]
positives

[0]

In [None]:
# ===================================================
# ✨ step 3: compute the mrr score for each  k/depth
# ===================================================
for depth in mrr_sums:
    # get the   1-first positive
    first_positive = positives[0]
    # mrr_sums[k] uptede k: value by the MRR's formula: 1/(first occurence in the ranking)
    mrr_sums[depth] += (1.0 / (first_positive+1.0)) if first_positive < depth else 0.0

# mrr@K -> K in {1,2,3}
for depth in sorted(mrr_sums):
    print("MRR@" + str(depth), "=", mrr_sums[depth] / (query_idx+1.0))

MRR@1 = 1.0
MRR@2 = 1.0
MRR@3 = 1.0


# Recall

In [None]:
recall_depths={1, 2, 3}
recall_sums = {depth: 0.0 for depth in recall_depths}

for depth in recall_sums:
    num_positives_up_to_depth = len([pos for pos in positives if pos < depth])
    recall_sums[depth] += num_positives_up_to_depth / len(gold_positives)

for depth in sorted(recall_sums):
    print("Recall@" + str(depth), "=", recall_sums[depth] / (query_idx+1.0))    

Recall@1 = 1.0
Recall@2 = 1.0
Recall@3 = 1.0


# Success 
(tem ou nao tem)

In [None]:
success_depths={1, 2, 3}
success_sums = {depth: 0.0 for depth in success_depths}

for depth in success_sums:
    first_positive = positives[0]
    success_sums[depth] += 1.0 if first_positive < depth else 0.0

for depth in sorted(success_sums):
    print("Success@" + str(depth), "=", success_sums[depth] / (query_idx+1.0))

Success@1 = 1.0
Success@2 = 1.0
Success@3 = 1.0


# In class format

In [None]:
# ===========
# ✨ Metrics
# ===========
class Metrics:
    def __init__(self, mrr_depths:set, recall_depths:set, success_depths:set):
        self.results = {}
        self.mrr_sums = {depth:0.0 for depth in mrr_depths}
        self.recall_sums = {depth:0.0 for depth in recall_depths}
        self.success_sums = {depth:0.0 for depth in success_depths}

    def get_result(self, query_idx, query_key, ranking, gold_positives):
        assert query_key not in self.results
        assert len(self.results) <= query_idx
        assert len(set(gold_positives)) == len(gold_positives)
        assert len(set([pid for _, pid, _ in ranking])) == len(ranking)

        self.results[query_key] = ranking

        positives = [i for i, (_, pid, _) in enumerate(ranking) if pid in gold_positives]

        if len(positives) == 0:
            return

        for depth in self.mrr_sums:
            first_positive = positives[0]
            self.mrr_sums[depth] += (1.0 / (first_positive+1.0)) if first_positive < depth else 0.0

        for depth in self.success_sums:
            first_positive = positives[0]
            self.success_sums[depth] += 1.0 if first_positive < depth else 0.0

        for depth in self.recall_sums:
            num_positives_up_to_depth = len([pos for pos in positives if pos < depth])
            self.recall_sums[depth] += num_positives_up_to_depth / len(gold_positives)

    def print_metrics(self, query_idx):
        print('- '*10)
        for depth in sorted(self.mrr_sums):
            mrr_value =  self.mrr_sums[depth] / (query_idx+1.0)
            print(f"MRR@{str(depth):<2} = {mrr_value:.3}")
        
        print('- '*10)
        for depth in sorted(self.recall_sums):
            recall_value = self.recall_sums[depth] / (query_idx+1.0)
            print(f"Recall@{str(depth):<2} = {recall_value:.3}")
        print('- '*10)
        for depth in sorted(self.success_sums):
            success_value = self.success_sums[depth] / (query_idx+1.0)
            print(f"Success@{str(depth):<2} = {success_value:.3}")
        print('- '*10)

def evaluate_recall(qrels, queries, topK_pids):
    if qrels is None:
        return

    assert set(qrels.keys()) == set(queries.keys())
    recall_at_k = [len(set.intersection(set(qrels[qid]), set(topK_pids[qid]))) / max(1.0, len(qrels[qid]))
                   for qid in qrels]
    recall_at_k = sum(recall_at_k) / len(qrels)
    recall_at_k = round(recall_at_k, 3)
    print(f"Recall @ maximum depth = {recall_at_k}")

# ==================
# ✨ ModelInference
# ==================
class ModelInference():
    def __init__(self, colbert, path_model):
        assert colbert.training is False

        self.colbert = colbert
        self.query_tokenizer = QueryTokenizer(colbert.query_maxlen, path_tokenizer=path_model)
        self.doc_tokenizer = DocTokenizer(colbert.doc_maxlen, path_tokenizer=path_model)

    def query(self, *args, to_cpu=False, **kw_args):
        with torch.no_grad():
            Q = self.colbert.query(*args, **kw_args)
            return Q.cpu() if to_cpu else Q

    def doc(self, *args, to_cpu=False, **kw_args):
        with torch.no_grad():
            D = self.colbert.doc(*args, **kw_args)
            return D.cpu() if to_cpu else D

    def queryFromText(self, queries, bsize=None, to_cpu=False):
        if bsize:
            batches = self.query_tokenizer.tensorize(queries, bsize=bsize)
            batches = [self.query(input_ids, attention_mask, to_cpu=to_cpu) for input_ids, attention_mask in batches]
            return torch.cat(batches)

        input_ids, attention_mask = self.query_tokenizer.tensorize(queries)
        return self.query(input_ids, attention_mask)

    def docFromText(self, docs, bsize=None, keep_dims=True, to_cpu=False):
        if bsize:
            batches, reverse_indices = self.doc_tokenizer.tensorize(docs, bsize=bsize)

            batches = [self.doc(input_ids, attention_mask, keep_dims=keep_dims, to_cpu=to_cpu)
                       for input_ids, attention_mask in batches]

            if keep_dims:
                D = _stack_3D_tensors(batches)
                return D[reverse_indices]

            D = [d for batch in batches for d in batch]
            return [D[idx] for idx in reverse_indices.tolist()]

        input_ids, attention_mask = self.doc_tokenizer.tensorize(docs)
        return self.doc(input_ids, attention_mask, keep_dims=keep_dims)

    def score(self, Q, D, mask=None, lengths=None, explain=False):
        if lengths is not None:
            assert mask is None, "don't supply both mask and lengths"

            mask = torch.arange(D.size(1), device=DEVICE) + 1
            mask = mask.unsqueeze(0) <= lengths.to(DEVICE).unsqueeze(-1)

        scores = (D @ Q)
        scores = scores if mask is None else scores * mask.unsqueeze(-1)
        scores = scores.max(1)

        if explain:
            assert False, "TODO"

        return scores.values.sum(-1).cpu()

def _stack_3D_tensors(groups):
    bsize = sum([x.size(0) for x in groups])
    maxlen = max([x.size(1) for x in groups])
    hdim = groups[0].size(2)

    output = torch.zeros(bsize, maxlen, hdim, device=groups[0].device, dtype=groups[0].dtype)

    offset = 0
    for x in groups:
        endpos = offset + x.size(0)
        output[offset:endpos, :x.size(1)] = x
        offset = endpos

    return output    

In [None]:
# ============
# ✨ evaluate
# ============
def evaluate(colbert, metrics, path_model, collection, queries, topK_docs, topK_pids, qrels=None, depth=50):
    inference = ModelInference(colbert, path_model)

    def qid2passages(qid):
        if collection is not None:
            return [collection[pid] for pid in topK_pids[qid][:depth]]
        else:
            return topK_docs[qid][:depth]

    with torch.no_grad():
        keys = sorted(list(queries.keys()))
        random.shuffle(keys)

        for query_idx, qid in enumerate(keys):
            query = queries[qid]
            
            if qrels and len(set.intersection(set(qrels[qid]), set(topK_pids[qid]))) == 0:
                continue

            ranking = slow_rerank(colbert, query, topK_pids[qid], qid2passages(qid))

            if qrels:
                metrics.get_result(query_idx, qid, ranking, qrels[qid])
                if query_idx%25 == 0:
                    print(f'\n[{query_idx}]. Query: {query}')
                    for i, (score, pid, passage) in enumerate(ranking, 1):
                        if pid in qrels[qid]:
                            print(f"Found at position: {i} with score {score:.3}")
                            print(passage)
                            break
                    metrics.print_metrics(query_idx)

# ===============
# ✨ slow_rerank
# ===============
def slow_rerank(colbert, query, pids, passages):
    inference = ModelInference(colbert, path_model)

    Q = inference.queryFromText([query])

    D_ = inference.docFromText(passages, bsize=bsize)
    scores = colbert.score(Q, D_).cpu()

    scores = scores.sort(descending=True)
    ranked = scores.indices.tolist()

    ranked_scores = scores.values.tolist()
    ranked_pids = [pids[position] for position in ranked]
    ranked_passages = [passages[position] for position in ranked]

    assert len(ranked_pids) == len(set(ranked_pids))

    return list(zip(ranked_scores, ranked_pids, ranked_passages))

# Running

In [None]:
evaluate_recall(qrels_valid, queries_valid, topK_pids_valid)

Recall @ maximum depth = 1.0


In [None]:
# ============
# ✨ evaluate
# ============
metrics = Metrics(
    mrr_depths=    {1, 3, 5, 10, 20}, 
    recall_depths= {1, 3, 5, 10, 20},
    success_depths={1, 3, 5, 10, 20},
    )

evaluate(
    colbert=colbert, 
    metrics=metrics,
    path_model=path_model,
    collection=collection_valid, 
    queries=queries_valid, 
    topK_docs=topK_docs_valid, 
    topK_pids=topK_pids_valid, 
    qrels=qrels_valid, 
    depth=50,
    )


[0]. Query: como devem ser tributados os resultados obtidos em alienações de participações societárias  quando o preço não pode ser predeterminado?
Found at position: 1 with score 27.8
quando não houver valor determinado, por impossibilidade absoluta de quantificá-lo de imediato (ex.: a  determinação do valor das prestações e do preço depende do faturamento futuro da empresa adquirida, no  curso do período do pagamento das parcelas contratadas), o ganho de capital deve ser tributado na medida  em que o preço for determinado e as parcelas forem pagas.   não obstante ser indeterminado o preço de alienação, toma-se como data de alienação a da concretização  da operação ou a data em que foi cumprida a cláusula preestabelecida nos atos contratados sob condição  suspensiva.   contudo, alerte-se que o tratamento descrito deve ser comprovado pelas partes contratantes sempre que a  autoridade lançadora assim o determinar.
- - - - - - - - - - 
MRR@1  = 1.0
MRR@3  = 1.0
MRR@5  = 1.0
MRR@10 = 1.0

# Evaluate Train

In [None]:
EVAL_TRAIN = True

topK_pids_train  = pickle_file(path_base+'data/topK_pids_TRAIN')
topK_docs_train  = pickle_file(path_base+'data/topK_docs_TRAIN')
queries_train    = pickle_file(path_base+'data/queries_TRAIN')
qrels_train      = pickle_file(path_base+'data/qrels_TRAIN')
collection_train = pickle_file(path_base+'data/collection_TRAIN')

print('TRAIN OBJECTS')
assert len(queries_train) == len(topK_docs_train) == len(topK_pids_train)
print(f'\tlen(queries_train):    {len(queries_train)}')
print(f'\tlen(topK_docs_train):  {len(topK_docs_train)}')
print(f'\tlen(topK_pids_train):  {len(topK_pids_train)}')
print(f'\tlen(collection_train): {len(collection_train)}')

if EVAL_TRAIN:

    metrics = Metrics(
        mrr_depths=    {1, 3, 5, 10, 20}, 
        recall_depths= {1, 3, 5, 10, 20},
        success_depths={1, 3, 5, 10, 20},
        )    

    evaluate(
        colbert=colbert, 
        metrics=metrics,
        path_model=path_model,
        collection=collection_train, 
        queries=queries_train, 
        topK_docs=topK_docs_train, 
        topK_pids=topK_pids_train, 
        qrels=qrels_train, 
        depth=50,
        )

TRAIN OBJECTS
	len(queries_train):    612
	len(topK_docs_train):  612
	len(topK_pids_train):  612
	len(collection_train): 612

[0]. Query: os gastos com  móvel podem ser deduzidos como despesa hospitalar?
Found at position: 1 with score 28.9
podem ser deduzidas da base de cálculo do , desde que comprovadas por meio de documentação hábil  e idônea, as seguintes despesas:   1) atendimento domiciliar dos serviços de saúde previstos na alínea “a” do inciso  do art. 8º da lei nº 9.250,  de 26 de dezembro de 1995;   2) atendimento pré-hospitalar de urgência, desde que prestado por meio de  móvel, instalada em  ambulância de suporte avançado (tipo “”) ou em aeronave de suporte médico (tipo “”); e    3) atendimento pré-hospitalar de emergência, realizado por meio de  móvel, instalada em ambulância tipo  “”, “”, “” ou “”, quando necessariamente conte com a presença de um profissional médico e possua em  seu interior equipamentos que possibilitem oferecer ao paciente suporte avançado de vida.
- 