<a href="https://colab.research.google.com/github/finardi/tutos/blob/master/Demo-ColBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install -q transformers

In [None]:
import gc
import os
import torch
import pickle
import numpy as np
import pandas as pd
from functools import partial

from transformers import logging
from transformers import BertPreTrainedModel, BertModel, BertTokenizer

logging.set_verbosity_error()

# better pandas viz
pd.set_option('display.max_columns', 100)  
pd.set_option('display.expand_frame_repr', 100)
pd.set_option('max_colwidth', 700)
pd.set_option('display.max_rows', 5000)
  
# save/load pickles
def pickle_file(path, data=None):
    if data is None:
        with open(path, 'rb') as f:
            return pickle.load(f)
    if data is not None:
        with open(path, 'wb') as handle:
            pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)
 
# path base
path_base = '/content/drive/MyDrive/BACEN/FAQ/'

# load dataframes
df_train = pd.read_parquet(path_base+'data/df_FAQ_TRAIN.parquet.gzip')

print(f'unique docs:      {df_train["Doc"].nunique()}')
print(f'unique questions: {df_train["Query"].nunique()}')
df_triplet = pd.read_parquet(path_base+'data/df_FAQ_triplet_IDS_TRAIN.parquet.gzip')

# load data dicts
query_to_qid = pickle_file(path_base+'data/query_to_qid_TRAIN' )
qid_to_query = pickle_file(path_base+'data/qid_to_query_TRAIN')
doc_to_pid   = pickle_file(path_base+'data/doc_to_pid_TRAIN' )
pid_to_doc   = pickle_file(path_base+'data/pid_to_doc_TRAIN')

df_train.head()

In [98]:
path_model = 'bert-base-multilingual-uncased'
tok = BertTokenizer.from_pretrained(path_model)

In [28]:
sample = df_train.iloc[:6]
queries = sample.Query.to_list()[:3]
pos_docs = sample.Doc.to_list()[:3]
neg_docs = sample.Doc.to_list()[3:]

for i, (query, pos_doc, neg_doc) in enumerate(zip(queries, pos_docs, neg_docs)):
    print(i, query)
    print('\t -> POSITIVE::: ', pos_doc)
    print('\t -> NEGATIVE::: ', neg_doc)

0 Quais as condições básicas para concessão dos créditos de investimento?
	 -> POSITIVE:::   Os créditos de investimento devem ser concedidos mediante apresentação de projeto técnico, o qual poderá ser substituído, a critério da instituição financeira, por proposta simplificada de crédito, desde que as inversões programadas envolvam técnicas simples e bem assimiladas pelos agricultores da região ou se trate de crédito destinado à ampliação dos investimentos já financiados. Os créditos de investimento se destinam a promover o aumento da produção e da produtividade e a redução dos custos de produção, visando a elevação da renda da família produtora rural. Os créditos de investimento estão restritos ao financiamento de itens diretamente relacionados com a implantação, ampliação ou modernização da estrutura das atividades de produção, de armazenagem, de transporte ou de serviços agropecuários ou não agropecuários, no estabelecimento rural ou em áreas comunitárias rurais próximas, sendo pas

# Query Tokenization

In [29]:
query_maxlen = 18
bsize = 3

# =============================
# ✨ build the q_obj tokenizer
# =============================
q_obj = tok(
    queries, 
    padding='max_length', 
    truncation=True,
    return_tensors='pt', 
    max_length=query_maxlen,
    )

# use only ids e mask keys in the q_obj dict
q_ids, q_mask = q_obj['input_ids'], q_obj['attention_mask']
q_ids, q_mask

(tensor([[  101, 23840, 10146, 56749, 61997, 10107, 10239, 10173, 15101, 22877,
          10426, 87174, 10102, 10104, 54093, 11605,   136,   102],
         [  101,   157, 10126,   147,   107, 62372, 20347,   107,   136,   102,
              0,     0,     0,     0,     0,     0,     0,     0],
         [  101, 12243, 45555,   157, 33212, 10132, 10102, 45276,   136,   102,
              0,     0,     0,     0,     0,     0,     0,     0]]),
 tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]))

In [70]:
query_batches = []
# range (0, N, N)
for offset in range(0, q_ids.size(0), bsize):
    query_batches.append((q_ids[offset:offset+bsize], q_mask[offset:offset+bsize]))
query_batches    

[(tensor([[  101, 23840, 10146, 56749, 61997, 10107, 10239, 10173, 15101, 22877,
           10426, 87174, 10102, 10104, 54093, 11605,   136,   102],
          [  101,   157, 10126,   147,   107, 62372, 20347,   107,   136,   102,
               0,     0,     0,     0,     0,     0,     0,     0],
          [  101, 12243, 45555,   157, 33212, 10132, 10102, 45276,   136,   102,
               0,     0,     0,     0,     0,     0,     0,     0]]),
  tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
          [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]))]

# Doc Tokenization

In [32]:
# ===========================================================
# ✨ build one batch with positive and negative docs 
# ===========================================================
doc_bsize = pos_docs + neg_docs
len(doc_bsize)

6

In [55]:
doc_maxlen = 64

d_obj = tok(
    doc_bsize, 
    padding='max_length', 
    truncation=True,
    return_tensors='pt', 
    max_length=doc_maxlen,
    )

# utiliza somente ids e mask keys
d_ids, d_mask = d_obj['input_ids'], d_obj['attention_mask']
d_ids, d_mask

(tensor([[  101, 10253, 87174, 10102, 10104, 54093, 11605, 75143, 10542, 10173,
          84677, 10107, 18255, 85267, 10102, 28903, 22760,   117,   157, 13249,
          13129, 10112, 10542, 77984,   117,   143, 24590, 57460, 10141, 78689,
          24363, 13198,   117, 10190, 36054, 33873, 64422, 41707, 10102, 62372,
            117, 11328, 10126, 10146, 10104, 78426, 10165, 13668, 11313, 10109,
          35710, 20145, 33658, 28579,   147, 19060, 17911, 14633, 11313, 18247,
          13353, 18754, 75246,   102],
         [  101, 87174, 81134, 85065, 28635, 11525, 10150, 54682, 13823, 25942,
          10173, 84677, 10107, 79673, 10405, 40230, 19841,   147, 50373, 19841,
          10239, 23955, 45426, 12574, 10102, 56651, 10391, 43189, 10252, 20041,
          35564, 10102, 25451, 21954,   119, 21256, 94744, 15408, 22670,   131,
          50373, 19841, 74850, 10181, 12369, 33575, 30114, 10112,   157, 80914,
          10620, 40230, 19841, 11373, 74850,   117, 10126, 42954,   157, 90177,
 

In [56]:
# ==========================
# ✨ sort the doc's lengths 
# ==========================
indices = d_mask.sum(-1).sort().indices
print(indices)
d_ids = d_ids[indices]
d_mask = d_mask[indices]
d_mask.sum(-1)

tensor([2, 3, 0, 1, 4, 5])


tensor([51, 56, 64, 64, 64, 64])

In [57]:
# ==========================================
# ✨ build the doc's batch: list(ids, mask)
# ==========================================
d_batches = []
# range (0, N, N)
for offset in range(0, d_ids.size(0), bsize):
    d_batches.append((d_ids[offset:offset+bsize], d_mask[offset:offset+bsize]))
d_batches    

[(tensor([[  101, 10135, 27994, 10102,   100, 14247, 10102, 58262, 12965,   100,
             117,   157, 11855, 10102, 33212, 10132, 14396, 10542, 63834, 10351,
           10620, 15841, 10102, 11822, 58262, 11115, 10405, 17734,   119, 10135,
           24588, 73296, 51215, 10141, 58262, 11115, 12014, 30736,   157, 11855,
           10154, 33212, 10132,   117, 10248,   147, 20018, 10102, 45276,   119,
             102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
               0,     0,     0,     0],
          [  101,   143, 21471, 10102, 80914,   147, 10477, 21471, 36692, 12108,
           62876, 10239,   143, 95541, 10102, 80914, 10107,   147, 63736, 10102,
           23402, 10245,   117, 10190, 21107,   117, 80914, 10102, 21471, 10107,
             147, 74788, 13320,   117, 25990, 87852, 10102, 21674,   147, 22747,
             117, 17077, 10102, 63736, 10102, 23402, 10405, 67210, 10102, 10477,
           18777, 78689, 10102, 80914,   119,   102,     0,     0,   

# Tensorize Triplets

In [59]:
# ========================================
# ✨ assign ids e mask tokens and reshape
# ========================================
N = bsize
Q_ids, Q_mask = q_ids, q_mask
D_ids, D_mask = d_ids, d_mask
D_ids, D_mask = D_ids.view(2, N, -1), D_mask.view(2, N, -1)

print(f'Q_ids:  {q_ids.shape}')
print(f'Q_mask: {q_mask.shape}\n')

print(f'D_ids:                {d_ids.shape}')
print(f'D_ids.view(2, N, -1): {d_ids.view(2, N, -1).shape}\n')
print(f'D_mask:                {d_mask.shape}')
print(f'D_mask.view(2, N, -1): {d_mask.view(2, N, -1).shape}')

Q_ids:  torch.Size([3, 18])
Q_mask: torch.Size([3, 18])

D_ids:                torch.Size([6, 64])
D_ids.view(2, N, -1): torch.Size([2, 3, 64])

D_mask:                torch.Size([6, 64])
D_mask.view(2, N, -1): torch.Size([2, 3, 64])


In [60]:
# ==================================================================================================
# ✨ get the max value between the len of i-th positive and the len of the i-th negative for i in N
# ==================================================================================================
maxlens = D_mask.sum(-1).max(0).values
print(maxlens)
indices = maxlens.sort().indices
indices

tensor([64, 64, 64])


tensor([0, 1, 2])

In [66]:
# =============================
# ✨ sort Q_* e D_* by maxlens
# =============================
Q_ids, Q_mask = Q_ids[indices], Q_mask[indices]
D_ids, D_mask = D_ids[:, indices], D_mask[:, indices]

In [67]:
# ==================================================================
# ✨ split the positive e negative ids and mask from D_ids e D_mask
# ==================================================================
(positive_ids, negative_ids), (positive_mask, negative_mask) = D_ids, D_mask
positive_ids.shape, negative_ids.shape, positive_mask.shape, negative_mask.shape

(torch.Size([3, 64]),
 torch.Size([3, 64]),
 torch.Size([3, 64]),
 torch.Size([3, 64]))

In [72]:
# ====================================================
# ✨ build batches to positive_docs and negative_docs
# ====================================================
(positive_ids, negative_ids), (positive_mask, negative_mask) = D_ids, D_mask

positive_batches = []
for offset in range(0, positive_ids.size(0), bsize):
    positive_batches.append((positive_ids[offset:offset+bsize], positive_mask[offset:offset+bsize]))    

negative_batches = []
for offset in range(0, negative_ids.size(0), bsize):
    negative_batches.append((negative_ids[offset:offset+bsize], negative_mask[offset:offset+bsize]))    

In [73]:
# ===========================================================
# ✨ group the batches: (query_, positive_, negative)batches
# ===========================================================
batches = []
for (q_ids, q_mask), (p_ids, p_mask), (n_ids, n_mask) in zip(query_batches, positive_batches, negative_batches):
    Q = (torch.cat((q_ids, q_ids)), torch.cat((q_mask, q_mask))) # <- duplicate Q (one to pos docs and another for neg docs)
    D = (torch.cat((p_ids, n_ids)), torch.cat((p_mask, n_mask)))
    batches.append((Q, D))
batches

[((tensor([[  101, 23840, 10146, 56749, 61997, 10107, 10239, 10173, 15101, 22877,
            10426, 87174, 10102, 10104, 54093, 11605,   136,   102],
           [  101,   157, 10126,   147,   107, 62372, 20347,   107,   136,   102,
                0,     0,     0,     0,     0,     0,     0,     0],
           [  101, 12243, 45555,   157, 33212, 10132, 10102, 45276,   136,   102,
                0,     0,     0,     0,     0,     0,     0,     0],
           [  101, 23840, 10146, 56749, 61997, 10107, 10239, 10173, 15101, 22877,
            10426, 87174, 10102, 10104, 54093, 11605,   136,   102],
           [  101,   157, 10126,   147,   107, 62372, 20347,   107,   136,   102,
                0,     0,     0,     0,     0,     0,     0,     0],
           [  101, 12243, 45555,   157, 33212, 10132, 10102, 45276,   136,   102,
                0,     0,     0,     0,     0,     0,     0,     0]]),
   tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
           [1, 1, 1, 1, 1

# Model

In [96]:
query_ids = batches[0][0][0]
query_mask = batches[0][0][1]
print('q_input_ids', query_ids.shape)
print('q_attn_mask', query_mask.shape)

doc_ids = batches[0][1][0]
doc_mask = batches[0][1][1]
print('doc_input_ids', doc_ids.shape)
print('doc_attn_mask', doc_mask.shape)

q_input_ids torch.Size([6, 18])
q_attn_mask torch.Size([6, 18])
doc_input_ids torch.Size([6, 64])
doc_attn_mask torch.Size([6, 64])


In [104]:
# =============================
# ✨ init BERT Model
# =============================
model = BertModel.from_pretrained(path_model, return_dict=True)

# =================================
# ✨ build a dense layer
# =================================
linear = torch.nn.Linear(model.config.hidden_size, 128, bias=False)

# Prepare Q 

In [105]:
# ====================================================
# ✨ input the input_ids and attention_mask into BERT
# ====================================================
outs = model(input_ids=query_ids, attention_mask=query_mask)
Q = outs['last_hidden_state']
Q.shape

torch.Size([6, 18, 768])

In [106]:
# ================================
# ✨ perform Q in the dense layer
# ================================
Q = linear(Q)
Q.shape

torch.Size([6, 18, 128])

In [107]:
# ===================
# ✨ normalize in L2
# ===================
Q = torch.nn.functional.normalize(Q, p=2, dim=2)
Q.shape

torch.Size([6, 18, 128])

# Prepare D

In [109]:
# ================================================
# ✨ input input_ids and attention_mask into BERT
# ================================================
outs = model(input_ids=doc_ids, attention_mask=doc_mask)
D = outs['last_hidden_state']
D.shape

torch.Size([6, 64, 768])

In [110]:
# ================================
# ✨ perform D in the dense layer
# ================================
D = linear(D)
D.shape

torch.Size([6, 64, 128])

In [111]:
# ======================
# ✨ filter D with mask
# ======================
mask = torch.tensor([[(x != 0) for x in d] for d in doc_ids.cpu().tolist()])
print(mask.shape)
D = D * mask.unsqueeze(2).float()
print(D.shape)

# ===================
# ✨ normalize in L2
# ===================
D = torch.nn.functional.normalize(D, p=2, dim=2)
D.shape

torch.Size([6, 64])
torch.Size([6, 64, 128])


torch.Size([6, 64, 128])

# Get the score between Q and D

In [128]:
print(f'Q shape: {Q.size()} -- D shape: {D.size()}\n')

scores = torch.einsum('nqe, nde -> nqd', Q, D) 
print(f'Score shape:                       {scores.size()}')

scores = scores.max(2)
print(f'Scores.max(2).values shape:        {scores.values.size()}')

scores = scores.values.sum(1)
print(f'Scores.max(2).values.sum(1) shape: {scores.size()}')

print(f'Scores: {[float(str(s.item())[:7]) for s in scores]:}')

Q shape: torch.Size([6, 18, 128]) -- D shape: torch.Size([6, 64, 128])

Score shape:                       torch.Size([6, 18, 64])
Scores.max(2).values shape:        torch.Size([6, 18])
Scores.max(2).values.sum(1) shape: torch.Size([6])
Scores: [10.7851, 9.7143, 9.93258, 11.5629, 10.0149, 10.114]


# Otimization

In [129]:
# ======================================================
# ✨ build pseudo-labels for the classe 0 (torch zeros)
# ======================================================
# labels has the size of the batch bsize
labels = torch.zeros(bsize, dtype=torch.long)
print('labels shape', labels.shape, '\n')

labels shape torch.Size([3]) 



In [130]:
# =======================
# ✨ reshape the  scores
# =======================
scores = scores.view(2, -1).permute(1, 0)
print(scores.shape)
scores

torch.Size([3, 2])


tensor([[10.7852, 11.5629],
        [ 9.7143, 10.0150],
        [ 9.9326, 10.1141]], grad_fn=<PermuteBackward>)

In [131]:
# ============
# ✨ get loss
# ============
# init the CE Loss
criterion = torch.nn.CrossEntropyLoss()
loss = criterion(scores, labels[:scores.size(0)])
loss.item()

0.9328517913818359