In [1]:
def install_lib(libname):
    print(f">>> {libname}")
    get_ipython().system(f"pip install -qqq {libname}")

libs = [
    "accelerate==0.29.3"
]

for lib in libs:
    install_lib(lib)

>>> accelerate==0.29.3


In [2]:
import os
import torch

from huggingface_hub import login
from sentence_transformers import SentenceTransformer
from datasets import load_dataset, concatenate_datasets

from sentence_transformers.util import cos_sim
from sentence_transformers.evaluation import (
    InformationRetrievalEvaluator,
    SequentialEvaluator,
)



In [3]:
login(token=os.environ["HUGGINGFACE_TOKEN"])

# MODEL_ID = 'PORTULAN/serafim-900m-portuguese-pt-sentence-encoder-ir'
MODEL_ID = "BAAI/bge-small-en-v1.5"
MATRYOSHKA_DIMENSIONS = [384, 256, 128, 64] # [768, 512, 256, 128, 64]
TRAIN_DATASET = "data/bacen/train_dataset.json"
TEST_DATASET = "data/bacen/test_dataset.json"

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


## 1. Crie e Prepare o Conjunto de Dados de Incorporação

Um conjunto de dados de embedding geralmente consiste em pares de texto (pergunta, resposta/contexto) ou tríades que representam relações ou semelhanças entre frases. O formato do conjunto de dados que você escolher ou tiver disponível também afetará a função de perda que você pode usar. Formatos comuns para conjuntos de dados de embedding:

- **Par Positivo**: Pares de texto de frases relacionadas (consulta, contexto | consulta, resposta), adequados para tarefas como pesquisa de semelhança ou busca semântica, exemplos de conjuntos de dados: `sentence-transformers/sentence-compression`, `sentence-transformers/natural-questions`.
- **Tríades**: Tríades de texto compostas por (âncora, positivo, negativo), exemplos de conjuntos de dados `sentence-transformers/quora-duplicates`, `nirantk/triplets`.
- **Par com Pontuação de Similaridade**: Pares de frases com uma pontuação de similaridade indicando o quão relacionadas são, exemplos de conjuntos de dados: `sentence-transformers/stsb`, `PhilipMay/stsb_multi_mt`

Saiba mais em [Visão Geral dos Conjuntos de Dados](https://sbert.net/docs/sentence_transformer/dataset_overview.html).

Vamos usar o dataset [Itau-Unibanco/FAQ_BACEN)](https://huggingface.co/datasets/Itau-Unibanco/FAQ_BACEN), que inclui 7.000 pares de texto positivos de perguntas e contextos correspondentes do [Relatório SEC da NVIDIA de 2023_10](https://stocklight.com/stocks/us/nasdaq-nvda/nvidia/annual-reports/nasdaq-nvda-2023-10K-23668751.pdf).

O conjunto de dados tem o seguinte formato:
```json
{"questions": "<pergunta>", "answers": "<contexto relevante para a resposta>"}
{"questions": "<pergunta>", "answers": "<contexto relevante para a resposta>"}
{"questions": "<pergunta>", "answers": "<contexto relevante para a resposta>"}
```

# Dataset

In [4]:
%%time
import re
from datasets import load_from_disk

dataset_filepath = "data/clips_mqa/pt"
dataset = load_from_disk(dataset_filepath)
print(f">>> Dataset importado com {dataset.num_rows} linhas")

# Removendo domínios sem relevância
def valid_domain(text):
    if (text[-3:] == ".br") or (text[-4:] in [".com", ".net", ".org"]):
        return True
    return False
dataset = dataset.filter(lambda row: valid_domain(row["domain"]) )
print(f">>> Apos remover domínios sem relevância, o dataset ficou com {dataset.num_rows} linhas")
        
    
# bloquear dominios 
def contains_prohibited_term_regex(text):
    blacklist = [
        "mundosugar.com.br",
        "aposta",
        "apuesta",
        "sex",
        "porn",
        "penis",
        "vagi",
        "turba",
        "sensual",
    ]
    pattern = re.compile("|".join(map(re.escape, blacklist)))
    return bool(pattern.search(text))

dataset = dataset.filter(lambda row: not contains_prohibited_term_regex(row["domain"]) )
print(f">>> Apos remover domínios com termos indesejáveis, o dataset ficou com {dataset.num_rows} linhas")

>>> Dataset importado com 4046601 linhas
>>> Apos remover domínios sem relevância, o dataset ficou com 3158116 linhas
>>> Apos remover domínios com termos indesejáveis, o dataset ficou com 3085331 linhas
CPU times: user 72.5 ms, sys: 39 ms, total: 112 ms
Wall time: 65.9 ms


In [133]:
%%time
dataset.to_pandas().sample(4)

CPU times: user 18.1 s, sys: 1.25 s, total: 19.4 s
Wall time: 19.3 s


Unnamed: 0,id,bucket,domain,text,question,answer
1708125,633ac8de66695d1671ffdea1476231fa,2021.1,hoteis.com,,quais sao as medidas de limpeza e higiene em v...,este estabelecimento confirma que sao utilizad...
466777,c678d0382e184b5d534dea19f6d6dda3,2019.47,aluguetemporada.com.br,,Posso reservar um imovel para temporada direta...,Sim. O AlugueTemporada oferece 134 imoveis par...
2474495,c4fe2ce007f3ba02412e705170008ce6,2021.25,kayak.com.br,,qual e a agencia de aluguel de carros mais pop...,premium ford focus ou similar e o tipo de carr...
88406,e6bd3705bfab4cc7722bcaaf2371d374,2020.29,edestinos.com.br,,quando os voos do aeroporto umtata airport sao...,a oferta da companhia aerea esta mudando const...


In [5]:
# Sampling
dataset = dataset.shuffle().select(range(10000))

# Selecting columns
dataset = dataset.rename_columns({"question": "anchor", "answer": "positive"}) .select_columns(["id", "anchor", "positive"])

# split dataset into a 10% test set
dataset = dataset.train_test_split(test_size=0.1)

# save datasets to disk
dataset["train"].to_json(TRAIN_DATASET, orient="records")
dataset["test"].to_json(TEST_DATASET, orient="records")

Creating json from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

347761

In [6]:
# load test dataset
train_dataset = load_dataset("json", data_files=TRAIN_DATASET, split="train")
test_dataset = load_dataset("json", data_files=TEST_DATASET, split="train")
corpus_dataset = concatenate_datasets([train_dataset, test_dataset])

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [7]:
# Convert the datasets to dictionaries
corpus = dict(
    zip(corpus_dataset["id"], corpus_dataset["positive"])
)  # Our corpus (cid => document)
queries = dict(
    zip(test_dataset["id"], test_dataset["anchor"])
)  # Our queries (qid => question)

In [8]:
# Create a mapping of relevant document (1 in our case) for each query
relevant_docs = {}  # Query ID to relevant documents (qid => set([relevant_cids])
for q_id in queries:
    relevant_docs[q_id] = [q_id]


# Baseline

In [9]:
model = SentenceTransformer(
    MODEL_ID, device="cuda" if torch.cuda.is_available() else "cpu"
)



In [11]:
%%time
matryoshka_evaluators = []
# Iterate over the different dimensions
for dim in MATRYOSHKA_DIMENSIONS:
    ir_evaluator = InformationRetrievalEvaluator(
        queries=queries,
        corpus=corpus,
        relevant_docs=relevant_docs,
        name=f"dim_{dim}",
        truncate_dim=dim,  # Truncate the embeddings to a certain dimension
        score_functions={"cosine": cos_sim},
    )
    matryoshka_evaluators.append(ir_evaluator)

# Create a sequential evaluator
evaluator = SequentialEvaluator(matryoshka_evaluators)

CPU times: user 2.85 ms, sys: 0 ns, total: 2.85 ms
Wall time: 2.86 ms


In [12]:
%%time

rerun_baseline = False

if rerun_baseline:
    results = evaluator(model)
    for dim in MATRYOSHKA_DIMENSIONS:
        key = f"dim_{dim}_cosine_ndcg@10"
        print
        print(f"{key}: {results[key]}")

dim_384_cosine_ndcg@10: 0.5718883115078188
dim_256_cosine_ndcg@10: 0.5554733697644808
dim_128_cosine_ndcg@10: 0.5229214149363955
dim_64_cosine_ndcg@10: 0.4474030387901696
CPU times: user 48.6 s, sys: 1.26 s, total: 49.8 s
Wall time: 39.6 s


# Fine Tune

In [13]:
from sentence_transformers import SentenceTransformerModelCardData, SentenceTransformer

attn_implementation = "sdpa"# "eager" # sdpa

# # load model with SDPA for using Flash Attention 2
# model = SentenceTransformer(
#     MODEL_ID,
#     model_kwargs={"attn_implementation": attn_implementation},
#     model_card_data=SentenceTransformerModelCardData(
#         language="pt-br",
#         license="apache-2.0",
#         model_name="FAQ-BACEN",
#     ),
# )

# load model with SDPA for using Flash Attention 2
model = SentenceTransformer(
    MODEL_ID,
    model_kwargs={"attn_implementation": attn_implementation},
    model_card_data=SentenceTransformerModelCardData(
        language="pt-br",
        license="apache-2.0",
        model_name="QA-Brazil",
    ),
)

In [17]:
from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss

inner_train_loss = MultipleNegativesRankingLoss(model)
train_loss = MatryoshkaLoss(
    model, inner_train_loss, matryoshka_dims=MATRYOSHKA_DIMENSIONS
)

In [18]:
from sentence_transformers import SentenceTransformerTrainingArguments
from sentence_transformers.training_args import BatchSamplers

# load train dataset again
train_dataset = load_dataset("json", data_files=TRAIN_DATASET, split="train")

# define training arguments
args = SentenceTransformerTrainingArguments(
    output_dir="./models/bacen/embeddings", # output directory and hugging face model ID
    num_train_epochs=10,                         # number of epochs
    per_device_train_batch_size=32,             # train batch size
    gradient_accumulation_steps=16,             # for a global batch size of 512
    per_device_eval_batch_size=16,              # evaluation batch size
    warmup_ratio=0.1,                           # warmup ratio
    learning_rate=2e-5,                         # learning rate, 2e-5 is a good value
    lr_scheduler_type="cosine",                 # use constant learning rate scheduler
    optim="adamw_torch_fused",                  # use fused adamw optimizer
    #tf32=True,                                  # use tf32 precision
    #bf16=True,                                  # use bf16 precision
    batch_sampler=BatchSamplers.NO_DUPLICATES,  # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch
    eval_strategy="epoch",                      # evaluate after each epoch
    save_strategy="epoch",                      # save after each epoch
    logging_steps=10,                           # log every 10 steps
    save_total_limit=3,                         # save only the last 3 models
    load_best_model_at_end=True,                # load the best model when training ends
    metric_for_best_model="eval_dim_128_cosine_ndcg@10",  # Optimizing for the best ndcg@10 score for the 128 dimension
)

In [19]:
test = train_dataset.to_pandas().sample(1).iloc[0].to_dict()
print(f'anchor: {test["anchor"]}')
print(f'positive: {test["positive"]}')

anchor: em media, quantas conexoes estao disponiveis por dia de goiania para sao paulo?
positive: entre goiania e sao paulo existem por volta de 2 conexoes diariamente. com nosso mecanismo de busca, voce pode comparar os horarios dos onibus para encontrar a viagem perfeita.


In [20]:
from sentence_transformers import SentenceTransformerTrainer

trainer = SentenceTransformerTrainer(
    model=model, # BAAI/bge-small-en-v1.5
    args=args,  # training arguments
    train_dataset=train_dataset.select_columns(
        ["positive", "anchor"]
    ),  # training dataset
    loss=train_loss,
    evaluator=evaluator,
)

In [21]:
# start training, the model will be automatically saved to the hub and the output directory
trainer.train()

# save the best model
trainer.save_model()

Epoch,Training Loss,Validation Loss,Dim 384 Cosine Accuracy@1,Dim 384 Cosine Accuracy@3,Dim 384 Cosine Accuracy@5,Dim 384 Cosine Accuracy@10,Dim 384 Cosine Precision@1,Dim 384 Cosine Precision@3,Dim 384 Cosine Precision@5,Dim 384 Cosine Precision@10,Dim 384 Cosine Recall@1,Dim 384 Cosine Recall@3,Dim 384 Cosine Recall@5,Dim 384 Cosine Recall@10,Dim 384 Cosine Ndcg@10,Dim 384 Cosine Mrr@10,Dim 384 Cosine Map@100,Dim 256 Cosine Accuracy@1,Dim 256 Cosine Accuracy@3,Dim 256 Cosine Accuracy@5,Dim 256 Cosine Accuracy@10,Dim 256 Cosine Precision@1,Dim 256 Cosine Precision@3,Dim 256 Cosine Precision@5,Dim 256 Cosine Precision@10,Dim 256 Cosine Recall@1,Dim 256 Cosine Recall@3,Dim 256 Cosine Recall@5,Dim 256 Cosine Recall@10,Dim 256 Cosine Ndcg@10,Dim 256 Cosine Mrr@10,Dim 256 Cosine Map@100,Dim 128 Cosine Accuracy@1,Dim 128 Cosine Accuracy@3,Dim 128 Cosine Accuracy@5,Dim 128 Cosine Accuracy@10,Dim 128 Cosine Precision@1,Dim 128 Cosine Precision@3,Dim 128 Cosine Precision@5,Dim 128 Cosine Precision@10,Dim 128 Cosine Recall@1,Dim 128 Cosine Recall@3,Dim 128 Cosine Recall@5,Dim 128 Cosine Recall@10,Dim 128 Cosine Ndcg@10,Dim 128 Cosine Mrr@10,Dim 128 Cosine Map@100,Dim 64 Cosine Accuracy@1,Dim 64 Cosine Accuracy@3,Dim 64 Cosine Accuracy@5,Dim 64 Cosine Accuracy@10,Dim 64 Cosine Precision@1,Dim 64 Cosine Precision@3,Dim 64 Cosine Precision@5,Dim 64 Cosine Precision@10,Dim 64 Cosine Recall@1,Dim 64 Cosine Recall@3,Dim 64 Cosine Recall@5,Dim 64 Cosine Recall@10,Dim 64 Cosine Ndcg@10,Dim 64 Cosine Mrr@10,Dim 64 Cosine Map@100,Sequential Score
0,6.5745,No log,0.574,0.62,0.64,0.667,0.574,0.206667,0.128,0.0667,0.574,0.62,0.64,0.667,0.618404,0.60316,0.607097,0.565,0.616,0.628,0.663,0.565,0.205333,0.1256,0.0663,0.565,0.616,0.628,0.663,0.61094,0.594781,0.597889,0.532,0.584,0.607,0.63,0.532,0.194667,0.1214,0.063,0.532,0.584,0.607,0.63,0.579668,0.563689,0.567261,0.48,0.538,0.556,0.583,0.48,0.179333,0.1112,0.0583,0.48,0.538,0.556,0.583,0.530545,0.513918,0.518398,0.518398
1,2.9879,No log,0.605,0.647,0.661,0.686,0.605,0.215667,0.1322,0.0686,0.605,0.647,0.661,0.686,0.643179,0.629808,0.634561,0.596,0.64,0.653,0.676,0.596,0.213333,0.1306,0.0676,0.596,0.64,0.653,0.676,0.634542,0.621492,0.626425,0.579,0.626,0.644,0.673,0.579,0.208667,0.1288,0.0673,0.579,0.626,0.644,0.673,0.623123,0.607525,0.612013,0.553,0.591,0.61,0.644,0.553,0.197,0.122,0.0644,0.553,0.591,0.61,0.644,0.594328,0.579057,0.583505,0.583505
2,1.8714,No log,0.606,0.645,0.664,0.699,0.606,0.215,0.1328,0.0699,0.606,0.645,0.664,0.699,0.648385,0.632767,0.637731,0.6,0.646,0.661,0.689,0.6,0.215333,0.1322,0.0689,0.6,0.646,0.661,0.689,0.642306,0.627749,0.633101,0.58,0.631,0.653,0.69,0.58,0.210333,0.1306,0.069,0.58,0.631,0.653,0.69,0.629942,0.6114,0.616088,0.564,0.597,0.617,0.655,0.564,0.199,0.1234,0.0655,0.564,0.597,0.617,0.655,0.604096,0.588518,0.593913,0.593913
3,1.4865,No log,0.606,0.652,0.673,0.707,0.606,0.217333,0.1346,0.0707,0.606,0.652,0.673,0.707,0.652612,0.635823,0.640959,0.602,0.65,0.669,0.701,0.602,0.216667,0.1338,0.0701,0.602,0.65,0.669,0.701,0.648134,0.631752,0.637255,0.588,0.635,0.66,0.694,0.588,0.211667,0.132,0.0694,0.588,0.635,0.66,0.694,0.63656,0.61875,0.623459,0.569,0.607,0.631,0.669,0.569,0.202333,0.1262,0.0669,0.569,0.607,0.631,0.669,0.613487,0.596461,0.601351,0.601351
4,1.4041,No log,0.612,0.658,0.679,0.712,0.612,0.219333,0.1358,0.0712,0.612,0.658,0.679,0.712,0.658165,0.641502,0.64671,0.606,0.649,0.668,0.71,0.606,0.216333,0.1336,0.071,0.606,0.649,0.668,0.71,0.652194,0.63464,0.639714,0.591,0.632,0.662,0.693,0.591,0.210667,0.1324,0.0693,0.591,0.632,0.662,0.693,0.637288,0.62003,0.625557,0.575,0.612,0.639,0.672,0.575,0.204,0.1278,0.0672,0.575,0.612,0.639,0.672,0.618707,0.60227,0.607817,0.607817
5,1.2555,No log,0.608,0.657,0.677,0.714,0.608,0.219,0.1354,0.0714,0.608,0.657,0.677,0.714,0.656806,0.639175,0.644174,0.607,0.651,0.672,0.704,0.607,0.217,0.1344,0.0704,0.607,0.651,0.672,0.704,0.651649,0.635473,0.64124,0.595,0.638,0.664,0.698,0.595,0.212667,0.1328,0.0698,0.595,0.638,0.664,0.698,0.642017,0.624682,0.630161,0.573,0.613,0.638,0.673,0.573,0.204333,0.1276,0.0673,0.573,0.613,0.638,0.673,0.61819,0.601308,0.607057,0.607057
6,1.0881,No log,0.609,0.657,0.678,0.717,0.609,0.219,0.1356,0.0717,0.609,0.657,0.678,0.717,0.658433,0.640394,0.645689,0.613,0.651,0.67,0.71,0.613,0.217,0.134,0.071,0.613,0.651,0.67,0.71,0.655942,0.639517,0.645102,0.595,0.64,0.665,0.699,0.595,0.213333,0.133,0.0699,0.595,0.64,0.665,0.699,0.642856,0.625453,0.631111,0.576,0.618,0.639,0.678,0.576,0.206,0.1278,0.0678,0.576,0.618,0.639,0.678,0.621665,0.604345,0.609843,0.609843
8,1.0863,No log,0.612,0.66,0.678,0.717,0.612,0.22,0.1356,0.0717,0.612,0.66,0.678,0.717,0.659978,0.642429,0.647723,0.615,0.652,0.669,0.711,0.615,0.217333,0.1338,0.0711,0.615,0.652,0.669,0.711,0.656703,0.640283,0.645938,0.595,0.64,0.666,0.702,0.595,0.213333,0.1332,0.0702,0.595,0.64,0.666,0.702,0.644018,0.626145,0.631545,0.576,0.617,0.641,0.678,0.576,0.205667,0.1282,0.0678,0.576,0.617,0.641,0.678,0.621709,0.604422,0.609762,0.609762
9,1.0612,No log,0.612,0.66,0.677,0.717,0.612,0.22,0.1354,0.0717,0.612,0.66,0.677,0.717,0.659982,0.64246,0.647599,0.615,0.653,0.669,0.714,0.615,0.217667,0.1338,0.0714,0.615,0.653,0.669,0.714,0.657766,0.640795,0.646327,0.595,0.639,0.665,0.703,0.595,0.213,0.133,0.0703,0.595,0.639,0.665,0.703,0.644166,0.626052,0.631347,0.576,0.617,0.641,0.679,0.576,0.205667,0.1282,0.0679,0.576,0.617,0.641,0.679,0.622118,0.604642,0.610027,0.610027


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

In [22]:
from sentence_transformers import SentenceTransformer

fine_tuned_model = SentenceTransformer(
    args.output_dir, device="cuda" if torch.cuda.is_available() else "cpu"
)
# Evaluate the model
results = evaluator(fine_tuned_model)

# # COMMENT IN for full results
# print(results)

# Print the main score
for dim in MATRYOSHKA_DIMENSIONS:
    key = f"dim_{dim}_cosine_ndcg@10"
    print(f"{key}: {results[key]}")

dim_384_cosine_ndcg@10: 0.6596838439631117
dim_256_cosine_ndcg@10: 0.6576391611101314
dim_128_cosine_ndcg@10: 0.6455207361109278
dim_64_cosine_ndcg@10: 0.6230519369171184


In [23]:
from sentence_transformers.util import cos_sim

sentences = [
    "Exemplo de sentença um.",
    "Exemplo de sentença um."
]

# Obter os embeddings
embeddings = fine_tuned_model.encode(sentences)
cos_sim(embeddings[0], embeddings[1])[0][0]

tensor(1.)

In [24]:
from sentence_transformers.util import cos_sim

sentences = [
    "O gato mordeu o cachorro",
    "O cachorro mordeu o gato."
]

# Obter os embeddings
embeddings = fine_tuned_model.encode(sentences)
cos_sim(embeddings[0], embeddings[1])[0][0]

tensor(0.9671)

In [25]:
from sentence_transformers.util import cos_sim

sentences = [
    "O gato mordeu o cachorro",
    "O gato mordeu o cão."
]

# Obter os embeddings
embeddings = fine_tuned_model.encode(sentences)
cos_sim(embeddings[0], embeddings[1])[0][0]

tensor(0.7082)

In [26]:
from sentence_transformers.util import cos_sim

sentences = [
    "O gato mordeu o cachorro",
    ""
]

# Obter os embeddings
embeddings = fine_tuned_model.encode(sentences)
cos_sim(embeddings[0], embeddings[1])[0][0]

tensor(0.2540)