In [1]:
def install_lib(libname):
    print(f">>> {libname}")
    get_ipython().system(f"pip install -qqq {libname}")

libs = [
        "accelerate==0.29.3"
]

for lib in libs:
    install_lib(lib)

>>> accelerate==0.29.3


In [1]:
import os
import torch

from huggingface_hub import login
from sentence_transformers import SentenceTransformer
from datasets import load_dataset, concatenate_datasets

from sentence_transformers.util import cos_sim
from sentence_transformers.evaluation import (
    InformationRetrievalEvaluator,
    SequentialEvaluator,
)



In [3]:
login(token=os.environ["HUGGINGFACE_TOKEN"])

# MODEL_ID = 'PORTULAN/serafim-900m-portuguese-pt-sentence-encoder-ir'
MODEL_ID = "BAAI/bge-small-en-v1.5"
MATRYOSHKA_DIMENSIONS = [384, 256, 128] # [768, 512, 256, 128, 64]
TRAIN_DATASET = "data/bacen/train_dataset.json"
TEST_DATASET = "data/bacen/test_dataset.json"

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


## 1. Crie e Prepare o Conjunto de Dados de Incorporação

Um conjunto de dados de embedding geralmente consiste em pares de texto (pergunta, resposta/contexto) ou tríades que representam relações ou semelhanças entre frases. O formato do conjunto de dados que você escolher ou tiver disponível também afetará a função de perda que você pode usar. Formatos comuns para conjuntos de dados de embedding:

- **Par Positivo**: Pares de texto de frases relacionadas (consulta, contexto | consulta, resposta), adequados para tarefas como pesquisa de semelhança ou busca semântica, exemplos de conjuntos de dados: `sentence-transformers/sentence-compression`, `sentence-transformers/natural-questions`.
- **Tríades**: Tríades de texto compostas por (âncora, positivo, negativo), exemplos de conjuntos de dados `sentence-transformers/quora-duplicates`, `nirantk/triplets`.
- **Par com Pontuação de Similaridade**: Pares de frases com uma pontuação de similaridade indicando o quão relacionadas são, exemplos de conjuntos de dados: `sentence-transformers/stsb`, `PhilipMay/stsb_multi_mt`

Saiba mais em [Visão Geral dos Conjuntos de Dados](https://sbert.net/docs/sentence_transformer/dataset_overview.html).

Vamos usar o dataset [Itau-Unibanco/FAQ_BACEN)](https://huggingface.co/datasets/Itau-Unibanco/FAQ_BACEN), que inclui 7.000 pares de texto positivos de perguntas e contextos correspondentes do [Relatório SEC da NVIDIA de 2023_10](https://stocklight.com/stocks/us/nasdaq-nvda/nvidia/annual-reports/nasdaq-nvda-2023-10K-23668751.pdf).

O conjunto de dados tem o seguinte formato:
```json
{"questions": "<pergunta>", "answers": "<contexto relevante para a resposta>"}
{"questions": "<pergunta>", "answers": "<contexto relevante para a resposta>"}
{"questions": "<pergunta>", "answers": "<contexto relevante para a resposta>"}
```

# Dataset

In [4]:
%%time
import re
from datasets import load_from_disk

dataset_filepath = "data/clips_mqa/pt"
dataset = load_from_disk(dataset_filepath)
print(f">>> Dataset importado com {dataset.num_rows} linhas")

# Removendo domínios sem relevância
def valid_domain(text):
    if (text[-3:] == ".br") or (text[-4:] in [".com", ".net", ".org"]):
        return True
    return False
dataset = dataset.filter(lambda row: valid_domain(row["domain"]) )
print(f">>> Apos remover domínios sem relevância, o dataset ficou com {dataset.num_rows} linhas")
        
    
# bloquear dominios 
def contains_prohibited_term_regex(text):
    blacklist = [
        "mundosugar.com.br",
        "aposta",
        "apuesta",
        "sex",
        "porn",
        "penis",
        "vagi",
        "turba",
        "sensual",
    ]
    pattern = re.compile("|".join(map(re.escape, blacklist)))
    return bool(pattern.search(text))

dataset = dataset.filter(lambda row: not contains_prohibited_term_regex(row["domain"]) )
print(f">>> Apos remover domínios com termos indesejáveis, o dataset ficou com {dataset.num_rows} linhas")

>>> Dataset importado com 4046601 linhas
>>> Apos remover domínios sem relevância, o dataset ficou com 3158116 linhas
>>> Apos remover domínios com termos indesejáveis, o dataset ficou com 3085331 linhas
CPU times: user 91.7 ms, sys: 37.8 ms, total: 130 ms
Wall time: 82.3 ms


In [133]:
%%time
dataset.to_pandas().sample(4)

CPU times: user 18.1 s, sys: 1.25 s, total: 19.4 s
Wall time: 19.3 s


Unnamed: 0,id,bucket,domain,text,question,answer
1708125,633ac8de66695d1671ffdea1476231fa,2021.1,hoteis.com,,quais sao as medidas de limpeza e higiene em v...,este estabelecimento confirma que sao utilizad...
466777,c678d0382e184b5d534dea19f6d6dda3,2019.47,aluguetemporada.com.br,,Posso reservar um imovel para temporada direta...,Sim. O AlugueTemporada oferece 134 imoveis par...
2474495,c4fe2ce007f3ba02412e705170008ce6,2021.25,kayak.com.br,,qual e a agencia de aluguel de carros mais pop...,premium ford focus ou similar e o tipo de carr...
88406,e6bd3705bfab4cc7722bcaaf2371d374,2020.29,edestinos.com.br,,quando os voos do aeroporto umtata airport sao...,a oferta da companhia aerea esta mudando const...


In [5]:
# Sampling
dataset = dataset.shuffle().select(range(100000))

# Selecting columns
dataset = dataset.rename_columns({"question": "anchor", "answer": "positive"}) .select_columns(["id", "anchor", "positive"])

# split dataset into a 10% test set
dataset = dataset.train_test_split(test_size=0.1)

# save datasets to disk
dataset["train"].to_json(TRAIN_DATASET, orient="records")
dataset["test"].to_json(TEST_DATASET, orient="records")

Creating json from Arrow format:   0%|          | 0/90 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

3623595

In [6]:
# load test dataset
train_dataset = load_dataset("json", data_files=TRAIN_DATASET, split="train")
test_dataset = load_dataset("json", data_files=TEST_DATASET, split="train")
corpus_dataset = concatenate_datasets([train_dataset, test_dataset])

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [7]:
# Convert the datasets to dictionaries
corpus = dict(
    zip(corpus_dataset["id"], corpus_dataset["positive"])
)  # Our corpus (cid => document)
queries = dict(
    zip(test_dataset["id"], test_dataset["anchor"])
)  # Our queries (qid => question)

In [8]:
# Create a mapping of relevant document (1 in our case) for each query
relevant_docs = {}  # Query ID to relevant documents (qid => set([relevant_cids])
for q_id in queries:
    relevant_docs[q_id] = [q_id]


# Baseline

In [9]:
model = SentenceTransformer(
    MODEL_ID, device="cuda" if torch.cuda.is_available() else "cpu"
)



In [10]:
%%time
matryoshka_evaluators = []
# Iterate over the different dimensions
for dim in MATRYOSHKA_DIMENSIONS:
    ir_evaluator = InformationRetrievalEvaluator(
        queries=queries,
        corpus=corpus,
        relevant_docs=relevant_docs,
        name=f"dim_{dim}",
        truncate_dim=dim,  # Truncate the embeddings to a certain dimension
        score_functions={"cosine": cos_sim},
    )
    matryoshka_evaluators.append(ir_evaluator)

# Create a sequential evaluator
evaluator = SequentialEvaluator(matryoshka_evaluators)

CPU times: user 62.6 ms, sys: 1.09 ms, total: 63.7 ms
Wall time: 62.9 ms


In [None]:
%%time

rerun_baseline = True

if rerun_baseline:
    results = evaluator(model)
    for dim in MATRYOSHKA_DIMENSIONS:
        key = f"dim_{dim}_cosine_ndcg@10"
        print
        print(f"{key}: {results[key]}")

# Fine Tune

In [12]:
from sentence_transformers import SentenceTransformerModelCardData, SentenceTransformer

attn_implementation = "sdpa"# "eager" # sdpa

# # load model with SDPA for using Flash Attention 2
# model = SentenceTransformer(
#     MODEL_ID,
#     model_kwargs={"attn_implementation": attn_implementation},
#     model_card_data=SentenceTransformerModelCardData(
#         language="pt-br",
#         license="apache-2.0",
#         model_name="FAQ-BACEN",
#     ),
# )

# load model with SDPA for using Flash Attention 2
model = SentenceTransformer(
    MODEL_ID,
    model_kwargs={"attn_implementation": attn_implementation},
    model_card_data=SentenceTransformerModelCardData(
        language="pt-br",
        license="apache-2.0",
        model_name="QA-Brazil",
    ),
)

In [13]:
from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss

inner_train_loss = MultipleNegativesRankingLoss(model)
train_loss = MatryoshkaLoss(
    model, inner_train_loss, matryoshka_dims=MATRYOSHKA_DIMENSIONS
)

In [14]:
from sentence_transformers import SentenceTransformerTrainingArguments
from sentence_transformers.training_args import BatchSamplers

# load train dataset again
train_dataset = load_dataset("json", data_files=TRAIN_DATASET, split="train")

# define training arguments
args = SentenceTransformerTrainingArguments(
    output_dir="./models/bacen/embeddings", # output directory and hugging face model ID
    num_train_epochs=10,                         # number of epochs
    per_device_train_batch_size=32,             # train batch size
    gradient_accumulation_steps=16,             # for a global batch size of 512
    per_device_eval_batch_size=16,              # evaluation batch size
    warmup_ratio=0.1,                           # warmup ratio
    learning_rate=2e-5,                         # learning rate, 2e-5 is a good value
    lr_scheduler_type="cosine",                 # use constant learning rate scheduler
    optim="adamw_torch_fused",                  # use fused adamw optimizer
    #tf32=True,                                  # use tf32 precision
    #bf16=True,                                  # use bf16 precision
    batch_sampler=BatchSamplers.NO_DUPLICATES,  # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch
    eval_strategy="epoch",                      # evaluate after each epoch
    save_strategy="epoch",                      # save after each epoch
    logging_steps=10,                           # log every 10 steps
    save_total_limit=3,                         # save only the last 3 models
    load_best_model_at_end=True,                # load the best model when training ends
    metric_for_best_model="eval_dim_128_cosine_ndcg@10",  # Optimizing for the best ndcg@10 score for the 128 dimension
)

In [15]:
test = train_dataset.to_pandas().sample(1).iloc[0].to_dict()
print(f'anchor: {test["anchor"]}')
print(f'positive: {test["positive"]}')

anchor: como adquirir bolsas de estudo para o curso de educacao fisica em joao monlevade?
positive: o quero bolsa oferece bolsas de estudo para o curso de educacao fisica em diversas instituicoes de ensino na cidade de joao monlevade. escolha a oferta que melhor se encaixa no seu bolso e garanta a sua matricula.


In [16]:
from sentence_transformers import SentenceTransformerTrainer

trainer = SentenceTransformerTrainer(
    model=model, # BAAI/bge-small-en-v1.5
    args=args,  # training arguments
    train_dataset=train_dataset.select_columns(
        ["positive", "anchor"]
    ),  # training dataset
    loss=train_loss,
    evaluator=evaluator,
)

In [17]:
%%time

# start training, the model will be automatically saved to the hub and the output directory
trainer.train()

# save the best model
trainer.save_model()

Epoch,Training Loss,Validation Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 

In [18]:
from sentence_transformers import SentenceTransformer

fine_tuned_model = SentenceTransformer(
    args.output_dir, device="cuda" if torch.cuda.is_available() else "cpu"
)
# Evaluate the model
results = evaluator(fine_tuned_model)

# # COMMENT IN for full results
# print(results)

# Print the main score
for dim in MATRYOSHKA_DIMENSIONS:
    key = f"dim_{dim}_cosine_ndcg@10"
    print(f"{key}: {results[key]}")

OutOfMemoryError: CUDA out of memory. Tried to allocate 46.00 MiB. GPU 

In [None]:
from sentence_transformers.util import cos_sim

sentences = [
    "Exemplo de sentença um.",
    "Exemplo de sentença um."
]

# Obter os embeddings
embeddings = fine_tuned_model.encode(sentences)
cos_sim(embeddings[0], embeddings[1])[0][0]

In [None]:
from sentence_transformers.util import cos_sim

sentences = [
    "O gato mordeu o cachorro",
    "O cachorro mordeu o gato."
]

# Obter os embeddings
embeddings = fine_tuned_model.encode(sentences)
cos_sim(embeddings[0], embeddings[1])[0][0]

In [None]:
from sentence_transformers.util import cos_sim

sentences = [
    "O gato mordeu o cachorro",
    "O gato mordeu o cão."
]

# Obter os embeddings
embeddings = fine_tuned_model.encode(sentences)
cos_sim(embeddings[0], embeddings[1])[0][0]

In [None]:
from sentence_transformers.util import cos_sim

sentences = [
    "O gato mordeu o cachorro",
    ""
]

# Obter os embeddings
embeddings = fine_tuned_model.encode(sentences)
cos_sim(embeddings[0], embeddings[1])[0][0]