https://github.com/vvr-rao/Fine-tuning-a-Sentence-Transformer-for-RAG/blob/main/SentenceTransformers_FineTuning_Using_MNRLoss.ipynb

# Instalação das Libs

In [1]:
def install_lib(libname):
    print(f">>> {libname}")
    get_ipython().system(f"pip install -qqq {libname}")

libs = [
    "accelerate==0.29.3"
]

for lib in libs:
    install_lib(lib)

>>> accelerate==0.29.3


# Importação das Dependências

In [1]:
import os
import torch
import numpy as np
from huggingface_hub import login
from sentence_transformers import SentenceTransformer
from datasets import load_dataset, concatenate_datasets

from sentence_transformers import SentenceTransformer, losses, InputExample
from torch.utils.data import DataLoader
from datasets import load_dataset

from sentence_transformers.util import cos_sim
from sentence_transformers.evaluation import (
    InformationRetrievalEvaluator,
    SequentialEvaluator,
)



In [2]:
login(token=os.environ["HUGGINGFACE_TOKEN"])

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


# Tutorial

### Referencias

1. https://medium.com/@venkat.ramrao/fine-tuning-a-sentence-transformer-for-semantic-search-7c7a57f4db2f

2. https://docs.llamaindex.ai/en/stable/examples/finetuning/embeddings/finetune_embedding/

# Parametros

### Funções auxiliares

In [3]:
from sentence_transformers.util import cos_sim

def similarity(a, b, distance="cos"):
    if distance=="cartesian":
        return np.dot(a, b)
    elif distance == "cos":
        return float(cos_sim(a, b)[0][0])

# Dados Reais

In [4]:
model_id = 'sentence-transformers/all-MiniLM-L12-v2'
dataset_id = "emdemor/ptbr-question-and-answer"

# PORTULAN/serafim-900m-portuguese-pt-sentence-encoder-ir
# rufimelo/Legal-BERTimbau-base
# rufimelo/Legal-BERTimbau-large
# rufimelo/bert-large-portuguese-cased-sts

### Baseline

In [5]:
model = SentenceTransformer(model_id)



In [6]:
question = ['Qual é a cor do céu?']
answers = ["O céu é azul", "Eu como ovos no café da manhã","Qual é a cor do mar?","Quão alto é o céu?"]

# Obtendo os vetores de embedding
question_embedding = model.encode(question)
answers_embeddings = model.encode(answers)


emb_q = question_embedding[0]
similarities = [similarity(emb_a, emb_q) for emb_a in answers_embeddings]

for a, s in zip(answers, similarities):
  print(a + " --- " + str(s))


O céu é azul --- 0.5942363739013672
Eu como ovos no café da manhã --- 0.341561496257782
Qual é a cor do mar? --- 0.8152474164962769
Quão alto é o céu? --- 0.6779704689979553


### Fine Tune

#### Preparando o dataset

In [7]:
question_col = "question"
answer_col = "answer"

In [8]:
dataset = load_dataset(dataset_id, split="train").shuffle().select(range(1_000_000))


dataset.to_pandas().sample(4)

Unnamed: 0,id,bucket,domain,text,question,answer
491789,334189e1474d20c724fae242fd7a250b,2021.43,thetrainline.com,,qual e a distancia entre wuppertal e cologne d...,os trens que fazem o trajeto de wuppertal para...
295346,3aa3484b5bbf0ce615ff448861364190,2021.39,barcelo.com,,existe um custo para conexao de internet wi-fi...,a conexao wi-fi para clientes occidental carib...
469635,febc77bcabe7a8432d922ba9c6e07cb1,2021.31,momondo.com.br,,qual e o momento mais barato para viajar de fo...,os voos de fortaleza a brasilia sao cerca de 1...
975617,ceaa938ed3d787abac35d4c970fb5af1,2019.47,nanoil.net.br,,o oleo embelezador oleo de jojoba nanoil e ade...,oleos embelezadores diferem de acordo com sua ...


In [9]:
dataset = dataset.train_test_split(test_size=0.15)

#### Gerando o DataLoader

In [10]:
import datasets
from torch.utils.data import DataLoader

def generate_dataloader(
    dataset: datasets.arrow_dataset.Dataset,
    question_column: str,
    answer_column: str,
    shuffle: bool = True,
    batch_size: int = 16,
    *args,
    **kwargs,
):
    """ """
    return DataLoader(
        [
            InputExample(texts=[row[question_column], row[answer_column]])
            for row in dataset
        ]
        ,
        shuffle=shuffle, batch_size=batch_size,
        *args,
        **kwargs
    )


train_dataloader = generate_dataloader(dataset["train"], question_col, answer_col)
test_dataloader = generate_dataloader(dataset["test"], question_col, answer_col)

#### Treinamento

In [None]:
%%time

# model = SentenceTransformer(model_id)

# train_loss = losses.CosineSimilarityLoss(model=model)
train_loss = losses.MultipleNegativesRankingLoss(model=model) #use if you have related sentence pairs
#train_loss = losses.TripletLoss(model=model)  # use this if you have an achor, positive, negative triplets


model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=4,
    warmup_steps=500,
)

model.save("models/finetune-MiniLM")

#### Teste

In [6]:
tuned_model = SentenceTransformer("models/finetune-MiniLM")

question = ['Qual é a cor do céu?']
answers = ["O céu é azul", "Eu como ovos no café da manhã","Qual é a cor do mar?","Quão alto é o céu?"]

# Obtendo os vetores de embedding
question_embedding = tuned_model.encode(question)
answers_embeddings = tuned_model.encode(answers)


emb_q = question_embedding[0]
similarities = [similarity(emb_a, emb_q) for emb_a in answers_embeddings]

for a, s in zip(answers, similarities):
  print(a + " --- " + str(s))


O céu é azul --- 0.7463054656982422
Eu como ovos no café da manhã --- 0.23135748505592346
Qual é a cor do mar? --- 0.7098197340965271
Quão alto é o céu? --- 0.792127251625061


In [13]:
q = tuned_model.encode("onde está o rato?")
a = tuned_model.encode("o rato está no porão")

similarity(q, a)

0.6584053039550781

In [14]:
q = tuned_model.encode("onde está o rato?")
a = tuned_model.encode("o gato está atras do rato")

similarity(q, a)

0.6188016533851624