# Fine-Tune with Pair-Score

In [1]:
import logging
import warnings

# Suprimir avisos específicos de FutureWarning e UserWarning
warnings.filterwarnings("ignore", category=FutureWarning, message=".*TRANSFORMERS_CACHE.*")
warnings.filterwarnings("ignore", message=".*resume_download.*deprecated.*", category=FutureWarning)
warnings.filterwarnings("ignore", message=".*use_cache=True.*", category=UserWarning)
warnings.filterwarnings("ignore", message=".*use_reentrant parameter should be passed explicitly.*", category=UserWarning)

# Configurar o nível de log para a biblioteca transformers
logging.getLogger("transformers.trainer").setLevel(logging.WARNING)
logging.getLogger("transformers.trainer_utils").setLevel(logging.WARNING)
logging.getLogger("transformers.training_args").setLevel(logging.WARNING)


In [2]:
import torch
import bitsandbytes
import peft
import accelerate
import datasets
import trl
import warnings

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("torch version:", torch.__version__)
print("bitsandbytes version:", bitsandbytes.__version__)
print("peft version:", peft.__version__)
print("accelerate version:", accelerate.__version__)
print("datasets version:", datasets.__version__)
print("trl version:", trl.__version__)
print(f"Device name: '{torch.cuda.get_device_name()}'")
print("Device:", device)
print(f"Device properties: '{torch.cuda.get_device_properties(torch.cuda.current_device())}'")
print("Suporta bfloat16." if torch.cuda.is_bf16_supported() else "Não suporta bfloat16.")

torch version: 2.3.1
bitsandbytes version: 0.43.1
peft version: 0.11.1
accelerate version: 0.31.0
datasets version: 2.19.2
trl version: 0.9.4
Device name: 'NVIDIA GeForce GTX 1650'
Device: cuda
Device properties: '_CudaDeviceProperties(name='NVIDIA GeForce GTX 1650', major=7, minor=5, total_memory=3903MB, multi_processor_count=14)'
Suporta bfloat16.


# Dependencias

In [3]:
import os
import pandas as pd
import numpy as np

import torch
assert torch.cuda.is_available()

from tqdm import tqdm
import GPUtil

from huggingface_hub import hf_hub_download, login

from datasets import load_dataset

from transformers import BitsAndBytesConfig
from transformers import TrainerCallback, TrainerState, TrainerControl

from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from langchain.callbacks import get_openai_callback

from sentence_transformers import SentenceTransformer
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments
from sentence_transformers.losses import CoSENTLoss
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SimilarityFunction
from sentence_transformers.util import cos_sim


print(f"Device name: '{torch.cuda.get_device_name()}'")
print(f"Device properties: '{torch.cuda.get_device_properties(torch.cuda.current_device())}'")
print("Suporta bfloat16." if torch.cuda.is_bf16_supported() else "Não suporta bfloat16.")

Device name: 'NVIDIA GeForce GTX 1650'
Device properties: '_CudaDeviceProperties(name='NVIDIA GeForce GTX 1650', major=7, minor=5, total_memory=3903MB, multi_processor_count=14)'
Suporta bfloat16.


# Referências de Embedding Fine Tune

- https://huggingface.co/blog/train-sentence-transformers
- https://huggingface.co/blog/abhishek/finetune-custom-embeddings-autotrain

# Funções

In [4]:
def gpu_summary():
    gpus = GPUtil.getGPUs()
    for gpu in gpus:
        print(f"GPU {gpu.id}:")
        print(f"  Memória Total: {gpu.memoryTotal} MB")
        print(f"  Memória Usada: {gpu.memoryUsed} MB")
        print(f"  Memória Livre: {gpu.memoryFree} MB")


In [5]:
def similarity(a, b, distance="cos"):
    if distance=="cartesian":
        return np.dot(a, b)
    elif distance == "cos":
        return float(cos_sim(a, b)[0][0])

In [6]:
def bechmark(model, test_dataset):
    similarities = [np.abs(similarity(model.encode(r['sentence1']), model.encode(r['sentence2'])) - r["score"])  for r in tqdm(test_dataset)]
    return np.mean(similarities)

In [7]:
class SaveCheckpointCallback(TrainerCallback):
    def on_save(self, args, state, control, **kwargs):
        print(f"Saving checkpoint at step {state.global_step}")

class EarlyStoppingCallback(TrainerCallback):
    def __init__(self, early_stopping_patience=3, early_stopping_threshold=0.02):
        self.early_stopping_patience = early_stopping_patience
        self.early_stopping_threshold = early_stopping_threshold
        self.best_metric = None
        self.counter = 0

    def on_evaluate(self, args, state: TrainerState, control: TrainerControl, metrics=None, **kwargs):
        current_metric = metrics.get("eval_loss")  # Use the relevant metric for your task

        if current_metric is None:
            return

        if self.best_metric is None or current_metric < self.best_metric - self.early_stopping_threshold:
            self.best_metric = current_metric
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.early_stopping_patience:
                control.should_training_stop = True
                print(f"Early stopping at step {state.global_step} with best eval_loss = {self.best_metric}")


class SaveMetricsCallback(TrainerCallback):
    def __init__(self, output_dir):
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        self.output_dir = output_dir
        self.output_path = os.path.join(output_dir, "metrics.json")
        self.state_path = os.path.join(output_dir, "state.json")
        print(f"Output directory initialized at {output_dir}")
        self.metrics = self.load_existing_metrics()

    def load_existing_metrics(self):
        if os.path.isfile(self.output_path):
            return pd.read_json(self.output_path, lines=True).to_dict('records')
        return []

    def on_step_end(self, args, state: TrainerState, control: TrainerControl, **kwargs):
        state.save_to_json(self.state_path)
    
    def on_evaluate(self, args, state: TrainerState, control: TrainerControl, metrics=None, **kwargs):
        if metrics:
            step = state.global_step
            _metrics = {"Step": step, **metrics}
            self.metrics.append(_metrics)
            metrics_df = pd.DataFrame(self.metrics).drop_duplicates(subset=['Step'], keep='last')
            metrics_df.to_json(self.output_path, orient="records", lines=True)



## Baseline

In [8]:
model = SentenceTransformer("sentence-transformers/all-MiniLM-L12-v2")

In [9]:
gpu_summary()

GPU 0:
  Memória Total: 4096.0 MB
  Memória Usada: 198.0 MB
  Memória Livre: 3705.0 MB


# Dataset de treinamento

In [10]:
dataset = load_dataset("sentence-transformers/all-nli", "pair-score", split="train")
dataset = dataset.shuffle(42).select(range(50000))

# Dividir em treino e teste
train_test_split = dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

# Dividir o conjunto de treino para criar validação
train_validation_split = train_dataset.train_test_split(test_size=0.05)
train_dataset = train_validation_split['train']
validation_dataset = train_validation_split['test']

# Verificar as divisões
print(f"Train dataset: {len(train_dataset)} samples")
print(f"Validation dataset: {len(validation_dataset)} samples")
print(f"Test dataset: {len(test_dataset)} samples")

Train dataset: 38000 samples
Validation dataset: 2000 samples
Test dataset: 10000 samples


# Dataset de validação

In [11]:
# eval_dataset = load_dataset("sentence-transformers/stsb", split="validation")
# eval_dataset = eval_dataset.shuffle(42).select(range(100,1100))

dev_evaluator = EmbeddingSimilarityEvaluator(
    sentences1=validation_dataset["sentence1"],
    sentences2=validation_dataset["sentence2"],
    scores=validation_dataset["score"],
    main_similarity=SimilarityFunction.COSINE,
    name="sts-dev",
)
# Run evaluation manually:
# print(dev_evaluator(model))

# Modelo

In [18]:
model = SentenceTransformer("sentence-transformers/all-MiniLM-L12-v2")

# Parâmetros de treinamento

In [19]:
from sentence_transformers import losses
output_dir="models/finetune-MiniLM-pair-score"

# This loss requires pairs of text and a floating point similarity score as a label
loss = losses.CosineSimilarityLoss(model)
# loss = losses.CoSENTLoss(model)
# loss = losses.AnglELoss(model)


args = SentenceTransformerTrainingArguments(
    seed=42,
    output_dir=output_dir,

    # Training Hyperparameters
    learning_rate=1e-5,
    num_train_epochs=2,
    warmup_ratio=0.1,
    lr_scheduler_type="linear",
    
    # Validation 
    do_eval=True,
    eval_strategy="steps",
    eval_steps=100,

    # Chackpoints
    save_strategy="steps",  # Salvando a cada 100 passos # save_strategy="epoch",  # Salvando ao final de cada época
    save_steps=1000,         # Salvando a cada 100 passos
    save_total_limit=2,

    # Loggings
    log_level="warning",
    logging_steps=20,
    
    # Optional training parameters:
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    fp16=False,  # Set to False if your GPU can't handle FP16
    bf16=True,  # Set to True if your GPU supports BF16
)


# Treinamento

In [20]:
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    loss=loss,
    evaluator=dev_evaluator,
    callbacks=[SaveCheckpointCallback(), SaveMetricsCallback(output_dir)],
    # callbacks=[SaveCheckpointCallback(), EarlyStoppingCallback( early_stopping_threshold=0.0005)],
    # callbacks=[SaveCheckpointCallback(), EarlyStoppingCallback( early_stopping_threshold=0.0005), SaveMetricsCallback(output_dir)],
)
trainer.train()

model.save(output_dir)

Output directory initialized at models/finetune-MiniLM-pair-score


Step,Training Loss,Validation Loss,Sts-dev Pearson Cosine,Sts-dev Spearman Cosine,Sts-dev Pearson Manhattan,Sts-dev Spearman Manhattan,Sts-dev Pearson Euclidean,Sts-dev Spearman Euclidean,Sts-dev Pearson Dot,Sts-dev Spearman Dot,Sts-dev Pearson Max,Sts-dev Spearman Max
100,0.1178,0.120141,0.54074,0.543125,0.533978,0.539115,0.538133,0.543125,0.54074,0.543125,0.54074,0.543125
200,0.1157,0.109989,0.587898,0.593245,0.579038,0.589589,0.582791,0.593245,0.587898,0.593245,0.587898,0.593245
300,0.1118,0.102545,0.619357,0.627646,0.612093,0.625258,0.614952,0.627646,0.619357,0.627646,0.619357,0.627646
400,0.1074,0.099776,0.630493,0.639352,0.625103,0.637643,0.627204,0.639352,0.630493,0.639352,0.630493,0.639352
500,0.0908,0.098049,0.638798,0.646097,0.633962,0.644551,0.635893,0.646098,0.638798,0.646097,0.638798,0.646098
600,0.0959,0.09783,0.64288,0.659225,0.646922,0.65778,0.64856,0.659225,0.64288,0.659225,0.64856,0.659225
700,0.097,0.095725,0.65007,0.657853,0.644723,0.655863,0.646498,0.657853,0.65007,0.657853,0.65007,0.657853
800,0.0981,0.093685,0.659617,0.670649,0.657987,0.66878,0.659815,0.670649,0.659617,0.670649,0.659815,0.670649
900,0.0902,0.091891,0.667223,0.674608,0.661591,0.672411,0.663813,0.674608,0.667223,0.674608,0.667223,0.674608
1000,0.0922,0.092472,0.668005,0.674176,0.661632,0.672179,0.663859,0.674176,0.668005,0.674176,0.668005,0.674176


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Saving checkpoint at step 1000
Saving checkpoint at step 2000
Saving checkpoint at step 3000
Saving checkpoint at step 4000


# Avaliação

In [21]:
def bechmark(model, test_dataset):
    similarities = [np.abs(similarity(model.encode(r['sentence1']), model.encode(r['sentence2'])) - r["score"])  for r in tqdm(test_dataset)]
    return np.mean(similarities)

In [22]:
tuned_model = SentenceTransformer("models/finetune-MiniLM-pair-score")
bechmark(tuned_model, test_dataset)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [01:20<00:00, 123.62it/s]


0.23825372176468373

In [36]:
base_model = SentenceTransformer("sentence-transformers/all-MiniLM-L12-v2")
bechmark(base_model, test_dataset)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [01:22<00:00, 121.62it/s]


0.29415455153742803

In [28]:
import gc
try:
    del model
except:
    pass

try:
    del trainer
except:
    pass

import gc

gc.collect()
gc.collect()

torch.cuda.empty_cache()

---
# Comparação

In [13]:
from sentence_transformers import losses
train_loss = losses.MultipleNegativesRankingLoss(model=model)

In [16]:
import datasets
from torch.utils.data import DataLoader

from sentence_transformers import SentenceTransformer, losses, InputExample

def generate_dataloader(
    dataset: datasets.arrow_dataset.Dataset,
    question_column: str,
    answer_column: str,
    shuffle: bool = True,
    batch_size: int = 16,
    *args,
    **kwargs,
):
    """ """
    return DataLoader(
        [
            InputExample(texts=[row[question_column], row[answer_column]])
            for row in dataset
        ]
        ,
        shuffle=shuffle, batch_size=batch_size,
        *args,
        **kwargs
    )


train_dataloader = generate_dataloader(train_dataset, 'sentence1', 'sentence2')
test_dataloader = generate_dataloader(test_dataset, 'sentence1', 'sentence2')

In [17]:
%%time
output_dir="models/finetune-MiniLM-comparison"



# model = SentenceTransformer(model_id)

# train_loss = losses.CosineSimilarityLoss(model=model)
train_loss = losses.MultipleNegativesRankingLoss(model=model) #use if you have related sentence pairs
#train_loss = losses.TripletLoss(model=model)  # use this if you have an achor, positive, negative triplets


model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=1,
    warmup_steps=500,
)

model.save(output_dir)

Step,Training Loss
500,0.2867
1000,0.2428
1500,0.2376


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

CPU times: user 4min 19s, sys: 8.45 s, total: 4min 27s
Wall time: 4min 20s


In [28]:

tuned_model = SentenceTransformer("models/finetune-MiniLM-pair-score")
# bechmark(tuned_model, test_dataset)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [01:32<00:00, 108.09it/s]


1.000000054460764

In [29]:
output_dir="models/finetune-MiniLM-comparison"
tuned_model_comparison = SentenceTransformer(output_dir)
# bechmark(tuned_model_comparison, test_dataset)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [01:21<00:00, 121.98it/s]


0.6409838970854412

In [34]:
base_model = SentenceTransformer("sentence-transformers/all-MiniLM-L12-v2")
tuned_model = SentenceTransformer("models/finetune-MiniLM-pair-score")
tuned_model_comparison = SentenceTransformer("models/finetune-MiniLM-comparison")


In [38]:
temp = test_dataset.to_pandas().sample().iloc[0].to_dict()
sentence1 = temp["sentence1"]
sentence2 = temp["sentence2"]
score = temp["score"]

print
print(f"sentence1 = {sentence1}")
print(f"sentence2 = {sentence2}")
print(f"score = {score}")
print(f"base model = {similarity(base_model.encode(sentence1), base_model.encode(sentence2))}")
print(f"tuned_model = {similarity(tuned_model.encode(sentence1), tuned_model.encode(sentence2))}")
print(f"tuned_model_comparison = {similarity(tuned_model_comparison.encode(sentence1), tuned_model_comparison.encode(sentence2))}")

sentence1 = The right cannot understand that ordinary Americans sparked the '60s cultural revolution.
sentence2 = The right understands how ordinary Americans sparked a cultural revolution in the 60s because the right was there at the time.
score = 0.0
base model = 0.8458305597305298
tuned_model = 0.9999999403953552
tuned_model_comparison = 0.9482017755508423


In [21]:
output_dir="models/finetune-MiniLM-comparison"
tuned_model_1 = SentenceTransformer(output_dir)

In [24]:
q = "qual o nome do seu pai?"
a = "ele se chama joao"

similarity(tuned_model_1.encode(q), tuned_model_1.encode(a))

0.5211070775985718

In [26]:
output_dir="models/finetune-MiniLM-pair-score"
tuned_model_2 = SentenceTransformer(output_dir)

similarity(tuned_model_2.encode(q), tuned_model_2.encode(a))

1.0000001192092896

In [None]:
def bechmark(model, test_dataset):
    
    similarities = [
        np.abs(similarity(model.encode(r['sentence1']), model.encode(r['sentence2']))-r["score"])
        for r in tqdm(test_dataset)
    ]
    
    return np.mean(similarities)

In [None]:
output_dir="models/finetune-MiniLM-comparison"
tuned_model = SentenceTransformer(output_dir)

In [None]:
output_dir

In [None]:
output_dir="models/finetune-MiniLM-pair-score"

# This loss requires pairs of text and a floating point similarity score as a label
loss = CoSENTLoss(model)


args = SentenceTransformerTrainingArguments(
    seed=42,
    output_dir=output_dir,

    # Training Hyperparameters
    learning_rate=1e-4,
    num_train_epochs=1,
    warmup_ratio=0.1,
    lr_scheduler_type="linear",
    
    # Validation 
    do_eval=True,
    eval_strategy="steps",
    eval_steps=100,

    # Chackpoints
    save_strategy="steps",  # Salvando a cada 100 passos # save_strategy="epoch",  # Salvando ao final de cada época
    save_steps=1000,         # Salvando a cada 100 passos
    save_total_limit=2,

    # Loggings
    log_level="warning",
    logging_steps=20,
    
    # Optional training parameters:
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    fp16=False,  # Set to False if your GPU can't handle FP16
    bf16=True,  # Set to True if your GPU supports BF16
)


In [11]:
tuned_model = SentenceTransformer("models/finetune-MiniLM-pair-score")

question = ['Qual é a cor do céu?']
answers = ["O céu é azul", "Eu como ovos no café da manhã","Qual é a cor do mar?","Quão alto é o céu?"]

# Obtendo os vetores de embedding
question_embedding = tuned_model.encode(question)
answers_embeddings = tuned_model.encode(answers)


emb_q = question_embedding[0]
similarities = [similarity(emb_a, emb_q) for emb_a in answers_embeddings]

for a, s in zip(answers, similarities):
  print(a + " --- " + str(s))


O céu é azul --- 0.8213341236114502
Eu como ovos no café da manhã --- 0.7087804079055786
Qual é a cor do mar? --- 0.9428924322128296
Quão alto é o céu? --- 0.8686226606369019


In [12]:
%%time
print(dev_evaluator(tuned_model))

{'sts-dev_pearson_cosine': 0.7696243947343424, 'sts-dev_spearman_cosine': 0.7890590202325146, 'sts-dev_pearson_manhattan': 0.7866918847058119, 'sts-dev_spearman_manhattan': 0.788402013180287, 'sts-dev_pearson_euclidean': 0.7867983537211847, 'sts-dev_spearman_euclidean': 0.7890590202325146, 'sts-dev_pearson_dot': 0.769624392140128, 'sts-dev_spearman_dot': 0.7890590202325146, 'sts-dev_pearson_max': 0.7867983537211847, 'sts-dev_spearman_max': 0.7890590202325146}
CPU times: user 1.98 s, sys: 67.6 ms, total: 2.05 s
Wall time: 1.79 s
