# SetFit Models

SetFit does not support sentence-transformers > 4.0 yet so we we install an older version.

**WARNINGS: It ignore the poetry version in the package. If running another notebook afterwards, run poetry install again.**

In [None]:
%pip install "sentence-transformers[train]==3.4.1"

In [1]:
import sys

sys.path.append("../../")

In [2]:
from typing import List

import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [2]:
def create_multilabel_folds(dataset: Dataset, n_splits: int = 5, random_state: int = None) -> List[DatasetDict]:
    """Create stratified folds for the given dataset.

    Args:
        dataset (Dataset): The dataset to split into folds.
        n_splits (int): Number of folds. Default is 5.
        random_state (int): Random state for reproducibility.

    Returns:
        list: A list of DatasetDict objects, each containing train and test splits.
    """
    folds = MultilabelStratifiedKFold(n_splits=n_splits, random_state=random_state, shuffle=True)
    labels = dataset["labels"]
    splits = folds.split(range(len(dataset)), labels)

    folds_ds = []
    for _, (train_index, test_index) in enumerate(splits):
        fold_ds = DatasetDict({"train": dataset.select(train_index), "test": dataset.select(test_index)})
        folds_ds.append(fold_ds)
    return folds_ds

In [4]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score


def compute_metrics(y_pred, y_test, labels: List[str] = None):
    y_pred = np.array(y_pred, copy=None)
    y_test = np.array(y_test, copy=None)
    metrics = {}

    """
    Compute evaluation metrics for multi-label classification.

    Args:
        y_pred (np.ndarray): Predicted labels, shape (n_samples, n_labels).
        y_test (np.ndarray): True labels, shape (n_samples, n_labels).
        labels (List[str], optional): List of label names. If None, numeric indices are used.

    Returns:
        dict: A dictionary containing accuracy, precision, recall, and F1 score for each label,
        as well as the macro accuracy, precision, recall, and F1 score across all labels.
    """
    if (labels is None) or (len(labels) != y_test.shape[1]):
        labels = list(range(0, y_test.shape[1]))

    for i in range(0, y_test.shape[1]):
        acc = accuracy_score(y_test[:, i], y_pred[:, i])
        prec = precision_score(y_test[:, i], y_pred[:, i], zero_division=0)
        rec = recall_score(y_test[:, i], y_pred[:, i], zero_division=0)
        f1 = f1_score(y_test[:, i], y_pred[:, i], zero_division=0)

        metrics.update(
            {
                f"{labels[i]}_avg_acc": acc,
                f"{labels[i]}_avg_prec": prec,
                f"{labels[i]}_avg_rec": rec,
                f"{labels[i]}_avg_f1": f1,
            }
        )
    metrics["macro_acc"] = accuracy_score(y_test, y_pred)
    metrics["macro_prec"] = precision_score(y_test, y_pred, average="macro")
    metrics["macro_rec"] = recall_score(y_test, y_pred, average="macro")
    metrics["macro_f1"] = f1_score(y_test, y_pred, average="macro")

    return metrics

## Default Dataset Train/Evaluation

Train and evaluate the model on the non-extended dataset using SetFit model with the wholde data.


In [5]:
# Create the dataset:
subtask4a_df = pd.read_csv("../../data/processed/task4/subtask_4a/ct_train_data_clean.tsv", sep="\t")


# Create the dataset
ids = subtask4a_df.index.values
texts = subtask4a_df.text.values
labels = subtask4a_df[["scientific_claim", "scientific_reference", "scientific_entities"]].values.tolist()
full_ds = Dataset.from_dict({"text": texts, "labels": labels})


# Create folds:
folds_ds = create_multilabel_folds(full_ds, n_splits=5, random_state=1435892670)
folds_ds

[DatasetDict({
     train: Dataset({
         features: ['text', 'labels'],
         num_rows: 1091
     })
     test: Dataset({
         features: ['text', 'labels'],
         num_rows: 273
     })
 }),
 DatasetDict({
     train: Dataset({
         features: ['text', 'labels'],
         num_rows: 1091
     })
     test: Dataset({
         features: ['text', 'labels'],
         num_rows: 273
     })
 }),
 DatasetDict({
     train: Dataset({
         features: ['text', 'labels'],
         num_rows: 1091
     })
     test: Dataset({
         features: ['text', 'labels'],
         num_rows: 273
     })
 }),
 DatasetDict({
     train: Dataset({
         features: ['text', 'labels'],
         num_rows: 1091
     })
     test: Dataset({
         features: ['text', 'labels'],
         num_rows: 273
     })
 }),
 DatasetDict({
     train: Dataset({
         features: ['text', 'labels'],
         num_rows: 1092
     })
     test: Dataset({
         features: ['text', 'labels'],
         num_row

In [None]:
from functools import partial
from pprint import pprint

from sentence_transformers.losses import CosineSimilarityLoss
from setfit import SetFitModel, Trainer, TrainingArguments
from tqdm.notebook import tqdm

evals = []

# Identify the most promising models based on MTEB v2 for text classification in English: https://huggingface.co/spaces/mteb/leaderboard
# Also include "all-MiniLM-L6-v2" as baseline
model_ids = ["Salesforce/SFR-Embedding-2_R", "NovaSearch/stella_en_1.5B_v5", "BAAI/bge-en-icl"]

# TODO compare "one-vs-rest", "multi-output" (and "classifier-chain" strategies)
# TODO https://huggingface.co/docs/setfit/how_to/hyperparameter_optimization
for model_id in tqdm(model_ids, desc="Running model"):
    print(f"Training/Evaluating: {model_id}.")

    for strategy in tqdm(["one-vs-rest", "multi-output"], desc="Evaluating strategy."):
        for fold, ds in tqdm(enumerate(folds_ds), desc="Running folds", total=len(folds_ds)):
            model = SetFitModel.from_pretrained(
                model_id,
                multi_target_strategy=strategy,
            )

            args = TrainingArguments(
                batch_size=32,
                num_epochs=1,
                loss=CosineSimilarityLoss,
                sampling_strategy="oversampling",
            )

            trainer = Trainer(
                model=model,
                args=args,
                train_dataset=ds["train"],  # Dataset.from_dict(ds["train"][0:100]),
                eval_dataset=ds["test"],
                column_mapping={"text": "text", "labels": "label"},
                metric=partial(
                    compute_metrics, labels=["scientific_claim", "scientific_reference", "scientific_entities"]
                ),
            )
            trainer.train()
            eval_result = trainer.evaluate()
            eval_result["fold"] = fold + 1
            eval_result["model"] = model_id
            eval_result["strategy"] = strategy

            pprint(eval_result)

            evals.append(eval_result)

            break

In [7]:
pd.DataFrame(evals).set_index(["model", "strategy", "fold"])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,scientific_claim_avg_acc,scientific_claim_avg_prec,scientific_claim_avg_rec,scientific_claim_avg_f1,scientific_reference_avg_acc,scientific_reference_avg_prec,scientific_reference_avg_rec,scientific_reference_avg_f1,scientific_entities_avg_acc,scientific_entities_avg_prec,scientific_entities_avg_rec,scientific_entities_avg_f1,macro_acc,macro_prec,macro_rec,macro_f1
model,strategy,fold,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
all-MiniLM-L6-v2,one-vs-rest,1,0.750916,0.521127,0.521127,0.521127,0.81685,0.0,0.0,0.0,0.85348,0.833333,0.514706,0.636364,0.611722,0.451487,0.345278,0.38583
all-MiniLM-L6-v2,multi-output,1,0.750916,0.521127,0.521127,0.521127,0.81685,0.0,0.0,0.0,0.85348,0.833333,0.514706,0.636364,0.611722,0.451487,0.345278,0.38583


## Extanded Dataset Evaluation

We add all the additional data as part of the model training.
