# SetFit Models


In [None]:
# parameters
upstream = None
product = None
some_param = None

model_id = "cardiffnlp/twitter-roberta-large-2022-154m"

max_steps = 10000
num_epochs = 10
eval_steps = 150
batch_size = 24
eval_steps = 150
warmup_ratio = 0.1
learning_rate = 2e-5
num_train_epochs = 10
weight_decay = 0.01
per_device_train_batch_size = 16
per_device_eval_batch_size = 256
gradient_checkpointing = True

In [2]:
import sys

sys.path.append("../../")

In [None]:
import os
from functools import partial
from pprint import pprint
from typing import List

import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict
from sentence_transformers.losses import CosineSimilarityLoss
from setfit import SetFitModel, Trainer, TrainingArguments
from transformers import EarlyStoppingCallback

In [None]:
# Check if directory exists:
if not os.path.exists("./data"):
    ROOT_DIR = "../../data/"
else:
    ROOT_DIR = "./data/"

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score


def compute_metrics(y_pred, y_test, labels: List[str] = None):
    y_pred = np.array(y_pred)  # , copy=None)
    y_test = np.array(y_test)  # , copy=None)
    metrics = {}

    """
    Compute evaluation metrics for multi-label classification.

    Args:
        y_pred (np.ndarray): Predicted labels, shape (n_samples, n_labels).
        y_test (np.ndarray): True labels, shape (n_samples, n_labels).
        labels (List[str], optional): List of label names. If None, numeric indices are used.

    Returns:
        dict: A dictionary containing accuracy, precision, recall, and F1 score for each label,
        as well as the macro accuracy, precision, recall, and F1 score across all labels.
    """
    if (labels is None) or (len(labels) != y_test.shape[1]):
        labels = list(range(0, y_test.shape[1]))

    for i in range(0, y_test.shape[1]):
        acc = accuracy_score(y_test[:, i], y_pred[:, i])
        prec = precision_score(y_test[:, i], y_pred[:, i], zero_division=0)
        rec = recall_score(y_test[:, i], y_pred[:, i], zero_division=0)
        f1 = f1_score(y_test[:, i], y_pred[:, i], zero_division=0)

        metrics.update(
            {
                f"{labels[i]}_avg_acc": acc,
                f"{labels[i]}_avg_prec": prec,
                f"{labels[i]}_avg_rec": rec,
                f"{labels[i]}_avg_f1": f1,
            }
        )
    metrics["macro_acc"] = accuracy_score(y_test, y_pred)
    metrics["macro_prec"] = precision_score(y_test, y_pred, average="macro")
    metrics["macro_rec"] = recall_score(y_test, y_pred, average="macro")
    metrics["macro_f1"] = f1_score(y_test, y_pred, average="macro")

    return metrics

In [6]:
# Load the provided trainning and test data:
subtask4a_train_df = pd.read_csv(
    os.path.join(ROOT_DIR, "processed/task4/subtask_4a/ct_train_clean.tsv"),
    sep="\t",
)
subtask4a_test_df = pd.read_csv(
    os.path.join(ROOT_DIR, "processed/task4/subtask_4a/ct_dev_clean.tsv"),
    sep="\t",
)
subtask4a_eval_df = pd.read_csv(
    os.path.join(ROOT_DIR, "processed/task4/subtask_4a/ct_eval_clean.tsv"),
    sep="\t",
)

## Basic Train/Dev Evaluaiton

We use the train/dev from the repository rathert than the folds.


In [None]:
# Create the dataset:

ds_train = Dataset.from_dict(
    {
        "text": subtask4a_train_df["text"],
        "labels": subtask4a_train_df[
            ["scientific_claim", "scientific_reference", "scientific_entities"]
        ].values.tolist(),
    }
)

ds_dev = Dataset.from_dict(
    {
        "text": subtask4a_test_df["text"],
        "labels": subtask4a_test_df[
            ["scientific_claim", "scientific_reference", "scientific_entities"]
        ].values.tolist(),
    }
)

ds_eval = Dataset.from_dict({"text": subtask4a_eval_df["text"]})

ds = DatasetDict({"train": ds_train, "test": ds_dev, "eval": ds_eval})
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 1228
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 137
    })
    eval: Dataset({
        features: ['text'],
        num_rows: 240
    })
})

In [None]:
import gc

import torch

gc.collect()
torch.cuda.empty_cache()

In [None]:
model = SetFitModel.from_pretrained(
    model_id,
    multi_target_strategy="one-vs-rest",
    trust_remote_code=True,
    torch_dtype="auto",
)


os.environ["MLFLOW_EXPERIMENT_NAME"] = f"setfit-*-{model_id.split('/')[-1]}-dev"
os.environ["MLFLOW_FLATTEN_PARAMS"] = "1"

callbacks = [EarlyStoppingCallback(early_stopping_patience=4)]

args = TrainingArguments(
    batch_size=batch_size,
    num_epochs=num_epochs,
    loss=CosineSimilarityLoss,
    sampling_strategy="oversampling",
    use_amp=True,  # mixed precision for optimising GPU memory
    eval_strategy="steps",
    save_steps=eval_steps,
    eval_steps=eval_steps,
    load_best_model_at_end=True,
    max_steps=max_steps,  # 46,540 eval steps... we change to a 3000 to avoid this issue...
    output_dir=os.path.join(ROOT_DIR, f"results/setfit/checkpoints/{model_id.split('/')[-1]}"),
    logging_dir=os.path.join(ROOT_DIR, f"results/setfit/logs/{model_id.split('/')[-1]}"),
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ds["train"],  # Dataset.from_dict(ds["train"][0:100]),
    eval_dataset=ds["test"],
    column_mapping={"text": "text", "labels": "label"},
    metric=partial(
        compute_metrics,
        labels=["scientific_claim", "scientific_reference", "scientific_entities"],
    ),
    callbacks=callbacks,  # FIXME should use an eval dataset for early stopping rather than the test set. Check: https://github.com/huggingface/setfit/issues/424
)
trainer.train()
eval_result = trainer.evaluate()
eval_result["model"] = model_id

pprint(eval_result)