# Setfit but using separate predictors and the augmented data


In [None]:
# Parameters
# parameters
upstream = None
product = None
some_param = None


model_name = "bge-base-en-v1.5"
model_id = "BAAI/bge-base-en-v1.5"
epochs = 1
num_iterations = 1
batch_size = 32

In [None]:
import gc
import os
from pprint import pprint

import pandas as pd
import setfit
from datasets import Dataset, DatasetDict
from sklearn.utils import shuffle

In [None]:
# Check if directory exists:
if not os.path.exists("./data"):
    ROOT_DIR = "../../data/processed/task4/subtask_4a/"
else:
    ROOT_DIR = "./data/processed/task4/subtask_4a/"

In [4]:
import evaluate

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")
precision = evaluate.load("precision")
recall = evaluate.load("recall")

In [None]:
from typing import List

import numpy as np
import torch
from transformers import EvalPrediction


def compute_metrics_sequenceclassification(eval_predictions: EvalPrediction):
    """Compute metrics for a sequence classification model's predictions.

    This function applies a softmax activation function to the model's raw logits to calculate probabilities,
    converts probabilities to binary predictions based on a specified threshold, and computes evaluation metrics
    using the provided `compute_metrics` function.

    Args:
        eval_predictions (EvalPrediction): An object containing the model's predictions and the true labels.
            - `eval_predictions.predictions`: The raw logits output by the model.
            - `eval_predictions.label_ids`: The true labels for the predictions.

    Returns:
        dict: A dictionary containing the computed evaluation metrics.
    """
    predictions, labels = eval_predictions
    predictions = np.argmax(predictions, axis=1)

    print(f"f1: {f1.compute(predictions=predictions, references=labels)}")

    return {
        "f1": f1.compute(predictions=predictions, references=labels),
        "precision": precision.compute(predictions=predictions, references=labels),
        "recall": recall.compute(predictions=predictions, references=labels),
        "accuracy": accuracy.compute(predictions=predictions, references=labels),
    }

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score


def compute_binary_metrics(y_pred, y_test):
    y_pred = np.array(y_pred)  # , copy=None)
    y_test = np.array(y_test)  # , copy=None)
    metrics = {}

    metrics["acc"] = accuracy_score(y_test, y_pred)
    metrics["prec"] = precision_score(y_test, y_pred)
    metrics["rec"] = recall_score(
        y_test,
        y_pred,
    )
    metrics["f1"] = f1_score(y_test, y_pred)

    metrics["macro_prec"] = precision_score(y_test, y_pred, average="macro")
    metrics["macro_rec"] = recall_score(y_test, y_pred, average="macro")
    metrics["macro_f1"] = f1_score(y_test, y_pred, average="macro")

    return metrics

In [None]:
def compute_metrics(y_pred, y_test, labels: List[str] = None):
    y_pred = np.array(y_pred)  # , copy=None)
    y_test = np.array(y_test)  # , copy=None)
    metrics = {}

    """
    Compute evaluation metrics for multi-label classification.

    Args:
        y_pred (np.ndarray): Predicted labels, shape (n_samples, n_labels).
        y_test (np.ndarray): True labels, shape (n_samples, n_labels).
        labels (List[str], optional): List of label names. If None, numeric indices are used.

    Returns:
        dict: A dictionary containing accuracy, precision, recall, and F1 score for each label,
        as well as the macro accuracy, precision, recall, and F1 score across all labels.
    """
    if (labels is None) or (len(labels) != y_test.shape[1]):
        labels = list(range(0, y_test.shape[1]))

    for i in range(0, y_test.shape[1]):
        acc = accuracy_score(y_test[:, i], y_pred[:, i])
        prec = precision_score(y_test[:, i], y_pred[:, i], zero_division=0)
        rec = recall_score(y_test[:, i], y_pred[:, i], zero_division=0)
        f1 = f1_score(y_test[:, i], y_pred[:, i], zero_division=0)

        metrics.update(
            {
                f"{labels[i]}_avg_acc": acc,
                f"{labels[i]}_avg_prec": prec,
                f"{labels[i]}_avg_rec": rec,
                f"{labels[i]}_avg_f1": f1,
            }
        )
    metrics["macro_acc"] = accuracy_score(y_test, y_pred)
    metrics["macro_prec"] = precision_score(y_test, y_pred, average="macro")
    metrics["macro_rec"] = recall_score(y_test, y_pred, average="macro")
    metrics["macro_f1"] = f1_score(y_test, y_pred, average="macro")

    return metrics

In [None]:
columns = ["scientific_claim", "scientific_reference", "scientific_entities"]
# columns = ["scientific_reference", "scientific_entities"]


# Load the provided trainning and test data:
subtask4a_train_df = pd.read_csv(
    os.path.join(ROOT_DIR, "ct_train_clean.tsv"),
    sep="\t",
)
subtask4a_test_df = pd.read_csv(
    os.path.join(ROOT_DIR, "ct_dev_clean.tsv"),
    sep="\t",
)

In [None]:
from typing import List

from setfit import SetFitModel
from transformers import (
    Trainer,
)


def fine_tune_setfit_binary_classifier(
    model_name: str,
    train_texts: List[str],
    train_labels: List[int],
    val_texts: List[str],
    val_labels: List[int],
    epochs: int = 3,
    batch_size: int = 16,
    num_iterations: int = 1,
) -> Trainer:
    """Fine-tune a binary classifier with SetFit.

    Args:
        model_name (str): Pretrained model name (e.g., 'bert-base-uncased').
        train_texts (List[str]): List of training texts.
        train_labels (List[int]): List of training labels (0 or 1).
        val_texts (List[str]): List of validation texts.
        val_labels (List[int]): List of validation labels (0 or 1).
        epochs (int): Number of training epochs.
        batch_size (int): Batch size for training.
        num_iterations (int): Number of contrastive learning iterations per epoch.

    Returns:
        Trainer: Hugging Face Trainer object after training.
    """
    # Create model
    model = SetFitModel.from_pretrained(model_name)

    # Create Dataset:
    ds_train = Dataset.from_dict({"text": train_texts, "labels": train_labels})
    ds_dev = Dataset.from_dict({"text": val_texts, "labels": val_labels})
    ds = DatasetDict({"train": ds_train, "test": ds_dev})

    # Define training arguments
    training_args = setfit.TrainingArguments(
        num_epochs=epochs,
        batch_size=batch_size,
        eval_strategy="epoch",
        save_strategy="epoch",
        logging_steps=10,
        load_best_model_at_end=True,
        # learning_rate=2e-5, #learning_rate=1e-6,
        # weight_decay=0.01,
        # fp16 = not torch.cuda.is_bf16_supported(),
        # bf16 = torch.cuda.is_bf16_supported(),
        # metric_for_best_model="f1",
        # greater_is_better=True,
        num_iterations=num_iterations,
    )

    # Define Trainer
    trainer = setfit.Trainer(
        model=model,
        args=training_args,
        train_dataset=ds["train"],
        eval_dataset=ds["test"],
        metric=compute_binary_metrics,
        column_mapping={"text": "text", "labels": "label"},
    )

    # Train the model
    trainer.train()

    return model, trainer

In [10]:
torch.cuda.empty_cache()

In [None]:
train_embeddings = {}
predictions = {}
for cl in columns:
    # Load the data
    # Check if oversampling file exists:
    if os.path.exists(os.path.join(ROOT_DIR, f"ct_train_oversamples_{cl}.tsv")):
        subtask4a_cat_claim_train_df = pd.read_csv(
            os.path.join(ROOT_DIR, f"ct_train_oversamples_{cl}.tsv"),
            sep="\t",
            header=None,
            names=["text", cl],
        )

        print(f"Evaluating {cl}...")
        for dataset_df, name in [
            (subtask4a_cat_claim_train_df, "oversampling"),
            (subtask4a_train_df[["text", cl]], "training"),
            (subtask4a_test_df[["text", cl]], "evaluation"),
        ]:
            if name == "oversampling":
                oversampling_train_df = dataset_df
            elif name == "training":
                train_df = dataset_df = dataset_df
            elif name == "evaluation":
                eval_df = dataset_df = dataset_df

        # Train model with oversampling + training:
        X_train = np.array(pd.concat([train_df, oversampling_train_df])["text"].tolist())
        y_train = list(map(int, pd.concat([train_df, oversampling_train_df])[cl]))
        X_train, y_train = shuffle(X_train, y_train)

        X_test = np.array(subtask4a_test_df["text"].tolist())
        y_test = list(map(int, subtask4a_test_df[cl]))

        model, trainer = fine_tune_setfit_binary_classifier(
            model_name=model_id,
            train_texts=X_train,
            train_labels=y_train,
            val_texts=X_test,
            val_labels=y_test,
            epochs=epochs,
            batch_size=batch_size,
            num_iterations=num_iterations,
        )

        # Evaluate the model:
        eval_result = trainer.evaluate()
        eval_result["model"] = model_id

        # Store predicitons:
        predictions[cl] = model.predict(X_test)
        predictions[f"{cl}_embeddings"] = model.encode(X_test)
        train_embeddings[f"{cl}_embeddings"] = model.encode(X_train)

        # Save the model:
        model.save_pretrained(os.path.join(ROOT_DIR, f"results/setfit_{cl}"))

        pprint(eval_result)

        # Clear memory:
        model.to("cpu")
        del model
        gc.collect()
        torch.cuda.empty_cache()
        gc.collect()

In [None]:
# Get the results
predictions_df = pd.DataFrame(
    {
        "scientific_claim": predictions["scientific_claim"],
        "scientific_reference": predictions["scientific_reference"],
        "scientific_entities": predictions["scientific_entities"],
    }
)
compute_metrics(
    predictions_df[columns].values.tolist(),
    subtask4a_test_df[columns].values.tolist(),
    labels=columns,
)