In [None]:
from datasets import load_dataset, DatasetDict
from setfit import SetFitModel, TrainingArguments, SetFitTrainer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import train_test_split
import wandb
import pandas as pd

### Save the path to the different datasets

In [None]:
train_en_path = "./data_sources/train/train_en.csv"
test_en_path = "./data_sources/test/test_en.csv"

train_it_path = "./data_sources/train/train_it.csv"
test_it_path = "./data_sources/test/test_it.csv"

train_es_path = "./data_sources/train/train_es.csv"
test_es_path = "./data_sources/test/test_es.csv"

### Set up W&B

In [None]:
wandb.login()

### Load data in DatasetDict

In [None]:
# Load CSV manually for the train split
train_df = pd.read_csv(train_it_path)
test_df = pd.read_csv(test_it_path)

# Split into train and val with stratification
train_df, val_df = train_test_split(
    train_df, test_size=0.2, stratify=train_df["label"], random_state=42
)

# Convert back to Hugging Face Datasets
dataset_it = DatasetDict({
    "train": Dataset.from_pandas(train_df.reset_index(drop=True)),
    "val": Dataset.from_pandas(val_df.reset_index(drop=True)),
    "test": Dataset.from_pandas(test_df)
})

### Define Metrics

In [None]:
def compute_metrics(p):
    y_pred = p.predictions.argmax(axis=1)  # convert logits to predicted labels
    y_true = p.label_ids

    return {
        "f1_macro": f1_score(y_true, y_pred, average="macro"),
        "precision_macro": precision_score(y_true, y_pred, average="macro"),
        "recall_macro": recall_score(y_true, y_pred, average="macro")
    }

### Train and Evaluate

In [None]:
model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
model = SetFitModel.from_pretrained(model_name)

# to match labels with meaning: 0 --> "offensive", 1 --> "reappropriative"
model.labels = ["offensive", "reappropriative"]

args = TrainingArguments(
    batch_size=8,
    num_epochs=5,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset_it["train"],
    eval_dataset=tokenized_dataset_it["val"],
    metric=compute_metrics,
    column_mapping={"text": "text", "label": "label"}
)

trainer.train()
trainer.evaluate(tokenized_dataset_it["test"])