# Challenge Baselines

Rewritten baseline example.


In [None]:
import sys

sys.path.append("../../")

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
)

from climatesense_checkthat2025_task4.utils.data import compute_metrics, create_multilabel_folds

In [None]:
from typing import List

import torch
from transformers import EvalPrediction


def compute_metrics_multilabel_sequenceclassification(
    eval_predictions: EvalPrediction, threshold: float = 0.5, labels: List[str] = None
):
    """Compute metrics for a multi-label sequence classification model's predictions.

    This function applies a sigmoid activation function to the model's raw logits to calculate probabilities,
    converts probabilities to binary predictions based on a specified threshold, and computes evaluation metrics
    using the provided `compute_metrics` function.

    Args:
        eval_predictions (EvalPrediction): An object containing the model's predictions and the true labels.
            - `eval_predictions.predictions`: The raw logits output by the model.
            - `eval_predictions.label_ids`: The true labels for the predictions.
        threshold (float, optional): The threshold for converting probabilities to binary predictions. Defaults to 0.5.
        labels (List[str], optional): A list of label names for the metrics computation. Defaults to None.

    Returns:
        dict: A dictionary containing the computed evaluation metrics.
    """
    sigmoid = torch.nn.Sigmoid()
    x_test = eval_predictions.predictions
    y_test = eval_predictions.label_ids

    # Calculate probabilities and derive binary predictions:
    probs = sigmoid(torch.Tensor(x_test))
    y_pred = torch.where(probs >= threshold, 1.0, 0.0)

    if isinstance(y_pred, torch.Tensor):
        y_pred = y_pred.cpu()

    return compute_metrics(y_pred, y_test, labels)

In [3]:
# Create the dataset:
subtask4a_df = pd.read_csv("../../data/processed/task4/subtask_4a/ct_train_data_clean.tsv", sep="\t")


# Create the dataset
ids = subtask4a_df.index.values
texts = subtask4a_df.text.values
labels = subtask4a_df[["scientific_claim", "scientific_reference", "scientific_entities"]].values.tolist()
full_ds = Dataset.from_dict({"text": texts, "labels": labels})


# Create folds:
folds_ds = create_multilabel_folds(full_ds, n_splits=5, random_state=1435892670)
folds_ds

[DatasetDict({
     train: Dataset({
         features: ['text', 'labels'],
         num_rows: 1091
     })
     test: Dataset({
         features: ['text', 'labels'],
         num_rows: 273
     })
 }),
 DatasetDict({
     train: Dataset({
         features: ['text', 'labels'],
         num_rows: 1091
     })
     test: Dataset({
         features: ['text', 'labels'],
         num_rows: 273
     })
 }),
 DatasetDict({
     train: Dataset({
         features: ['text', 'labels'],
         num_rows: 1091
     })
     test: Dataset({
         features: ['text', 'labels'],
         num_rows: 273
     })
 }),
 DatasetDict({
     train: Dataset({
         features: ['text', 'labels'],
         num_rows: 1091
     })
     test: Dataset({
         features: ['text', 'labels'],
         num_rows: 273
     })
 }),
 DatasetDict({
     train: Dataset({
         features: ['text', 'labels'],
         num_rows: 1092
     })
     test: Dataset({
         features: ['text', 'labels'],
         num_row

In [None]:
from functools import partial
from pprint import pprint

from tqdm.notebook import tqdm

evals = []
model_id = "cardiffnlp/twitter-roberta-base-2022-154m"
for fold, ds in tqdm(enumerate(folds_ds), desc="Running folds", total=len(folds_ds)):
    model = AutoModelForSequenceClassification.from_pretrained(
        pretrained_model_name_or_path=model_id,
        num_labels=3,
        problem_type="multi_label_classification",
    )

    tokenizer_config = {"pretrained_model_name_or_path": model_id}
    if "scibert" in model_id:
        tokenizer_config["do_lower_case"] = False

    tokenizer = AutoTokenizer.from_pretrained(**tokenizer_config)

    def tokenize_function(examples, tokenizer=tokenizer):
        return tokenizer(examples["text"], padding="max_length", truncation=True)

    tokenized_train_dataset = Dataset.from_dict(ds["train"][:]).map(tokenize_function, batched=True)
    tokenized_test_dataset = Dataset.from_dict(ds["test"][:]).map(tokenize_function, batched=True)

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    training_args = TrainingArguments(
        warmup_ratio=0.1,
        learning_rate=2e-5,
        num_train_epochs=10,
        weight_decay=0.01,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=256,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_test_dataset,
        processing_class=tokenizer,
        data_collator=data_collator,
        compute_metrics=partial(
            compute_metrics_multilabel_sequenceclassification,
            labels=["scientific_claim", "scientific_reference", "scientific_entities"],
        ),
    )

    trainer.train()
    eval_result = trainer.evaluate()
    eval_result["fold"] = fold + 1
    eval_result["model"] = model_id
    eval_result = {key.lstrip("eval_"): value for key, value in eval_result.items()}

    pprint(eval_result)

    evals.append(eval_result)

In [62]:
results_df = pd.DataFrame(evals).set_index(["model", "fold"]).rename(columns=lambda x: x.lstrip("_"))
results_df

Unnamed: 0_level_0,Unnamed: 1_level_0,oss,scientific_claim_avg_acc,scientific_claim_avg_prec,scientific_claim_avg_rec,scientific_claim_avg_f1,scientific_reference_avg_acc,scientific_reference_avg_prec,scientific_reference_avg_rec,scientific_reference_avg_f1,scientific_entities_avg_acc,...,scientific_entities_avg_rec,scientific_entities_avg_f1,macro_acc,macro_prec,macro_rec,macro_f1,runtime,samples_per_second,steps_per_second,poch
model,fold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
cardiffnlp/twitter-roberta-base-2022-154m,1,0.373783,0.875458,0.753425,0.774648,0.763889,0.919414,0.725806,0.9,0.803571,0.901099,...,0.941176,0.825806,0.772894,0.738288,0.871941,0.797756,24.1422,11.308,0.083,10.0
cardiffnlp/twitter-roberta-base-2022-154m,2,0.256274,0.897436,0.805556,0.805556,0.805556,0.930403,0.816327,0.8,0.808081,0.952381,...,0.970588,0.910345,0.820513,0.826342,0.858715,0.841327,21.428,12.74,0.093,10.0
cardiffnlp/twitter-roberta-base-2022-154m,3,0.356733,0.893773,0.7625,0.859155,0.807947,0.89011,0.692308,0.72,0.705882,0.904762,...,0.808824,0.808824,0.765568,0.754544,0.795993,0.774218,18.8865,14.455,0.106,10.0
cardiffnlp/twitter-roberta-base-2022-154m,4,0.374265,0.897436,0.761905,0.888889,0.820513,0.904762,0.75,0.72,0.734694,0.897436,...,0.808824,0.797101,0.783883,0.765873,0.805904,0.784103,19.8471,13.755,0.101,10.0
cardiffnlp/twitter-roberta-base-2022-154m,5,0.288283,0.882353,0.717391,0.916667,0.804878,0.9375,0.823529,0.84,0.831683,0.919118,...,0.882353,0.84507,0.790441,0.783911,0.879673,0.827211,23.1913,11.729,0.086,10.0


In [None]:
results_df.groupby("model").mean()

Unnamed: 0_level_0,oss,scientific_claim_avg_acc,scientific_claim_avg_prec,scientific_claim_avg_rec,scientific_claim_avg_f1,scientific_reference_avg_acc,scientific_reference_avg_prec,scientific_reference_avg_rec,scientific_reference_avg_f1,scientific_entities_avg_acc,...,scientific_entities_avg_rec,scientific_entities_avg_f1,macro_acc,macro_prec,macro_rec,macro_f1,runtime,samples_per_second,steps_per_second,poch
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
cardiffnlp/twitter-roberta-base-2022-154m,0.329868,0.889291,0.760155,0.848983,0.800556,0.916438,0.761594,0.796,0.776782,0.914959,...,0.882353,0.837429,0.78666,0.773791,0.842445,0.804923,21.49902,12.7974,0.0938,10.0


: 