# Hierachy Version of the Baseline Model


In [None]:
# parameters
upstream = None
product = None
some_param = None

model_id = "cardiffnlp/twitter-roberta-base-2022-154m"
warmup_ratio = 0.1
learning_rate = 2e-5
num_train_epochs = 10
weight_decay = 0.01
per_device_train_batch_size = 32
per_device_eval_batch_size = 256
fp16 = False
fp16_full_eval = False
gradient_checkpointing = False

In [None]:
import os
from functools import partial
from pprint import pprint

import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
)

from climatesense_checkthat2025.utils.data import compute_metrics

In [3]:
import evaluate

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")
precision = evaluate.load("precision")
recall = evaluate.load("recall")

In [None]:
from typing import List

import numpy as np
import torch
from transformers import EvalPrediction


def compute_metrics_sequenceclassification(eval_predictions: EvalPrediction):
    """Compute metrics for a sequence classification model's predictions.

    This function applies a softmax activation function to the model's raw logits to calculate probabilities,
    converts probabilities to binary predictions based on a specified threshold, and computes evaluation metrics
    using the provided `compute_metrics` function.

    Args:
        eval_predictions (EvalPrediction): An object containing the model's predictions and the true labels.
            - `eval_predictions.predictions`: The raw logits output by the model.
            - `eval_predictions.label_ids`: The true labels for the predictions.

    Returns:
        dict: A dictionary containing the computed evaluation metrics.
    """
    predictions, labels = eval_predictions
    predictions = np.argmax(predictions, axis=1)

    return {
        "f1": f1.compute(predictions=predictions, references=labels),
        "precision": precision.compute(predictions=predictions, references=labels),
        "recall": recall.compute(predictions=predictions, references=labels),
        "accuracy": accuracy.compute(predictions=predictions, references=labels),
    }


def compute_metrics_multilabel_sequenceclassification(
    eval_predictions: EvalPrediction, threshold: float = 0.5, labels: List[str] = None
):
    """Compute metrics for a multi-label sequence classification model's predictions.

    This function applies a sigmoid activation function to the model's raw logits to calculate probabilities,
    converts probabilities to binary predictions based on a specified threshold, and computes evaluation metrics
    using the provided `compute_metrics` function.

    Args:
        eval_predictions (EvalPrediction): An object containing the model's predictions and the true labels.
            - `eval_predictions.predictions`: The raw logits output by the model.
            - `eval_predictions.label_ids`: The true labels for the predictions.
        threshold (float, optional): The threshold for converting probabilities to binary predictions. Defaults to 0.5.
        labels (List[str], optional): A list of label names for the metrics computation. Defaults to None.

    Returns:
        dict: A dictionary containing the computed evaluation metrics.
    """
    sigmoid = torch.nn.Sigmoid()
    x_test = eval_predictions.predictions
    y_test = eval_predictions.label_ids

    # Calculate probabilities and derive binary predictions:
    probs = sigmoid(torch.Tensor(x_test))
    y_pred = torch.where(probs >= threshold, 1.0, 0.0)

    if isinstance(y_pred, torch.Tensor):
        y_pred = y_pred.cpu()

    return compute_metrics(y_pred, y_test, labels)

In [None]:
# Create the dataset:

import os.path

if os.path.isfile("./data/processed/task4/subtask_4a/ct_train_clean.tsv"):
    subtask4a_train_df = pd.read_csv("./data/processed/task4/subtask_4a/ct_train_clean.tsv", sep="\t")
    subtask4a_dev_df = pd.read_csv("./data/processed/task4/subtask_4a/ct_dev_clean.tsv", sep="\t")
else:
    subtask4a_train_df = pd.read_csv("../../data/processed/task4/subtask_4a/ct_train_clean.tsv", sep="\t")
    subtask4a_dev_df = pd.read_csv("../../data/processed/task4/subtask_4a/ct_dev_clean.tsv", sep="\t")

ds_train = Dataset.from_dict(
    {
        "text": subtask4a_train_df["text"],
        "labels": subtask4a_train_df[
            ["scientific_claim", "scientific_reference", "scientific_entities"]
        ].values.tolist(),
    }
)

ds_dev = Dataset.from_dict(
    {
        "text": subtask4a_dev_df["text"],
        "labels": subtask4a_dev_df[["scientific_claim", "scientific_reference", "scientific_entities"]].values.tolist(),
    }
)

ds = DatasetDict({"train": ds_train, "test": ds_dev})
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 1228
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 137
    })
})

## Create Zero Model


In [None]:
# Create ones dataset:
def check_all_zero(row):
    return all(value == 0.0 for value in row[["scientific_claim", "scientific_reference", "scientific_entities"]])


subtask4a_train_df["all_zero"] = subtask4a_train_df.apply(check_all_zero, axis=1)
subtask4a_dev_df["all_zero"] = subtask4a_dev_df.apply(check_all_zero, axis=1)

In [16]:
subtask4a_train_df

Unnamed: 0,index,text,labels,scientific_claim,scientific_reference,scientific_entities,all_zero
0,1046,@user those eyes are a gift send straight from...,"[0.0, 0.0, 0.0]",0.0,0.0,0.0,True
1,638,Remember when libs attacked @user for his conc...,"[0.0, 0.0, 0.0]",0.0,0.0,0.0,True
2,1091,Teenage Fever is a mood,"[0.0, 0.0, 0.0]",0.0,0.0,0.0,True
3,31,Steam survey shows PC gamers are still mostly ...,"[0.0, 0.0, 0.0]",0.0,0.0,0.0,True
4,142,Key findings utilized by Watson and Crick: Fra...,"[0.0, 0.0, 1.0]",0.0,0.0,1.0,False
...,...,...,...,...,...,...,...
1223,296,"Hey everyone, here's a great thing from @user ...","[0.0, 0.0, 0.0]",0.0,0.0,0.0,True
1224,94,The War On Science: What It Is And How To Win ...,"[0.0, 0.0, 1.0]",0.0,0.0,1.0,False
1225,1240,https://tonic.vice.com/en_us/article/8xz9mz/ma...,"[1.0, 1.0, 1.0]",1.0,1.0,1.0,False
1226,637,@user Martha - stop the redundant BS!! Even if...,"[0.0, 0.0, 0.0]",0.0,0.0,0.0,True


In [None]:
# Create dataset for the ones:

ds_train_zeros = Dataset.from_dict(
    {
        "text": subtask4a_train_df["text"],
        "label": list(map(int, subtask4a_train_df["all_zero"])),
    }
)

ds_dev_zeros = Dataset.from_dict(
    {
        "text": subtask4a_dev_df["text"],
        "label": list(map(int, subtask4a_dev_df["all_zero"])),
    }
)

ds_zeros = DatasetDict({"train": ds_train_zeros, "test": ds_dev_zeros})
ds_zeros

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 1228
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 137
    })
})

In [None]:
def compute_weighted_loss(outputs, labels, num_items_in_batch=None, return_outputs=False, class_weights=None):
    logits = outputs.get("logits")
    n_labels = logits.shape[1]

    if class_weights is not None and len(class_weights) == n_labels:
        loss_fct = torch.nn.CrossEntropyLoss(weight=torch.from_numpy(class_weights).float().to(device=logits.device))
    else:
        loss_fct = torch.nn.CrossEntropyLoss()
    loss = loss_fct(logits.view(-1, n_labels), labels.view(-1))
    return (loss, outputs) if return_outputs else loss

In [None]:
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(ds_zeros["train"]["label"]),
    y=ds_zeros["train"]["label"],
).astype(float)
class_weights

array([1.26859504, 0.82526882])

In [None]:
from transformers import EarlyStoppingCallback

os.environ["MLFLOW_EXPERIMENT_NAME"] = f"baseline-{model_id.split('/')[-1]}"
os.environ["MLFLOW_FLATTEN_PARAMS"] = "1"
callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]

zero_model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=2, trust_remote_code=True)

tokenizer = AutoTokenizer.from_pretrained(model_id)


def preprocess_function(examples, tokenizer=tokenizer):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


tokenized_ds_zeros = ds_zeros.map(preprocess_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    warmup_ratio=warmup_ratio,
    learning_rate=learning_rate,
    num_train_epochs=num_train_epochs,
    weight_decay=weight_decay,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    gradient_checkpointing=gradient_checkpointing,
    load_best_model_at_end=True,
    eval_strategy="epoch",
    save_strategy="epoch",
)

zero_trainer = Trainer(
    model=zero_model,
    args=training_args,
    train_dataset=tokenized_ds_zeros["train"],
    eval_dataset=tokenized_ds_zeros["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics_sequenceclassification,
    compute_loss_func=partial(compute_weighted_loss, class_weights=class_weights),
    callbacks=callbacks,  # FIXME should use an eval dataset for early stopping rather than the test set. Check: https://github.com/huggingface/setfit/issues/424
)

zero_trainer.train()
zero_eval_result = zero_trainer.evaluate()
zero_eval_result["model"] = model_id
pprint(zero_eval_result)

In [None]:
zero_preds = zero_trainer.predict(tokenized_ds_zeros["test"])

## Create Standard Model


In [None]:
# Create subset dataset for the second sub task:

subtask4a_train_df_sub = subtask4a_train_df[subtask4a_train_df["all_zero"] == False]  # noqa: E712
subtask4a_dev_df_sub = subtask4a_dev_df[subtask4a_dev_df["all_zero"] == False]  # noqa: E712

ds_train_sub = Dataset.from_dict(
    {
        "text": subtask4a_train_df_sub["text"],
        "labels": subtask4a_train_df_sub[
            ["scientific_claim", "scientific_reference", "scientific_entities"]
        ].values.tolist(),
    }
)

ds_dev_sub = Dataset.from_dict(
    {
        "text": subtask4a_dev_df_sub["text"],
        "labels": subtask4a_dev_df_sub[
            ["scientific_claim", "scientific_reference", "scientific_entities"]
        ].values.tolist(),
    }
)

ds_sub = DatasetDict({"train": ds_train_sub, "test": ds_dev_sub})
ds_sub

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 484
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 42
    })
})

In [None]:
from transformers import EarlyStoppingCallback

os.environ["MLFLOW_EXPERIMENT_NAME"] = f"baseline-{model_id.split('/')[-1]}"
os.environ["MLFLOW_FLATTEN_PARAMS"] = "1"
callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]

model = AutoModelForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path=model_id,
    num_labels=3,
    problem_type="multi_label_classification",
)

tokenizer = AutoTokenizer.from_pretrained(model_id)


def preprocess_function(examples, tokenizer=tokenizer):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


tokenized_ds_sub = ds_sub.map(preprocess_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


training_args = TrainingArguments(
    warmup_ratio=warmup_ratio,
    learning_rate=learning_rate,
    num_train_epochs=num_train_epochs,
    weight_decay=weight_decay,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    # fp16=fp16,
    # fp16_full_eval=fp16_full_eval,
    gradient_checkpointing=gradient_checkpointing,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds_sub["train"],
    eval_dataset=tokenized_ds_sub["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=partial(
        compute_metrics_multilabel_sequenceclassification,
        labels=["scientific_claim", "scientific_reference", "scientific_entities"],
    ),
)

trainer.train()
eval_result = trainer.evaluate()
eval_result["model"] = model_id

pprint(eval_result)
eval_result

In [None]:
tokenized_ds = ds.map(preprocess_function, batched=True)

In [None]:
preds = trainer.predict(tokenized_ds["test"])

In [None]:
final_preds = []
for zeros, p in zip(zero_preds.predictions, preds.predictions):
    if np.argmax(zeros) == 1:
        final_preds.append([0, 0, 0])
    else:
        sigmoid = torch.nn.Sigmoid()
        p = sigmoid(torch.Tensor(p))
        p = torch.where(p >= 0.5, 1.0, 0.0)
        p = p.cpu().numpy()
        final_preds.append(p)

compute_metrics(
    np.array(final_preds),
    ds["test"]["labels"],
    labels=["scientific_claim", "scientific_reference", "scientific_entities"],
)