# Challenge Baselines

Rewritten baseline/fine tuning example.


In [None]:
# parameters
upstream = None
product = None
some_param = None

model_id = "cardiffnlp/twitter-roberta-large-2022-154m"

warmup_ratio = 0.1
learning_rate = 2e-5
num_train_epochs = 10
weight_decay = 0.01
per_device_train_batch_size = 32
per_device_eval_batch_size = 256
gradient_checkpointing = True

In [114]:
import sys

sys.path.append("../../")

In [None]:
import os
import shutil
from functools import partial
from pprint import pprint

import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.metrics import f1_score
from transformers import (
    AutoTokenizer,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
)

from climatesense_checkthat2025.utils.data import (
    compute_metrics,
)

In [None]:
# Check if directory exists:
if not os.path.exists("./data"):
    ROOT_DIR = "../../data/"
else:
    ROOT_DIR = "./data/"

In [117]:
from typing import List

import torch
from transformers import EvalPrediction


def compute_metrics_multilabel_sequenceclassification(
    eval_predictions: EvalPrediction, threshold: float = 0.5, labels: List[str] = None
):
    """Compute metrics for a multi-label sequence classification model's predictions.

    This function applies a sigmoid activation function to the model's raw logits to calculate probabilities,
    converts probabilities to binary predictions based on a specified threshold, and computes evaluation metrics
    using the provided `compute_metrics` function.

    Args:
        eval_predictions (EvalPrediction): An object containing the model's predictions and the true labels.
            - `eval_predictions.predictions`: The raw logits output by the model.
            - `eval_predictions.label_ids`: The true labels for the predictions.
        threshold (float, optional): The threshold for converting probabilities to binary predictions. Defaults to 0.5.
        labels (List[str], optional): A list of label names for the metrics computation. Defaults to None.

    Returns:
        dict: A dictionary containing the computed evaluation metrics.
    """
    sigmoid = torch.nn.Sigmoid()
    x_test = eval_predictions.predictions
    y_test = eval_predictions.label_ids

    # Calculate probabilities and derive binary predictions:
    probs = sigmoid(torch.Tensor(x_test))
    y_pred = torch.where(probs >= threshold, 1.0, 0.0)

    if isinstance(y_pred, torch.Tensor):
        y_pred = y_pred.cpu()

    return compute_metrics(y_pred, y_test, labels)

In [None]:
def compute_weighted_loss(outputs, labels, num_items_in_batch=None, return_outputs=False, class_weights=None):
    logits = outputs.get("logits")
    n_labels = logits.shape[1]

    loss_fct = torch.nn.BCEWithLogitsLoss(pos_weight=torch.from_numpy(class_weights).float().to(device=logits.device))
    loss = loss_fct(logits.view(-1, n_labels), labels.view(-1, n_labels)).float().to(device=logits.device)

    return (loss, outputs) if return_outputs else loss

In [119]:
# Load the provided trainning and test data:
subtask4a_train_df = pd.read_csv(
    os.path.join(ROOT_DIR, "processed/task4/subtask_4a/ct_train_clean.tsv"),
    sep="\t",
)
subtask4a_test_df = pd.read_csv(
    os.path.join(ROOT_DIR, "processed/task4/subtask_4a/ct_dev_clean.tsv"),
    sep="\t",
)
subtask4a_eval_df = pd.read_csv(
    os.path.join(ROOT_DIR, "processed/task4/subtask_4a/ct_eval_clean.tsv"),
    sep="\t",
)

## Train/Dev Evaluation

We use the train/dev from the repository rathert than the folds.


In [None]:
# Create the dataset:

ds_train = Dataset.from_dict(
    {
        "text": subtask4a_train_df["text"],
        "labels": subtask4a_train_df[
            ["scientific_claim", "scientific_reference", "scientific_entities"]
        ].values.tolist(),
    }
)

ds_dev = Dataset.from_dict(
    {
        "text": subtask4a_test_df["text"],
        "labels": subtask4a_test_df[
            ["scientific_claim", "scientific_reference", "scientific_entities"]
        ].values.tolist(),
    }
)

ds_eval = Dataset.from_dict({"text": subtask4a_eval_df["text"]})

ds = DatasetDict({"train": ds_train, "test": ds_dev, "eval": ds_eval})
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 1228
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 137
    })
    eval: Dataset({
        features: ['text'],
        num_rows: 240
    })
})

In [None]:
class_weights = np.array(ds["train"]["labels"]).sum(axis=0)

In [None]:
from transformers import AutoModelForSequenceClassification, EarlyStoppingCallback

os.environ["MLFLOW_EXPERIMENT_NAME"] = f"baseline-*-{model_id.split('/')[-1]}"
os.environ["MLFLOW_FLATTEN_PARAMS"] = "1"

callbacks = [EarlyStoppingCallback(early_stopping_patience=4)]

model = AutoModelForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path=model_id,
    num_labels=3,
    problem_type="multi_label_classification",
)

tokenizer = AutoTokenizer.from_pretrained(model_id)


def preprocess_function(examples, tokenizer=tokenizer):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


tokenized_ds = ds.map(preprocess_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


training_args = TrainingArguments(
    warmup_ratio=warmup_ratio,
    learning_rate=learning_rate,
    num_train_epochs=num_train_epochs,
    weight_decay=weight_decay,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    gradient_checkpointing=gradient_checkpointing,
    load_best_model_at_end=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    output_dir=os.path.join(ROOT_DIR, f"results/baselines/checkpoints/{model_id.split('/')[-1]}"),
    logging_dir=os.path.join(ROOT_DIR, f"results/baselines/logs/{model_id.split('/')[-1]}"),
    fp16=not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_bf16_supported(),
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=partial(
        compute_metrics_multilabel_sequenceclassification,
        labels=["scientific_claim", "scientific_reference", "scientific_entities"],
    ),
    # compute_loss_func=partial(compute_weighted_loss, class_weights=class_weights),
    callbacks=callbacks,
)

trainer.train()
eval_result = trainer.evaluate()
eval_result["model"] = model_id

pprint(eval_result)
eval_result  # noqa: B018

model.save_pretrained(os.path.join(ROOT_DIR, f"results/baselines/checkpoints/{model_id.split('/')[-1]}"))

In [94]:
subtask4a_test_df

Unnamed: 0,index,text,labels,scientific_claim,scientific_reference,scientific_entities
0,11,@user Nabil these sickos full stop preying on ...,"[0.0, 0.0, 0.0]",0.0,0.0,0.0
1,23,So do strippers just wait to be saved to stop ...,"[0.0, 0.0, 0.0]",0.0,0.0,0.0
2,28,The weaponization of medical language embolden...,"[0.0, 0.0, 0.0]",0.0,0.0,0.0
3,32,#sports #fitness Buy Now: $33.97 Ueasy Knee pa...,"[0.0, 0.0, 0.0]",0.0,0.0,0.0
4,57,@user thx for support also trans community - a...,"[0.0, 0.0, 0.0]",0.0,0.0,0.0
...,...,...,...,...,...,...
132,1317,Winter Tipples tasting went down a treat last ...,"[0.0, 0.0, 0.0]",0.0,0.0,0.0
133,1327,@user Please read this research analysis https...,"[0.0, 1.0, 1.0]",0.0,1.0,1.0
134,1329,@user Stop hoarding so much and throw some shi...,"[0.0, 0.0, 0.0]",0.0,0.0,0.0
135,1343,@user ugh. I'm really over the mezzo drama. th...,"[0.0, 0.0, 0.0]",0.0,0.0,0.0


In [None]:
# Generate embeddings using the model:
from transformers import TextClassificationPipeline

model = AutoModelForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path=os.path.join(
        ROOT_DIR,
        "results/baselines/checkpoints/twitter-roberta-large-2022-154m-unbalanced/checkpoint-300",
    ),
    num_labels=3,
    problem_type="multi_label_classification",
    id2label={
        0: "scientific_claim",
        1: "scientific_reference",
        2: "scientific_entities",
    },
    label2id={
        "scientific_claim": 0,
        "scientific_reference": 1,
        "scientific_entities": 2,
    },
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# # Create a text classification pipeline
clf = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True)

In [None]:
def get_last_hidden_states(text, model, tokenizer):
    """Generate the last hidden states for a given text using a pre-trained model and tokenizer.

    Args:
        text (str): The input text to process.
        model: The pre-trained model to use for generating hidden states.
        tokenizer: The tokenizer corresponding to the pre-trained model.

    Returns:
        numpy.ndarray: The mean of the last hidden states for the input text.
    """
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(model.device)
    outputs = model(**inputs, output_hidden_states=True)
    last_hidden_states = outputs.hidden_states[-1].mean(dim=1).to("cpu").detach().numpy()
    return last_hidden_states

In [None]:
embeddings = subtask4a_train_df["text"].apply(lambda x: get_last_hidden_states(x, model, tokenizer))

In [None]:
# Best for step each label:
best_steps = {
    "scientific_claim": 210,
    "scientific_reference": 240,
    "scientific_entities": 105,
}

embeddings = {}
for _, (k, v) in enumerate(best_steps.items()):
    print(k, v)

    model = AutoModelForSequenceClassification.from_pretrained(
        pretrained_model_name_or_path=os.path.join(
            ROOT_DIR,
            f"results/baselines/checkpoints/twitter-roberta-large-2022-154m-unbalanced/checkpoint-{v}",
        ),
        num_labels=3,
        problem_type="multi_label_classification",
        id2label={
            0: "scientific_claim",
            1: "scientific_reference",
            2: "scientific_entities",
        },
        label2id={
            "scientific_claim": 0,
            "scientific_reference": 1,
            "scientific_entities": 2,
        },
    )
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    emb = subtask4a_train_df["text"].apply(
        lambda x: get_last_hidden_states(x, model, tokenizer)  # noqa: B023
    )

    embeddings[k] = emb

In [None]:
# Best for step each label:
best_steps = {
    "scientific_claim": 210,
    "scientific_reference": 240,
    "scientific_entities": 105,
}

test_embeddings = {}
for _, (k, v) in enumerate(best_steps.items()):
    print(k, v)

    model = AutoModelForSequenceClassification.from_pretrained(
        pretrained_model_name_or_path=os.path.join(
            ROOT_DIR,
            f"results/baselines/checkpoints/twitter-roberta-large-2022-154m-unbalanced/checkpoint-{v}",
        ),
        num_labels=3,
        problem_type="multi_label_classification",
        id2label={
            0: "scientific_claim",
            1: "scientific_reference",
            2: "scientific_entities",
        },
        label2id={
            "scientific_claim": 0,
            "scientific_reference": 1,
            "scientific_entities": 2,
        },
    )
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    emb = subtask4a_test_df["text"].apply(
        lambda x: get_last_hidden_states(x, model, tokenizer)  # noqa: B023
    )

    test_embeddings[k] = emb

In [None]:
# Best for step each label:
best_steps = {
    "scientific_claim": 210,
    "scientific_reference": 240,
    "scientific_entities": 105,
}

eval_embeddings = {}
for _, (k, v) in enumerate(best_steps.items()):
    print(k, v)

    model = AutoModelForSequenceClassification.from_pretrained(
        pretrained_model_name_or_path=os.path.join(
            ROOT_DIR,
            f"results/baselines/checkpoints/twitter-roberta-large-2022-154m-unbalanced/checkpoint-{v}",
        ),
        num_labels=3,
        problem_type="multi_label_classification",
        id2label={
            0: "scientific_claim",
            1: "scientific_reference",
            2: "scientific_entities",
        },
        label2id={
            "scientific_claim": 0,
            "scientific_reference": 1,
            "scientific_entities": 2,
        },
    )
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    emb = subtask4a_eval_df["text"].apply(
        lambda x: get_last_hidden_states(x, model, tokenizer)  # noqa: B023
    )

    eval_embeddings[k] = emb

In [263]:
# Read heuristic:
features = [
    "is_claim_with_sciterm",
    "is_claim",
    "contains_arg",
    "contains_scientific_term",
    "has_url",
    "has_sci_domain",
    "has_sci_subdomain",
    "has_sci_mag_domain",
    "has_sci_news_domain",
    "is_related_to_research",
    "mentions_science_research_in_general",
    "mentions_scientist",
    "mentions_publications",
    "mentions_research_method",
]

train_heuristic_df = pd.read_csv(
    os.path.join(ROOT_DIR, "processed/task4/subtask_4a/ct_train_clean_heuristics.tsv"),
    sep="\t",
)

test_heuristic_df = pd.read_csv(
    os.path.join(ROOT_DIR, "processed/task4/subtask_4a/ct_dev_clean_heuristics.tsv"),
    sep="\t",
)

eval_heuristic_df = pd.read_csv(
    os.path.join(ROOT_DIR, "processed/task4/subtask_4a/ct_eval_heuristics.tsv"),
    sep="\t",
)

In [None]:
from lazypredict.Supervised import LazyClassifier
from sklearn.utils import shuffle

for k in embeddings.keys():
    print(k)
    print("-------------------------------------------------------------------------------")
    clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=f1_score)

    # Reshape X_train and X_test to 2D
    X_train = np.array(embeddings[k].tolist()).reshape(len(embeddings[k]), -1)
    # X_train = np.hstack((X_train, train_heuristic_df[features].replace({True: 1, False: 0}).values)) # Add heuristics
    y_train = subtask4a_train_df[k].values.tolist()
    X_train, y_train = shuffle(X_train, y_train)

    X_test = np.array(test_embeddings[k].tolist()).reshape(len(test_embeddings[k]), -1)
    # X_test = np.hstack((X_test, test_heuristic_df[features].replace({True: 1, False: 0}).values)) # Add heuristics
    y_test = subtask4a_test_df[k].values.tolist()

    models, predictions = clf.fit(X_train, X_test, y_train, y_test)
    display(models.sort_values("f1_score", ascending=False))

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import NearestCentroid

best_clf = {
    "scientific_claim": NearestCentroid(),
    "scientific_reference": GaussianNB(),
    "scientific_entities": NearestCentroid(),
}
preds_clf = {}
eval_preds_clf = {}
for k, clf in best_clf.items():
    print(k, clf)
    print("-------------------------------------------------------------------------------")

    # Reshape X_train and X_test to 2D
    X_train = np.array(embeddings[k].tolist()).reshape(len(embeddings[k]), -1)
    # X_train = np.hstack((X_train, train_heuristic_df[features].replace({True: 1, False: 0}).values)) # Add heuristics
    y_train = subtask4a_train_df[k].values.tolist()
    X_train, y_train = shuffle(X_train, y_train)

    X_test = np.array(test_embeddings[k].tolist()).reshape(len(test_embeddings[k]), -1)
    # X_test = np.hstack((X_test, test_heuristic_df[features].replace({True: 1, False: 0}).values)) # Add heuristics
    y_test = subtask4a_test_df[k].values.tolist()

    X_eval = np.array(eval_embeddings[k].tolist()).reshape(len(eval_embeddings[k]), -1)

    clf.fit(X_train, y_train)
    preds_clf[k] = clf.predict(X_test)
    eval_preds_clf[k] = clf.predict(X_eval)

    print(f1_score(y_test, clf.predict(X_test)))

In [None]:
(
    np.array(preds_clf["scientific_claim"]),
    preds_clf["scientific_reference"],
    np.array(preds_clf["scientific_entities"]).tolist(),
)

In [None]:
compute_metrics(
    pd.DataFrame(preds_clf)[["scientific_claim", "scientific_reference", "scientific_entities"]].values.tolist(),
    subtask4a_test_df[["scientific_claim", "scientific_reference", "scientific_entities"]].values,
    labels=["scientific_claim", "scientific_reference", "scientific_entities"],
)

In [None]:
test_submission_df = pd.DataFrame(preds_clf)
test_submission_df = test_submission_df.rename(
    columns={
        "scientific_claim": "cat1_pred",
        "scientific_reference": "cat2_pred",
        "scientific_entities": "cat3_pred",
    }
)
test_submission_df["index"] = subtask4a_test_df["index"]
test_submission_df = test_submission_df[["index", "cat1_pred", "cat2_pred", "cat3_pred"]]


# Save the submission file and zip it:

test_submission_df.to_csv(
    os.path.join(ROOT_DIR, "results/baselines/predictions.csv"),
    index=False,
)

# Zip the submission file:
shutil.make_archive(
    os.path.join(ROOT_DIR, "results/baselines/test_predictions"),
    "zip",
    os.path.join(ROOT_DIR, "results/baselines/"),
    "predictions.csv",
)

In [None]:
####################### eval ########################
eval_submission_df = pd.DataFrame(eval_preds_clf)
eval_submission_df = eval_submission_df.rename(
    columns={
        "scientific_claim": "cat1_pred",
        "scientific_reference": "cat2_pred",
        "scientific_entities": "cat3_pred",
    }
)
eval_submission_df["index"] = subtask4a_eval_df["index"]
eval_submission_df = eval_submission_df[["index", "cat1_pred", "cat2_pred", "cat3_pred"]]


# Save the submission file and zip it:

eval_submission_df.to_csv(
    os.path.join(ROOT_DIR, "results/baselines/predictions.csv"),
    index=False,
)

# Zip the submission file:
shutil.make_archive(
    os.path.join(ROOT_DIR, "results/baselines/predictions"),
    "zip",
    os.path.join(ROOT_DIR, "results/baselines/"),
    "predictions.csv",
)