# Baselines but using separate predictors and the augmented data


In [None]:
# Parameters
# parameters
upstream = None
product = None
some_param = None

model_id = "microsoft/deberta-large-mnli"
epochs = 10
batch_size = 32

In [None]:
import os
from pprint import pprint

import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.utils import shuffle
from transformers import (
    DataCollatorWithPadding,
)

In [None]:
# Check if directory exists:
if not os.path.exists("./data"):
    ROOT_DIR = "../../data/processed/task4/subtask_4a/"
else:
    ROOT_DIR = "./data/processed/task4/subtask_4a/"

In [6]:
import evaluate

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")
precision = evaluate.load("precision")
recall = evaluate.load("recall")

In [None]:
from typing import List

import numpy as np
import torch
from transformers import EvalPrediction


def compute_metrics_sequenceclassification(eval_predictions: EvalPrediction):
    """Compute metrics for a sequence classification model's predictions.

    This function applies a softmax activation function to the model's raw logits to calculate probabilities,
    converts probabilities to binary predictions based on a specified threshold, and computes evaluation metrics
    using the provided `compute_metrics` function.

    Args:
        eval_predictions (EvalPrediction): An object containing the model's predictions and the true labels.
            - `eval_predictions.predictions`: The raw logits output by the model.
            - `eval_predictions.label_ids`: The true labels for the predictions.

    Returns:
        dict: A dictionary containing the computed evaluation metrics.
    """
    predictions, labels = eval_predictions
    predictions = np.argmax(predictions, axis=1)

    return {
        "f1": f1.compute(predictions=predictions, references=labels),
        "precision": precision.compute(predictions=predictions, references=labels),
        "recall": recall.compute(predictions=predictions, references=labels),
        "accuracy": accuracy.compute(predictions=predictions, references=labels),
    }

In [None]:
embedding_model = "mxbai-embed-large"
columns = ["scientific_claim", "scientific_reference", "scientific_entities"]
# columns = ["scientific_reference", "scientific_entities"]


# Load the provided trainning and test data:
subtask4a_train_df = pd.read_csv(
    os.path.join(ROOT_DIR, "ct_train_clean.tsv"),
    sep="\t",
)
subtask4a_test_df = pd.read_csv(
    os.path.join(ROOT_DIR, "ct_dev_clean.tsv"),
    sep="\t",
)

In [None]:
def compute_weighted_loss(outputs, labels, num_items_in_batch=None, return_outputs=False, class_weights=None):
    logits = outputs.get("logits")
    n_labels = logits.shape[1]

    if class_weights is not None and len(class_weights) == n_labels:
        loss_fct = torch.nn.CrossEntropyLoss(weight=torch.from_numpy(class_weights).float().to(device=logits.device))
    else:
        loss_fct = torch.nn.CrossEntropyLoss()
    loss = loss_fct(logits.view(-1, n_labels), labels.view(-1))
    return (loss, outputs) if return_outputs else loss

In [None]:
def my_hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-3, log=True),
    }

In [None]:
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)


def fine_tune_binary_classifier(
    model_name: str,
    train_texts: List[str],
    train_labels: List[int],
    val_texts: List[str],
    val_labels: List[int],
    epochs: int = 3,
    batch_size: int = 16,
) -> Trainer:
    """Fine-tune a binary classifier using Hugging Face Transformers.

    Args:
        model_name (str): Pretrained model name (e.g., 'bert-base-uncased').
        train_texts (List[str]): List of training texts.
        train_labels (List[int]): List of training labels (0 or 1).
        val_texts (List[str]): List of validation texts.
        val_labels (List[int]): List of validation labels (0 or 1).
        output_dir (str): Directory to save the fine-tuned model.
        epochs (int): Number of training epochs.
        batch_size (int): Batch size for training.

    Returns:
        Trainer: Hugging Face Trainer object after training.
    """
    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, trust_remote_code=True)

    def preprocess_function(examples, tokenizer=tokenizer):
        return tokenizer(examples["text"], padding="max_length", truncation=True)

    # Create Dataset:
    ds_train = Dataset.from_dict({"text": train_texts, "labels": train_labels})
    ds_dev = Dataset.from_dict({"text": val_texts, "labels": val_labels})
    ds = DatasetDict({"train": ds_train, "test": ds_dev})

    # Tokenize the datasets
    tokenized_ds = ds.map(preprocess_function, batched=True)

    # Define data collator
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # Define training arguments
    training_args = TrainingArguments(
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        eval_strategy="epoch",
        save_strategy="epoch",
        logging_steps=10,
        load_best_model_at_end=True,
        learning_rate=2e-5,  # learning_rate=1e-6,
        weight_decay=0.01,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        # metric_for_best_model="f1",
        # greater_is_better=True,
    )

    # Define Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_ds["train"].with_format("torch"),
        eval_dataset=tokenized_ds["test"].with_format("torch"),
        tokenizer=tokenizer,
        compute_metrics=compute_metrics_sequenceclassification,
        data_collator=data_collator,
        # compute_loss_func=compute_weighted_loss,
    )

    # Train the model
    trainer.train()

    return trainer

In [None]:
for cl in columns:
    # Load the data
    # Check if oversampling file exists:
    if os.path.exists(os.path.join(ROOT_DIR, f"ct_train_oversamples_{cl}.tsv")):
        subtask4a_cat_claim_train_df = pd.read_csv(
            os.path.join(ROOT_DIR, f"ct_train_oversamples_{cl}.tsv"),
            sep="\t",
            header=None,
            names=["text", cl],
        )

        print(f"Evaluating {cl}...")
        for dataset_df, name in [
            (subtask4a_cat_claim_train_df, "oversampling"),
            (subtask4a_train_df[["text", cl]], "training"),
            (subtask4a_test_df[["text", cl]], "evaluation"),
        ]:
            if name == "oversampling":
                oversampling_train_df = dataset_df
            elif name == "training":
                train_df = dataset_df = dataset_df
            elif name == "evaluation":
                eval_df = dataset_df = dataset_df

        # Train model with oversampling + training:
        X_train = np.array(pd.concat([train_df, oversampling_train_df])["text"].tolist())
        y_train = list(map(int, pd.concat([train_df, oversampling_train_df])[cl]))
        X_train, y_train = shuffle(X_train, y_train)

        X_test = np.array(subtask4a_test_df["text"].tolist())
        y_test = list(map(int, subtask4a_test_df[cl]))

        trainer = fine_tune_binary_classifier(
            model_name=model_id,
            train_texts=X_train,
            train_labels=y_train,
            val_texts=X_test,
            val_labels=y_test,
            epochs=epochs,
            batch_size=batch_size,
        )

        # Evaluate the model:
        eval_result = trainer.evaluate()
        eval_result["model"] = model_id

        pprint(eval_result)