In [1]:
# %pip install datasets

import ast
import copy
from functools import partial
from typing import Any, Dict, List, Union

import numpy as np
import pandas as pd
from datasets import Dataset
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import KFold
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments

In [2]:
class DataLoader_5F_CV:
    def __init__(self, tokenizer, n_folds, seed=0):
        self.tokenizer = tokenizer
        self.n_folds = n_folds
        self.seed = seed
        self.data = self._read_data()
        self.cv_datasets, self.cv_test_dfs = self._create_datasets()

    def _read_data(self):
        data = pd.read_csv("ct_train_data.tsv", sep="\t")
        data["labels"] = data["labels"].apply(lambda x: ast.literal_eval(x))

        return data

    def _create_datasets(self):
        def tokenize(examples):
            return self.tokenizer(examples["text"], max_length=128, truncation=True, padding="max_length")

        def filter_split(split_indices: List, example: Union[Dict, Any], indices: int) -> List[bool]:
            return [True if idx in split_indices else False for idx in indices]

        cv_datasets = []
        cv_test_dfs = []

        dataset = Dataset.from_pandas(self.data[["text", "labels"]])
        dataset = dataset.map(tokenize, batched=True, batch_size=32, remove_columns=["text"])

        kf = KFold(n_splits=self.n_folds, random_state=self.seed, shuffle=True)
        kf.get_n_splits(self.data)

        for fold, (train_index, test_index) in enumerate(kf.split(self.data)):
            assert len(set(train_index).intersection(set(test_index))) == 0

            train_dataset = dataset.filter(
                partial(filter_split, train_index), with_indices=True, batched=True, keep_in_memory=True
            )
            test_dataset = dataset.filter(
                partial(filter_split, test_index), with_indices=True, batched=True, keep_in_memory=True
            )

            cv_datasets.append((train_dataset, test_dataset))
            cv_test_dfs.append(self.data.iloc[test_index])

        return cv_datasets, cv_test_dfs

    def get_datasets_for_fold(self, fold):
        return self.cv_datasets[fold]

    def get_test_df_for_fold(self, fold):
        return self.cv_test_dfs[fold]

In [3]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))


def compute_metrics(data):
    metrics = {}
    for i, _cat in enumerate(["cat1", "cat2", "cat3"]):
        preds = data[f"{_cat}_pred"].apply(lambda x: int(x) == 1)
        labels = data["labels"].apply(lambda x: int(float(x[i])) == 1)

        acc = accuracy_score(labels, preds)
        prec = precision_score(labels, preds)
        rec = recall_score(labels, preds)
        f1 = f1_score(labels, preds)
        metrics.update({f"{_cat}_avg_acc": acc, f"{_cat}_avg_prec": prec, f"{_cat}_avg_rec": rec, f"{_cat}_avg_f1": f1})

    preds = (
        data[[c for c in data.columns if "_pred" in c]].apply(lambda x: list([float(p) for p in x]), axis=1).tolist()
    )
    labels = data["labels"].tolist()
    metrics["macro_f1"] = f1_score(labels, preds, average="macro")

    return metrics

In [4]:
def annotate_test_dataframe(pred_output):
    test_df["cat1_logits"] = pred_output.predictions[:, 0]
    test_df["cat2_logits"] = pred_output.predictions[:, 1]
    test_df["cat3_logits"] = pred_output.predictions[:, 2]

    predictions = (pred_output.predictions > 0) * 1
    test_df["cat1_pred"] = predictions[:, 0]
    test_df["cat2_pred"] = predictions[:, 1]
    test_df["cat3_pred"] = predictions[:, 2]

    test_df["cat1_score"] = sigmoid(pred_output.predictions[:, 0])
    test_df["cat2_score"] = sigmoid(pred_output.predictions[:, 1])
    test_df["cat3_score"] = sigmoid(pred_output.predictions[:, 2])

    test_df["fold"] = fold

    annotated_test_data.append(test_df.copy())

In [5]:
# path_or_id = "allenai/scibert_scivocab_cased"
path_or_id = "cardiffnlp/twitter-roberta-base-2022-154m"
model_id_or_path = path_or_id
tokenizer_id_or_path = path_or_id

epochs = 10
learning_rate = 2e-5
tokenizer_max_len = 128

dataloader_config = {"per_device_train_batch_size": 16, "per_device_eval_batch_size": 256}

n_folds = 5
seed = 0

In [6]:
print("load model and tokenizer")
tokenizer_config = {"pretrained_model_name_or_path": tokenizer_id_or_path, "max_len": tokenizer_max_len}

if "scibert" in path_or_id:
    tokenizer_config["do_lower_case"] = False

tokenizer = AutoTokenizer.from_pretrained(**tokenizer_config)

model_config = {
    "pretrained_model_name_or_path": model_id_or_path,
    "num_labels": 3,
    "problem_type": "multi_label_classification",
}

model = AutoModelForSequenceClassification.from_pretrained(**model_config)

load model and tokenizer


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-2022-154m and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
print("load data")
dl = DataLoader_5F_CV(tokenizer, n_folds, seed)

load data


Map:   0%|          | 0/1366 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1366 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1366 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1366 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1366 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1366 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1366 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1366 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1366 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1366 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1366 [00:00<?, ? examples/s]

In [8]:
annotated_test_data = []

for fold in range(n_folds):
    print(f"Fold {fold+1}")
    fold_model = copy.deepcopy(model).cuda()
    train_dataset, test_dataset = dl.get_datasets_for_fold(fold)
    test_df = dl.get_test_df_for_fold(fold).copy()

    training_args = TrainingArguments(
        output_dir="results",
        num_train_epochs=epochs,
        **dataloader_config,
        warmup_ratio=0.1,
        weight_decay=0.01,
        learning_rate=learning_rate,
        logging_strategy="no",
        save_strategy="no",
        eval_strategy="no",
        no_cuda=False,
        report_to="none",
    )

    trainer = Trainer(
        model=fold_model,
        args=training_args,
        train_dataset=train_dataset,
    )

    trainer.train()

    test_pred_output = trainer.predict(test_dataset)
    annotate_test_dataframe(test_pred_output)
    print(compute_metrics(annotated_test_data[-1]))

Fold 1


Step,Training Loss


{'cat1_avg_acc': 0.8905109489051095, 'cat1_avg_prec': 0.7261904761904762, 'cat1_avg_rec': 0.8970588235294118, 'cat1_avg_f1': 0.8026315789473685, 'cat2_avg_acc': 0.9416058394160584, 'cat2_avg_prec': 0.7894736842105263, 'cat2_avg_rec': 0.9183673469387755, 'cat2_avg_f1': 0.8490566037735849, 'cat3_avg_acc': 0.9452554744525548, 'cat3_avg_prec': 0.8666666666666667, 'cat3_avg_rec': 0.9285714285714286, 'cat3_avg_f1': 0.896551724137931, 'macro_f1': 0.8494133022862949}
Fold 2


Step,Training Loss


{'cat1_avg_acc': 0.9304029304029304, 'cat1_avg_prec': 0.8690476190476191, 'cat1_avg_rec': 0.9012345679012346, 'cat1_avg_f1': 0.8848484848484849, 'cat2_avg_acc': 0.9047619047619048, 'cat2_avg_prec': 0.7636363636363637, 'cat2_avg_rec': 0.7636363636363637, 'cat2_avg_f1': 0.7636363636363637, 'cat3_avg_acc': 0.9084249084249084, 'cat3_avg_prec': 0.8292682926829268, 'cat3_avg_rec': 0.8607594936708861, 'cat3_avg_f1': 0.84472049689441, 'macro_f1': 0.8310684484597527}
Fold 3


Step,Training Loss


{'cat1_avg_acc': 0.8681318681318682, 'cat1_avg_prec': 0.7340425531914894, 'cat1_avg_rec': 0.8625, 'cat1_avg_f1': 0.7931034482758621, 'cat2_avg_acc': 0.8827838827838828, 'cat2_avg_prec': 0.6333333333333333, 'cat2_avg_rec': 0.7916666666666666, 'cat2_avg_f1': 0.7037037037037037, 'cat3_avg_acc': 0.9047619047619048, 'cat3_avg_prec': 0.7692307692307693, 'cat3_avg_rec': 0.8823529411764706, 'cat3_avg_f1': 0.821917808219178, 'macro_f1': 0.772908320066248}
Fold 4


Step,Training Loss


{'cat1_avg_acc': 0.8644688644688645, 'cat1_avg_prec': 0.6923076923076923, 'cat1_avg_rec': 0.8059701492537313, 'cat1_avg_f1': 0.7448275862068966, 'cat2_avg_acc': 0.9377289377289377, 'cat2_avg_prec': 0.8163265306122449, 'cat2_avg_rec': 0.8333333333333334, 'cat2_avg_f1': 0.8247422680412371, 'cat3_avg_acc': 0.9304029304029304, 'cat3_avg_prec': 0.7763157894736842, 'cat3_avg_rec': 0.9672131147540983, 'cat3_avg_f1': 0.8613138686131386, 'macro_f1': 0.8102945742870907}
Fold 5


Step,Training Loss


{'cat1_avg_acc': 0.9047619047619048, 'cat1_avg_prec': 0.7534246575342466, 'cat1_avg_rec': 0.873015873015873, 'cat1_avg_f1': 0.8088235294117647, 'cat2_avg_acc': 0.9377289377289377, 'cat2_avg_prec': 0.8235294117647058, 'cat2_avg_rec': 0.84, 'cat2_avg_f1': 0.8316831683168316, 'cat3_avg_acc': 0.9120879120879121, 'cat3_avg_prec': 0.7638888888888888, 'cat3_avg_rec': 0.8870967741935484, 'cat3_avg_f1': 0.8208955223880597, 'macro_f1': 0.8204674067055521}


In [9]:
print("Evaluate all folds")
data = pd.concat(annotated_test_data)
data.to_pickle("preds.pkl")
metrics = compute_metrics(data)
print(metrics)

Evaluate all folds
{'cat1_avg_acc': 0.8916544655929722, 'cat1_avg_prec': 0.7554479418886199, 'cat1_avg_rec': 0.8690807799442897, 'cat1_avg_f1': 0.8082901554404145, 'cat2_avg_acc': 0.9209370424597365, 'cat2_avg_prec': 0.7610294117647058, 'cat2_avg_rec': 0.828, 'cat2_avg_f1': 0.7931034482758621, 'cat3_avg_acc': 0.9202049780380673, 'cat3_avg_prec': 0.8015665796344648, 'cat3_avg_rec': 0.9029411764705882, 'cat3_avg_f1': 0.8492392807745505, 'macro_f1': 0.816877628163609}
