# Ensemble Evaluation, Optimal Threshold Selection and Prediction of Test Data
In this Notebook we are going to determine the optimal decision threshold for an ensemble and then use the ensemble and optimal decision threshold to predict the test-set for the submission.

In [14]:
# Install packages if on google colab
!pip install -q pytorch-lightning==1.6.4 neptune-client transformers sentencepiece

^C


In [2]:
import pandas as pd
import numpy as np

from tqdm.auto import tqdm

import torch

from transformers import AutoTokenizer

import pytorch_lightning as pl
from pytorch_lightning.loggers import NeptuneLogger

from torchmetrics import  AUROC

from sklearn.metrics import classification_report

import seaborn as sns
from pylab import rcParams
import pickle

%matplotlib inline
%config InlineBackend.figure_format='retina'

RANDOM_SEED = 42
COLAB = True

sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 12, 8



pl.seed_everything(RANDOM_SEED)

Global seed set to 42


42

If you are training to google colab and want to connect to drive

In [None]:
torch.cuda.is_available()

In [None]:
if COLAB:
    import os
    os.getcwd()
    from google.colab import drive
    drive.mount('/content/drive')

In [None]:
cd ./drive/MyDrive/human_value/human_values_behind_arguments

In [None]:
!git pull

## Import Modules
We use Pytorch Lightning for the training and therefore import the Lighntning Data and Model Modules, as well as other helper functions.

In [20]:
from data_modules.BertDataModule import BertDataModule, BertDataset
from torch.utils.data import Dataset, DataLoader
from models.BertFineTunerPl import BertFineTunerPl
from toolbox.bert_utils import max_for_thres

In [37]:
PARAMS_ENSEMBLE = {
    "MODEL_CHECKPOINTS": ['./checkpoints/HCV-409-microsoft-deberta-large-BS_8-LR_2e-05-HL_None-DROPOUT_None-SL_None.ckpt',
                          './checkpoints/HCV-408-microsoft-deberta-large-BS_8-LR_2e-05-HL_None-DROPOUT_None-SL_None.ckpt',
                          './checkpoints/HCV-406-microsoft-deberta-large-BS_8-LR_2e-05-HL_None-DROPOUT_None-SL_None.ckpt',
                          './checkpoints/HCV-402-danschr-roberta-large-BS_16-EPOCHS_8-LR_5e-05-ACC_GRAD_2-MAX_LENGTH_165-BS_8-LR_2e-05-HL_None-DROPOUT_None-SL_None.ckpt',
                          './checkpoints/HCV-403-danschr-roberta-large-BS_16-EPOCHS_8-LR_5e-05-ACC_GRAD_2-MAX_LENGTH_165-BS_8-LR_2e-05-HL_None-DROPOUT_None-SL_None.ckpt',
                          './checkpoints/HCV-405-danschr-roberta-large-BS_16-EPOCHS_8-LR_5e-05-ACC_GRAD_2-MAX_LENGTH_165-BS_8-LR_2e-05-HL_None-DROPOUT_None-SL_None.ckpt',
                          './checkpoints/HCV-364-microsoft-deberta-large-BS_8-LR_2e-05-HL_None-DROPOUT_None-SL_None.ckpt',
                          './checkpoints/HCV-366-microsoft-deberta-large-BS_8-LR_2e-05-HL_None-DROPOUT_None-SL_None.ckpt',
                          './checkpoints/HCV-368-microsoft-deberta-large-BS_8-LR_2e-05-HL_None-DROPOUT_None-SL_None.ckpt',
                          './checkpoints/HCV-371-danschr-roberta-large-BS_16-EPOCHS_8-LR_5e-05-ACC_GRAD_2-MAX_LENGTH_165-BS_8-LR_2e-05-HL_None-DROPOUT_None-SL_None.ckpt',
                          './checkpoints/HCV-372-danschr-roberta-large-BS_16-EPOCHS_8-LR_5e-05-ACC_GRAD_2-MAX_LENGTH_165-BS_8-LR_2e-05-HL_None-DROPOUT_None-SL_None.ckpt',
                          './checkpoints/HCV-375-danschr-roberta-large-BS_16-EPOCHS_8-LR_5e-05-ACC_GRAD_2-MAX_LENGTH_165-BS_8-LR_2e-05-HL_None-DROPOUT_None-SL_None.ckpt'
                          ],
    "DESCRIPTION":"FULL #3xDebL_F1 3EP 3xdanRobL_F1 3EP 3xDebL_Loss 3EP 3xdanRobL_Loss 3EP",
    "TEST_PATH" : "./data/path_to_your_test_data.csv",
    "LEAVE_OUT_DATA_PATH": "./data/leave_out_dataset_300.csv",
    "MAX_THRESHOLD_METRIC": "custom",
    "ENSEMBLE": "EN",
    "LABEL_COLUMNS":['Self-direction: thought',
                     'Self-direction: action',
                     'Stimulation',
                     'Hedonism',
                     'Achievement',
                     'Power: dominance',
                     'Power: resources',
                     'Face',
                     'Security: personal',
                     'Security: societal',
                     'Tradition',
                     'Conformity: rules',
                     'Conformity: interpersonal',
                     'Humility',
                     'Benevolence: caring',
                     'Benevolence: dependability',
                     'Universalism: concern',
                     'Universalism: nature',
                     'Universalism: tolerance',
                     'Universalism: objectivity']
}

We extract the identifier e.g "HCV-409" from the checkpoint paths. (We use it later to pair the checkpoint together with the PARAMS ( Model Parameter used for training)

In [42]:
NAME = ""
ids = []
for elem in PARAMS_ENSEMBLE["MODEL_CHECKPOINTS"]:
    text_list = elem.split("checkpoints/")[1]
    text_list = text_list.split("-")
    id = text_list[0]+"-" + text_list[1]
    ids.append(id)
    NAME= NAME + "_" + id
    print(text_list[0]+"-" + text_list[1])
NAME = PARAMS_ENSEMBLE["ENSEMBLE"]+"_"+NAME[1:]

PARAMS_ENSEMBLE["IDS"] = ids
LABEL_COLUMNS = PARAMS_ENSEMBLE["LABEL_COLUMNS"]

HCV-3
HCV-8
HCV-11
HCV-19
HCV-2


# Load Data
Load the Leave out Dataset to determine the optimal threshold

In [23]:
val_df = pd.read_csv(PARAMS_ENSEMBLE["LEAVE_OUT_DATA_PATH"], index_col=0)

## The Ensemble List

Take IDs that have been generated and get the params file with the same id


In [None]:
# Loading the parameters for each model
PARAMS_LIST = []
for id in PARAMS_ENSEMBLE["IDS"]:
    with open(f'./checkpoints/{id}_PARAMS.pkl', 'rb') as f:
        loaded_dict = pickle.load(f)
        PARAMS_LIST.append(loaded_dict)

We group together the checkpoint and parameters in a list

In [None]:
# Concatenating relevant information into one Ensemble_list: Parameters, Id, and Path to Checkpoint.
ENSEMBLE_LIST = []
for param, id, mc in zip(PARAMS_LIST, PARAMS_ENSEMBLE["IDS"], PARAMS_ENSEMBLE["MODEL_CHECKPOINTS"]):
    ENSEMBLE_LIST.append({"PARAMS":param, "ID":id,"MODEL_CHECKPOINT":mc})

# Prediction of Leave-Out-Dataset
The following sections contain a lot of logging to neptune. If you wish to track the different performance with neptune you can uncomment the corresponding lines...

In [None]:
def predict_unseen_data(trained_model, data, collect_labels=True):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    trained_model = trained_model.to(device)

    test_dataset = BertDataset(
        data=data,
        tokenizer=TOKENIZER,
        max_token_count=PARAMS["MAX_TOKEN_COUNT"],
    )

    predictions = []
    labels =[]

    for item in tqdm(test_dataset):
        _, prediction = trained_model(
            item["input_ids"].unsqueeze(dim=0).to(device),
            item["attention_mask"].unsqueeze(dim=0).to(device)
        )
        predictions.append(prediction.flatten())
        if collect_labels:
            labels.append(item["labels"].int())

    predictions = torch.stack(predictions).detach().cpu()
    if collect_labels:
        labels = torch.stack(predictions).detach().cpu()

    return predictions, labels

We iterate over the Models in the Ensemble List and get the predictions for the leave-out-dataset and for the test-dataset for each model (If we use a test-dataset)

In [None]:
# Iterate over elements in Ensemble_List and get predictions from each model. Collect them in predictions [] list.
predictions = []
labels = []
for idx, elem in enumerate(ENSEMBLE_LIST):
    print(f"Starting with model {elem['MODEL_CHECKPOINT']}")
    PARAMS = elem["PARAMS"]
    trained_model = BertFineTunerPl.load_from_checkpoint(
        elem["MODEL_CHECKPOINT"],
        params=PARAMS,
        label_columns=LABEL_COLUMNS,
        n_classes=len(LABEL_COLUMNS)
    )
    trained_model.eval()
    trained_model.freeze()
    print(f"With Tokenizer {PARAMS['MODEL_PATH']}")
    TOKENIZER = AutoTokenizer.from_pretrained(PARAMS["MODEL_PATH"])
    pred, lab = predict_unseen_data(trained_model=trained_model, data=val_df)
    predictions.append(pred)
    labels.append(lab)

In [None]:
labels_val = labels[0]

y_pred_val = torch.stack(predictions).numpy()
y_true_val = labels_val.numpy()

y_pred_val_avg = np.mean(y_pred_val, axis=0)

y_pred_val_avg_tensor = torch.tensor(y_pred_val_avg)
y_true_val_tensor =torch.tensor(y_true_val)


In [None]:
THRESHOLD = max_for_thres(y_pred=y_pred_val_avg_tensor, y_true=y_true_val_tensor, label_columns=LABEL_COLUMNS, average=PARAMS_ENSEMBLE["MAX_THRESHOLD_METRIC"])

# Predicting the submission File.
Now that we have the optimal threshold, we can create the submission file. (Note that this is the same code as in predict.ipynb. But we will show below. But we will show below how we further used stacking.

In [None]:
test_df_input = pd.read_csv('./data/arguments-test.tsv', sep='\t')

In [None]:
test_df_input["text"] = test_df_input["Premise"]+" " + test_df_input["Stance"]+ " " + test_df_input["Conclusion"]
test_df_input.head()

In [None]:
predictions_test = []
for idx, elem in enumerate(ENSEMBLE_LIST):
    print(f"Starting with model {elem['MODEL_CHECKPOINT']}")
    PARAMS = elem["PARAMS"]
    trained_model = BertFineTunerPl.load_from_checkpoint(
        elem["MODEL_CHECKPOINT"],
        params=PARAMS,
        label_columns=LABEL_COLUMNS,
        n_classes=len(LABEL_COLUMNS)
    )
    trained_model.eval()
    trained_model.freeze()
    print(f"With Tokenizer {PARAMS['MODEL_PATH']}")
    TOKENIZER = AutoTokenizer.from_pretrained(PARAMS["MODEL_PATH"])

    pred, lab = predict_unseen_data(trained_model=trained_model, data=test_df_input, collect_labels=False)
    predictions_test.append(pred)

In [None]:
predictions_test_stacked = torch.stack(predictions_test).numpy()
predictions_avg = np.mean(predictions_test_stacked, axis=0)

In [None]:
upper, lower = 1, 0
y_pred = np.where(predictions_avg > THRESHOLD, upper, lower)

In [None]:
prediction_dictionary = {}
prediction_dictionary["Argument ID"] = test_df_input["Argument ID"]
for idx, l_name in enumerate(LABEL_COLUMNS):
    prediction_dictionary[l_name]=y_pred[:,idx]

test_prediction_df = pd.DataFrame(prediction_dictionary)
test_prediction_df.head()

In [4]:
test_prediction_df.to_csv(f"submissions/submission_test.tsv", sep="\t", index=False)

NameError: name 'test_prediction_df' is not defined

# Stacking
In the following we train logistic regressions to determine the decision threshold for each label. We train the model on the training-dataset. So we get the predictions for the training dataset and train the models in a way that they schould learn to predict the labels based on the predictions as input.


In [None]:
train_df = pd.read_csv("./data/data_training_full.csv")

In [None]:
predictions = []
labels = []
for idx, elem in enumerate(ENSEMBLE_LIST):
    print(f"Starting with model {elem['MODEL_CHECKPOINT']}")
    PARAMS = elem["PARAMS"]
    trained_model = BertFineTunerPl.load_from_checkpoint(
        elem["MODEL_CHECKPOINT"],
        params=PARAMS,
        label_columns=LABEL_COLUMNS,
        n_classes=len(LABEL_COLUMNS)
    )
    trained_model.eval()
    trained_model.freeze()
    print(f"With Tokenizer {PARAMS['MODEL_PATH']}")
    TOKENIZER = AutoTokenizer.from_pretrained(PARAMS["MODEL_PATH"])

    pred, lab = predict_unseen_data(trained_model=trained_model, data=train_df, collect_labels=True)
    predictions.append(pred)
    labels.append(lab)

## Train Logistic Regression
We structure our input-data and then train the logistic regressions.


For each sample in the data we concatenate the prediction of each model columnwise. So we get a the shape [len(data), 20*num_models_in_ensemble]

In [None]:
labels_val = labels[0]
predictions_val = torch.Tensor([])
for p in predictions:
    predictions_val = torch.cat([predictions_val, p], dim=1)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier

logReg = MultiOutputClassifier(LogisticRegression(random_state=0, max_iter=200))
# logReg=MultiOutputClassifier(MultinomialNB(alpha=0.1))
# logReg=MultiOutputClassifier(DecisionTreeClassifier(min_samples_leaf=3))

In [None]:
logReg.fit(predictions_val.numpy(), labels_val.numpy())

Get the unstacked predictions for the test-file from above and concatenate the predictions from each model columnwise.

In [None]:
predictions_transformed = torch.Tensor([])
for record in predictions_test_stacked:
    predictions_transformed = torch.cat([predictions_test, record], dim=1)

Use the trained logReg Model to predict the labels

In [None]:
y_pred = logReg.predict(predictions_test)

Create Submission File

In [None]:
prediction_dictionary = {}
prediction_dictionary["Argument ID"] = test_df_input["Argument ID"]
for idx, l_name in enumerate(LABEL_COLUMNS):
    prediction_dictionary[l_name]=y_pred[:,idx]

test_prediction_df = pd.DataFrame(prediction_dictionary)
test_prediction_df.head()

In [None]:
test_prediction_df.to_csv(f"submissions/test-submission_logReg", sep="\t", index=False)