In [1]:
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt

from utils.utils import load_data, remove_zero_features, load_confounders, deconfound_linear, standardize, label_freq_sorted

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.multioutput import MultiOutputClassifier, ClassifierChain

from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.utils import resample

from sklearn.metrics import average_precision_score, roc_auc_score, brier_score_loss, f1_score, hamming_loss

In [2]:
plot_path = 'plots/'

In [3]:
N_CORES = joblib.cpu_count(only_physical_cores=True)
print(f"Number of physical cores: {N_CORES}")

Number of physical cores: 8


---

In [None]:
# Load data for classification task
subject_data, features, diagnoses = load_data('classification')

In [None]:
# Remove zero features
F = remove_zero_features(features.iloc[:,1:])

In [None]:
# Load confounders
C = load_confounders(subject_data)

# Apply deconfounding
#F = deconfound_linear(C, F)

In [None]:
# Standardize
X = standardize(F)
print(f"Number of samples: {X.shape[0]}")
print(f"Number of features: {X.shape[1]}")

Number of samples: 2815
Number of features: 922


In [None]:
# Remove ID column
Y = diagnoses.iloc[:,1:]
print(f"Number of labels: {Y.shape[1]}")

Number of labels: 13


In [None]:
# Split dataset into train and test (holdout) set
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=0)
print(f"Number of samples in training set: {len(X_train)}")
print(f"Number of samples in test set: {len(X_test)}")

Number of samples in training set: 2111
Number of samples in test set: 704


---

## 1. MultiOutputClassifier
Evaluate classification models wrapped in meta estimator MultiOutputClassifier with respect to multi-label performance metrics

### 1.1. Always zero baseline estimator

In [9]:
auprc_zero_basline = []
auroc_zero_basline = []
brier_zero_basline = []
hamm_zero_basline = []
f1_zero_basline = []

zero_baseline_clf = DummyClassifier(strategy='constant', constant=0 ,random_state=0)
zero_baseline_meta_clf = MultiOutputClassifier(zero_baseline_clf)
zero_baseline_meta_clf = zero_baseline_meta_clf.fit(X_train, Y_train)

for i in range(100):
    X_test_resampled, y_test_resampled = resample(X_test, Y_test, replace=True, n_samples=len(Y_test), random_state=0+i)

    y_prob = zero_baseline_meta_clf.predict_proba(X_test_resampled)
    y_pred = zero_baseline_meta_clf.predict(X_test_resampled)

    # Combine prediction probas into single ndarray
    y_prob_combined = y_prob[0][:,1].reshape(-1,1)
    for i in range(len(y_prob)):
        if i == (len(y_prob))-1:
            break
        else:
            y_prob_combined = np.concatenate([y_prob_combined, y_prob[i+1][:,1].reshape(-1,1)], axis=1)

    # Compute brier score
    brier_scores = np.zeros(len(y_prob))
    for i in range(len(y_prob)):
        brier_scores[i] = brier_score_loss(y_test_resampled.iloc[:,i], y_prob[i][:,1])
    brier_zero_basline.append(brier_scores.mean()) 
    
    # Other metrics
    auprc_zero_basline.append(average_precision_score(y_test_resampled, y_prob_combined, average='macro')) 
    auroc_zero_basline.append(roc_auc_score(y_test_resampled, y_prob_combined, average='macro'))
    f1_zero_basline.append(f1_score(y_test_resampled, y_pred, average='micro'))
    hamm_zero_basline.append(hamming_loss(y_test_resampled, y_pred))

print(f"Mean scores for always zero baseline with MultiOutputClassifier with 95% confidence intervals:")
print("    AUPRC macro: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(auprc_zero_basline), np.percentile(auprc_zero_basline, 2.5), np.percentile(auprc_zero_basline, 97.5)))
print("    AUROC macro: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(auroc_zero_basline), np.percentile(auroc_zero_basline, 2.5), np.percentile(auroc_zero_basline, 97.5)))
print("    Brier score: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(brier_zero_basline), np.percentile(brier_zero_basline, 2.5), np.percentile(brier_zero_basline, 97.5)))
print("    Hamming loss: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(hamm_zero_basline), np.percentile(hamm_zero_basline, 2.5), np.percentile(hamm_zero_basline, 97.5)))
print("    Micro Avg F1 score: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(f1_zero_basline), np.percentile(f1_zero_basline, 2.5), np.percentile(f1_zero_basline, 97.5)))

Mean scores for always zero baseline with MultiOutputClassifier with 95% confidence intervals:
    AUPRC macro: 0.17 [0.16, 0.18]
    AUROC macro: 0.50 [0.50, 0.50]
    Brier score: 0.17 [0.16, 0.18]
    Hamming loss: 0.17 [0.16, 0.18]
    Micro Avg F1 score: 0.00 [0.00, 0.00]


### 1.2. Label proportion baseline estimator

In [10]:
auprc_lprop_basline = []
auroc_lprop_basline = []
brier_lprop_basline = []
f1_lprop_basline = []
hamm_lprop_basline = []

lprop_baseline_clf = DummyClassifier(strategy='prior', random_state=0)
lprop_baseline_meta_clf = MultiOutputClassifier(lprop_baseline_clf)
lprop_baseline_meta_clf = lprop_baseline_meta_clf.fit(X_train, Y_train)

for i in range(100):
    X_test_resampled, y_test_resampled = resample(X_test, Y_test, replace=True, n_samples=len(Y_test), random_state=0+i)

    y_prob = lprop_baseline_meta_clf.predict_proba(X_test_resampled)
    y_pred = lprop_baseline_meta_clf.predict(X_test_resampled)

    # Combine prediction probas into single ndarray
    y_prob_combined = y_prob[0][:,1].reshape(-1,1)
    for i in range(len(y_prob)):
        if i == (len(y_prob))-1:
            break
        else:
            y_prob_combined = np.concatenate([y_prob_combined, y_prob[i+1][:,1].reshape(-1,1)], axis=1)

    # Compute brier score
    brier_scores = np.zeros(len(y_prob))
    for i in range(len(y_prob)):
        brier_scores[i] = brier_score_loss(y_test_resampled.iloc[:,i], y_prob[i][:,1])
    brier_lprop_basline.append(brier_scores.mean()) 
    
    # Other metrics
    auprc_lprop_basline.append(average_precision_score(y_test_resampled, y_prob_combined, average='macro')) 
    auroc_lprop_basline.append(roc_auc_score(y_test_resampled, y_prob_combined, average='macro'))
    f1_lprop_basline.append(f1_score(y_test_resampled, y_pred, average='micro'))
    hamm_lprop_basline.append(hamming_loss(y_test_resampled, y_pred))

print(f"Mean scores for label proportion baseline with MultiOutputClassifier with 95% confidence intervals:")
print("    AUPRC macro: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(auprc_lprop_basline), np.percentile(auprc_lprop_basline, 2.5), np.percentile(auprc_lprop_basline, 97.5)))
print("    AUROC macro: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(auroc_lprop_basline), np.percentile(auroc_lprop_basline, 2.5), np.percentile(auroc_lprop_basline, 97.5)))
print("    Brier score: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(brier_lprop_basline), np.percentile(brier_lprop_basline, 2.5), np.percentile(brier_lprop_basline, 97.5)))
print("    Hamming loss: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(hamm_lprop_basline), np.percentile(hamm_lprop_basline, 2.5), np.percentile(hamm_lprop_basline, 97.5)))
print("    Micro Avg F1 score: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(f1_lprop_basline), np.percentile(f1_lprop_basline, 2.5), np.percentile(f1_lprop_basline, 97.5)))

Mean scores for label proportion baseline with MultiOutputClassifier with 95% confidence intervals:
    AUPRC macro: 0.17 [0.16, 0.18]
    AUROC macro: 0.50 [0.50, 0.50]
    Brier score: 0.11 [0.11, 0.12]
    Hamming loss: 0.15 [0.14, 0.15]
    Micro Avg F1 score: 0.40 [0.38, 0.42]


### 1.3. Logistic regression

In [11]:
auprc_LR_meta = []
auroc_LR_meta = []
brier_LR_meta = []
f1_LR_meta = []
hamm_LR_meta = []

LR_base_clf = LogisticRegression(class_weight='balanced', max_iter=10000, random_state=0)
LR_meta_clf = MultiOutputClassifier(LR_base_clf, n_jobs=-1)
LR_meta_clf = LR_meta_clf.fit(X_train, Y_train)

for i in range(100):
    X_test_resampled, y_test_resampled = resample(X_test, Y_test, replace=True, n_samples=len(Y_test), random_state=0+i)

    Y_prob = LR_meta_clf.predict_proba(X_test_resampled)
    Y_pred = LR_meta_clf.predict(X_test_resampled)

    # Combine prediction probas into single ndarray
    Y_prob_merged = Y_prob[0][:,1].reshape(-1,1)
    for i in range(1, len(Y.columns), 1):
            Y_prob_merged = np.concatenate([Y_prob_merged, Y_prob[i][:,1].reshape(-1,1)], axis=1)
    
    # Compute brier score
    brier_scores = np.zeros(len(Y.columns))
    for i in range(len(Y.columns)):
        brier_scores[i] = brier_score_loss(y_test_resampled.iloc[:,i], Y_prob_merged[:, i])
    brier_LR_meta.append(brier_scores.mean())

    # Other metrics
    auprc_LR_meta.append(average_precision_score(y_test_resampled, Y_prob_merged, average='macro')) 
    auroc_LR_meta.append(roc_auc_score(y_test_resampled, Y_prob_merged, average='macro'))
    f1_LR_meta.append(f1_score(y_test_resampled, Y_pred, average='micro'))
    hamm_LR_meta.append(hamming_loss(y_test_resampled, Y_pred))


print(f"Mean scores for LR with MultiOutputClassifier with 95% confidence intervals:")
print("    AUPRC macro: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(auprc_LR_meta), np.percentile(auprc_LR_meta, 2.5), np.percentile(auprc_LR_meta, 97.5)))
print("    AUROC macro: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(auroc_LR_meta), np.percentile(auroc_LR_meta, 2.5), np.percentile(auroc_LR_meta, 97.5)))
print("    Brier score: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(brier_LR_meta), np.percentile(brier_LR_meta, 2.5), np.percentile(brier_LR_meta, 97.5)))
print("    Hamming loss: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(hamm_LR_meta), np.percentile(hamm_LR_meta, 2.5), np.percentile(hamm_LR_meta, 97.5)))
print("    Micro Avg F1 score: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(f1_LR_meta), np.percentile(f1_LR_meta, 2.5), np.percentile(f1_LR_meta, 97.5)))

Mean scores for LR with MultiOutputClassifier with 95% confidence intervals:
    AUPRC macro: 0.20 [0.19, 0.22]
    AUROC macro: 0.55 [0.53, 0.57]
    Brier score: 0.23 [0.22, 0.24]
    Hamming loss: 0.28 [0.27, 0.29]
    Micro Avg F1 score: 0.34 [0.32, 0.36]


### 1.4. SVM

In [12]:
auprc_SVM_meta = []
auroc_SVM_meta = []
brier_SVM_meta = []
f1_SVM_meta = []
hamm_SVM_meta = []

SVC_base_clf = SVC(class_weight='balanced', kernel='rbf', gamma='scale', probability=True, random_state=0)
SVC_meta_clf = MultiOutputClassifier(SVC_base_clf, n_jobs=-1)
SVC_meta_clf = LR_meta_clf.fit(X_train, Y_train)

for i in range(100):
    X_test_resampled, y_test_resampled = resample(X_test, Y_test, replace=True, n_samples=len(Y_test), random_state=0+i)

    Y_prob = SVC_meta_clf.predict_proba(X_test_resampled)
    Y_pred = SVC_meta_clf.predict(X_test_resampled)

    # Combine prediction probas into single ndarray
    Y_prob_merged = Y_prob[0][:,1].reshape(-1,1)
    for i in range(1, len(Y.columns), 1):
            Y_prob_merged = np.concatenate([Y_prob_merged, Y_prob[i][:,1].reshape(-1,1)], axis=1)
    
    # Compute brier score
    brier_scores = np.zeros(len(Y.columns))
    for i in range(len(Y.columns)):
        brier_scores[i] = brier_score_loss(y_test_resampled.iloc[:,i], Y_prob_merged[:, i])
    brier_SVM_meta.append(brier_scores.mean())

    # Other metrics
    auprc_SVM_meta.append(average_precision_score(y_test_resampled, Y_prob_merged, average='macro')) 
    auroc_SVM_meta.append(roc_auc_score(y_test_resampled, Y_prob_merged, average='macro'))
    f1_SVM_meta.append(f1_score(y_test_resampled, Y_pred, average='micro'))
    hamm_SVM_meta.append(hamming_loss(y_test_resampled, Y_pred))


print(f"Mean scores for SVM with MultiOutputClassifier with 95% confidence intervals:")
print("    AUPRC macro: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(auprc_SVM_meta), np.percentile(auprc_SVM_meta, 2.5), np.percentile(auprc_SVM_meta, 97.5)))
print("    AUROC macro: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(auroc_SVM_meta), np.percentile(auroc_SVM_meta, 2.5), np.percentile(auroc_SVM_meta, 97.5)))
print("    Brier score: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(brier_SVM_meta), np.percentile(brier_SVM_meta, 2.5), np.percentile(brier_SVM_meta, 97.5)))
print("    Hamming loss: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(hamm_SVM_meta), np.percentile(hamm_SVM_meta, 2.5), np.percentile(hamm_SVM_meta, 97.5)))
print("    Micro Avg F1 score: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(f1_SVM_meta), np.percentile(f1_SVM_meta, 2.5), np.percentile(f1_SVM_meta, 97.5)))

Mean scores for SVM with MultiOutputClassifier with 95% confidence intervals:
    AUPRC macro: 0.20 [0.19, 0.22]
    AUROC macro: 0.55 [0.53, 0.57]
    Brier score: 0.23 [0.22, 0.24]
    Hamming loss: 0.28 [0.27, 0.29]
    Micro Avg F1 score: 0.34 [0.32, 0.36]


### 1.5. Random Forest

In [13]:
auprc_RF_meta = []
auroc_RF_meta = []
brier_RF_meta = []
f1_RF_meta = []
hamm_RF_meta = []

RF_base_clf = RandomForestClassifier(random_state=0)
RF_meta_clf = MultiOutputClassifier(RF_base_clf, n_jobs=-1)
RF_meta_clf = LR_meta_clf.fit(X_train, Y_train)

for i in range(100):
    X_test_resampled, y_test_resampled = resample(X_test, Y_test, replace=True, n_samples=len(Y_test), random_state=0+i)

    Y_prob = RF_meta_clf.predict_proba(X_test_resampled)
    Y_pred = RF_meta_clf.predict(X_test_resampled)

    # Combine prediction probas into single ndarray
    Y_prob_merged = Y_prob[0][:,1].reshape(-1,1)
    for i in range(1, len(Y.columns), 1):
            Y_prob_merged = np.concatenate([Y_prob_merged, Y_prob[i][:,1].reshape(-1,1)], axis=1)
    
    # Compute brier score
    brier_scores = np.zeros(len(Y.columns))
    for i in range(len(Y.columns)):
        brier_scores[i] = brier_score_loss(y_test_resampled.iloc[:,i], Y_prob_merged[:, i])
    brier_RF_meta.append(brier_scores.mean())

    # Other metrics
    auprc_RF_meta.append(average_precision_score(y_test_resampled, Y_prob_merged, average='macro')) 
    auroc_RF_meta.append(roc_auc_score(y_test_resampled, Y_prob_merged, average='macro'))
    f1_RF_meta.append(f1_score(y_test_resampled, Y_pred, average='micro'))
    hamm_RF_meta.append(hamming_loss(y_test_resampled, Y_pred))


print(f"Mean scores for RF with MultiOutputClassifier with 95% confidence intervals:")
print("    AUPRC macro: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(auprc_RF_meta), np.percentile(auprc_RF_meta, 2.5), np.percentile(auprc_RF_meta, 97.5)))
print("    AUROC macro: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(auroc_RF_meta), np.percentile(auroc_RF_meta, 2.5), np.percentile(auroc_RF_meta, 97.5)))
print("    Brier score: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(brier_RF_meta), np.percentile(brier_RF_meta, 2.5), np.percentile(brier_RF_meta, 97.5)))
print("    Hamming loss: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(hamm_RF_meta), np.percentile(hamm_RF_meta, 2.5), np.percentile(hamm_RF_meta, 97.5)))
print("    Micro Avg F1 score: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(f1_RF_meta), np.percentile(f1_RF_meta, 2.5), np.percentile(f1_RF_meta, 97.5)))

Mean scores for RF with MultiOutputClassifier with 95% confidence intervals:
    AUPRC macro: 0.20 [0.19, 0.22]
    AUROC macro: 0.55 [0.53, 0.57]
    Brier score: 0.23 [0.22, 0.24]
    Hamming loss: 0.28 [0.27, 0.29]
    Micro Avg F1 score: 0.34 [0.32, 0.36]


### 1.6 Histogram-based Gradient Boosting

In [14]:
auprc_HGB_meta = []
auroc_HGB_meta = []
brier_HGB_meta = []
f1_HGB_meta = []
hamm_HGB_meta = []

HGB_base_clf = HistGradientBoostingClassifier(random_state=0)
HGB_meta_clf = MultiOutputClassifier(HGB_base_clf, n_jobs=-1)
HGB_meta_clf = HGB_meta_clf.fit(X_train, Y_train)

for i in range(100):
    X_test_resampled, y_test_resampled = resample(X_test, Y_test, replace=True, n_samples=len(Y_test), random_state=0+i)

    Y_prob = HGB_meta_clf.predict_proba(X_test_resampled)
    Y_pred = HGB_meta_clf.predict(X_test_resampled)

    # Combine prediction probas into single ndarray
    Y_prob_merged = Y_prob[0][:,1].reshape(-1,1)
    for i in range(1, len(Y.columns), 1):
            Y_prob_merged = np.concatenate([Y_prob_merged, Y_prob[i][:,1].reshape(-1,1)], axis=1)
    
    # Compute brier score
    brier_scores = np.zeros(len(Y.columns))
    for i in range(len(Y.columns)):
        brier_scores[i] = brier_score_loss(y_test_resampled.iloc[:,i], Y_prob_merged[:, i])
    brier_HGB_meta.append(brier_scores.mean())

    # Other metrics
    auprc_HGB_meta.append(average_precision_score(y_test_resampled, Y_prob_merged, average='macro')) 
    auroc_HGB_meta.append(roc_auc_score(y_test_resampled, Y_prob_merged, average='macro'))
    f1_HGB_meta.append(f1_score(y_test_resampled, Y_pred, average='micro'))
    hamm_HGB_meta.append(hamming_loss(y_test_resampled, Y_pred))

print(f"Mean scores for HGB with MultiOutputClassifier with 95% confidence intervals:")
print("    AUPRC macro: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(auprc_HGB_meta), np.percentile(auprc_HGB_meta, 2.5), np.percentile(auprc_HGB_meta, 97.5)))
print("    AUROC macro: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(auroc_HGB_meta), np.percentile(auroc_HGB_meta, 2.5), np.percentile(auroc_HGB_meta, 97.5)))
print("    Brier score: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(brier_HGB_meta), np.percentile(brier_HGB_meta, 2.5), np.percentile(brier_HGB_meta, 97.5)))
print("    Hamming loss: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(hamm_HGB_meta), np.percentile(hamm_HGB_meta, 2.5), np.percentile(hamm_HGB_meta, 97.5)))
print("    Micro Avg F1 score: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(f1_HGB_meta), np.percentile(f1_HGB_meta, 2.5), np.percentile(f1_HGB_meta, 97.5)))

Mean scores for HGB with MultiOutputClassifier with 95% confidence intervals:
    AUPRC macro: 0.22 [0.21, 0.24]
    AUROC macro: 0.59 [0.57, 0.62]
    Brier score: 0.12 [0.12, 0.13]
    Hamming loss: 0.15 [0.14, 0.16]
    Micro Avg F1 score: 0.39 [0.37, 0.41]


### 1.7. MLP

In [15]:
auprc_MLP_meta = []
auroc_MLP_meta = []
brier_MLP_meta = []
f1_MLP_meta = []
hamm_MLP_meta = []

MLP_base_clf = MLPClassifier(random_state=0)
MLP_meta_clf = MultiOutputClassifier(MLP_base_clf, n_jobs=-1)
MLP_meta_clf = MLP_meta_clf.fit(X_train, Y_train)

for i in range(100):
    X_test_resampled, y_test_resampled = resample(X_test, Y_test, replace=True, n_samples=len(Y_test), random_state=0+i)

    Y_prob = MLP_meta_clf.predict_proba(X_test_resampled)
    Y_pred = MLP_meta_clf.predict(X_test_resampled)

    # Combine prediction probas into single ndarray
    Y_prob_merged = Y_prob[0][:,1].reshape(-1,1)
    for i in range(1, len(Y.columns), 1):
            Y_prob_merged = np.concatenate([Y_prob_merged, Y_prob[i][:,1].reshape(-1,1)], axis=1)
    
    # Compute brier score
    brier_scores = np.zeros(len(Y.columns))
    for i in range(len(Y.columns)):
        brier_scores[i] = brier_score_loss(y_test_resampled.iloc[:,i], Y_prob_merged[:, i])
    brier_MLP_meta.append(brier_scores.mean())

    # Other metrics
    auprc_MLP_meta.append(average_precision_score(y_test_resampled, Y_prob_merged, average='macro')) 
    auroc_MLP_meta.append(roc_auc_score(y_test_resampled, Y_prob_merged, average='macro'))
    f1_MLP_meta.append(f1_score(y_test_resampled, Y_pred, average='micro'))
    hamm_MLP_meta.append(hamming_loss(y_test_resampled, Y_pred))

print(f"Mean scores for MLP with MultiOutputClassifier with 95% confidence intervals:")
print("    AUPRC macro: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(auprc_MLP_meta), np.percentile(auprc_MLP_meta, 2.5), np.percentile(auprc_MLP_meta, 97.5)))
print("    AUROC macro: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(auroc_MLP_meta), np.percentile(auroc_MLP_meta, 2.5), np.percentile(auroc_MLP_meta, 97.5)))
print("    Brier score: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(brier_MLP_meta), np.percentile(brier_MLP_meta, 2.5), np.percentile(brier_MLP_meta, 97.5)))
print("    Hamming loss: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(hamm_MLP_meta), np.percentile(hamm_MLP_meta, 2.5), np.percentile(hamm_MLP_meta, 97.5)))
print("    Micro Avg F1 score: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(f1_MLP_meta), np.percentile(f1_MLP_meta, 2.5), np.percentile(f1_MLP_meta, 97.5)))

Mean scores for MLP with MultiOutputClassifier with 95% confidence intervals:
    AUPRC macro: 0.21 [0.20, 0.23]
    AUROC macro: 0.58 [0.56, 0.60]
    Brier score: 0.16 [0.15, 0.16]
    Hamming loss: 0.18 [0.18, 0.19]
    Micro Avg F1 score: 0.38 [0.36, 0.40]


## 2. ClassifierChain (ordered by frequency) 
Evaluate classification models wrapped in meta estimator ClassifierChain with respect to multi-label performance metrics

In [16]:
by_freq = label_freq_sorted(Y_train) # list for ordered ClassifierChain

### 2.1. Always zero baseline estimator

In [17]:
auprc_zero_basline_chain = []
auroc_zero_basline_chain = []
brier_zero_basline_chain = []
hamm_zero_basline_chain = []
f1_zero_basline_chain = []

zero_baseline_clf = DummyClassifier(strategy='constant', constant=0 ,random_state=0)
zero_baseline_meta_chain_clf = ClassifierChain(zero_baseline_clf, order=by_freq, random_state=0)
zero_baseline_meta_chain_clf = zero_baseline_meta_chain_clf.fit(X_train, Y_train)

for i in range(100):
    X_test_resampled, y_test_resampled = resample(X_test, Y_test, replace=True, n_samples=len(Y_test), random_state=0+i)

    y_prob = zero_baseline_meta_chain_clf.predict_proba(X_test_resampled)
    y_pred = zero_baseline_meta_chain_clf.predict(X_test_resampled)

    # Compute brier score
    brier_scores = np.zeros(y_prob.shape[1])
    for i in range(y_prob.shape[1]):
        brier_scores[i] = brier_score_loss(y_test_resampled.iloc[:,i], y_prob[:,i])
    brier_zero_basline_chain.append(brier_scores.mean()) 
    
    # Other metrics
    auprc_zero_basline_chain.append(average_precision_score(y_test_resampled, y_prob, average='macro')) 
    auroc_zero_basline_chain.append(roc_auc_score(y_test_resampled, y_prob, average='macro'))
    f1_zero_basline_chain.append(f1_score(y_test_resampled, y_pred, average='micro'))
    hamm_zero_basline_chain.append(hamming_loss(y_test_resampled, y_pred))

print(f"Mean scores for always zero baseline with ClassifierChain (ordered by frequency) with 95% confidence intervals:")
print("    AUPRC macro: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(auprc_zero_basline_chain), np.percentile(auprc_zero_basline_chain, 2.5), np.percentile(auprc_zero_basline_chain, 97.5)))
print("    AUROC macro: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(auroc_zero_basline_chain), np.percentile(auroc_zero_basline_chain, 2.5), np.percentile(auroc_zero_basline_chain, 97.5)))
print("    Brier score: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(brier_zero_basline_chain), np.percentile(brier_zero_basline_chain, 2.5), np.percentile(brier_zero_basline_chain, 97.5)))
print("    Hamming loss: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(hamm_zero_basline_chain), np.percentile(hamm_zero_basline_chain, 2.5), np.percentile(hamm_zero_basline_chain, 97.5)))
print("    Micro Avg F1 score: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(f1_zero_basline_chain), np.percentile(f1_zero_basline_chain, 2.5), np.percentile(f1_zero_basline_chain, 97.5)))

Mean scores for always zero baseline with ClassifierChain (ordered by frequency) with 95% confidence intervals:
    AUPRC macro: 0.17 [0.16, 0.18]
    AUROC macro: 0.50 [0.50, 0.50]
    Brier score: 0.17 [0.16, 0.18]
    Hamming loss: 0.17 [0.16, 0.18]
    Micro Avg F1 score: 0.00 [0.00, 0.00]


### 2.2. Label proportion baseline estimator

In [18]:
auprc_lprop_basline_chain_r = []
auroc_lprop_basline_chain_r = []
brier_lprop_basline_chain_r = []
f1_lprop_basline_chain_r = []
hamm_lprop_basline_chain_r = []

lprop_baseline_clf = DummyClassifier(strategy='prior', random_state=0)
lprop_baseline_meta_chain_r_clf = ClassifierChain(zero_baseline_clf, order=by_freq, random_state=0)
lprop_baseline_meta_chain_r_clf = lprop_baseline_meta_chain_r_clf.fit(X_train, Y_train)

for i in range(100):
    X_test_resampled, y_test_resampled = resample(X_test, Y_test, replace=True, n_samples=len(Y_test), random_state=0+i)
    

    y_prob = lprop_baseline_meta_chain_r_clf.predict_proba(X_test_resampled)
    y_pred = lprop_baseline_meta_chain_r_clf.predict(X_test_resampled)

    # Compute brier score
    brier_scores = np.zeros(y_prob.shape[1])
    for i in range(y_prob.shape[1]):
        brier_scores[i] = brier_score_loss(y_test_resampled.iloc[:,i], y_prob[:,i])
    brier_lprop_basline_chain_r.append(brier_scores.mean()) 
    
    # Other metrics
    auprc_lprop_basline_chain_r.append(average_precision_score(y_test_resampled, y_prob, average='macro')) 
    auroc_lprop_basline_chain_r.append(roc_auc_score(y_test_resampled, y_prob, average='macro'))
    f1_lprop_basline_chain_r.append(f1_score(y_test_resampled, y_pred, average='micro'))
    hamm_lprop_basline_chain_r.append(hamming_loss(y_test_resampled, y_pred))

print(f"Mean scores for label proportion baseline with ClassifierChain (ordered by frequency) with 95% confidence intervals:")
print("    AUPRC macro: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(auprc_lprop_basline_chain_r), np.percentile(auprc_lprop_basline_chain_r, 2.5), np.percentile(auprc_lprop_basline_chain_r, 97.5)))
print("    AUROC macro: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(auroc_lprop_basline_chain_r), np.percentile(auroc_lprop_basline_chain_r, 2.5), np.percentile(auroc_lprop_basline_chain_r, 97.5)))
print("    Brier score: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(brier_lprop_basline_chain_r), np.percentile(brier_lprop_basline_chain_r, 2.5), np.percentile(brier_lprop_basline_chain_r, 97.5)))
print("    Hamming loss: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(hamm_lprop_basline_chain_r), np.percentile(hamm_lprop_basline_chain_r, 2.5), np.percentile(hamm_lprop_basline_chain_r, 97.5)))
print("    Micro Avg F1 score: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(f1_lprop_basline_chain_r), np.percentile(f1_lprop_basline_chain_r, 2.5), np.percentile(f1_lprop_basline_chain_r, 97.5)))

Mean scores for label proportion baseline with ClassifierChain (ordered by frequency) with 95% confidence intervals:
    AUPRC macro: 0.17 [0.16, 0.18]
    AUROC macro: 0.50 [0.50, 0.50]
    Brier score: 0.17 [0.16, 0.18]
    Hamming loss: 0.17 [0.16, 0.18]
    Micro Avg F1 score: 0.00 [0.00, 0.00]


### 2.3. Logistic regression

In [19]:
auprc_LR_meta_chain = []
auroc_LR_meta_chain = []
brier_LR_meta_chain = []
f1_LR_meta_chain = []
hamm_LR_meta_chain = []

LR_base_clf = LogisticRegression(class_weight='balanced', max_iter=10000, random_state=0, n_jobs=-1)
LR_meta_chain_clf = ClassifierChain(LR_base_clf, order=by_freq, random_state=0)
LR_meta_chain_clf = LR_meta_chain_clf.fit(X_train, Y_train)

for i in range(100):
    X_test_resampled, y_test_resampled = resample(X_test, Y_test, replace=True, n_samples=len(Y_test), random_state=0+i)

    Y_prob = LR_meta_chain_clf.predict_proba(X_test_resampled)
    Y_pred = LR_meta_chain_clf.predict(X_test_resampled)
    
    # Compute brier score
    brier_scores = np.zeros(Y_prob.shape[1])
    for i in range(Y_prob.shape[1]):
        brier_scores[i] = brier_score_loss(y_test_resampled.iloc[:,i], Y_prob[:, i])
    brier_LR_meta_chain.append(brier_scores.mean())

    # Other metrics
    auprc_LR_meta_chain.append(average_precision_score(y_test_resampled, Y_prob, average='macro')) 
    auroc_LR_meta_chain.append(roc_auc_score(y_test_resampled, Y_prob, average='macro'))
    f1_LR_meta_chain.append(f1_score(y_test_resampled, Y_pred, average='micro'))
    hamm_LR_meta_chain.append(hamming_loss(y_test_resampled, Y_pred))


print(f"Mean scores for LR with ClassifierChain (ordered by frequency) with 95% confidence intervals:")
print("    AUPRC macro: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(auprc_LR_meta_chain), np.percentile(auprc_LR_meta_chain, 2.5), np.percentile(auprc_LR_meta_chain, 97.5)))
print("    AUROC macro: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(auroc_LR_meta_chain), np.percentile(auroc_LR_meta_chain, 2.5), np.percentile(auroc_LR_meta_chain, 97.5)))
print("    Brier score: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(brier_LR_meta_chain), np.percentile(brier_LR_meta_chain, 2.5), np.percentile(brier_LR_meta_chain, 97.5)))
print("    Hamming loss: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(hamm_LR_meta_chain), np.percentile(hamm_LR_meta_chain, 2.5), np.percentile(hamm_LR_meta_chain, 97.5)))
print("    Micro Avg F1 score: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(f1_LR_meta_chain), np.percentile(f1_LR_meta_chain, 2.5), np.percentile(f1_LR_meta_chain, 97.5)))

Mean scores for LR with ClassifierChain (ordered by frequency) with 95% confidence intervals:
    AUPRC macro: 0.20 [0.19, 0.22]
    AUROC macro: 0.55 [0.53, 0.58]
    Brier score: 0.23 [0.22, 0.23]
    Hamming loss: 0.28 [0.27, 0.28]
    Micro Avg F1 score: 0.35 [0.33, 0.37]


### 2.4. SVM

In [20]:
auprc_SVM_meta_chain = []
auroc_SVM_meta_chain = []
brier_SVM_meta_chain = []
f1_SVM_meta_chain = []
hamm_SVM_meta_chain = []

SVC_base_clf = SVC(class_weight='balanced', kernel='rbf', gamma='scale', probability=True, random_state=0)
SVC_meta_chain_clf = ClassifierChain(SVC_base_clf, order=by_freq, random_state=0)
SVC_meta_chain_clf = LR_meta_chain_clf.fit(X_train, Y_train)

for i in range(10):
    X_test_resampled, y_test_resampled = resample(X_test, Y_test, replace=True, n_samples=len(Y_test), random_state=0+i)

    Y_prob = SVC_meta_chain_clf.predict_proba(X_test_resampled)
    Y_pred = SVC_meta_chain_clf.predict(X_test_resampled)
    
    # Compute brier score
    brier_scores = np.zeros(Y_prob.shape[1])
    for i in range(Y_prob.shape[1]):
        brier_scores[i] = brier_score_loss(y_test_resampled.iloc[:,i], Y_prob[:, i])
    brier_SVM_meta_chain.append(brier_scores.mean())

    # Other metrics
    auprc_SVM_meta_chain.append(average_precision_score(y_test_resampled, Y_prob, average='macro')) 
    auroc_SVM_meta_chain.append(roc_auc_score(y_test_resampled, Y_prob, average='macro'))
    f1_SVM_meta_chain.append(f1_score(y_test_resampled, Y_pred, average='micro'))
    hamm_SVM_meta_chain.append(hamming_loss(y_test_resampled, Y_pred))


print(f"Mean scores for SVM with ClassifierChain (ordered by frequency) with 95% confidence intervals:")
print("    AUPRC macro: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(auprc_SVM_meta_chain), np.percentile(auprc_SVM_meta_chain, 2.5), np.percentile(auprc_SVM_meta_chain, 97.5)))
print("    AUROC macro: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(auroc_SVM_meta_chain), np.percentile(auroc_SVM_meta_chain, 2.5), np.percentile(auroc_SVM_meta_chain, 97.5)))
print("    Brier score: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(brier_SVM_meta_chain), np.percentile(brier_SVM_meta_chain, 2.5), np.percentile(brier_SVM_meta_chain, 97.5)))
print("    Hamming loss: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(hamm_SVM_meta_chain), np.percentile(hamm_SVM_meta_chain, 2.5), np.percentile(hamm_SVM_meta_chain, 97.5)))
print("    Micro Avg F1 score: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(f1_SVM_meta_chain), np.percentile(f1_SVM_meta_chain, 2.5), np.percentile(f1_SVM_meta_chain, 97.5)))

Mean scores for SVM with ClassifierChain (ordered by frequency) with 95% confidence intervals:
    AUPRC macro: 0.20 [0.20, 0.21]
    AUROC macro: 0.55 [0.53, 0.57]
    Brier score: 0.23 [0.22, 0.23]
    Hamming loss: 0.28 [0.27, 0.28]
    Micro Avg F1 score: 0.35 [0.34, 0.36]


### 2.5. Random Forest

In [21]:
auprc_RF_meta_chain = []
auroc_RF_meta_chain = []
brier_RF_meta_chain = []
f1_RF_meta_chain = []
hamm_RF_meta_chain = []

RF_base_clf = RandomForestClassifier(n_jobs=-1, random_state=0)
RF_meta_chain_clf = ClassifierChain(RF_base_clf, order=by_freq, random_state=0)
RF_meta_chain_clf = LR_meta_chain_clf.fit(X_train, Y_train)

for i in range(100):
    X_test_resampled, y_test_resampled = resample(X_test, Y_test, replace=True, n_samples=len(Y_test), random_state=0+i)

    Y_prob = RF_meta_chain_clf.predict_proba(X_test_resampled)
    Y_pred = RF_meta_chain_clf.predict(X_test_resampled)
    
    # Compute brier score
    brier_scores = np.zeros(Y_prob.shape[1])
    for i in range(Y_prob.shape[1]):
        brier_scores[i] = brier_score_loss(y_test_resampled.iloc[:,i], Y_prob[:, i])
    brier_RF_meta_chain.append(brier_scores.mean())

    # Other metrics
    auprc_RF_meta_chain.append(average_precision_score(y_test_resampled, Y_prob, average='macro')) 
    auroc_RF_meta_chain.append(roc_auc_score(y_test_resampled, Y_prob, average='macro'))
    f1_RF_meta_chain.append(f1_score(y_test_resampled, Y_pred, average='micro'))
    hamm_RF_meta_chain.append(hamming_loss(y_test_resampled, Y_pred))


print(f"Mean scores for RF meta with ClassifierChain (ordered by frequency) with 95% confidence intervals:")
print("    AUPRC macro: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(auprc_RF_meta_chain), np.percentile(auprc_RF_meta_chain, 2.5), np.percentile(auprc_RF_meta_chain, 97.5)))
print("    AUROC macro: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(auroc_RF_meta_chain), np.percentile(auroc_RF_meta_chain, 2.5), np.percentile(auroc_RF_meta_chain, 97.5)))
print("    Brier score: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(brier_RF_meta_chain), np.percentile(brier_RF_meta_chain, 2.5), np.percentile(brier_RF_meta_chain, 97.5)))
print("    Hamming loss: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(hamm_RF_meta_chain), np.percentile(hamm_RF_meta_chain, 2.5), np.percentile(hamm_RF_meta_chain, 97.5)))
print("    Micro Avg F1 score: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(f1_RF_meta_chain), np.percentile(f1_RF_meta_chain, 2.5), np.percentile(f1_RF_meta_chain, 97.5)))

Mean scores for RF meta with ClassifierChain (ordered by frequency) with 95% confidence intervals:
    AUPRC macro: 0.20 [0.19, 0.22]
    AUROC macro: 0.55 [0.53, 0.58]
    Brier score: 0.23 [0.22, 0.23]
    Hamming loss: 0.28 [0.27, 0.28]
    Micro Avg F1 score: 0.35 [0.33, 0.37]


### 2.6 Histogram-based Gradient Boosting

In [22]:
auprc_HGB_meta_chain = []
auroc_HGB_meta_chain = []
brier_HGB_meta_chain = []
f1_HGB_meta_chain = []
hamm_HGB_meta_chain = []


HGB_base_clf = HistGradientBoostingClassifier(random_state=0)
HGB_meta_chain_clf = ClassifierChain(HGB_base_clf, order=by_freq, random_state=0)
HGB_meta_chain_clf = HGB_meta_chain_clf.fit(X_train, Y_train)

for i in range(10):
    X_test_resampled, y_test_resampled = resample(X_test, Y_test, replace=True, n_samples=len(Y_test), random_state=0+i)

    Y_prob = HGB_meta_chain_clf.predict_proba(X_test_resampled)
    Y_pred = HGB_meta_chain_clf.predict(X_test_resampled)
    
    # Compute brier score
    brier_scores = np.zeros(Y_prob.shape[1])
    for i in range(Y_prob.shape[1]):
        brier_scores[i] = brier_score_loss(y_test_resampled.iloc[:,i], Y_prob[:, i])
    brier_HGB_meta_chain.append(brier_scores.mean())

    # Other metrics
    auprc_HGB_meta_chain.append(average_precision_score(y_test_resampled, Y_prob, average='macro')) 
    auroc_HGB_meta_chain.append(roc_auc_score(y_test_resampled, Y_prob, average='macro'))
    f1_HGB_meta_chain.append(f1_score(y_test_resampled, Y_pred, average='micro'))
    hamm_HGB_meta_chain.append(hamming_loss(y_test_resampled, Y_pred))

print(f"Mean scores for HGB with ClassifierChain (ordered by frequency) with 95% confidence intervals:")
print("    AUPRC macro: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(auprc_HGB_meta_chain), np.percentile(auprc_HGB_meta_chain, 2.5), np.percentile(auprc_HGB_meta_chain, 97.5)))
print("    AUROC macro: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(auroc_HGB_meta_chain), np.percentile(auroc_HGB_meta_chain, 2.5), np.percentile(auroc_HGB_meta_chain, 97.5)))
print("    Brier score: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(brier_HGB_meta_chain), np.percentile(brier_HGB_meta_chain, 2.5), np.percentile(brier_HGB_meta_chain, 97.5)))
print("    Hamming loss: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(hamm_HGB_meta_chain), np.percentile(hamm_HGB_meta_chain, 2.5), np.percentile(hamm_HGB_meta_chain, 97.5)))
print("    Micro Avg F1 score: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(f1_HGB_meta_chain), np.percentile(f1_HGB_meta_chain, 2.5), np.percentile(f1_HGB_meta_chain, 97.5)))

Mean scores for HGB with ClassifierChain (ordered by frequency) with 95% confidence intervals:
    AUPRC macro: 0.22 [0.20, 0.23]
    AUROC macro: 0.58 [0.55, 0.60]
    Brier score: 0.12 [0.12, 0.13]
    Hamming loss: 0.15 [0.14, 0.15]
    Micro Avg F1 score: 0.39 [0.38, 0.40]


### 2.7. MLP

In [23]:
auprc_MLP_meta_chain = []
auroc_MLP_meta_chain = []
brier_MLP_meta_chain = []
f1_MLP_meta_chain = []
hamm_MLP_meta_chain = []

MLP_base_clf = MLPClassifier(random_state=0)
MLP_meta_chain_clf = ClassifierChain(MLP_base_clf, order=by_freq, random_state=0)
MLP_meta_chain_clf = MLP_meta_chain_clf.fit(X_train, Y_train)

for i in range(100):
    X_test_resampled, y_test_resampled = resample(X_test, Y_test, replace=True, n_samples=len(Y_test), random_state=0+i)

    Y_prob = MLP_meta_chain_clf.predict_proba(X_test_resampled)
    Y_pred = MLP_meta_chain_clf.predict(X_test_resampled)

    # Compute brier score
    brier_scores = np.zeros(Y_prob.shape[1])
    for i in range(Y_prob.shape[1]):
        brier_scores[i] = brier_score_loss(y_test_resampled.iloc[:,i], Y_prob[:, i])
    brier_MLP_meta_chain.append(brier_scores.mean())

    # Other metrics
    auprc_MLP_meta_chain.append(average_precision_score(y_test_resampled, Y_prob, average='macro')) 
    auroc_MLP_meta_chain.append(roc_auc_score(y_test_resampled, Y_prob, average='macro'))
    f1_MLP_meta_chain.append(f1_score(y_test_resampled, Y_pred, average='micro'))
    hamm_MLP_meta_chain.append(hamming_loss(y_test_resampled, Y_pred))

print(f"Mean scores for MLP with ClassifierChain (ordered by frequency) with 95% confidence intervals:")
print("    AUPRC macro: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(auprc_MLP_meta_chain), np.percentile(auprc_MLP_meta_chain, 2.5), np.percentile(auprc_MLP_meta_chain, 97.5)))
print("    AUROC macro: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(auroc_MLP_meta_chain), np.percentile(auroc_MLP_meta_chain, 2.5), np.percentile(auroc_MLP_meta_chain, 97.5)))
print("    Brier score: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(brier_MLP_meta_chain), np.percentile(brier_MLP_meta_chain, 2.5), np.percentile(brier_MLP_meta_chain, 97.5)))
print("    Hamming loss: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(hamm_MLP_meta_chain), np.percentile(hamm_MLP_meta_chain, 2.5), np.percentile(hamm_MLP_meta_chain, 97.5)))
print("    Micro Avg F1 score: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(f1_MLP_meta_chain), np.percentile(f1_MLP_meta_chain, 2.5), np.percentile(f1_MLP_meta_chain, 97.5)))

Mean scores for MLP with ClassifierChain (ordered by frequency) with 95% confidence intervals:
    AUPRC macro: 0.21 [0.20, 0.23]
    AUROC macro: 0.57 [0.55, 0.60]
    Brier score: 0.16 [0.15, 0.16]
    Hamming loss: 0.18 [0.18, 0.19]
    Micro Avg F1 score: 0.37 [0.35, 0.39]


## 3. ClassifierChain (random order) 
Evaluate classification models wrapped in meta estimator ClassifierChain with respect to multi-label performance metrics

### 3.1. Always zero baseline estimator

In [24]:
auprc_zero_basline_chain_r = []
auroc_zero_basline_chain_r = []
brier_zero_basline_chain_r = []
hamm_zero_basline_chain_r = []
f1_zero_basline_chain_r = []

zero_baseline_clf = DummyClassifier(strategy='constant', constant=0 ,random_state=0)
zero_baseline_meta_chain_r_clf = ClassifierChain(zero_baseline_clf, random_state=0) # random order
zero_baseline_meta_chain_r_clf = zero_baseline_meta_chain_r_clf.fit(X_train, Y_train)

for i in range(100):
    X_test_resampled, y_test_resampled = resample(X_test, Y_test, replace=True, n_samples=len(Y_test), random_state=0+i)

    y_prob = zero_baseline_meta_chain_r_clf.predict_proba(X_test_resampled)
    y_pred = zero_baseline_meta_chain_r_clf.predict(X_test_resampled)

    # Compute brier score
    brier_scores = np.zeros(y_prob.shape[1])
    for i in range(y_prob.shape[1]):
        brier_scores[i] = brier_score_loss(y_test_resampled.iloc[:,i], y_prob[:,i])
    brier_zero_basline_chain_r.append(brier_scores.mean()) 
    
    # Other metrics
    auprc_zero_basline_chain_r.append(average_precision_score(y_test_resampled, y_prob, average='macro')) 
    auroc_zero_basline_chain_r.append(roc_auc_score(y_test_resampled, y_prob, average='macro'))
    f1_zero_basline_chain_r.append(f1_score(y_test_resampled, y_pred, average='micro'))
    hamm_zero_basline_chain_r.append(hamming_loss(y_test_resampled, y_pred))

print(f"Mean scores for always zero baseline with ClassifierChain (random order) with 95% confidence intervals:")
print("    AUPRC macro: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(auprc_zero_basline_chain_r), np.percentile(auprc_zero_basline_chain_r, 2.5), np.percentile(auprc_zero_basline_chain_r, 97.5)))
print("    AUROC macro: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(auroc_zero_basline_chain_r), np.percentile(auroc_zero_basline_chain_r, 2.5), np.percentile(auroc_zero_basline_chain_r, 97.5)))
print("    Brier score: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(brier_zero_basline_chain_r), np.percentile(brier_zero_basline_chain_r, 2.5), np.percentile(brier_zero_basline_chain_r, 97.5)))
print("    Hamming loss: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(hamm_zero_basline_chain_r), np.percentile(hamm_zero_basline_chain_r, 2.5), np.percentile(hamm_zero_basline_chain_r, 97.5)))
print("    Micro Avg F1 score: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(f1_zero_basline_chain_r), np.percentile(f1_zero_basline_chain_r, 2.5), np.percentile(f1_zero_basline_chain_r, 97.5)))

Mean scores for always zero baseline with ClassifierChain (random order) with 95% confidence intervals:
    AUPRC macro: 0.17 [0.16, 0.18]
    AUROC macro: 0.50 [0.50, 0.50]
    Brier score: 0.17 [0.16, 0.18]
    Hamming loss: 0.17 [0.16, 0.18]
    Micro Avg F1 score: 0.00 [0.00, 0.00]


### 3.2. Label proportion baseline estimator

In [25]:
auprc_lprop_basline_chain = []
auroc_lprop_basline_chain = []
brier_lprop_basline_chain = []
f1_lprop_basline_chain = []
hamm_lprop_basline_chain = []

lprop_baseline_clf = DummyClassifier(strategy='prior', random_state=0)
lprop_baseline_meta_chain_clf = ClassifierChain(zero_baseline_clf, random_state=0) # random order
lprop_baseline_meta_chain_clf = lprop_baseline_meta_chain_clf.fit(X_train, Y_train)

for i in range(100):
    X_test_resampled, y_test_resampled = resample(X_test, Y_test, replace=True, n_samples=len(Y_test), random_state=0+i)
    

    y_prob = lprop_baseline_meta_chain_clf.predict_proba(X_test_resampled)
    y_pred = lprop_baseline_meta_chain_clf.predict(X_test_resampled)

    # Compute brier score
    brier_scores = np.zeros(y_prob.shape[1])
    for i in range(y_prob.shape[1]):
        brier_scores[i] = brier_score_loss(y_test_resampled.iloc[:,i], y_prob[:,i])
    brier_lprop_basline_chain.append(brier_scores.mean()) 
    
    # Other metrics
    auprc_lprop_basline_chain.append(average_precision_score(y_test_resampled, y_prob, average='macro')) 
    auroc_lprop_basline_chain.append(roc_auc_score(y_test_resampled, y_prob, average='macro'))
    f1_lprop_basline_chain.append(f1_score(y_test_resampled, y_pred, average='micro'))
    hamm_lprop_basline_chain.append(hamming_loss(y_test_resampled, y_pred))

print(f"Mean scores for label proportion baseline with ClassifierChain (random order) with 95% confidence intervals:")
print("    AUPRC macro: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(auprc_lprop_basline_chain), np.percentile(auprc_lprop_basline_chain, 2.5), np.percentile(auprc_lprop_basline_chain, 97.5)))
print("    AUROC macro: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(auroc_lprop_basline_chain), np.percentile(auroc_lprop_basline_chain, 2.5), np.percentile(auroc_lprop_basline_chain, 97.5)))
print("    Brier score: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(brier_lprop_basline_chain), np.percentile(brier_lprop_basline_chain, 2.5), np.percentile(brier_lprop_basline_chain, 97.5)))
print("    Hamming loss: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(hamm_lprop_basline_chain), np.percentile(hamm_lprop_basline_chain, 2.5), np.percentile(hamm_lprop_basline_chain, 97.5)))
print("    Micro Avg F1 score: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(f1_lprop_basline_chain), np.percentile(f1_lprop_basline_chain, 2.5), np.percentile(f1_lprop_basline_chain, 97.5)))

Mean scores for label proportion baseline with ClassifierChain (random order) with 95% confidence intervals:
    AUPRC macro: 0.17 [0.16, 0.18]
    AUROC macro: 0.50 [0.50, 0.50]
    Brier score: 0.17 [0.16, 0.18]
    Hamming loss: 0.17 [0.16, 0.18]
    Micro Avg F1 score: 0.00 [0.00, 0.00]


### 3.3. Logistic regression

In [26]:
auprc_LR_meta_chain_r = []
auroc_LR_meta_chain_r = []
brier_LR_meta_chain_r = []
f1_LR_meta_chain_r = []
hamm_LR_meta_chain_r = []

LR_base_clf = LogisticRegression(class_weight='balanced', max_iter=10000, random_state=0, n_jobs=-1)
LR_meta_chain_r_clf = ClassifierChain(LR_base_clf, random_state=0) # random order
LR_meta_chain_r_clf = LR_meta_chain_r_clf.fit(X_train, Y_train)

for i in range(100):
    X_test_resampled, y_test_resampled = resample(X_test, Y_test, replace=True, n_samples=len(Y_test), random_state=0+i)

    Y_prob = LR_meta_chain_r_clf.predict_proba(X_test_resampled)
    Y_pred = LR_meta_chain_r_clf.predict(X_test_resampled)
    
    # Compute brier score
    brier_scores = np.zeros(Y_prob.shape[1])
    for i in range(Y_prob.shape[1]):
        brier_scores[i] = brier_score_loss(y_test_resampled.iloc[:,i], Y_prob[:, i])
    brier_LR_meta_chain_r.append(brier_scores.mean())

    # Other metrics
    auprc_LR_meta_chain_r.append(average_precision_score(y_test_resampled, Y_prob, average='macro')) 
    auroc_LR_meta_chain_r.append(roc_auc_score(y_test_resampled, Y_prob, average='macro'))
    f1_LR_meta_chain_r.append(f1_score(y_test_resampled, Y_pred, average='micro'))
    hamm_LR_meta_chain_r.append(hamming_loss(y_test_resampled, Y_pred))


print(f"Mean scores for LR with ClassifierChain (random order) with 95% confidence intervals:")
print("    AUPRC macro: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(auprc_LR_meta_chain_r), np.percentile(auprc_LR_meta_chain_r, 2.5), np.percentile(auprc_LR_meta_chain_r, 97.5)))
print("    AUROC macro: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(auroc_LR_meta_chain_r), np.percentile(auroc_LR_meta_chain_r, 2.5), np.percentile(auroc_LR_meta_chain_r, 97.5)))
print("    Brier score: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(brier_LR_meta_chain_r), np.percentile(brier_LR_meta_chain_r, 2.5), np.percentile(brier_LR_meta_chain_r, 97.5)))
print("    Hamming loss: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(hamm_LR_meta_chain_r), np.percentile(hamm_LR_meta_chain_r, 2.5), np.percentile(hamm_LR_meta_chain_r, 97.5)))
print("    Micro Avg F1 score: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(f1_LR_meta_chain_r), np.percentile(f1_LR_meta_chain_r, 2.5), np.percentile(f1_LR_meta_chain_r, 97.5)))

Mean scores for LR with ClassifierChain (random order) with 95% confidence intervals:
    AUPRC macro: 0.20 [0.19, 0.22]
    AUROC macro: 0.55 [0.53, 0.57]
    Brier score: 0.23 [0.22, 0.24]
    Hamming loss: 0.28 [0.27, 0.29]
    Micro Avg F1 score: 0.35 [0.33, 0.36]


### 3.4. SVM

In [27]:
auprc_SVM_meta_chain_r = []
auroc_SVM_meta_chain_r = []
brier_SVM_meta_chain_r = []
f1_SVM_meta_chain_r = []
hamm_SVM_meta_chain_r = []

SVC_base_clf = SVC(class_weight='balanced', kernel='rbf', gamma='scale', probability=True, random_state=0)
SVC_meta_chain_r_clf = ClassifierChain(SVC_base_clf, random_state=0) # random order
SVC_meta_chain_r_clf = LR_meta_chain_r_clf.fit(X_train, Y_train)

for i in range(10):
    X_test_resampled, y_test_resampled = resample(X_test, Y_test, replace=True, n_samples=len(Y_test), random_state=0+i)

    Y_prob = SVC_meta_chain_r_clf.predict_proba(X_test_resampled)
    Y_pred = SVC_meta_chain_r_clf.predict(X_test_resampled)
    
    # Compute brier score
    brier_scores = np.zeros(Y_prob.shape[1])
    for i in range(Y_prob.shape[1]):
        brier_scores[i] = brier_score_loss(y_test_resampled.iloc[:,i], Y_prob[:, i])
    brier_SVM_meta_chain_r.append(brier_scores.mean())

    # Other metrics
    auprc_SVM_meta_chain_r.append(average_precision_score(y_test_resampled, Y_prob, average='macro')) 
    auroc_SVM_meta_chain_r.append(roc_auc_score(y_test_resampled, Y_prob, average='macro'))
    f1_SVM_meta_chain_r.append(f1_score(y_test_resampled, Y_pred, average='micro'))
    hamm_SVM_meta_chain_r.append(hamming_loss(y_test_resampled, Y_pred))


print(f"Mean scores for SVM with ClassifierChain (random order) with 95% confidence intervals:")
print("    AUPRC macro: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(auprc_SVM_meta_chain_r), np.percentile(auprc_SVM_meta_chain_r, 2.5), np.percentile(auprc_SVM_meta_chain_r, 97.5)))
print("    AUROC macro: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(auroc_SVM_meta_chain_r), np.percentile(auroc_SVM_meta_chain_r, 2.5), np.percentile(auroc_SVM_meta_chain_r, 97.5)))
print("    Brier score: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(brier_SVM_meta_chain_r), np.percentile(brier_SVM_meta_chain_r, 2.5), np.percentile(brier_SVM_meta_chain_r, 97.5)))
print("    Hamming loss: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(hamm_SVM_meta_chain_r), np.percentile(hamm_SVM_meta_chain_r, 2.5), np.percentile(hamm_SVM_meta_chain_r, 97.5)))
print("    Micro Avg F1 score: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(f1_SVM_meta_chain_r), np.percentile(f1_SVM_meta_chain_r, 2.5), np.percentile(f1_SVM_meta_chain_r, 97.5)))

Mean scores for SVM with ClassifierChain (random order) with 95% confidence intervals:
    AUPRC macro: 0.20 [0.20, 0.21]
    AUROC macro: 0.55 [0.53, 0.56]
    Brier score: 0.23 [0.22, 0.23]
    Hamming loss: 0.28 [0.27, 0.28]
    Micro Avg F1 score: 0.34 [0.33, 0.35]


### 3.5. Random Forest

In [28]:
auprc_RF_meta_chain_r = []
auroc_RF_meta_chain_r = []
brier_RF_meta_chain_r = []
f1_RF_meta_chain_r = []
hamm_RF_meta_chain_r = []

RF_base_clf = RandomForestClassifier(n_jobs=-1, random_state=0)
RF_meta_chain_r_clf = ClassifierChain(RF_base_clf, random_state=0) # random order
RF_meta_chain_r_clf = LR_meta_chain_r_clf.fit(X_train, Y_train)

for i in range(100):
    X_test_resampled, y_test_resampled = resample(X_test, Y_test, replace=True, n_samples=len(Y_test), random_state=0+i)

    Y_prob = RF_meta_chain_r_clf.predict_proba(X_test_resampled)
    Y_pred = RF_meta_chain_r_clf.predict(X_test_resampled)
    
    # Compute brier score
    brier_scores = np.zeros(Y_prob.shape[1])
    for i in range(Y_prob.shape[1]):
        brier_scores[i] = brier_score_loss(y_test_resampled.iloc[:,i], Y_prob[:, i])
    brier_RF_meta_chain_r.append(brier_scores.mean())

    # Other metrics
    auprc_RF_meta_chain_r.append(average_precision_score(y_test_resampled, Y_prob, average='macro')) 
    auroc_RF_meta_chain_r.append(roc_auc_score(y_test_resampled, Y_prob, average='macro'))
    f1_RF_meta_chain_r.append(f1_score(y_test_resampled, Y_pred, average='micro'))
    hamm_RF_meta_chain_r.append(hamming_loss(y_test_resampled, Y_pred))


print(f"Mean scores for RF meta with ClassifierChain (random order) with 95% confidence intervals:")
print("    AUPRC macro: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(auprc_RF_meta_chain_r), np.percentile(auprc_RF_meta_chain_r, 2.5), np.percentile(auprc_RF_meta_chain_r, 97.5)))
print("    AUROC macro: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(auroc_RF_meta_chain_r), np.percentile(auroc_RF_meta_chain_r, 2.5), np.percentile(auroc_RF_meta_chain_r, 97.5)))
print("    Brier score: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(brier_RF_meta_chain_r), np.percentile(brier_RF_meta_chain_r, 2.5), np.percentile(brier_RF_meta_chain_r, 97.5)))
print("    Hamming loss: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(hamm_RF_meta_chain_r), np.percentile(hamm_RF_meta_chain_r, 2.5), np.percentile(hamm_RF_meta_chain_r, 97.5)))
print("    Micro Avg F1 score: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(f1_RF_meta_chain_r), np.percentile(f1_RF_meta_chain_r, 2.5), np.percentile(f1_RF_meta_chain_r, 97.5)))

Mean scores for RF meta with ClassifierChain (random order) with 95% confidence intervals:
    AUPRC macro: 0.20 [0.19, 0.22]
    AUROC macro: 0.55 [0.53, 0.57]
    Brier score: 0.23 [0.22, 0.24]
    Hamming loss: 0.28 [0.27, 0.29]
    Micro Avg F1 score: 0.35 [0.33, 0.36]


### 3.6 Histogram-based Gradient Boosting

In [29]:
auprc_HGB_meta_chain_r = []
auroc_HGB_meta_chain_r = []
brier_HGB_meta_chain_r = []
f1_HGB_meta_chain_r = []
hamm_HGB_meta_chain_r = []


HGB_base_clf = HistGradientBoostingClassifier(random_state=0)
HGB_meta_chain_r_clf = ClassifierChain(HGB_base_clf, random_state=0) # random order
HGB_meta_chain_r_clf = HGB_meta_chain_r_clf.fit(X_train, Y_train)

for i in range(10):
    X_test_resampled, y_test_resampled = resample(X_test, Y_test, replace=True, n_samples=len(Y_test), random_state=0+i)

    Y_prob = HGB_meta_chain_r_clf.predict_proba(X_test_resampled)
    Y_pred = HGB_meta_chain_r_clf.predict(X_test_resampled)
    
    # Compute brier score
    brier_scores = np.zeros(Y_prob.shape[1])
    for i in range(Y_prob.shape[1]):
        brier_scores[i] = brier_score_loss(y_test_resampled.iloc[:,i], Y_prob[:, i])
    brier_HGB_meta_chain_r.append(brier_scores.mean())

    # Other metrics
    auprc_HGB_meta_chain_r.append(average_precision_score(y_test_resampled, Y_prob, average='macro')) 
    auroc_HGB_meta_chain_r.append(roc_auc_score(y_test_resampled, Y_prob, average='macro'))
    f1_HGB_meta_chain_r.append(f1_score(y_test_resampled, Y_pred, average='micro'))
    hamm_HGB_meta_chain_r.append(hamming_loss(y_test_resampled, Y_pred))

print(f"Mean scores for HGB with ClassifierChain (random order) with 95% confidence intervals:")
print("    AUPRC macro: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(auprc_HGB_meta_chain_r), np.percentile(auprc_HGB_meta_chain_r, 2.5), np.percentile(auprc_HGB_meta_chain_r, 97.5)))
print("    AUROC macro: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(auroc_HGB_meta_chain_r), np.percentile(auroc_HGB_meta_chain_r, 2.5), np.percentile(auroc_HGB_meta_chain_r, 97.5)))
print("    Brier score: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(brier_HGB_meta_chain_r), np.percentile(brier_HGB_meta_chain_r, 2.5), np.percentile(brier_HGB_meta_chain_r, 97.5)))
print("    Hamming loss: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(hamm_HGB_meta_chain_r), np.percentile(hamm_HGB_meta_chain_r, 2.5), np.percentile(hamm_HGB_meta_chain_r, 97.5)))
print("    Micro Avg F1 score: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(f1_HGB_meta_chain_r), np.percentile(f1_HGB_meta_chain_r, 2.5), np.percentile(f1_HGB_meta_chain_r, 97.5)))

Mean scores for HGB with ClassifierChain (random order) with 95% confidence intervals:
    AUPRC macro: 0.21 [0.20, 0.23]
    AUROC macro: 0.59 [0.57, 0.60]
    Brier score: 0.13 [0.12, 0.13]
    Hamming loss: 0.15 [0.15, 0.16]
    Micro Avg F1 score: 0.38 [0.35, 0.39]


### 3.7. MLP

In [30]:
auprc_MLP_meta_chain_r = []
auroc_MLP_meta_chain_r = []
brier_MLP_meta_chain_r = []
f1_MLP_meta_chain_r = []
hamm_MLP_meta_chain_r = []

MLP_base_clf = MLPClassifier(random_state=0)
MLP_meta_chain_r_clf = ClassifierChain(MLP_base_clf, random_state=0) # random order
MLP_meta_chain_r_clf = MLP_meta_chain_r_clf.fit(X_train, Y_train)

for i in range(100):
    X_test_resampled, y_test_resampled = resample(X_test, Y_test, replace=True, n_samples=len(Y_test), random_state=0+i)

    Y_prob = MLP_meta_chain_r_clf.predict_proba(X_test_resampled)
    Y_pred = MLP_meta_chain_r_clf.predict(X_test_resampled)

    # Compute brier score
    brier_scores = np.zeros(Y_prob.shape[1])
    for i in range(Y_prob.shape[1]):
        brier_scores[i] = brier_score_loss(y_test_resampled.iloc[:,i], Y_prob[:, i])
    brier_MLP_meta_chain_r.append(brier_scores.mean())

    # Other metrics
    auprc_MLP_meta_chain_r.append(average_precision_score(y_test_resampled, Y_prob, average='macro')) 
    auroc_MLP_meta_chain_r.append(roc_auc_score(y_test_resampled, Y_prob, average='macro'))
    f1_MLP_meta_chain_r.append(f1_score(y_test_resampled, Y_pred, average='micro'))
    hamm_MLP_meta_chain_r.append(hamming_loss(y_test_resampled, Y_pred))

print(f"Mean scores for MLP with ClassifierChain (random order) with 95% confidence intervals:")
print("    AUPRC macro: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(auprc_MLP_meta_chain_r), np.percentile(auprc_MLP_meta_chain_r, 2.5), np.percentile(auprc_MLP_meta_chain_r, 97.5)))
print("    AUROC macro: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(auroc_MLP_meta_chain_r), np.percentile(auroc_MLP_meta_chain_r, 2.5), np.percentile(auroc_MLP_meta_chain_r, 97.5)))
print("    Brier score: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(brier_MLP_meta_chain_r), np.percentile(brier_MLP_meta_chain_r, 2.5), np.percentile(brier_MLP_meta_chain_r, 97.5)))
print("    Hamming loss: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(hamm_MLP_meta_chain_r), np.percentile(hamm_MLP_meta_chain_r, 2.5), np.percentile(hamm_MLP_meta_chain_r, 97.5)))
print("    Micro Avg F1 score: {:.2f} [{:.2f}, {:.2f}]".format(np.mean(f1_MLP_meta_chain_r), np.percentile(f1_MLP_meta_chain_r, 2.5), np.percentile(f1_MLP_meta_chain_r, 97.5)))

Mean scores for MLP with ClassifierChain (random order) with 95% confidence intervals:
    AUPRC macro: 0.21 [0.20, 0.23]
    AUROC macro: 0.58 [0.55, 0.60]
    Brier score: 0.16 [0.15, 0.17]
    Hamming loss: 0.19 [0.18, 0.20]
    Micro Avg F1 score: 0.37 [0.35, 0.39]
