In [1]:
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt

from utils.utils import load_data, remove_zero_features, load_confounders, standardize, label_freq_sorted, pca_transform
from utils.utils import generate_oversampled_set, generate_undersampled_set, generate_label_stats
from utils.utils import compute_scores

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.multioutput import MultiOutputClassifier, ClassifierChain

from sklearn.model_selection import train_test_split
from sklearn.utils import resample

In [2]:
N_CORES = joblib.cpu_count(only_physical_cores=True)
print(f"Number of physical cores: {N_CORES}")

Number of physical cores: 8


---

In [3]:
# Load data for classification task
subject_data, features, diagnoses = load_data('classification')

In [4]:
# Remove zero features
F = remove_zero_features(features.iloc[:,1:])

In [5]:
# Load confounders
C = load_confounders(subject_data)

In [6]:
# Standardize
X = standardize(F)
print(f"Number of samples: {X.shape[0]}")
print(f"Number of features: {X.shape[1]}")

Number of samples: 2815
Number of features: 922


In [7]:
# Remove ID column
Y = diagnoses.iloc[:,1:]
print(f"Number of labels: {Y.shape[1]}")

Number of labels: 13


In [8]:
boot_iter = 100

---

# 1. Use dataset with original label distribution (no resampling)

In [32]:
label_stats, mean_ir = generate_label_stats(Y, True)
print(f"Mean imbalance ratio: {mean_ir}")
label_stats

Mean imbalance ratio: 7.995236107963529


Unnamed: 0,Absolute frequency,Relative frequency,Imbalance ratio
Trauma_And_Stress_RelatedDisorders,152,0.053996,11.519737
DepressiveDisorders,300,0.106572,5.836667
Attention_Deficit_HyperactivityDisorder,1751,0.622025,1.0
MotorDisorder,229,0.08135,7.646288
AutismSpectrumDisorder,398,0.141385,4.399497
CommunicationDisorder,493,0.175133,3.551724
OtherDisorders,135,0.047957,12.97037
SpecificLearningDisorder,761,0.270337,2.30092
Obsessive_Compulsive_And_RelatedDisorders,137,0.048668,12.781022
Disruptive,436,0.154885,4.016055


In [31]:
# Split dataset into train and test (holdout) set
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=0)

# Confounder 
C_train, C_test = C.loc[X_train.index], C.loc[X_test.index]

# PCA features
X_pca = pca_transform(F)
X_pca_train, X_pca_test = X_pca.loc[X_train.index], X_pca.loc[X_test.index]

print(f"Number of samples in training set: {len(X_train)}")
print(f"Number of samples in test set: {len(X_test)}")

Number of samples in training set: 2111
Number of samples in test set: 704


## 1.1. MultiOutputClassifier
Evaluate classification models wrapped in meta estimator MultiOutputClassifier with respect to multi-label performance metrics

### 1.1.1. Dummy estimators

#### 1.1.1.1. Always zero baseline estimator

In [14]:
clf = DummyClassifier(strategy='constant', constant=0 ,random_state=0)
meta_clf = MultiOutputClassifier(clf).fit(X_train, Y_train)
compute_scores(meta_clf, X_test, Y_test, boot_iter)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.17 (0.00) [0.16, 0.17]
auprc_weighted:               0.31 (0.01) [0.30, 0.32]
auroc_macro:                  0.50 (0.00) [0.50, 0.50]
auroc_weighted:               0.50 (0.00) [0.50, 0.50]
brier_macro:                  0.17 (0.00) [0.16, 0.17]
brier_weighted:               0.05 (0.00) [0.05, 0.05]
balanced_accuracy_macro:      0.50 (0.00) [0.50, 0.50]
balanced_accuracy_weighted:   0.08 (0.00) [0.08, 0.08]
f1_micro:                     0.00 (0.00) [0.00, 0.00]
hamming:                      0.17 (0.00) [0.16, 0.17]
subset_accuracy:              0.10 (0.01) [0.08, 0.12]


#### 1.1.1.2. Label proportion baseline estimator

In [15]:
clf = DummyClassifier(strategy='prior', random_state=0)
meta_clf = MultiOutputClassifier(clf).fit(X_train, Y_train)
compute_scores(meta_clf, X_test, Y_test, boot_iter)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.17 (0.00) [0.16, 0.17]
auprc_weighted:               0.31 (0.01) [0.30, 0.32]
auroc_macro:                  0.50 (0.00) [0.50, 0.50]
auroc_weighted:               0.50 (0.00) [0.50, 0.50]
brier_macro:                  0.12 (0.00) [0.11, 0.12]
brier_weighted:               0.03 (0.00) [0.03, 0.03]
balanced_accuracy_macro:      0.50 (0.00) [0.50, 0.50]
balanced_accuracy_weighted:   0.08 (0.00) [0.08, 0.08]
f1_micro:                     0.38 (0.01) [0.36, 0.40]
hamming:                      0.15 (0.00) [0.14, 0.16]
subset_accuracy:              0.10 (0.01) [0.08, 0.12]


### 1.1.2. Confounders as features

#### 1.1.1.1. Logistic regression

In [16]:
clf = LogisticRegression(class_weight='balanced', max_iter=10000, random_state=0)
meta_clf = MultiOutputClassifier(clf).fit(C_train, Y_train)
compute_scores(meta_clf, C_test, Y_test, boot_iter)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.18 (0.01) [0.17, 0.19]
auprc_weighted:               0.32 (0.01) [0.31, 0.34]
auroc_macro:                  0.51 (0.01) [0.49, 0.53]
auroc_weighted:               0.51 (0.01) [0.49, 0.53]
brier_macro:                  0.25 (0.00) [0.25, 0.25]
brier_weighted:               0.04 (0.00) [0.04, 0.04]
balanced_accuracy_macro:      0.51 (0.01) [0.49, 0.53]
balanced_accuracy_weighted:   0.08 (0.00) [0.08, 0.09]
f1_micro:                     0.25 (0.01) [0.24, 0.27]
hamming:                      0.47 (0.00) [0.46, 0.48]
subset_accuracy:              0.00 (0.00) [0.00, 0.00]


#### 1.1.1.1. Histogram-based Gradient Boosting

In [17]:
clf = HistGradientBoostingClassifier(random_state=0)
meta_clf = MultiOutputClassifier(clf, n_jobs=-1).fit(C_train,Y_train)
compute_scores(meta_clf, C_test, Y_test, boot_iter)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.18 (0.01) [0.17, 0.20]
auprc_weighted:               0.32 (0.01) [0.30, 0.34]
auroc_macro:                  0.49 (0.01) [0.47, 0.51]
auroc_weighted:               0.50 (0.01) [0.49, 0.52]
brier_macro:                  0.13 (0.00) [0.12, 0.13]
brier_weighted:               0.03 (0.00) [0.03, 0.03]
balanced_accuracy_macro:      0.50 (0.00) [0.50, 0.51]
balanced_accuracy_weighted:   0.08 (0.00) [0.08, 0.09]
f1_micro:                     0.34 (0.01) [0.32, 0.36]
hamming:                      0.16 (0.00) [0.16, 0.17]
subset_accuracy:              0.08 (0.01) [0.06, 0.09]


### 1.1.3. PCA-projected data (top-10 components) as features 

#### 1.1.1.1. Logistic regression

In [18]:
clf = LogisticRegression(class_weight='balanced', max_iter=10000, random_state=0)
meta_clf = MultiOutputClassifier(clf).fit(X_pca_train, Y_train)
compute_scores(meta_clf, X_pca_test, Y_test, boot_iter)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.23 (0.01) [0.21, 0.25]
auprc_weighted:               0.37 (0.01) [0.35, 0.40]
auroc_macro:                  0.60 (0.01) [0.58, 0.61]
auroc_weighted:               0.58 (0.01) [0.56, 0.60]
brier_macro:                  0.24 (0.00) [0.23, 0.24]
brier_weighted:               0.04 (0.00) [0.04, 0.04]
balanced_accuracy_macro:      0.57 (0.01) [0.55, 0.59]
balanced_accuracy_weighted:   0.09 (0.00) [0.09, 0.10]
f1_micro:                     0.31 (0.01) [0.30, 0.33]
hamming:                      0.43 (0.00) [0.42, 0.44]
subset_accuracy:              0.00 (0.00) [0.00, 0.00]


#### 1.1.1.1. Histogram-based Gradient Boosting

In [19]:
clf = HistGradientBoostingClassifier(random_state=0)
meta_clf = MultiOutputClassifier(clf, n_jobs=-1).fit(X_pca_train,Y_train)
compute_scores(meta_clf, X_pca_test, Y_test, boot_iter)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.21 (0.01) [0.19, 0.23]
auprc_weighted:               0.35 (0.01) [0.33, 0.37]
auroc_macro:                  0.55 (0.01) [0.53, 0.57]
auroc_weighted:               0.54 (0.01) [0.53, 0.56]
brier_macro:                  0.13 (0.00) [0.12, 0.13]
brier_weighted:               0.03 (0.00) [0.03, 0.03]
balanced_accuracy_macro:      0.51 (0.00) [0.50, 0.51]
balanced_accuracy_weighted:   0.09 (0.00) [0.08, 0.09]
f1_micro:                     0.37 (0.01) [0.35, 0.39]
hamming:                      0.16 (0.00) [0.16, 0.17]
subset_accuracy:              0.08 (0.01) [0.07, 0.10]


### 1.1.4. Original features (standardized)

#### 1.1.4.1. Logistic regression

In [20]:
clf = LogisticRegression(class_weight='balanced', max_iter=10000, random_state=0)
meta_clf = MultiOutputClassifier(clf).fit(X_train, Y_train)
compute_scores(meta_clf, X_test, Y_test, boot_iter)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.21 (0.01) [0.19, 0.22]
auprc_weighted:               0.36 (0.01) [0.34, 0.38]
auroc_macro:                  0.57 (0.01) [0.55, 0.59]
auroc_weighted:               0.56 (0.01) [0.54, 0.58]
brier_macro:                  0.21 (0.00) [0.21, 0.22]
brier_weighted:               0.05 (0.00) [0.04, 0.05]
balanced_accuracy_macro:      0.53 (0.01) [0.51, 0.54]
balanced_accuracy_weighted:   0.09 (0.00) [0.09, 0.09]
f1_micro:                     0.33 (0.01) [0.31, 0.35]
hamming:                      0.27 (0.00) [0.26, 0.28]
subset_accuracy:              0.01 (0.00) [0.01, 0.02]


#### 1.1.4.2. SVM

In [21]:
clf = SVC(class_weight='balanced', kernel='rbf', gamma='scale', probability=True, random_state=0)
meta_clf = MultiOutputClassifier(clf, n_jobs=-1).fit(X_train, Y_train)
compute_scores(meta_clf, X_test, Y_test, 10)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.23 (0.01) [0.21, 0.25]
auprc_weighted:               0.38 (0.00) [0.37, 0.39]
auroc_macro:                  0.60 (0.01) [0.58, 0.62]
auroc_weighted:               0.60 (0.00) [0.59, 0.60]
brier_macro:                  0.11 (0.00) [0.11, 0.12]
brier_weighted:               0.03 (0.00) [0.03, 0.03]
balanced_accuracy_macro:      0.54 (0.01) [0.53, 0.55]
balanced_accuracy_weighted:   0.09 (0.00) [0.09, 0.09]
f1_micro:                     0.38 (0.01) [0.37, 0.39]
hamming:                      0.23 (0.00) [0.22, 0.24]
subset_accuracy:              0.04 (0.01) [0.02, 0.05]


#### 1.1.4.3. Histogram-based Gradient Boosting

In [22]:
clf = HistGradientBoostingClassifier(random_state=0)
meta_clf = MultiOutputClassifier(clf, n_jobs=-1).fit(X_train, Y_train)
compute_scores(meta_clf, X_test, Y_test, boot_iter)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.23 (0.01) [0.21, 0.25]
auprc_weighted:               0.36 (0.01) [0.34, 0.39]
auroc_macro:                  0.59 (0.01) [0.57, 0.62]
auroc_weighted:               0.58 (0.01) [0.55, 0.60]
brier_macro:                  0.13 (0.00) [0.12, 0.13]
brier_weighted:               0.03 (0.00) [0.03, 0.03]
balanced_accuracy_macro:      0.51 (0.00) [0.50, 0.51]
balanced_accuracy_weighted:   0.09 (0.00) [0.09, 0.09]
f1_micro:                     0.37 (0.01) [0.36, 0.39]
hamming:                      0.15 (0.00) [0.15, 0.16]
subset_accuracy:              0.11 (0.01) [0.09, 0.13]


#### 1.1.4.4. MLP

In [23]:
clf = MLPClassifier(random_state=0)
meta_clf = MultiOutputClassifier(clf, n_jobs=-1).fit(X_train, Y_train)
compute_scores(meta_clf, X_test, Y_test, boot_iter)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.22 (0.01) [0.20, 0.23]
auprc_weighted:               0.36 (0.01) [0.34, 0.38]
auroc_macro:                  0.58 (0.01) [0.55, 0.60]
auroc_weighted:               0.57 (0.01) [0.55, 0.59]
brier_macro:                  0.16 (0.00) [0.15, 0.16]
brier_weighted:               0.04 (0.00) [0.04, 0.04]
balanced_accuracy_macro:      0.52 (0.00) [0.52, 0.53]
balanced_accuracy_weighted:   0.09 (0.00) [0.09, 0.09]
f1_micro:                     0.37 (0.01) [0.35, 0.39]
hamming:                      0.18 (0.00) [0.18, 0.19]
subset_accuracy:              0.08 (0.01) [0.07, 0.10]


## 1.2. ClassifierChain (ordered by frequency) 
Evaluate classification models wrapped in meta estimator ClassifierChain with respect to multi-label performance metrics

In [11]:
by_freq = label_freq_sorted(Y_train)

### 1.2.1. Logistic regression

In [12]:
clf = LogisticRegression(class_weight='balanced', max_iter=10000, random_state=0)
meta_clf = ClassifierChain(clf, order=by_freq, random_state=0).fit(X_train, Y_train)
compute_scores(meta_clf, X_test, Y_test, boot_iter, chain=True)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.21 (0.01) [0.19, 0.22]
auprc_weighted:               0.36 (0.01) [0.34, 0.38]
auroc_macro:                  0.56 (0.01) [0.55, 0.59]
auroc_weighted:               0.56 (0.01) [0.54, 0.58]
brier_macro:                  0.21 (0.00) [0.20, 0.22]
brier_weighted:               0.04 (0.00) [0.04, 0.05]
balanced_accuracy_macro:      0.53 (0.01) [0.52, 0.55]
balanced_accuracy_weighted:   0.09 (0.00) [0.09, 0.09]
f1_micro:                     0.34 (0.01) [0.32, 0.36]
hamming:                      0.26 (0.00) [0.26, 0.27]
subset_accuracy:              0.03 (0.01) [0.01, 0.04]


### 1.2.2. Histogram-based Gradient Boosting

In [14]:
clf = HistGradientBoostingClassifier(random_state=0)
meta_clf = ClassifierChain(clf, order=by_freq, random_state=0).fit(X_train, Y_train)
compute_scores(meta_clf, X_test, Y_test, boot_iter, chain=True)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.23 (0.01) [0.21, 0.26]
auprc_weighted:               0.36 (0.01) [0.34, 0.39]
auroc_macro:                  0.59 (0.01) [0.57, 0.61]
auroc_weighted:               0.57 (0.01) [0.55, 0.60]
brier_macro:                  0.13 (0.00) [0.12, 0.13]
brier_weighted:               0.03 (0.00) [0.03, 0.03]
balanced_accuracy_macro:      0.51 (0.00) [0.50, 0.51]
balanced_accuracy_weighted:   0.09 (0.00) [0.09, 0.09]
f1_micro:                     0.37 (0.01) [0.36, 0.39]
hamming:                      0.15 (0.00) [0.15, 0.16]
subset_accuracy:              0.11 (0.01) [0.09, 0.13]


## 1.3. ClassifierChain (random order) 
Evaluate classification models wrapped in meta estimator ClassifierChain with respect to multi-label performance metrics

### 1.3.1. Logistic regression

In [15]:
clf = LogisticRegression(class_weight='balanced', max_iter=10000, random_state=0)
meta_clf = ClassifierChain(clf, random_state=0).fit(X_train, Y_train)
compute_scores(meta_clf, X_test, Y_test, boot_iter, chain=True)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.21 (0.01) [0.20, 0.22]
auprc_weighted:               0.36 (0.01) [0.34, 0.38]
auroc_macro:                  0.57 (0.01) [0.55, 0.59]
auroc_weighted:               0.56 (0.01) [0.54, 0.58]
brier_macro:                  0.21 (0.00) [0.21, 0.22]
brier_weighted:               0.05 (0.00) [0.04, 0.05]
balanced_accuracy_macro:      0.53 (0.01) [0.51, 0.54]
balanced_accuracy_weighted:   0.09 (0.00) [0.09, 0.09]
f1_micro:                     0.34 (0.01) [0.32, 0.36]
hamming:                      0.26 (0.00) [0.26, 0.28]
subset_accuracy:              0.02 (0.01) [0.01, 0.03]


### 1.3.2. Histogram-based Gradient Boosting

In [16]:
clf = HistGradientBoostingClassifier(random_state=0)
meta_clf = ClassifierChain(clf, random_state=0).fit(X_train, Y_train)
compute_scores(meta_clf, X_test, Y_test, boot_iter, chain=True)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.23 (0.01) [0.21, 0.25]
auprc_weighted:               0.36 (0.01) [0.34, 0.39]
auroc_macro:                  0.59 (0.01) [0.57, 0.62]
auroc_weighted:               0.57 (0.01) [0.55, 0.60]
brier_macro:                  0.13 (0.00) [0.12, 0.13]
brier_weighted:               0.03 (0.00) [0.03, 0.03]
balanced_accuracy_macro:      0.50 (0.00) [0.50, 0.51]
balanced_accuracy_weighted:   0.09 (0.00) [0.08, 0.09]
f1_micro:                     0.35 (0.01) [0.34, 0.38]
hamming:                      0.15 (0.00) [0.15, 0.16]
subset_accuracy:              0.11 (0.01) [0.09, 0.14]


# 2. Use undersampled dataset

In [9]:
# Resample data (undersampling)
X_under, Y_under = generate_undersampled_set(X, Y)
label_stats, mean_ir = generate_label_stats(Y_under, True)
print(f"Mean imbalance ratio: {mean_ir}")
label_stats

Mean imbalance ratio: 5.355254324589975


Unnamed: 0,Absolute frequency,Relative frequency,Imbalance ratio
Trauma_And_Stress_RelatedDisorders,107,0.056435,7.775701
DepressiveDisorders,175,0.0923,4.754286
Attention_Deficit_HyperactivityDisorder,832,0.438819,1.0
MotorDisorder,147,0.077532,5.659864
AutismSpectrumDisorder,288,0.151899,2.888889
CommunicationDisorder,389,0.205169,2.138817
OtherDisorders,106,0.055907,7.849057
SpecificLearningDisorder,564,0.297468,1.475177
Obsessive_Compulsive_And_RelatedDisorders,81,0.042722,10.271605
Disruptive,306,0.161392,2.718954


In [10]:
# Split dataset into train and test (holdout) set
X_train, X_test, Y_train, Y_test = train_test_split(X_under, Y_under, test_size=0.25, random_state=0)

# Confounder 
C_train, C_test = C.loc[X_train.index], C.loc[X_test.index]

# PCA features
X_pca = pca_transform(F)
X_pca_train, X_pca_test = X_pca.loc[X_train.index], X_pca.loc[X_test.index]

print(f"Number of samples in training set: {len(X_train)}")
print(f"Number of samples in test set: {len(X_test)}")

Number of samples in training set: 1422
Number of samples in test set: 474


## 2.1. MultiOutputClassifier
Evaluate classification models wrapped in meta estimator MultiOutputClassifier with respect to multi-label performance metrics

### 2.1.1. Dummy estimators

#### 2.1.1.1. Always zero baseline estimator

In [35]:
clf = DummyClassifier(strategy='constant', constant=0 ,random_state=0)
meta_clf = MultiOutputClassifier(clf).fit(X_train, Y_train)
compute_scores(meta_clf, X_test, Y_test, boot_iter)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.15 (0.00) [0.14, 0.16]
auprc_weighted:               0.25 (0.01) [0.23, 0.26]
auroc_macro:                  0.50 (0.00) [0.50, 0.50]
auroc_weighted:               0.50 (0.00) [0.50, 0.50]
brier_macro:                  0.15 (0.00) [0.14, 0.16]
brier_weighted:               0.04 (0.00) [0.03, 0.04]
balanced_accuracy_macro:      0.50 (0.00) [0.50, 0.50]
balanced_accuracy_weighted:   0.07 (0.00) [0.07, 0.07]
f1_micro:                     0.00 (0.00) [0.00, 0.00]
hamming:                      0.15 (0.00) [0.14, 0.16]
subset_accuracy:              0.14 (0.01) [0.11, 0.18]


#### 2.1.1.2. Label proportion baseline estimator

In [36]:
clf = DummyClassifier(strategy='prior', random_state=0)
meta_clf = MultiOutputClassifier(clf).fit(X_train, Y_train)
compute_scores(meta_clf, X_test, Y_test, boot_iter)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.15 (0.00) [0.14, 0.16]
auprc_weighted:               0.25 (0.01) [0.23, 0.26]
auroc_macro:                  0.50 (0.00) [0.50, 0.50]
auroc_weighted:               0.50 (0.00) [0.50, 0.50]
brier_macro:                  0.11 (0.00) [0.11, 0.12]
brier_weighted:               0.02 (0.00) [0.02, 0.03]
balanced_accuracy_macro:      0.50 (0.00) [0.50, 0.50]
balanced_accuracy_weighted:   0.07 (0.00) [0.07, 0.07]
f1_micro:                     0.00 (0.00) [0.00, 0.00]
hamming:                      0.15 (0.00) [0.14, 0.16]
subset_accuracy:              0.14 (0.01) [0.11, 0.18]


### 2.1.2. Confounders as features

#### 2.1.1.1. Logistic regression

In [37]:
clf = LogisticRegression(max_iter=10000, random_state=0)
meta_clf = MultiOutputClassifier(clf).fit(C_train, Y_train)
compute_scores(meta_clf, C_test, Y_test, boot_iter)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.17 (0.01) [0.15, 0.18]
auprc_weighted:               0.27 (0.01) [0.25, 0.29]
auroc_macro:                  0.49 (0.01) [0.47, 0.52]
auroc_weighted:               0.50 (0.01) [0.48, 0.52]
brier_macro:                  0.11 (0.00) [0.11, 0.12]
brier_weighted:               0.02 (0.00) [0.02, 0.03]
balanced_accuracy_macro:      0.50 (0.00) [0.50, 0.50]
balanced_accuracy_weighted:   0.08 (0.00) [0.07, 0.08]
f1_micro:                     0.03 (0.01) [0.02, 0.05]
hamming:                      0.15 (0.00) [0.14, 0.16]
subset_accuracy:              0.14 (0.01) [0.11, 0.17]


#### 2.1.1.1. Histogram-based Gradient Boosting

In [38]:
clf = HistGradientBoostingClassifier(random_state=0)
meta_clf = MultiOutputClassifier(clf, n_jobs=-1).fit(C_train,Y_train)
compute_scores(meta_clf, C_test, Y_test, boot_iter)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.17 (0.01) [0.16, 0.19]
auprc_weighted:               0.26 (0.01) [0.24, 0.29]
auroc_macro:                  0.50 (0.01) [0.47, 0.52]
auroc_weighted:               0.50 (0.01) [0.47, 0.52]
brier_macro:                  0.13 (0.00) [0.12, 0.13]
brier_weighted:               0.03 (0.00) [0.03, 0.03]
balanced_accuracy_macro:      0.50 (0.00) [0.49, 0.51]
balanced_accuracy_weighted:   0.07 (0.00) [0.07, 0.08]
f1_micro:                     0.16 (0.02) [0.13, 0.19]
hamming:                      0.16 (0.00) [0.16, 0.17]
subset_accuracy:              0.09 (0.02) [0.06, 0.12]


### 2.1.3. PCA-projected data (top-10 components) as features 

#### 2.1.1.1. Logistic regression

In [39]:
clf = LogisticRegression(max_iter=10000, random_state=0)
meta_clf = MultiOutputClassifier(clf).fit(X_pca_train, Y_train)
compute_scores(meta_clf, X_pca_test, Y_test, boot_iter)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.22 (0.01) [0.20, 0.24]
auprc_weighted:               0.31 (0.01) [0.28, 0.33]
auroc_macro:                  0.59 (0.01) [0.57, 0.61]
auroc_weighted:               0.57 (0.01) [0.55, 0.59]
brier_macro:                  0.11 (0.00) [0.11, 0.12]
brier_weighted:               0.02 (0.00) [0.02, 0.03]
balanced_accuracy_macro:      0.50 (0.00) [0.50, 0.51]
balanced_accuracy_weighted:   0.08 (0.00) [0.08, 0.08]
f1_micro:                     0.07 (0.01) [0.05, 0.09]
hamming:                      0.15 (0.00) [0.14, 0.16]
subset_accuracy:              0.13 (0.01) [0.10, 0.16]


#### 2.1.1.1. Histogram-based Gradient Boosting

In [40]:
clf = HistGradientBoostingClassifier(random_state=0)
meta_clf = MultiOutputClassifier(clf, n_jobs=-1).fit(X_pca_train,Y_train)
compute_scores(meta_clf, X_pca_test, Y_test, boot_iter)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.20 (0.01) [0.18, 0.22]
auprc_weighted:               0.29 (0.01) [0.26, 0.31]
auroc_macro:                  0.57 (0.01) [0.54, 0.60]
auroc_weighted:               0.55 (0.01) [0.53, 0.57]
brier_macro:                  0.13 (0.00) [0.12, 0.13]
brier_weighted:               0.03 (0.00) [0.03, 0.03]
balanced_accuracy_macro:      0.51 (0.00) [0.50, 0.52]
balanced_accuracy_weighted:   0.08 (0.00) [0.07, 0.08]
f1_micro:                     0.20 (0.01) [0.18, 0.23]
hamming:                      0.17 (0.00) [0.16, 0.17]
subset_accuracy:              0.07 (0.01) [0.05, 0.09]


### 2.1.4. Original features (standardized)

#### 2.1.4.1. Logistic regression

In [41]:
clf = LogisticRegression(max_iter=10000, random_state=0)
meta_clf = MultiOutputClassifier(clf).fit(X_train, Y_train)
compute_scores(meta_clf, X_test, Y_test, boot_iter)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.19 (0.01) [0.18, 0.21]
auprc_weighted:               0.29 (0.01) [0.27, 0.32]
auroc_macro:                  0.56 (0.01) [0.53, 0.58]
auroc_weighted:               0.55 (0.01) [0.52, 0.57]
brier_macro:                  0.21 (0.00) [0.20, 0.22]
brier_weighted:               0.04 (0.00) [0.04, 0.04]
balanced_accuracy_macro:      0.52 (0.01) [0.51, 0.54]
balanced_accuracy_weighted:   0.08 (0.00) [0.08, 0.08]
f1_micro:                     0.29 (0.01) [0.27, 0.31]
hamming:                      0.25 (0.01) [0.24, 0.26]
subset_accuracy:              0.03 (0.01) [0.02, 0.05]


#### 2.1.4.2. SVM

In [42]:
clf = SVC(kernel='rbf', gamma='scale', probability=True, random_state=0)
meta_clf = MultiOutputClassifier(clf, n_jobs=-1).fit(X_train, Y_train)
compute_scores(meta_clf, X_test, Y_test, 10)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.23 (0.01) [0.21, 0.24]
auprc_weighted:               0.32 (0.01) [0.31, 0.33]
auroc_macro:                  0.61 (0.01) [0.60, 0.63]
auroc_weighted:               0.60 (0.01) [0.59, 0.61]
brier_macro:                  0.11 (0.00) [0.11, 0.11]
brier_weighted:               0.02 (0.00) [0.02, 0.02]
balanced_accuracy_macro:      0.55 (0.01) [0.54, 0.56]
balanced_accuracy_weighted:   0.08 (0.00) [0.08, 0.09]
f1_micro:                     0.35 (0.01) [0.33, 0.36]
hamming:                      0.21 (0.00) [0.20, 0.21]
subset_accuracy:              0.04 (0.01) [0.03, 0.07]


#### 2.1.4.3. Histogram-based Gradient Boosting

In [13]:
clf = HistGradientBoostingClassifier(random_state=0)
meta_clf = MultiOutputClassifier(clf).fit(X_train, Y_train) # n_jobs=-1
compute_scores(meta_clf, X_test, Y_test, boot_iter)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.23 (0.01) [0.20, 0.25]
auprc_weighted:               0.33 (0.01) [0.30, 0.35]
auroc_macro:                  0.61 (0.01) [0.58, 0.63]
auroc_weighted:               0.59 (0.01) [0.57, 0.61]
brier_macro:                  0.13 (0.00) [0.12, 0.13]
brier_weighted:               0.03 (0.00) [0.03, 0.03]
balanced_accuracy_macro:      0.51 (0.00) [0.50, 0.52]
balanced_accuracy_weighted:   0.08 (0.00) [0.08, 0.08]
f1_micro:                     0.17 (0.01) [0.15, 0.20]
hamming:                      0.15 (0.00) [0.14, 0.16]
subset_accuracy:              0.12 (0.01) [0.09, 0.14]


#### 2.1.4.4. MLP

In [15]:
clf = MLPClassifier(random_state=0)
meta_clf = MultiOutputClassifier(clf).fit(X_train, Y_train) # n_jobs=-1
compute_scores(meta_clf, X_test, Y_test, boot_iter)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.20 (0.01) [0.18, 0.22]
auprc_weighted:               0.30 (0.01) [0.28, 0.33]
auroc_macro:                  0.57 (0.01) [0.55, 0.60]
auroc_weighted:               0.56 (0.01) [0.53, 0.58]
brier_macro:                  0.16 (0.00) [0.15, 0.16]
brier_weighted:               0.04 (0.00) [0.03, 0.04]
balanced_accuracy_macro:      0.52 (0.01) [0.51, 0.53]
balanced_accuracy_weighted:   0.08 (0.00) [0.08, 0.08]
f1_micro:                     0.25 (0.01) [0.23, 0.28]
hamming:                      0.18 (0.00) [0.17, 0.19]
subset_accuracy:              0.06 (0.01) [0.04, 0.08]


## 2.2. ClassifierChain (ordered by frequency) 
Evaluate classification models wrapped in meta estimator ClassifierChain with respect to multi-label performance metrics

In [16]:
by_freq = label_freq_sorted(Y_train)

### 2.2.1. Logistic regression

In [17]:
clf = LogisticRegression(max_iter=10000, random_state=0)
meta_clf = ClassifierChain(clf, order=by_freq, random_state=0).fit(X_train, Y_train)
compute_scores(meta_clf, X_test, Y_test, boot_iter, chain=True)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.20 (0.01) [0.18, 0.21]
auprc_weighted:               0.30 (0.01) [0.27, 0.32]
auroc_macro:                  0.57 (0.01) [0.54, 0.59]
auroc_weighted:               0.55 (0.01) [0.53, 0.58]
brier_macro:                  0.18 (0.00) [0.17, 0.19]
brier_weighted:               0.04 (0.00) [0.04, 0.04]
balanced_accuracy_macro:      0.52 (0.01) [0.51, 0.54]
balanced_accuracy_weighted:   0.08 (0.00) [0.08, 0.08]
f1_micro:                     0.29 (0.01) [0.27, 0.32]
hamming:                      0.21 (0.01) [0.20, 0.22]
subset_accuracy:              0.06 (0.01) [0.04, 0.07]


### 2.2.2. Histogram-based Gradient Boosting

In [18]:
clf = HistGradientBoostingClassifier(random_state=0)
meta_clf = ClassifierChain(clf, order=by_freq, random_state=0).fit(X_train, Y_train)
compute_scores(meta_clf, X_test, Y_test, boot_iter, chain=True)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.23 (0.01) [0.21, 0.25]
auprc_weighted:               0.33 (0.01) [0.30, 0.36]
auroc_macro:                  0.61 (0.01) [0.59, 0.64]
auroc_weighted:               0.60 (0.01) [0.57, 0.62]
brier_macro:                  0.13 (0.00) [0.12, 0.13]
brier_weighted:               0.03 (0.00) [0.03, 0.03]
balanced_accuracy_macro:      0.52 (0.00) [0.51, 0.52]
balanced_accuracy_weighted:   0.08 (0.00) [0.08, 0.08]
f1_micro:                     0.20 (0.01) [0.17, 0.23]
hamming:                      0.15 (0.00) [0.15, 0.16]
subset_accuracy:              0.11 (0.01) [0.09, 0.14]


## 2.3. ClassifierChain (random order) 
Evaluate classification models wrapped in meta estimator ClassifierChain with respect to multi-label performance metrics

### 2.3.1. Logistic regression

In [19]:
clf = LogisticRegression(max_iter=10000, random_state=0)
meta_clf = ClassifierChain(clf, random_state=0).fit(X_train, Y_train)
compute_scores(meta_clf, X_test, Y_test, boot_iter, chain=True)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.19 (0.01) [0.18, 0.21]
auprc_weighted:               0.29 (0.01) [0.27, 0.32]
auroc_macro:                  0.56 (0.01) [0.54, 0.58]
auroc_weighted:               0.55 (0.01) [0.53, 0.57]
brier_macro:                  0.18 (0.00) [0.17, 0.19]
brier_weighted:               0.04 (0.00) [0.04, 0.04]
balanced_accuracy_macro:      0.52 (0.01) [0.51, 0.54]
balanced_accuracy_weighted:   0.08 (0.00) [0.08, 0.08]
f1_micro:                     0.28 (0.01) [0.26, 0.31]
hamming:                      0.21 (0.01) [0.21, 0.22]
subset_accuracy:              0.05 (0.01) [0.03, 0.07]


### 2.3.2. Histogram-based Gradient Boosting

In [20]:
clf = HistGradientBoostingClassifier(random_state=0)
meta_clf = ClassifierChain(clf, random_state=0).fit(X_train, Y_train)
compute_scores(meta_clf, X_test, Y_test, boot_iter, chain=True)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.23 (0.01) [0.21, 0.25]
auprc_weighted:               0.33 (0.01) [0.30, 0.35]
auroc_macro:                  0.61 (0.01) [0.59, 0.63]
auroc_weighted:               0.60 (0.01) [0.58, 0.62]
brier_macro:                  0.13 (0.00) [0.12, 0.13]
brier_weighted:               0.03 (0.00) [0.03, 0.03]
balanced_accuracy_macro:      0.51 (0.00) [0.51, 0.52]
balanced_accuracy_weighted:   0.08 (0.00) [0.08, 0.08]
f1_micro:                     0.19 (0.01) [0.17, 0.22]
hamming:                      0.15 (0.00) [0.15, 0.16]
subset_accuracy:              0.11 (0.01) [0.09, 0.14]


# 3. Use oversampled dataset

In [21]:
# Resample data (undersampling)
X_over, Y_over = generate_oversampled_set(X, Y)
label_stats, mean_ir = generate_label_stats(Y_over, True)
print(f"Mean imbalance ratio: {mean_ir}")
label_stats

Mean imbalance ratio: 1.6092872677464145


Unnamed: 0,Absolute frequency,Relative frequency,Imbalance ratio
Trauma_And_Stress_RelatedDisorders,2580,0.116174,1.923256
DepressiveDisorders,3170,0.142741,1.5653
Attention_Deficit_HyperactivityDisorder,4582,0.206322,1.082933
MotorDisorder,3134,0.14112,1.58328
AutismSpectrumDisorder,3689,0.166111,1.34508
CommunicationDisorder,4431,0.199523,1.119838
OtherDisorders,2320,0.104467,2.138793
SpecificLearningDisorder,4962,0.223433,1.0
Obsessive_Compulsive_And_RelatedDisorders,2668,0.120137,1.85982
Disruptive,2801,0.126126,1.77151


In [22]:
# Split dataset into train and test (holdout) set
X_train, X_test, Y_train, Y_test = train_test_split(X_over, Y_over, test_size=0.25, random_state=0)

# Confounder 
C_train, C_test = C.loc[X_train.index], C.loc[X_test.index]

# PCA features
X_pca = pca_transform(F)
X_pca_train, X_pca_test = X_pca.loc[X_train.index], X_pca.loc[X_test.index]

print(f"Number of samples in training set: {len(X_train)}")
print(f"Number of samples in test set: {len(X_test)}")

Number of samples in training set: 16656
Number of samples in test set: 5552


## 3.1. MultiOutputClassifier
Evaluate classification models wrapped in meta estimator MultiOutputClassifier with respect to multi-label performance metrics

### 3.1.1. Dummy estimators

#### 3.1.1.1. Always zero baseline estimator

In [23]:
clf = DummyClassifier(strategy='constant', constant=0 ,random_state=0)
meta_clf = MultiOutputClassifier(clf).fit(X_train, Y_train)
compute_scores(meta_clf, X_test, Y_test, boot_iter)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.15 (0.00) [0.15, 0.15]
auprc_weighted:               0.16 (0.00) [0.16, 0.17]
auroc_macro:                  0.50 (0.00) [0.50, 0.50]
auroc_weighted:               0.50 (0.00) [0.50, 0.50]
brier_macro:                  0.15 (0.00) [0.15, 0.15]
brier_weighted:               0.02 (0.00) [0.02, 0.02]
balanced_accuracy_macro:      0.50 (0.00) [0.50, 0.50]
balanced_accuracy_weighted:   0.07 (0.00) [0.07, 0.07]
f1_micro:                     0.00 (0.00) [0.00, 0.00]
hamming:                      0.15 (0.00) [0.15, 0.15]
subset_accuracy:              0.01 (0.00) [0.01, 0.02]


#### 3.1.1.2. Label proportion baseline estimator

In [24]:
clf = DummyClassifier(strategy='prior', random_state=0)
meta_clf = MultiOutputClassifier(clf).fit(X_train, Y_train)
compute_scores(meta_clf, X_test, Y_test, boot_iter)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.15 (0.00) [0.15, 0.15]
auprc_weighted:               0.16 (0.00) [0.16, 0.17]
auroc_macro:                  0.50 (0.00) [0.50, 0.50]
auroc_weighted:               0.50 (0.00) [0.50, 0.50]
brier_macro:                  0.13 (0.00) [0.12, 0.13]
brier_weighted:               0.02 (0.00) [0.02, 0.02]
balanced_accuracy_macro:      0.50 (0.00) [0.50, 0.50]
balanced_accuracy_weighted:   0.07 (0.00) [0.07, 0.07]
f1_micro:                     0.00 (0.00) [0.00, 0.00]
hamming:                      0.15 (0.00) [0.15, 0.15]
subset_accuracy:              0.01 (0.00) [0.01, 0.02]


### 3.1.2. Confounders as features

#### 3.1.1.1. Logistic regression

In [25]:
clf = LogisticRegression(max_iter=10000, random_state=0)
meta_clf = MultiOutputClassifier(clf).fit(C_train, Y_train)
compute_scores(meta_clf, C_test, Y_test, boot_iter)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.20 (0.00) [0.19, 0.20]
auprc_weighted:               0.21 (0.00) [0.20, 0.21]
auroc_macro:                  0.58 (0.00) [0.58, 0.59]
auroc_weighted:               0.57 (0.00) [0.57, 0.58]
brier_macro:                  0.12 (0.00) [0.12, 0.13]
brier_weighted:               0.02 (0.00) [0.02, 0.02]
balanced_accuracy_macro:      0.50 (0.00) [0.50, 0.50]
balanced_accuracy_weighted:   0.07 (0.00) [0.07, 0.07]
f1_micro:                     0.00 (0.00) [0.00, 0.00]
hamming:                      0.15 (0.00) [0.15, 0.15]
subset_accuracy:              0.01 (0.00) [0.01, 0.02]


#### 3.1.1.1. Histogram-based Gradient Boosting

In [26]:
clf = HistGradientBoostingClassifier(random_state=0)
meta_clf = MultiOutputClassifier(clf, n_jobs=-1).fit(C_train,Y_train)
compute_scores(meta_clf, C_test, Y_test, boot_iter)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.84 (0.00) [0.84, 0.85]
auprc_weighted:               0.83 (0.00) [0.82, 0.84]
auroc_macro:                  0.95 (0.00) [0.94, 0.95]
auroc_weighted:               0.94 (0.00) [0.94, 0.94]
brier_macro:                  0.05 (0.00) [0.05, 0.05]
brier_weighted:               0.01 (0.00) [0.01, 0.01]
balanced_accuracy_macro:      0.84 (0.00) [0.83, 0.84]
balanced_accuracy_weighted:   0.12 (0.00) [0.12, 0.12]
f1_micro:                     0.75 (0.01) [0.74, 0.76]
hamming:                      0.06 (0.00) [0.06, 0.07]
subset_accuracy:              0.65 (0.01) [0.64, 0.66]


### 3.1.3. PCA-projected data (top-10 components) as features 

#### 3.1.1.1. Logistic regression

In [27]:
clf = LogisticRegression(max_iter=10000, random_state=0)
meta_clf = MultiOutputClassifier(clf).fit(X_pca_train, Y_train)
compute_scores(meta_clf, X_pca_test, Y_test, boot_iter)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.28 (0.01) [0.27, 0.29]
auprc_weighted:               0.28 (0.01) [0.27, 0.29]
auroc_macro:                  0.65 (0.00) [0.65, 0.66]
auroc_weighted:               0.64 (0.00) [0.63, 0.65]
brier_macro:                  0.12 (0.00) [0.12, 0.12]
brier_weighted:               0.02 (0.00) [0.02, 0.02]
balanced_accuracy_macro:      0.51 (0.00) [0.51, 0.51]
balanced_accuracy_weighted:   0.08 (0.00) [0.08, 0.08]
f1_micro:                     0.04 (0.00) [0.04, 0.05]
hamming:                      0.15 (0.00) [0.15, 0.15]
subset_accuracy:              0.02 (0.00) [0.02, 0.02]


#### 3.1.1.1. Histogram-based Gradient Boosting

In [29]:
clf = HistGradientBoostingClassifier(random_state=0)
meta_clf = MultiOutputClassifier(clf).fit(X_pca_train,Y_train) # n_jobs=-1
compute_scores(meta_clf, X_pca_test, Y_test, boot_iter)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.97 (0.00) [0.97, 0.97]
auprc_weighted:               0.97 (0.00) [0.96, 0.97]
auroc_macro:                  0.99 (0.00) [0.99, 0.99]
auroc_weighted:               0.99 (0.00) [0.99, 0.99]
brier_macro:                  0.02 (0.00) [0.02, 0.02]
brier_weighted:               0.00 (0.00) [0.00, 0.00]
balanced_accuracy_macro:      0.94 (0.00) [0.93, 0.94]
balanced_accuracy_weighted:   0.14 (0.00) [0.14, 0.14]
f1_micro:                     0.92 (0.00) [0.91, 0.92]
hamming:                      0.02 (0.00) [0.02, 0.02]
subset_accuracy:              0.83 (0.00) [0.82, 0.84]


### 3.1.4. Original features (standardized)

#### 3.1.4.1. Logistic regression

In [30]:
clf = LogisticRegression(max_iter=10000, random_state=0)
meta_clf = MultiOutputClassifier(clf).fit(X_train, Y_train) # n_jobs=-1
compute_scores(meta_clf, X_test, Y_test, boot_iter)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.88 (0.00) [0.87, 0.89]
auprc_weighted:               0.87 (0.00) [0.86, 0.88]
auroc_macro:                  0.96 (0.00) [0.96, 0.96]
auroc_weighted:               0.96 (0.00) [0.95, 0.96]
brier_macro:                  0.03 (0.00) [0.03, 0.03]
brier_weighted:               0.01 (0.00) [0.01, 0.01]
balanced_accuracy_macro:      0.92 (0.00) [0.91, 0.92]
balanced_accuracy_weighted:   0.14 (0.00) [0.14, 0.14]
f1_micro:                     0.86 (0.00) [0.86, 0.87]
hamming:                      0.04 (0.00) [0.04, 0.04]
subset_accuracy:              0.76 (0.01) [0.75, 0.77]


#### 3.1.4.2. SVM

In [None]:
clf = SVC(class_weight='balanced', kernel='rbf', gamma='scale', probability=True, random_state=0)
meta_clf = MultiOutputClassifier(clf, n_jobs=-1).fit(X_train, Y_train)
compute_scores(meta_clf, X_test, Y_test, 10)

#### 3.1.4.3. Histogram-based Gradient Boosting

In [None]:
clf = HistGradientBoostingClassifier(random_state=0)
meta_clf = MultiOutputClassifier(clf).fit(X_train, Y_train) # n_jobs=-1
compute_scores(meta_clf, X_test, Y_test, boot_iter)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.23 (0.01) [0.20, 0.25]
auprc_weighted:               0.33 (0.01) [0.30, 0.35]
auroc_macro:                  0.61 (0.01) [0.58, 0.63]
auroc_weighted:               0.59 (0.01) [0.57, 0.61]
brier_macro:                  0.13 (0.00) [0.12, 0.13]
brier_weighted:               0.03 (0.00) [0.03, 0.03]
balanced_accuracy_macro:      0.51 (0.00) [0.50, 0.52]
balanced_accuracy_weighted:   0.08 (0.00) [0.08, 0.08]
f1_micro:                     0.17 (0.01) [0.15, 0.20]
hamming:                      0.15 (0.00) [0.14, 0.16]
subset_accuracy:              0.12 (0.01) [0.09, 0.14]


#### 3.1.4.4. MLP

In [None]:
clf = MLPClassifier(random_state=0)
meta_clf = MultiOutputClassifier(clf).fit(X_train, Y_train) # n_jobs=-1
compute_scores(meta_clf, X_test, Y_test, boot_iter)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.20 (0.01) [0.18, 0.22]
auprc_weighted:               0.30 (0.01) [0.28, 0.33]
auroc_macro:                  0.57 (0.01) [0.55, 0.60]
auroc_weighted:               0.56 (0.01) [0.53, 0.58]
brier_macro:                  0.16 (0.00) [0.15, 0.16]
brier_weighted:               0.04 (0.00) [0.03, 0.04]
balanced_accuracy_macro:      0.52 (0.01) [0.51, 0.53]
balanced_accuracy_weighted:   0.08 (0.00) [0.08, 0.08]
f1_micro:                     0.25 (0.01) [0.23, 0.28]
hamming:                      0.18 (0.00) [0.17, 0.19]
subset_accuracy:              0.06 (0.01) [0.04, 0.08]


## 3.2. ClassifierChain (ordered by frequency) 
Evaluate classification models wrapped in meta estimator ClassifierChain with respect to multi-label performance metrics

In [None]:
by_freq = label_freq_sorted(Y_train)

### 3.2.1. Logistic regression

In [None]:
clf = LogisticRegression(max_iter=10000, random_state=0)
meta_clf = ClassifierChain(clf, order=by_freq, random_state=0).fit(X_train, Y_train)
compute_scores(meta_clf, X_test, Y_test, boot_iter, chain=True)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.20 (0.01) [0.18, 0.21]
auprc_weighted:               0.30 (0.01) [0.27, 0.32]
auroc_macro:                  0.57 (0.01) [0.54, 0.59]
auroc_weighted:               0.55 (0.01) [0.53, 0.58]
brier_macro:                  0.18 (0.00) [0.17, 0.19]
brier_weighted:               0.04 (0.00) [0.04, 0.04]
balanced_accuracy_macro:      0.52 (0.01) [0.51, 0.54]
balanced_accuracy_weighted:   0.08 (0.00) [0.08, 0.08]
f1_micro:                     0.29 (0.01) [0.27, 0.32]
hamming:                      0.21 (0.01) [0.20, 0.22]
subset_accuracy:              0.06 (0.01) [0.04, 0.07]


### 3.2.2. Histogram-based Gradient Boosting

In [None]:
clf = HistGradientBoostingClassifier(random_state=0)
meta_clf = ClassifierChain(clf, order=by_freq, random_state=0).fit(X_train, Y_train)
compute_scores(meta_clf, X_test, Y_test, boot_iter, chain=True)

## 3.3. ClassifierChain (random order) 
Evaluate classification models wrapped in meta estimator ClassifierChain with respect to multi-label performance metrics

### 3.3.1. Logistic regression

In [None]:
clf = LogisticRegression(max_iter=10000, random_state=0)
meta_clf = ClassifierChain(clf, random_state=0).fit(X_train, Y_train)
compute_scores(meta_clf, X_test, Y_test, boot_iter, chain=True)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.21 (0.01) [0.20, 0.22]
auprc_weighted:               0.36 (0.01) [0.34, 0.38]
auroc_macro:                  0.57 (0.01) [0.55, 0.59]
auroc_weighted:               0.56 (0.01) [0.54, 0.58]
brier_macro:                  0.21 (0.00) [0.21, 0.22]
brier_weighted:               0.05 (0.00) [0.04, 0.05]
balanced_accuracy_macro:      0.53 (0.01) [0.51, 0.54]
balanced_accuracy_weighted:   0.09 (0.00) [0.09, 0.09]
f1_micro:                     0.34 (0.01) [0.32, 0.36]
hamming:                      0.26 (0.00) [0.26, 0.28]
subset_accuracy:              0.02 (0.01) [0.01, 0.03]


### 3.3.2. Histogram-based Gradient Boosting

In [None]:
clf = HistGradientBoostingClassifier(random_state=0)
meta_clf = ClassifierChain(clf, random_state=0).fit(X_train, Y_train)
compute_scores(meta_clf, X_test, Y_test, boot_iter, chain=True)