In [None]:
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt

from utils.utils import load_data, remove_zero_features, load_confounders, standardize, label_freq_sorted, pca_transform
from utils.utils import generate_oversampled_set, generate_undersampled_set, generate_label_stats
from utils.utils import compute_scores

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.multioutput import MultiOutputClassifier, ClassifierChain

from sklearn.model_selection import train_test_split

In [None]:
N_CORES = joblib.cpu_count(only_physical_cores=True)
print(f"Number of physical cores: {N_CORES}")

Number of physical cores: 8


---

In [None]:
# Load data for classification task
subject_data, features, diagnoses = load_data('classification')

In [None]:
# Remove zero features
F = remove_zero_features(features.iloc[:,1:])

In [None]:
# Load confounders
C = load_confounders(subject_data)

In [None]:
# Standardize
X = standardize(F)
print(f"Number of samples: {X.shape[0]}")
print(f"Number of features: {X.shape[1]}")

Number of samples: 2815
Number of features: 922


In [None]:
# Remove ID column
Y = diagnoses.iloc[:,1:]
print(f"Number of labels: {Y.shape[1]}")

Number of labels: 13


In [None]:
boot_iter = 100

# 2. Use undersampled dataset

In [None]:
X_under, Y_under = generate_undersampled_set(X, Y)
label_stats, mean_ir = generate_label_stats(Y_under, True)
print(f"Mean imbalance ratio: {mean_ir}")
label_stats

In [None]:
# Split dataset into train and test (holdout) set
X_train, X_test, Y_train, Y_test = train_test_split(X_under, Y_under, test_size=0.25, random_state=0)

# Confounder 
C_train, C_test = C.loc[X_train.index], C.loc[X_test.index]

# PCA features
X_pca = pca_transform(F)
X_pca_train, X_pca_test = X_pca.loc[X_train.index], X_pca.loc[X_test.index]

print(f"Number of samples in training set: {len(X_train)}")
print(f"Number of samples in test set: {len(X_test)}")

## 2.1. MultiOutputClassifier
Evaluate classification models wrapped in meta estimator MultiOutputClassifier with respect to multi-label performance metrics

### 2.1.1. Dummy estimators

#### 2.1.1.1. Always zero baseline estimator

In [None]:
clf = DummyClassifier(strategy='constant', constant=0 ,random_state=0)
meta_clf = MultiOutputClassifier(clf).fit(X_train, Y_train)
compute_scores(meta_clf, X_test, Y_test, boot_iter)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.10 (0.01) [0.09, 0.11]
auprc_weighted:               0.13 (0.01) [0.11, 0.15]
auroc_macro:                  0.50 (0.00) [0.50, 0.50]
auroc_weighted:               0.50 (0.00) [0.50, 0.50]
brier_macro:                  0.10 (0.01) [0.09, 0.11]
brier_weighted:               0.01 (0.00) [0.01, 0.01]
balanced_accuracy_macro:      0.50 (0.00) [0.50, 0.50]
balanced_accuracy_weighted:   0.05 (0.00) [0.05, 0.05]
f1_micro:                     0.00 (0.00) [0.00, 0.00]
hamming:                      0.10 (0.01) [0.09, 0.11]
subset_accuracy:              0.34 (0.03) [0.28, 0.41]


#### 2.1.1.2. Label proportion baseline estimator

In [None]:
clf = DummyClassifier(strategy='prior', random_state=0)
meta_clf = MultiOutputClassifier(clf).fit(X_train, Y_train)
compute_scores(meta_clf, X_test, Y_test, boot_iter)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.10 (0.01) [0.09, 0.11]
auprc_weighted:               0.13 (0.01) [0.11, 0.15]
auroc_macro:                  0.50 (0.00) [0.50, 0.50]
auroc_weighted:               0.50 (0.00) [0.50, 0.50]
brier_macro:                  0.09 (0.00) [0.08, 0.10]
brier_weighted:               0.01 (0.00) [0.01, 0.01]
balanced_accuracy_macro:      0.50 (0.00) [0.50, 0.50]
balanced_accuracy_weighted:   0.05 (0.00) [0.05, 0.05]
f1_micro:                     0.00 (0.00) [0.00, 0.00]
hamming:                      0.10 (0.01) [0.09, 0.11]
subset_accuracy:              0.34 (0.03) [0.28, 0.41]


### 2.1.2. Confounders as features

#### 2.1.1.1. Logistic regression

In [None]:
clf = LogisticRegression(max_iter=10000, random_state=0)
meta_clf = MultiOutputClassifier(clf).fit(C_train, Y_train)
compute_scores(meta_clf, C_test, Y_test, boot_iter)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.14 (0.01) [0.12, 0.17]
auprc_weighted:               0.18 (0.02) [0.15, 0.22]
auroc_macro:                  0.53 (0.02) [0.49, 0.56]
auroc_weighted:               0.54 (0.02) [0.51, 0.58]
brier_macro:                  0.09 (0.00) [0.08, 0.10]
brier_weighted:               0.01 (0.00) [0.01, 0.01]
balanced_accuracy_macro:      0.50 (0.00) [0.50, 0.50]
balanced_accuracy_weighted:   0.05 (0.00) [0.05, 0.05]
f1_micro:                     0.00 (0.00) [0.00, 0.00]
hamming:                      0.10 (0.01) [0.09, 0.11]
subset_accuracy:              0.34 (0.03) [0.28, 0.41]


#### 2.1.1.1. Histogram-based Gradient Boosting

In [None]:
clf = HistGradientBoostingClassifier(random_state=0)
meta_clf = MultiOutputClassifier(clf, n_jobs=-1).fit(C_train,Y_train)
compute_scores(meta_clf, C_test, Y_test, boot_iter)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.13 (0.01) [0.11, 0.16]
auprc_weighted:               0.17 (0.02) [0.13, 0.21]
auroc_macro:                  0.50 (0.02) [0.47, 0.53]
auroc_weighted:               0.52 (0.02) [0.48, 0.56]
brier_macro:                  0.11 (0.01) [0.10, 0.12]
brier_weighted:               0.01 (0.00) [0.01, 0.01]
balanced_accuracy_macro:      0.50 (0.00) [0.49, 0.51]
balanced_accuracy_weighted:   0.05 (0.00) [0.05, 0.05]
f1_micro:                     0.09 (0.02) [0.06, 0.12]
hamming:                      0.13 (0.01) [0.11, 0.14]
subset_accuracy:              0.20 (0.02) [0.17, 0.25]


### 2.1.3. PCA-projected data (top-10 components) as features 

#### 2.1.1.1. Logistic regression

In [None]:
clf = LogisticRegression(max_iter=10000, random_state=0)
meta_clf = MultiOutputClassifier(clf).fit(X_pca_train, Y_train)
compute_scores(meta_clf, X_pca_test, Y_test, boot_iter)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.16 (0.02) [0.13, 0.19]
auprc_weighted:               0.19 (0.02) [0.16, 0.23]
auroc_macro:                  0.57 (0.02) [0.52, 0.62]
auroc_weighted:               0.58 (0.02) [0.54, 0.62]
brier_macro:                  0.09 (0.00) [0.08, 0.10]
brier_weighted:               0.01 (0.00) [0.01, 0.01]
balanced_accuracy_macro:      0.50 (0.00) [0.50, 0.51]
balanced_accuracy_weighted:   0.05 (0.00) [0.05, 0.05]
f1_micro:                     0.01 (0.01) [0.00, 0.03]
hamming:                      0.10 (0.01) [0.09, 0.11]
subset_accuracy:              0.34 (0.03) [0.28, 0.40]


#### 2.1.1.1. Histogram-based Gradient Boosting

In [None]:
clf = HistGradientBoostingClassifier(random_state=0)
meta_clf = MultiOutputClassifier(clf, n_jobs=-1).fit(X_pca_train,Y_train)
compute_scores(meta_clf, X_pca_test, Y_test, boot_iter)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.14 (0.01) [0.12, 0.17]
auprc_weighted:               0.17 (0.02) [0.14, 0.20]
auroc_macro:                  0.54 (0.02) [0.49, 0.59]
auroc_weighted:               0.54 (0.02) [0.50, 0.58]
brier_macro:                  0.10 (0.01) [0.09, 0.11]
brier_weighted:               0.01 (0.00) [0.01, 0.01]
balanced_accuracy_macro:      0.50 (0.00) [0.49, 0.51]
balanced_accuracy_weighted:   0.05 (0.00) [0.05, 0.05]
f1_micro:                     0.08 (0.02) [0.05, 0.11]
hamming:                      0.12 (0.01) [0.11, 0.13]
subset_accuracy:              0.25 (0.03) [0.20, 0.30]


### 2.1.4. Original features (standardized)

#### 2.1.4.1. Logistic regression

In [None]:
clf = LogisticRegression(max_iter=10000, random_state=0)
meta_clf = MultiOutputClassifier(clf).fit(X_train, Y_train)
compute_scores(meta_clf, X_test, Y_test, boot_iter)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.17 (0.01) [0.14, 0.19]
auprc_weighted:               0.20 (0.02) [0.17, 0.24]
auroc_macro:                  0.55 (0.02) [0.51, 0.60]
auroc_weighted:               0.56 (0.02) [0.53, 0.59]
brier_macro:                  0.14 (0.01) [0.13, 0.15]
brier_weighted:               0.02 (0.00) [0.02, 0.02]
balanced_accuracy_macro:      0.53 (0.01) [0.51, 0.55]
balanced_accuracy_weighted:   0.05 (0.00) [0.05, 0.06]
f1_micro:                     0.19 (0.02) [0.16, 0.22]
hamming:                      0.17 (0.01) [0.15, 0.18]
subset_accuracy:              0.13 (0.02) [0.10, 0.18]


#### 2.1.4.2. SVM

In [None]:
clf = SVC(kernel='rbf', gamma='scale', probability=True, random_state=0)
meta_clf = MultiOutputClassifier(clf, n_jobs=-1).fit(X_train, Y_train)
compute_scores(meta_clf, X_test, Y_test, 10)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.19 (0.02) [0.15, 0.21]
auprc_weighted:               0.23 (0.03) [0.18, 0.27]
auroc_macro:                  0.55 (0.02) [0.51, 0.58]
auroc_weighted:               0.55 (0.02) [0.52, 0.57]
brier_macro:                  0.09 (0.00) [0.08, 0.09]
brier_weighted:               0.01 (0.00) [0.01, 0.01]
balanced_accuracy_macro:      0.50 (0.00) [0.50, 0.50]
balanced_accuracy_weighted:   0.05 (0.00) [0.05, 0.05]
f1_micro:                     0.00 (0.00) [0.00, 0.00]
hamming:                      0.10 (0.00) [0.09, 0.11]
subset_accuracy:              0.33 (0.03) [0.28, 0.37]


#### 2.1.4.3. Histogram-based Gradient Boosting

In [None]:
clf = HistGradientBoostingClassifier(random_state=0)
meta_clf = MultiOutputClassifier(clf, n_jobs=-1).fit(X_train, Y_train)
compute_scores(meta_clf, X_test, Y_test, boot_iter)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.17 (0.02) [0.14, 0.20]
auprc_weighted:               0.20 (0.02) [0.17, 0.24]
auroc_macro:                  0.59 (0.02) [0.55, 0.63]
auroc_weighted:               0.59 (0.02) [0.55, 0.62]
brier_macro:                  0.10 (0.01) [0.09, 0.11]
brier_weighted:               0.01 (0.00) [0.01, 0.01]
balanced_accuracy_macro:      0.50 (0.00) [0.50, 0.51]
balanced_accuracy_weighted:   0.05 (0.00) [0.05, 0.05]
f1_micro:                     0.03 (0.01) [0.01, 0.06]
hamming:                      0.10 (0.01) [0.09, 0.12]
subset_accuracy:              0.32 (0.03) [0.26, 0.37]


#### 2.1.4.4. MLP

In [None]:
clf = MLPClassifier(random_state=0)
meta_clf = MultiOutputClassifier(clf).fit(X_train, Y_train) #n_jobs=-1
compute_scores(meta_clf, X_test, Y_test, boot_iter)



## 2.2. ClassifierChain (ordered by frequency) 
Evaluate classification models wrapped in meta estimator ClassifierChain with respect to multi-label performance metrics

In [None]:
by_freq = label_freq_sorted(Y_train)

### 2.2.1. Logistic regression

In [None]:
clf = LogisticRegression(max_iter=10000, random_state=0)
meta_clf = ClassifierChain(clf, order=by_freq, random_state=0).fit(X_train, Y_train)
compute_scores(meta_clf, X_test, Y_test, boot_iter, chain=True)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.20 (0.01) [0.18, 0.21]
auprc_weighted:               0.30 (0.01) [0.27, 0.32]
auroc_macro:                  0.57 (0.01) [0.54, 0.59]
auroc_weighted:               0.55 (0.01) [0.53, 0.58]
brier_macro:                  0.18 (0.00) [0.17, 0.19]
brier_weighted:               0.04 (0.00) [0.04, 0.04]
balanced_accuracy_macro:      0.52 (0.01) [0.51, 0.54]
balanced_accuracy_weighted:   0.08 (0.00) [0.08, 0.08]
f1_micro:                     0.29 (0.01) [0.27, 0.32]
hamming:                      0.21 (0.01) [0.20, 0.22]
subset_accuracy:              0.06 (0.01) [0.04, 0.07]


### 2.2.2. Histogram-based Gradient Boosting

In [None]:
clf = HistGradientBoostingClassifier(random_state=0)
meta_clf = ClassifierChain(clf, order=by_freq, random_state=0).fit(X_train, Y_train)
compute_scores(meta_clf, X_test, Y_test, boot_iter, chain=True)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.23 (0.01) [0.21, 0.25]
auprc_weighted:               0.33 (0.01) [0.30, 0.36]
auroc_macro:                  0.61 (0.01) [0.59, 0.64]
auroc_weighted:               0.60 (0.01) [0.57, 0.62]
brier_macro:                  0.13 (0.00) [0.12, 0.13]
brier_weighted:               0.03 (0.00) [0.03, 0.03]
balanced_accuracy_macro:      0.52 (0.00) [0.51, 0.52]
balanced_accuracy_weighted:   0.08 (0.00) [0.08, 0.08]
f1_micro:                     0.20 (0.01) [0.17, 0.23]
hamming:                      0.15 (0.00) [0.15, 0.16]
subset_accuracy:              0.11 (0.01) [0.09, 0.14]


## 2.3. ClassifierChain (random order) 
Evaluate classification models wrapped in meta estimator ClassifierChain with respect to multi-label performance metrics

### 2.3.1. Logistic regression

In [None]:
clf = LogisticRegression(max_iter=10000, random_state=0)
meta_clf = ClassifierChain(clf, random_state=0).fit(X_train, Y_train)
compute_scores(meta_clf, X_test, Y_test, boot_iter, chain=True)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.19 (0.01) [0.18, 0.21]
auprc_weighted:               0.29 (0.01) [0.27, 0.32]
auroc_macro:                  0.56 (0.01) [0.54, 0.58]
auroc_weighted:               0.55 (0.01) [0.53, 0.57]
brier_macro:                  0.18 (0.00) [0.17, 0.19]
brier_weighted:               0.04 (0.00) [0.04, 0.04]
balanced_accuracy_macro:      0.52 (0.01) [0.51, 0.54]
balanced_accuracy_weighted:   0.08 (0.00) [0.08, 0.08]
f1_micro:                     0.28 (0.01) [0.26, 0.31]
hamming:                      0.21 (0.01) [0.21, 0.22]
subset_accuracy:              0.05 (0.01) [0.03, 0.07]


### 2.3.2. Histogram-based Gradient Boosting

In [None]:
clf = HistGradientBoostingClassifier(random_state=0)
meta_clf = ClassifierChain(clf, random_state=0).fit(X_train, Y_train)
compute_scores(meta_clf, X_test, Y_test, boot_iter, chain=True)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.23 (0.01) [0.21, 0.25]
auprc_weighted:               0.33 (0.01) [0.30, 0.35]
auroc_macro:                  0.61 (0.01) [0.59, 0.63]
auroc_weighted:               0.60 (0.01) [0.58, 0.62]
brier_macro:                  0.13 (0.00) [0.12, 0.13]
brier_weighted:               0.03 (0.00) [0.03, 0.03]
balanced_accuracy_macro:      0.51 (0.00) [0.51, 0.52]
balanced_accuracy_weighted:   0.08 (0.00) [0.08, 0.08]
f1_micro:                     0.19 (0.01) [0.17, 0.22]
hamming:                      0.15 (0.00) [0.15, 0.16]
subset_accuracy:              0.11 (0.01) [0.09, 0.14]
