In [1]:
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt

from utils.utils import load_data, remove_zero_features, load_confounders, standardize, label_freq_sorted, pca_transform
from utils.utils import generate_oversampled_set, generate_undersampled_set, generate_label_stats
from utils.utils import compute_scores

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.multioutput import MultiOutputClassifier, ClassifierChain

from sklearn.model_selection import train_test_split

In [2]:
N_CORES = joblib.cpu_count(only_physical_cores=True)
print(f"Number of physical cores: {N_CORES}")

Number of physical cores: 8


---

In [3]:
# Load data for classification task
subject_data, features, diagnoses = load_data('classification')

In [4]:
# Remove zero features
F = remove_zero_features(features.iloc[:,1:])

In [5]:
# Load confounders
C = load_confounders(subject_data)

In [6]:
# Standardize
X = standardize(F)
print(f"Number of samples: {X.shape[0]}")
print(f"Number of features: {X.shape[1]}")

Number of samples: 2815
Number of features: 922


In [7]:
# Remove ID column
Y = diagnoses.iloc[:,1:]
print(f"Number of labels: {Y.shape[1]}")

Number of labels: 13


In [8]:
boot_iter = 100

---

# 1. Use dataset with original label distribution (no resampling)

In [9]:
label_stats, mean_ir = generate_label_stats(Y, True)
print(f"Mean imbalance ratio: {mean_ir}")
label_stats

Mean imbalance ratio: 7.995236107963529


Unnamed: 0,Absolute frequency,Relative frequency,Imbalance ratio
Trauma_And_Stress_RelatedDisorders,152,0.053996,11.519737
DepressiveDisorders,300,0.106572,5.836667
Attention_Deficit_HyperactivityDisorder,1751,0.622025,1.0
MotorDisorder,229,0.08135,7.646288
AutismSpectrumDisorder,398,0.141385,4.399497
CommunicationDisorder,493,0.175133,3.551724
OtherDisorders,135,0.047957,12.97037
SpecificLearningDisorder,761,0.270337,2.30092
Obsessive_Compulsive_And_RelatedDisorders,137,0.048668,12.781022
Disruptive,436,0.154885,4.016055


In [10]:
# Split dataset into train and test (holdout) set
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=0)

# Confounder 
C_train, C_test = C.loc[X_train.index], C.loc[X_test.index]

# PCA features
X_pca = pca_transform(F)
X_pca_train, X_pca_test = X_pca.loc[X_train.index], X_pca.loc[X_test.index]

print(f"Number of samples in training set: {len(X_train)}")
print(f"Number of samples in test set: {len(X_test)}")

Number of samples in training set: 2111
Number of samples in test set: 704


## 1.1. MultiOutputClassifier
Evaluate classification models wrapped in meta estimator MultiOutputClassifier with respect to multi-label performance metrics

### 1.1.1. Dummy estimators

#### 1.1.1.1. Always zero baseline estimator

In [11]:
clf = DummyClassifier(strategy='constant', constant=0 ,random_state=0)
meta_clf = MultiOutputClassifier(clf).fit(X_train, Y_train)
compute_scores(meta_clf, X_test, Y_test, boot_iter)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.17 (0.00) [0.16, 0.17]
auprc_weighted:               0.31 (0.01) [0.30, 0.32]
auroc_macro:                  0.50 (0.00) [0.50, 0.50]
auroc_weighted:               0.50 (0.00) [0.50, 0.50]
brier_macro:                  0.17 (0.00) [0.16, 0.17]
brier_weighted:               0.05 (0.00) [0.05, 0.05]
balanced_accuracy_macro:      0.50 (0.00) [0.50, 0.50]
balanced_accuracy_weighted:   0.08 (0.00) [0.08, 0.08]
f1_micro:                     0.00 (0.00) [0.00, 0.00]
hamming:                      0.17 (0.00) [0.16, 0.17]
subset_accuracy:              0.10 (0.01) [0.08, 0.12]


#### 1.1.1.2. Label proportion baseline estimator

In [12]:
clf = DummyClassifier(strategy='prior', random_state=0)
meta_clf = MultiOutputClassifier(clf).fit(X_train, Y_train)
compute_scores(meta_clf, X_test, Y_test, boot_iter)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.17 (0.00) [0.16, 0.17]
auprc_weighted:               0.31 (0.01) [0.30, 0.32]
auroc_macro:                  0.50 (0.00) [0.50, 0.50]
auroc_weighted:               0.50 (0.00) [0.50, 0.50]
brier_macro:                  0.12 (0.00) [0.11, 0.12]
brier_weighted:               0.03 (0.00) [0.03, 0.03]
balanced_accuracy_macro:      0.50 (0.00) [0.50, 0.50]
balanced_accuracy_weighted:   0.08 (0.00) [0.08, 0.08]
f1_micro:                     0.38 (0.01) [0.36, 0.40]
hamming:                      0.15 (0.00) [0.14, 0.16]
subset_accuracy:              0.10 (0.01) [0.08, 0.12]


### 1.1.2. Confounders as features

#### 1.1.2.1. Logistic regression

In [13]:
clf = LogisticRegression(class_weight='balanced', max_iter=10000, random_state=0)
meta_clf = MultiOutputClassifier(clf).fit(C_train, Y_train)
compute_scores(meta_clf, C_test, Y_test, boot_iter)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.18 (0.01) [0.17, 0.19]
auprc_weighted:               0.32 (0.01) [0.31, 0.34]
auroc_macro:                  0.51 (0.01) [0.49, 0.53]
auroc_weighted:               0.51 (0.01) [0.49, 0.53]
brier_macro:                  0.25 (0.00) [0.25, 0.25]
brier_weighted:               0.04 (0.00) [0.04, 0.04]
balanced_accuracy_macro:      0.51 (0.01) [0.49, 0.53]
balanced_accuracy_weighted:   0.08 (0.00) [0.08, 0.09]
f1_micro:                     0.25 (0.01) [0.24, 0.27]
hamming:                      0.47 (0.00) [0.46, 0.48]
subset_accuracy:              0.00 (0.00) [0.00, 0.00]


#### 1.1.2.2. Histogram-based Gradient Boosting

In [14]:
clf = HistGradientBoostingClassifier(random_state=0)
meta_clf = MultiOutputClassifier(clf, n_jobs=-1).fit(C_train,Y_train)
compute_scores(meta_clf, C_test, Y_test, boot_iter)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.18 (0.01) [0.17, 0.20]
auprc_weighted:               0.32 (0.01) [0.30, 0.34]
auroc_macro:                  0.49 (0.01) [0.47, 0.51]
auroc_weighted:               0.50 (0.01) [0.49, 0.52]
brier_macro:                  0.13 (0.00) [0.12, 0.13]
brier_weighted:               0.03 (0.00) [0.03, 0.03]
balanced_accuracy_macro:      0.50 (0.00) [0.50, 0.51]
balanced_accuracy_weighted:   0.08 (0.00) [0.08, 0.09]
f1_micro:                     0.34 (0.01) [0.32, 0.36]
hamming:                      0.16 (0.00) [0.16, 0.17]
subset_accuracy:              0.08 (0.01) [0.06, 0.09]


### 1.1.3. PCA-projected data (top-10 components) as features 

#### 1.1.3.1. Logistic regression

In [15]:
clf = LogisticRegression(class_weight='balanced', max_iter=10000, random_state=0)
meta_clf = MultiOutputClassifier(clf).fit(X_pca_train, Y_train)
compute_scores(meta_clf, X_pca_test, Y_test, boot_iter)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.23 (0.01) [0.21, 0.25]
auprc_weighted:               0.37 (0.01) [0.35, 0.40]
auroc_macro:                  0.60 (0.01) [0.58, 0.61]
auroc_weighted:               0.58 (0.01) [0.56, 0.60]
brier_macro:                  0.24 (0.00) [0.23, 0.24]
brier_weighted:               0.04 (0.00) [0.04, 0.04]
balanced_accuracy_macro:      0.57 (0.01) [0.55, 0.59]
balanced_accuracy_weighted:   0.09 (0.00) [0.09, 0.10]
f1_micro:                     0.31 (0.01) [0.30, 0.33]
hamming:                      0.44 (0.00) [0.42, 0.44]
subset_accuracy:              0.00 (0.00) [0.00, 0.00]


#### 1.1.3.2. Histogram-based Gradient Boosting

In [16]:
clf = HistGradientBoostingClassifier(random_state=0)
meta_clf = MultiOutputClassifier(clf, n_jobs=-1).fit(X_pca_train,Y_train)
compute_scores(meta_clf, X_pca_test, Y_test, boot_iter)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.21 (0.01) [0.19, 0.23]
auprc_weighted:               0.35 (0.01) [0.33, 0.37]
auroc_macro:                  0.55 (0.01) [0.53, 0.57]
auroc_weighted:               0.54 (0.01) [0.53, 0.56]
brier_macro:                  0.13 (0.00) [0.12, 0.13]
brier_weighted:               0.03 (0.00) [0.03, 0.03]
balanced_accuracy_macro:      0.51 (0.00) [0.50, 0.51]
balanced_accuracy_weighted:   0.09 (0.00) [0.08, 0.09]
f1_micro:                     0.37 (0.01) [0.35, 0.39]
hamming:                      0.16 (0.00) [0.16, 0.17]
subset_accuracy:              0.08 (0.01) [0.07, 0.10]


### 1.1.4. Original features (standardized)

#### 1.1.4.1. Logistic regression

In [17]:
clf = LogisticRegression(class_weight='balanced', max_iter=10000, random_state=0)
meta_clf = MultiOutputClassifier(clf).fit(X_train, Y_train)
compute_scores(meta_clf, X_test, Y_test, boot_iter)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.21 (0.01) [0.19, 0.22]
auprc_weighted:               0.36 (0.01) [0.34, 0.38]
auroc_macro:                  0.57 (0.01) [0.55, 0.59]
auroc_weighted:               0.56 (0.01) [0.54, 0.58]
brier_macro:                  0.21 (0.00) [0.21, 0.22]
brier_weighted:               0.05 (0.00) [0.04, 0.05]
balanced_accuracy_macro:      0.53 (0.01) [0.51, 0.54]
balanced_accuracy_weighted:   0.09 (0.00) [0.09, 0.09]
f1_micro:                     0.33 (0.01) [0.31, 0.35]
hamming:                      0.27 (0.00) [0.26, 0.28]
subset_accuracy:              0.01 (0.00) [0.01, 0.02]


#### 1.1.4.2. SVM

In [18]:
clf = SVC(class_weight='balanced', kernel='rbf', gamma='scale', probability=True, random_state=0)
meta_clf = MultiOutputClassifier(clf, n_jobs=-1).fit(X_train, Y_train)
compute_scores(meta_clf, X_test, Y_test, boot_iter)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.23 (0.01) [0.22, 0.25]
auprc_weighted:               0.39 (0.01) [0.36, 0.41]
auroc_macro:                  0.60 (0.01) [0.58, 0.63]
auroc_weighted:               0.60 (0.01) [0.58, 0.63]
brier_macro:                  0.11 (0.00) [0.11, 0.12]
brier_weighted:               0.03 (0.00) [0.03, 0.03]
balanced_accuracy_macro:      0.54 (0.01) [0.53, 0.55]
balanced_accuracy_weighted:   0.09 (0.00) [0.09, 0.10]
f1_micro:                     0.38 (0.01) [0.36, 0.40]
hamming:                      0.23 (0.00) [0.22, 0.24]
subset_accuracy:              0.04 (0.01) [0.03, 0.06]


#### 1.1.4.3. Histogram-based Gradient Boosting

In [26]:
clf = HistGradientBoostingClassifier(random_state=0)
meta_clf = MultiOutputClassifier(clf, n_jobs=-1).fit(X_train, Y_train)
compute_scores(meta_clf, X_test, Y_test, boot_iter)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.23 (0.01) [0.21, 0.25]
auprc_weighted:               0.36 (0.01) [0.34, 0.39]
auroc_macro:                  0.59 (0.01) [0.57, 0.62]
auroc_weighted:               0.58 (0.01) [0.55, 0.60]
brier_macro:                  0.13 (0.00) [0.12, 0.13]
brier_weighted:               0.03 (0.00) [0.03, 0.03]
balanced_accuracy_macro:      0.51 (0.00) [0.50, 0.51]
balanced_accuracy_weighted:   0.09 (0.00) [0.09, 0.09]
f1_micro:                     0.37 (0.01) [0.36, 0.39]
hamming:                      0.15 (0.00) [0.15, 0.16]
subset_accuracy:              0.11 (0.01) [0.09, 0.13]


#### 1.1.4.4. MLP

In [20]:
clf = MLPClassifier(random_state=0)
meta_clf = MultiOutputClassifier(clf, n_jobs=-1).fit(X_train, Y_train)
compute_scores(meta_clf, X_test, Y_test, boot_iter)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.22 (0.01) [0.20, 0.23]
auprc_weighted:               0.36 (0.01) [0.34, 0.38]
auroc_macro:                  0.58 (0.01) [0.55, 0.60]
auroc_weighted:               0.57 (0.01) [0.55, 0.59]
brier_macro:                  0.16 (0.00) [0.15, 0.16]
brier_weighted:               0.04 (0.00) [0.04, 0.04]
balanced_accuracy_macro:      0.52 (0.00) [0.52, 0.53]
balanced_accuracy_weighted:   0.09 (0.00) [0.09, 0.09]
f1_micro:                     0.37 (0.01) [0.35, 0.39]
hamming:                      0.18 (0.00) [0.18, 0.19]
subset_accuracy:              0.08 (0.01) [0.07, 0.10]


## 1.2. ClassifierChain (ordered by frequency) 
Evaluate classification models wrapped in meta estimator ClassifierChain with respect to multi-label performance metrics

In [21]:
by_freq = label_freq_sorted(Y_train)

### 1.2.1. Logistic regression

In [22]:
clf = LogisticRegression(class_weight='balanced', max_iter=10000, random_state=0)
meta_clf = ClassifierChain(clf, order=by_freq, random_state=0).fit(X_train, Y_train)
compute_scores(meta_clf, X_test, Y_test, boot_iter, chain=True)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.21 (0.01) [0.19, 0.22]
auprc_weighted:               0.36 (0.01) [0.34, 0.38]
auroc_macro:                  0.56 (0.01) [0.55, 0.59]
auroc_weighted:               0.56 (0.01) [0.54, 0.58]
brier_macro:                  0.21 (0.00) [0.20, 0.22]
brier_weighted:               0.04 (0.00) [0.04, 0.05]
balanced_accuracy_macro:      0.53 (0.01) [0.52, 0.55]
balanced_accuracy_weighted:   0.09 (0.00) [0.09, 0.09]
f1_micro:                     0.34 (0.01) [0.32, 0.36]
hamming:                      0.26 (0.00) [0.26, 0.27]
subset_accuracy:              0.03 (0.01) [0.01, 0.04]


### 1.2.2. Histogram-based Gradient Boosting

In [23]:
clf = HistGradientBoostingClassifier(random_state=0)
meta_clf = ClassifierChain(clf, order=by_freq, random_state=0).fit(X_train, Y_train)
compute_scores(meta_clf, X_test, Y_test, boot_iter, chain=True)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.23 (0.01) [0.21, 0.26]
auprc_weighted:               0.36 (0.01) [0.34, 0.39]
auroc_macro:                  0.59 (0.01) [0.57, 0.61]
auroc_weighted:               0.57 (0.01) [0.55, 0.60]
brier_macro:                  0.13 (0.00) [0.12, 0.13]
brier_weighted:               0.03 (0.00) [0.03, 0.03]
balanced_accuracy_macro:      0.51 (0.00) [0.50, 0.51]
balanced_accuracy_weighted:   0.09 (0.00) [0.09, 0.09]
f1_micro:                     0.37 (0.01) [0.36, 0.39]
hamming:                      0.15 (0.00) [0.15, 0.16]
subset_accuracy:              0.11 (0.01) [0.09, 0.13]


## 1.3. ClassifierChain (random order) 
Evaluate classification models wrapped in meta estimator ClassifierChain with respect to multi-label performance metrics

### 1.3.1. Logistic regression

In [24]:
clf = LogisticRegression(class_weight='balanced', max_iter=10000, random_state=0)
meta_clf = ClassifierChain(clf, random_state=0).fit(X_train, Y_train)
compute_scores(meta_clf, X_test, Y_test, boot_iter, chain=True)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.21 (0.01) [0.20, 0.22]
auprc_weighted:               0.36 (0.01) [0.34, 0.38]
auroc_macro:                  0.57 (0.01) [0.55, 0.59]
auroc_weighted:               0.56 (0.01) [0.54, 0.58]
brier_macro:                  0.21 (0.00) [0.21, 0.22]
brier_weighted:               0.05 (0.00) [0.04, 0.05]
balanced_accuracy_macro:      0.53 (0.01) [0.51, 0.54]
balanced_accuracy_weighted:   0.09 (0.00) [0.09, 0.09]
f1_micro:                     0.34 (0.01) [0.32, 0.36]
hamming:                      0.26 (0.00) [0.26, 0.28]
subset_accuracy:              0.02 (0.01) [0.01, 0.03]


### 1.3.2. Histogram-based Gradient Boosting

In [25]:
clf = HistGradientBoostingClassifier(random_state=0)
meta_clf = ClassifierChain(clf, random_state=0).fit(X_train, Y_train)
compute_scores(meta_clf, X_test, Y_test, boot_iter, chain=True)

Mean scores with SE and 95% confidence intervals:

auprc_macro:                  0.23 (0.01) [0.21, 0.25]
auprc_weighted:               0.36 (0.01) [0.34, 0.39]
auroc_macro:                  0.59 (0.01) [0.57, 0.62]
auroc_weighted:               0.57 (0.01) [0.55, 0.60]
brier_macro:                  0.13 (0.00) [0.12, 0.13]
brier_weighted:               0.03 (0.00) [0.03, 0.03]
balanced_accuracy_macro:      0.50 (0.00) [0.50, 0.51]
balanced_accuracy_weighted:   0.09 (0.00) [0.08, 0.09]
f1_micro:                     0.35 (0.01) [0.34, 0.38]
hamming:                      0.15 (0.00) [0.15, 0.16]
subset_accuracy:              0.11 (0.01) [0.09, 0.14]
