In [1]:
import pandas as pd
import numpy as np

import autorootcwd  # noqa
from hamilton import driver

from src.data import data_pipeline
from src.data.pydantic_models import BearingDataset
from functools import partial
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import (
    RandomizedSearchCV,
    GroupKFold,
    cross_validate,
    cross_val_predict,
)
from scipy.stats import randint, loguniform
import logging
import random
from src.utils.add_signal_data import add_signal_data_cwru
from typing import Literal, Tuple
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import roc_auc_score

In [2]:
from src.data.data_splits import (split_cwru_proposed, cwru_optimization_split)
from itertools import combinations, product
import random

In [3]:
def split_cwru_proposed(
    df: pd.DataFrame,
    normal_train_side: Literal["DE", "FE"],
    random_state: int = 42,
    HP0_ontest: bool = False,
    HP0_ontrain: bool = False,
    include_normal_config_on_test: bool = False,
    test_size: float = 1 / 3,
) -> Tuple[pd.DataFrame, None, pd.DataFrame]:
    """Prepare CWRU dataset for model training and testing.

    - LT Split (Fault location and type): stratified, (Size): random
    """
    df_train = pd.DataFrame()
    df_test = pd.DataFrame()

    df["fault_type"] = df["fault_type"].str.replace("OR@3", "OR")
    df["fault_type"] = df["fault_type"].str.replace("OR@6", "OR")
    df["group"] = (
        df["fault_location"].astype(str)
        + "_"
        + df["fault_type"].astype(str)
        + "_"
        + df["fault_size"].astype(str)
    )

    # We will not use all 411 signals from CWRU, this CSV filter to only used (144 signals)
    df_filtered_ids = pd.read_csv("data/splits/cwru/filtered_cwru.csv")
    df = df[df["waveform_id"].isin(df_filtered_ids["waveform_id"].tolist())]

    test_ids_optimization = []

    for optimization_fold in range(1, 4):
        for train_side in ["DE", "FE"]:
            test_ids = pd.read_csv(
                f"data/splits/cwru/fold_{optimization_fold}/test_{train_side}.csv"
            )["waveform_id"].tolist()
            test_ids_optimization.append(test_ids)

    # For each location-type pair, split into train and test by randomly selecting a size
    for enum, (_group, df_group) in enumerate(df.groupby(["fault_location", "fault_type"])):
        gss = GroupShuffleSplit(
            n_splits=1, test_size=test_size, random_state=random_state + enum * 1000
        )
        df_train_idx, df_test_idx = next(gss.split(df_group, groups=df_group.group))
        df_train = pd.concat([df_train, df_group.iloc[df_train_idx]], ignore_index=True)
        df_test = pd.concat([df_test, df_group.iloc[df_test_idx]], ignore_index=True)

        print(df_train.shape, df_test.shape)

    if normal_train_side == "DE":
        df_train = df_train[
            ~(
                (df_train["signal_location"] == "FE")
                & (df_train["fault_location"] == "DE")
            )
        ]
        df_test = df_test[
            ~(
                (df_test["signal_location"] == "DE")
                & (df_test["fault_location"] == "FE")
            )
        ]
    elif normal_train_side == "FE":
        df_train = df_train[
            ~(
                (df_train["signal_location"] == "DE")
                & (df_train["fault_location"] == "FE")
            )
        ]
        df_test = df_test[
            ~(
                (df_test["signal_location"] == "FE")
                & (df_test["fault_location"] == "DE")
            )
        ]
    elif normal_train_side == "both":
        pass

    # Check if the test set is equal to some of the optimization test sets and if so, create a new test set
    test_waveform_ids = set(df_test["waveform_id"].tolist())

    # Function to check if test set is equal to any optimization set
    def check_equality(test_ids, opt_ids_list):
        for i, opt_ids in enumerate(opt_ids_list):
            opt_ids_set = set(opt_ids)
            if test_ids == opt_ids_set:
                return True
        return False

    # Initial check before any additional splits
    is_equal = check_equality(test_waveform_ids, test_ids_optimization)
    attempt = 0

    while is_equal:
        attempt += 1
        new_random_state = random_state + attempt * 100

        # Repeat the splitting process with a new random seed
        df_train = pd.DataFrame()
        df_test = pd.DataFrame()

        for enum, (_group, df_group) in enumerate(df.groupby(["fault_location", "fault_type"])):
            gss = GroupShuffleSplit(
                n_splits=1, test_size=test_size, random_state=new_random_state + enum * 1000
            )
            df_train_idx, df_test_idx = next(gss.split(df_group, groups=df_group.group))
            df_train = pd.concat(
                [df_train, df_group.iloc[df_train_idx]], ignore_index=True
            )
            df_test = pd.concat(
                [df_test, df_group.iloc[df_test_idx]], ignore_index=True
            )

        # Re-apply the filtering based on normal_train_side
        if normal_train_side == "DE":
            df_train = df_train[
                ~(
                    (df_train["signal_location"] == "FE")
                    & (df_train["fault_location"] == "DE")
                )
            ]
            df_test = df_test[
                ~(
                    (df_test["signal_location"] == "DE")
                    & (df_test["fault_location"] == "FE")
                )
            ]
        elif normal_train_side == "FE":
            df_train = df_train[
                ~(
                    (df_train["signal_location"] == "DE")
                    & (df_train["fault_location"] == "FE")
                )
            ]
            df_test = df_test[
                ~(
                    (df_test["signal_location"] == "FE")
                    & (df_test["fault_location"] == "DE")
                )
            ]
        elif normal_train_side == "both":
            pass

        # Check equality again
        test_waveform_ids = set(df_test["waveform_id"].tolist())
        is_equal = check_equality(test_waveform_ids, test_ids_optimization)

    #df_train = df_train.drop(columns=["group"])
    #df_test = df_test.drop(columns=["group"])

    if not HP0_ontest:
        print("Removing HP0 from test set")
        df_test = df_test[~df_test["load"].isin([0])]
    else:
        print("Keeping HP0 in test set")
    if not HP0_ontrain:
        print("Removing HP0 from train set")
        df_train = df_train[~df_train["load"].isin([0])]
    else:
        print("Keeping HP0 in train set")

    if include_normal_config_on_test:
        normal_df = pd.read_csv("data/splits/cwru/normal_signals.csv")
        df_test = pd.concat([df_test, normal_df], ignore_index=True)
    print("Test set size: ", df_test.shape[0])

    df_val = df_test
    return df_train, df_val, df_test

In [4]:
def get_cv_splits(
    metadata_df: pd.DataFrame,
    run: int,
    train_side: str = "DE",
    pct = 1,
    test_size = 1 / 3
):
    
    no_duplicate = metadata_df.drop_duplicates(subset=["waveform_id"]).copy()

    train, _ , test = split_cwru_proposed(
        df=no_duplicate,
        random_state=run,
        HP0_ontrain= True,
        HP0_ontest= True,
        normal_train_side= train_side,
        test_size = test_size,
    )
    
    train_idx = train.index.tolist()
    test_idx = test.index.tolist()

    train_df = metadata_df[metadata_df["waveform_id"].isin(train["waveform_id"])]
    # for each waveform_id, select pct of the samples
    if pct < 1:
        train_df = train_df.groupby("waveform_id").apply(lambda x: x.sample(frac=pct, random_state=42)).drop(columns=['waveform_id']).reset_index()
        train_idx = train_df.level_1.values.tolist()
    elif pct > 1:
        # repeat samples if pct > 1
        train_df = train_df.groupby("waveform_id_seg").apply(lambda x: x.sample(n=int(pct), replace=True, random_state=42)).drop(columns=['waveform_id_seg']).reset_index()
        train_idx = train_df.level_1.values.tolist()
    else:
        train_idx = train_df.index.tolist()
    
    test_df = metadata_df[metadata_df["waveform_id"].isin(test["waveform_id"])]
    test_idx = test_df.index.tolist()

    cv = (train_idx, test_idx)

    return cv

def get_val_cv_splits(
    metadata_df,
    test_fold,
    train_side,
    withHP0: bool = True,
    pct = 1
    ):

    no_duplicate = metadata_df.drop_duplicates(subset=["waveform_id"]).copy()


    train, _, test = cwru_optimization_split(
        df=no_duplicate,
        test_fold=test_fold,
        random_state=42,
        withHP0=withHP0,
        train_side=train_side
        )
    
    train_df = metadata_df[metadata_df["waveform_id"].isin(train["waveform_id"])]
    test_df = metadata_df[metadata_df["waveform_id"].isin(test["waveform_id"])]

    if pct < 1:
        train_df = train_df.groupby("waveform_id").apply(lambda x: x.sample(frac=pct, random_state=42)).drop(columns=['waveform_id']).reset_index()
        train_idx = train_df.level_1.values.tolist()
    else:
        train_idx = train_df.index.tolist()

    test_idx = test_df.index.tolist()

    cv = (train_idx, test_idx)

    return cv

In [5]:
metadata = pd.read_pickle("/data/bearing_datasets/cwru/processed/files_metadata.bz2")
features = pd.read_pickle("data/features/cwru_features.pkl") # Features with 10 segments of 1s per signal
#features = pd.read_pickle("data/features/cwru_features_segmented.pkl") # Features with 20 segments of 1s per signal (53% overlap)

In [6]:
metadata.head()

Unnamed: 0,waveform_id,fault_location,load,rpm,fault_type,fault_size,signal_location,fs,duration,bpfo,bpfi,bsf,ftf,inner,outer,ball,multiclass_label
0,OR@3_7_DE_DE_48_3HP,DE,3,1730,OR@3,7,DE,48000,10.129667,3.5848,5.4152,4.7135,0.39828,0,1,0,2
1,B_21_FE_FE_12_3HP,FE,3,1730,B,21,FE,12000,10.06675,3.053,4.9469,3.9874,0.3817,0,0,1,3
2,B_7_DE_BA_12_3HP,DE,3,1730,B,7,BA,12000,10.129667,,,,,0,0,0,0
3,OR@3_7_FE_FE_12_3HP,FE,3,1730,OR@3,7,FE,12000,10.14325,3.053,4.9469,3.9874,0.3817,0,1,0,2
4,OR@6_14_DE_FE_12_3HP,DE,3,1730,OR@6,14,FE,12000,10.165917,3.053,4.9469,3.9874,0.3817,0,0,0,0


In [7]:
df = features.merge(metadata, on="waveform_id", how="left")

In [8]:
val_cvs = [get_val_cv_splits(
    metadata_df=df,
    test_fold=run,
    train_side=side)
 for (run, side) in product(
    range(1,4), ["FE", "DE"])]

#val_cvs = [get_cv_splits(
#    metadata_df=df,
#    run=run,
#    train_side=side, pct=1, test_size = 1 / 3)
# for (run, side) in product(
#    range(1,4), ["FE", "DE"])]

cvs = [get_cv_splits(
    metadata_df=df,
    run=run,
    train_side=side, pct=1, test_size = 1 / 3)
 for (run, side) in product(
    range(4,54), ["FE", "DE"])]

(16, 44) (8, 44)
(32, 44) (16, 44)
(48, 44) (24, 44)
(64, 44) (32, 44)
(80, 44) (40, 44)
(96, 44) (48, 44)
Keeping HP0 in test set
Keeping HP0 in train set
Test set size:  36
(16, 44) (8, 44)
(32, 44) (16, 44)
(48, 44) (24, 44)
(64, 44) (32, 44)
(80, 44) (40, 44)
(96, 44) (48, 44)
Keeping HP0 in test set
Keeping HP0 in train set
Test set size:  36
(16, 44) (8, 44)
(32, 44) (16, 44)
(48, 44) (24, 44)
(64, 44) (32, 44)
(80, 44) (40, 44)
(96, 44) (48, 44)
Keeping HP0 in test set
Keeping HP0 in train set
Test set size:  36
(16, 44) (8, 44)
(32, 44) (16, 44)
(48, 44) (24, 44)
(64, 44) (32, 44)
(80, 44) (40, 44)
(96, 44) (48, 44)
Keeping HP0 in test set
Keeping HP0 in train set
Test set size:  36
(16, 44) (8, 44)
(32, 44) (16, 44)
(48, 44) (24, 44)
(64, 44) (32, 44)
(80, 44) (40, 44)
(96, 44) (48, 44)
Keeping HP0 in test set
Keeping HP0 in train set
Test set size:  36
(16, 44) (8, 44)
(32, 44) (16, 44)
(48, 44) (24, 44)
(64, 44) (32, 44)
(80, 44) (40, 44)
(96, 44) (48, 44)
Keeping HP0 in tes

In [9]:
len(cvs[0][1])

360

In [10]:
len(val_cvs[0][1])

360

In [11]:
df.iloc[cvs[0][1]].waveform_id_seg.nunique(), df.iloc[cvs[0][0]].waveform_id_seg.value_counts()

(360,
 waveform_id_seg
 B_7_DE_FE_12_1HP_3836        1
 B_7_DE_FE_12_1HP_3835        1
 B_7_DE_FE_12_1HP_3834        1
 B_7_DE_FE_12_1HP_3833        1
 OR@6_21_DE_DE_12_1HP_3732    1
                             ..
 B_21_FE_FE_12_3HP_15         1
 B_21_FE_FE_12_3HP_14         1
 B_21_FE_FE_12_3HP_13         1
 B_21_FE_FE_12_3HP_12         1
 B_21_FE_FE_12_3HP_11         1
 Name: count, Length: 720, dtype: int64)

In [12]:
features = [ 'acceleration/rms/global', 'acceleration/pk-pk/global',
       'acceleration/kurt/global', 'acceleration/skewness/global',
       'acceleration/fc/global',
       'envelope/spectralPeak/1.0x-bpfo/500-6000',
       'envelope/spectralPeak/2.0x-bpfo/500-6000',
       'envelope/spectralPeak/3.0x-bpfo/500-6000',
       'envelope/spectralPeak/4.0x-bpfo/500-6000',
       'envelope/spectralPeak/5.0x-bpfo/500-6000',
       'envelope/spectralPeak/1.0x-bpfi/500-6000',
       'envelope/spectralPeak/2.0x-bpfi/500-6000',
       'envelope/spectralPeak/3.0x-bpfi/500-6000',
       'envelope/spectralPeak/4.0x-bpfi/500-6000',
       'envelope/spectralPeak/5.0x-bpfi/500-6000',
       'envelope/spectralPeak/1.0x-bsf/500-6000',
       'envelope/spectralPeak/2.0x-bsf/500-6000',
       'envelope/spectralPeak/3.0x-bsf/500-6000',
       'envelope/spectralPeak/4.0x-bsf/500-6000',
       'envelope/spectralPeak/5.0x-bsf/500-6000']

In [13]:
X = df[features].copy()
y = df[['inner', 'outer', 'ball']].copy()

# Features (segmented)

## RF

In [14]:
random_search = RandomizedSearchCV(
    estimator=MultiOutputClassifier(RandomForestClassifier()),
    param_distributions={
        "estimator__n_estimators": [200],
        "estimator__max_features": ["sqrt", "log2"],
        "estimator__criterion": ["gini", "entropy", "log_loss"],
        "estimator__max_depth": randint(low=2, high=60),
        "estimator__min_samples_split": randint(low=2, high=20),
        "estimator__min_samples_leaf": randint(low=1, high=20),
        "estimator__ccp_alpha": loguniform(1e-5, 1),
    },
    n_iter=250,
    cv=val_cvs,
    scoring="roc_auc",
    verbose=2,
    n_jobs=-1,
)

random_search.fit(X, y)

Fitting 6 folds for each of 250 candidates, totalling 1500 fits
multilabel-indicator
[CV] END estimator__ccp_alpha=0.012030471738908978, estimator__criterion=gini, estimator__max_depth=41, estimator__max_features=log2, estimator__min_samples_leaf=16, estimator__min_samples_split=12, estimator__n_estimators=200; total time=   1.0s
multilabel-indicator
[CV] END estimator__ccp_alpha=0.012030471738908978, estimator__criterion=gini, estimator__max_depth=41, estimator__max_features=log2, estimator__min_samples_leaf=16, estimator__min_samples_split=12, estimator__n_estimators=200; total time=   1.0s
multilabel-indicatormultilabel-indicator

[CV] END estimator__ccp_alpha=0.012030471738908978, estimator__criterion=gini, estimator__max_depth=41, estimator__max_features=log2, estimator__min_samples_leaf=16, estimator__min_samples_split=12, estimator__n_estimators=200; total time=   1.0s
[CV] END estimator__ccp_alpha=0.012030471738908978, estimator__criterion=gini, estimator__max_depth=41, estimat

KeyboardInterrupt: 

In [14]:
random_search = RandomizedSearchCV(
    estimator=MultiOutputClassifier(RandomForestClassifier()),
    param_distributions={
        "estimator__n_estimators": [200],
        "estimator__max_features": ["sqrt", "log2"],
        "estimator__criterion": ["gini", "entropy", "log_loss"],
        "estimator__max_depth": randint(low=2, high=60),
        "estimator__min_samples_split": randint(low=2, high=20),
        "estimator__min_samples_leaf": randint(low=1, high=20),
        "estimator__ccp_alpha": loguniform(1e-5, 1),
    },
    n_iter=250,
    cv=val_cvs,
    scoring="roc_auc",
    verbose=2,
    n_jobs=-1,
)

random_search.fit(X, y)

Fitting 6 folds for each of 250 candidates, totalling 1500 fits
[CV] END estimator__ccp_alpha=0.014497115610445442, estimator__criterion=gini, estimator__max_depth=7, estimator__max_features=sqrt, estimator__min_samples_leaf=2, estimator__min_samples_split=14, estimator__n_estimators=200; total time=   1.0s
[CV] END estimator__ccp_alpha=0.014497115610445442, estimator__criterion=gini, estimator__max_depth=7, estimator__max_features=sqrt, estimator__min_samples_leaf=2, estimator__min_samples_split=14, estimator__n_estimators=200; total time=   1.0s
[CV] END estimator__ccp_alpha=0.014497115610445442, estimator__criterion=gini, estimator__max_depth=7, estimator__max_features=sqrt, estimator__min_samples_leaf=2, estimator__min_samples_split=14, estimator__n_estimators=200; total time=   1.0s
[CV] END estimator__ccp_alpha=0.014497115610445442, estimator__criterion=gini, estimator__max_depth=7, estimator__max_features=sqrt, estimator__min_samples_leaf=2, estimator__min_samples_split=14, esti

In [25]:
results = pd.DataFrame(random_search.cv_results_).sort_values(by="rank_test_score")

In [26]:
results.head(10)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_estimator__ccp_alpha,param_estimator__criterion,param_estimator__max_depth,param_estimator__max_features,param_estimator__min_samples_leaf,param_estimator__min_samples_split,...,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,mean_test_score,std_test_score,rank_test_score
234,1.153726,0.028248,0.040417,0.001695,1.2e-05,log_loss,46,sqrt,1,19,...,{'estimator__ccp_alpha': 1.2307794443225254e-0...,0.826079,0.785655,0.827634,0.842068,0.888661,0.915327,0.847571,0.042822,1
118,1.134645,0.023972,0.041924,0.000471,0.000616,entropy,49,log2,5,9,...,{'estimator__ccp_alpha': 0.0006158364706060997...,0.804405,0.793921,0.835952,0.843467,0.891786,0.909762,0.846549,0.042249,2
104,1.095266,0.016355,0.039839,0.001489,0.003446,log_loss,23,log2,16,15,...,{'estimator__ccp_alpha': 0.0034456955291814716...,0.801406,0.795685,0.839375,0.848378,0.878408,0.905759,0.844835,0.039122,3
11,1.098512,0.024649,0.040961,0.001486,0.003032,log_loss,35,sqrt,9,2,...,{'estimator__ccp_alpha': 0.0030317753628044124...,0.812902,0.78253,0.834747,0.845446,0.887619,0.905193,0.84474,0.041788,4
168,1.126171,0.020797,0.039187,0.000973,0.023225,log_loss,56,sqrt,8,16,...,"{'estimator__ccp_alpha': 0.02322495775207635, ...",0.813229,0.784658,0.829985,0.848065,0.881994,0.910357,0.844715,0.041883,5
66,1.082367,0.018014,0.040107,0.00114,0.001389,log_loss,36,log2,13,4,...,{'estimator__ccp_alpha': 0.0013887947765110764...,0.809234,0.792939,0.831503,0.840893,0.887708,0.904568,0.844474,0.039915,6
43,1.132599,0.024665,0.03928,0.001077,5.2e-05,entropy,52,sqrt,3,19,...,{'estimator__ccp_alpha': 5.234662929322621e-05...,0.80372,0.793118,0.835967,0.837835,0.894807,0.90119,0.844439,0.04116,7
193,1.073218,0.021758,0.040796,0.002988,9.9e-05,entropy,7,sqrt,16,9,...,{'estimator__ccp_alpha': 9.937202795927419e-05...,0.793452,0.792798,0.84631,0.84058,0.888437,0.902723,0.84405,0.042065,8
180,1.096316,0.017224,0.040546,0.001923,1.2e-05,entropy,36,log2,13,9,...,{'estimator__ccp_alpha': 1.1542809520392911e-0...,0.800647,0.79157,0.831354,0.843378,0.890179,0.905461,0.843765,0.042226,9
65,1.07151,0.015825,0.040923,0.001128,0.00015,entropy,11,sqrt,17,16,...,{'estimator__ccp_alpha': 0.0001504133219383310...,0.800655,0.79875,0.83872,0.840104,0.886012,0.898318,0.84376,0.038045,10


In [27]:
best_params = results.iloc[0]["params"]

In [28]:
best_params.values()

dict_values([np.float64(1.2307794443225254e-05), 'log_loss', 46, 'sqrt', 1, 19, 200])

In [29]:
keys = []
for i in best_params.keys():
    keys.append(i.replace("estimator__", ""))

In [30]:
best_params = {k: i for (k,i) in zip(keys, best_params.values())}

In [31]:
best_params

{'ccp_alpha': np.float64(1.2307794443225254e-05),
 'criterion': 'log_loss',
 'max_depth': 46,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 19,
 'n_estimators': 200}

In [32]:

aucs = []
for cv in cvs:

    X_train = X.iloc[cv[0]]
    X_test = X.iloc[cv[1]]

    y_train = y.iloc[cv[0]]
    y_test = y.iloc[cv[1]]

    model = MultiOutputClassifier(RandomForestClassifier(random_state=42, **best_params), n_jobs=-1)

    model.fit(X_train, y_train)
    y_probas = model.predict_proba(X_test)
    proba_outer = y_probas[1][:, 1]
    proba_inner = y_probas[0][:, 1]
    proba_ball = y_probas[2][:, 1]

    auroc_outer = roc_auc_score(y_test["outer"], proba_outer)
    auroc_inner = roc_auc_score(y_test["inner"], proba_inner)
    auroc_ball = roc_auc_score(y_test["ball"], proba_ball)

    macro_auc = np.mean([auroc_outer, auroc_inner, auroc_ball])
    print(f"Macro AUC: {macro_auc:.4f} | Outer AUC: {auroc_outer:.4f} | Inner AUC: {auroc_inner:.4f} | Ball AUC: {auroc_ball:.4f}")
    aucs.append([macro_auc, auroc_outer, auroc_inner, auroc_ball])

Macro AUC: 0.7811 | Outer AUC: 0.6285 | Inner AUC: 0.9596 | Ball AUC: 0.7553
Macro AUC: 0.8354 | Outer AUC: 0.6854 | Inner AUC: 0.9964 | Ball AUC: 0.8245
Macro AUC: 0.7924 | Outer AUC: 0.6078 | Inner AUC: 0.8938 | Ball AUC: 0.8758
Macro AUC: 0.8533 | Outer AUC: 0.7410 | Inner AUC: 0.9775 | Ball AUC: 0.8413
Macro AUC: 0.9715 | Outer AUC: 0.9968 | Inner AUC: 0.9966 | Ball AUC: 0.9211
Macro AUC: 0.9721 | Outer AUC: 1.0000 | Inner AUC: 0.9978 | Ball AUC: 0.9186
Macro AUC: 0.7327 | Outer AUC: 0.5487 | Inner AUC: 0.9530 | Ball AUC: 0.6964
Macro AUC: 0.7983 | Outer AUC: 0.5973 | Inner AUC: 0.9293 | Ball AUC: 0.8683
Macro AUC: 0.9076 | Outer AUC: 0.9812 | Inner AUC: 0.9988 | Ball AUC: 0.7428
Macro AUC: 0.9179 | Outer AUC: 0.9893 | Inner AUC: 0.9266 | Ball AUC: 0.8377
Macro AUC: 0.7487 | Outer AUC: 0.4224 | Inner AUC: 0.9946 | Ball AUC: 0.8291
Macro AUC: 0.7048 | Outer AUC: 0.3943 | Inner AUC: 0.9889 | Ball AUC: 0.7310
Macro AUC: 0.9139 | Outer AUC: 0.9935 | Inner AUC: 0.9168 | Ball AUC: 0.8316

In [33]:
macro_aucs = [i[0] for i in aucs]
np.mean(macro_aucs), np.std(macro_aucs)

(np.float64(0.8552291666666667), np.float64(0.07431407033093687))

## SVM

In [34]:
from sklearn.svm import SVC

random_search = RandomizedSearchCV(
    estimator=MultiOutputClassifier(SVC(probability=True)),
    param_distributions={
        "estimator__C": loguniform(1e-3, 1e3),
        "estimator__gamma": ["scale", "auto"],
        "estimator__kernel": ["rbf"],
    },
    n_iter=250,
    cv=val_cvs,
    scoring="roc_auc",
    verbose=2,
    n_jobs=-1,
)

random_search.fit(X, y)

Fitting 6 folds for each of 250 candidates, totalling 1500 fits
[CV] END estimator__C=20.533628453375698, estimator__gamma=auto, estimator__kernel=rbf; total time=   0.1s
[CV] END estimator__C=20.533628453375698, estimator__gamma=auto, estimator__kernel=rbf; total time=   0.1s
[CV] END estimator__C=20.533628453375698, estimator__gamma=auto, estimator__kernel=rbf; total time=   0.1s
[CV] END estimator__C=20.533628453375698, estimator__gamma=auto, estimator__kernel=rbf; total time=   0.1s
[CV] END estimator__C=78.68894795807083, estimator__gamma=scale, estimator__kernel=rbf; total time=   0.1s
[CV] END estimator__C=20.533628453375698, estimator__gamma=auto, estimator__kernel=rbf; total time=   0.1s
[CV] END estimator__C=78.68894795807083, estimator__gamma=scale, estimator__kernel=rbf; total time=   0.2s
[CV] END estimator__C=78.68894795807083, estimator__gamma=scale, estimator__kernel=rbf; total time=   0.2s
[CV] END estimator__C=0.13747695547447253, estimator__gamma=auto, estimator__ker

In [35]:
results = pd.DataFrame(random_search.cv_results_).sort_values(by="rank_test_score")

In [36]:
results.iloc[0]

mean_fit_time                                                       0.156918
std_fit_time                                                        0.016612
mean_score_time                                                     0.013902
std_score_time                                                      0.000992
param_estimator__C                                                165.881105
param_estimator__gamma                                                 scale
param_estimator__kernel                                                  rbf
params                     {'estimator__C': 165.88110500515725, 'estimato...
split0_test_score                                                   0.897582
split1_test_score                                                   0.823006
split2_test_score                                                   0.720558
split3_test_score                                                   0.847314
split4_test_score                                                   0.773408

In [37]:
best_params = results.iloc[0]["params"]

In [38]:
best_params.values()

dict_values([np.float64(165.88110500515725), 'scale', 'rbf'])

In [39]:
keys = []
for i in best_params.keys():
    keys.append(i.replace("estimator__", ""))

In [40]:
best_params = {k: i for (k,i) in zip(keys, best_params.values())}

In [41]:
best_params

{'C': np.float64(165.88110500515725), 'gamma': 'scale', 'kernel': 'rbf'}

In [42]:
len(cvs[0][1])

360

In [43]:
aucs = []
for cv in cvs:

    X_train = X.iloc[cv[0]]
    X_test = X.iloc[cv[1]]

    y_train = y.iloc[cv[0]]
    y_test = y.iloc[cv[1]]

    model = MultiOutputClassifier(SVC(random_state=42, **best_params, probability=True), n_jobs=-1)

    model.fit(X_train, y_train)
    y_probas = model.predict_proba(X_test)
    proba_outer = y_probas[1][:, 1]
    proba_inner = y_probas[0][:, 1]
    proba_ball = y_probas[2][:, 1]

    auroc_outer = roc_auc_score(y_test["outer"], proba_outer)
    auroc_inner = roc_auc_score(y_test["inner"], proba_inner)
    auroc_ball = roc_auc_score(y_test["ball"], proba_ball)

    macro_auc = np.mean([auroc_outer, auroc_inner, auroc_ball])
    print(f"Macro AUC: {macro_auc:.4f} | Outer AUC: {auroc_outer:.4f} | Inner AUC: {auroc_inner:.4f} | Ball AUC: {auroc_ball:.4f}")
    aucs.append([macro_auc, auroc_outer, auroc_inner, auroc_ball])


Macro AUC: 0.6596 | Outer AUC: 0.8502 | Inner AUC: 0.5667 | Ball AUC: 0.5620
Macro AUC: 0.7245 | Outer AUC: 0.8538 | Inner AUC: 0.6536 | Ball AUC: 0.6662
Macro AUC: 0.7246 | Outer AUC: 0.8075 | Inner AUC: 0.5346 | Ball AUC: 0.8316
Macro AUC: 0.7166 | Outer AUC: 0.9063 | Inner AUC: 0.6526 | Ball AUC: 0.5908
Macro AUC: 0.9211 | Outer AUC: 0.9918 | Inner AUC: 0.9652 | Ball AUC: 0.8061
Macro AUC: 0.8750 | Outer AUC: 0.9982 | Inner AUC: 0.9861 | Ball AUC: 0.6406
Macro AUC: 0.7153 | Outer AUC: 0.6348 | Inner AUC: 0.7572 | Ball AUC: 0.7538
Macro AUC: 0.8547 | Outer AUC: 0.6700 | Inner AUC: 0.9821 | Ball AUC: 0.9121
Macro AUC: 0.9368 | Outer AUC: 0.9941 | Inner AUC: 0.9977 | Ball AUC: 0.8185
Macro AUC: 0.9021 | Outer AUC: 0.9948 | Inner AUC: 0.9976 | Ball AUC: 0.7140
Macro AUC: 0.7209 | Outer AUC: 0.4117 | Inner AUC: 0.9954 | Ball AUC: 0.7555
Macro AUC: 0.7128 | Outer AUC: 0.5150 | Inner AUC: 1.0000 | Ball AUC: 0.6233
Macro AUC: 0.4614 | Outer AUC: 0.1386 | Inner AUC: 0.5847 | Ball AUC: 0.6609

In [45]:
macro_aucs = [i[0] for i in aucs] 
np.mean(macro_aucs), np.std(macro_aucs)

(np.float64(0.7600532738095238), np.float64(0.10910965953833945))