In [36]:
import pandas as pd
import numpy as np

import autorootcwd  # noqa
from hamilton import driver

from src.data import data_pipeline
from src.data.pydantic_models import BearingDataset
from functools import partial
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import (
    RandomizedSearchCV,
    GroupKFold,
    cross_validate,
    cross_val_predict,
)
from scipy.stats import randint, loguniform
import logging
import random
from src.utils.add_signal_data import add_signal_data_paderborn, add_signal_data_ottawa, add_signal_data_cwru

In [37]:
from src.data.data_splits import train_test_split_proposed
from itertools import combinations, product
import random

In [38]:
healthy_bearing_ids = ["K001", "K002", "K003", "K004", "K005", "K006"]
outer_bearing_ids = ["KA04", "KA15", "KA16", "KA22", "KA30"]
inner_bearing_ids = ["KI04", "KI14", "KI16", "KI18", "KI21", "KI17"]


class CombinationSampler:
    def __init__(self, list1, list2, list3, random_seed=42):
        self.combinations = list(product(list1, list2, list3))
        random.seed(random_seed)
        random.shuffle(self.combinations)

    def get_combination(self, run: int):
        if not self.combinations:
            raise ValueError("No more combinations available")
        combination = self.combinations[run]

        return combination


list1 = [
    ["K001", "K003"],
    ["K001", "K002"],
    ["K001", "K004"],
    ["K001", "K005"],
    ["K001", "K006"],
    ["K002", "K003"],
    ["K002", "K004"],
    ["K002", "K005"],
    ["K002", "K006"],
    ["K003", "K004"],
    ["K003", "K005"],
    ["K003", "K006"],
    ["K004", "K005"],
    ["K004", "K006"],
    ["K005", "K006"],
]

list2 = [
    ["KA04", "KA16"],
    ["KA04", "KA15"],
    ["KA04", "KA22"],
    ["KA04", "KA30"],
    ["KA15", "KA16"],
    ["KA15", "KA22"],
    ["KA15", "KA30"],
    ["KA16", "KA22"],
    ["KA16", "KA30"],
    ["KA22", "KA30"],
]

list3 = [
    ["KI04", "KI16"],
    ["KI04", "KI14"],
    ["KI04", "KI21"],
    ["KI04", "KI18"],
    ["KI04", "KI17"],
    ["KI14", "KI16"],
    ["KI14", "KI18"],
    ["KI14", "KI21"],
    ["KI14", "KI17"],
    ["KI16", "KI18"],
    ["KI16", "KI21"],
    ["KI16", "KI17"],
    ["KI18", "KI21"],
    ["KI18", "KI17"],
    ["KI21", "KI17"],
]

sampler = CombinationSampler(list1, list2, list3)

def get_cv_splits(
    df: pd.DataFrame,
    sampler: CombinationSampler,
    run: int,
    healthy_bearing_ids: list,
    outer_bearing_ids: list,
    inner_bearing_ids: list,
    use_combined: bool = False,
    on_train: bool = False,
):

    combination = sampler.get_combination(run=run)

    train_ids = (
        list(set(healthy_bearing_ids).difference(set(combination[0])))
        + list(set(outer_bearing_ids).difference(set(combination[1])))
        + list(set(inner_bearing_ids).difference(set(combination[2])))
    )

    print(f"Train IDs: {train_ids}")

    test_ids = list(combination[0] + combination[1] + combination[2])

    print(f"Test IDs: {test_ids}")

    if use_combined:
        combined_bearings_ids = ["KB23", "KB24", "KB27"]
        if on_train:
            print(f"Using combined bearings on train: {combined_bearings_ids}")
            train_ids += combined_bearings_ids
        else:
            print(f"Using combined bearings on test: {combined_bearings_ids}")
            test_ids += combined_bearings_ids

    train_idx = df[df["bearing_id"].isin(train_ids)].index.values.tolist()
    test_idx = df[df["bearing_id"].isin(test_ids)].index.values.tolist()

    cv = (train_idx, test_idx)

    return cv

In [39]:
metadata = pd.read_pickle("/data/bearing_datasets/paderborn/processed/files_metadata.bz2")
features = pd.read_pickle("data/features/paderborn_features_segmented.pkl")

In [40]:
df = features.merge(metadata, on="waveform_id", how="left").reset_index(drop=True)

In [41]:
df.bearing_id.unique()

array(['KI14', 'KI18', 'K005', 'KA16', 'KI16', 'K002', 'KA30', 'K001',
       'KA22', 'KA04', 'KA15', 'KI17', 'K006', 'K003', 'KI04', 'K004',
       'KI21'], dtype=object)

In [42]:
#df = df.drop_duplicates(subset='waveform_id').reset_index(drop=True)

In [43]:
cvs = [get_cv_splits(sampler=sampler, df=df, run=run,
 healthy_bearing_ids=healthy_bearing_ids, outer_bearing_ids=outer_bearing_ids,
  inner_bearing_ids=inner_bearing_ids, use_combined=False, on_train=False)
 for run in range(5)]

Train IDs: ['K003', 'K001', 'K006', 'K005', 'KA04', 'KA22', 'KA30', 'KI04', 'KI18', 'KI21', 'KI14']
Test IDs: ['K002', 'K004', 'KA15', 'KA16', 'KI16', 'KI17']
Train IDs: ['K003', 'K001', 'K005', 'K004', 'KA04', 'KA22', 'KA30', 'KI16', 'KI18', 'KI21', 'KI14']
Test IDs: ['K002', 'K006', 'KA15', 'KA16', 'KI04', 'KI17']
Train IDs: ['K001', 'K002', 'K006', 'K004', 'KA16', 'KA04', 'KA22', 'KI16', 'KI17', 'KI21', 'KI14']
Test IDs: ['K003', 'K005', 'KA15', 'KA30', 'KI04', 'KI18']
Train IDs: ['K005', 'K001', 'K002', 'K004', 'KA16', 'KA04', 'KA30', 'KI16', 'KI04', 'KI17', 'KI14']
Test IDs: ['K003', 'K006', 'KA15', 'KA22', 'KI18', 'KI21']
Train IDs: ['K005', 'K001', 'K002', 'K004', 'KA16', 'KA22', 'KA30', 'KI16', 'KI18', 'KI17', 'KI21']
Test IDs: ['K003', 'K006', 'KA04', 'KA15', 'KI04', 'KI14']


In [44]:
features_list = ['acceleration/rms/global', 'acceleration/pk-pk/global',
       'acceleration/kurt/global', 'acceleration/skewness/global',
       'acceleration/fc/global',
       'envelope/spectralPeak/1.0x-bpfo/500-10000',
       'envelope/spectralPeak/2.0x-bpfo/500-10000',
       'envelope/spectralPeak/3.0x-bpfo/500-10000',
       'envelope/spectralPeak/4.0x-bpfo/500-10000',
       'envelope/spectralPeak/5.0x-bpfo/500-10000',
       'envelope/spectralPeak/1.0x-bpfi/500-10000',
       'envelope/spectralPeak/2.0x-bpfi/500-10000',
       'envelope/spectralPeak/3.0x-bpfi/500-10000',
       'envelope/spectralPeak/4.0x-bpfi/500-10000',
       'envelope/spectralPeak/5.0x-bpfi/500-10000']

In [45]:
X = df[features_list].copy()
y = df[['inner', 'outer']].copy()

# Features (segmented)

## RF

In [46]:
random_search = RandomizedSearchCV(
    estimator=MultiOutputClassifier(RandomForestClassifier()),
    param_distributions={
        "estimator__n_estimators": [200],
        "estimator__max_features": ["sqrt", "log2"],
        "estimator__criterion": ["gini", "entropy", "log_loss"],
        "estimator__max_depth": randint(low=2, high=60),
        "estimator__min_samples_split": randint(low=2, high=20),
        "estimator__min_samples_leaf": randint(low=1, high=20),
        "estimator__ccp_alpha": loguniform(1e-5, 1),
    },
    n_iter=250,
    cv=cvs,
    scoring="roc_auc",
    verbose=2,
    n_jobs=-1,
)

random_search.fit(X, y)

Fitting 5 folds for each of 250 candidates, totalling 1250 fits


[CV] END estimator__ccp_alpha=0.18701836931614066, estimator__criterion=gini, estimator__max_depth=8, estimator__max_features=sqrt, estimator__min_samples_leaf=4, estimator__min_samples_split=12, estimator__n_estimators=200; total time=   2.1s
[CV] END estimator__ccp_alpha=0.001023434036393425, estimator__criterion=gini, estimator__max_depth=30, estimator__max_features=sqrt, estimator__min_samples_leaf=5, estimator__min_samples_split=6, estimator__n_estimators=200; total time=   2.1s
[CV] END estimator__ccp_alpha=0.18701836931614066, estimator__criterion=gini, estimator__max_depth=8, estimator__max_features=sqrt, estimator__min_samples_leaf=4, estimator__min_samples_split=12, estimator__n_estimators=200; total time=   2.2s
[CV] END estimator__ccp_alpha=0.18701836931614066, estimator__criterion=gini, estimator__max_depth=8, estimator__max_features=sqrt, estimator__min_samples_leaf=4, estimator__min_samples_split=12, estimator__n_estimators=200; total time=   2.2s
[CV] END estimator__ccp

In [47]:
results = pd.DataFrame(random_search.cv_results_).sort_values(by="rank_test_score")

In [48]:
results.head(5)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_estimator__ccp_alpha,param_estimator__criterion,param_estimator__max_depth,param_estimator__max_features,param_estimator__min_samples_leaf,param_estimator__min_samples_split,param_estimator__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
119,1.312615,0.047406,0.030852,0.000275,0.001346,gini,3,log2,3,18,200,{'estimator__ccp_alpha': 0.0013461245580047097...,0.814251,0.826325,0.649423,0.500917,0.880018,0.734187,0.139833,1
65,1.629362,0.029852,0.031249,0.000221,0.000257,entropy,3,log2,4,13,200,{'estimator__ccp_alpha': 0.0002565831698803033...,0.819781,0.818192,0.608871,0.513,0.885259,0.729021,0.142619,2
111,1.275859,0.031368,0.029172,0.00045,4e-05,log_loss,2,sqrt,6,7,200,{'estimator__ccp_alpha': 3.995614155070056e-05...,0.854018,0.869746,0.51254,0.514044,0.89409,0.728888,0.176496,3
70,1.896552,0.056983,0.03349,0.000537,3.1e-05,entropy,4,log2,19,5,200,"{'estimator__ccp_alpha': 3.07535888085428e-05,...",0.827776,0.813989,0.612943,0.505305,0.883839,0.72877,0.144543,4
147,1.266224,0.026807,0.028776,0.00016,0.000147,log_loss,2,log2,8,13,200,{'estimator__ccp_alpha': 0.0001472195782339504...,0.868414,0.853652,0.520931,0.508711,0.891597,0.728661,0.175061,5


In [49]:
results.loc[22]

mean_fit_time                                                                   2.67123
std_fit_time                                                                   0.138501
mean_score_time                                                                0.034098
std_score_time                                                                 0.000917
param_estimator__ccp_alpha                                                     0.026476
param_estimator__criterion                                                     log_loss
param_estimator__max_depth                                                           24
param_estimator__max_features                                                      log2
param_estimator__min_samples_leaf                                                    18
param_estimator__min_samples_split                                                   15
param_estimator__n_estimators                                                       200
params                          

In [50]:
best_params = results.loc[22]["params"]

In [51]:
best_params.values()

dict_values([np.float64(0.026475639105883027), 'log_loss', 24, 'log2', 18, 15, 200])

In [52]:
keys = []
for i in best_params.keys():
    keys.append(i.replace("estimator__", ""))

In [53]:
best_params = {k: i for (k,i) in zip(keys, best_params.values())}

In [54]:
best_params

{'ccp_alpha': np.float64(0.026475639105883027),
 'criterion': 'log_loss',
 'max_depth': 24,
 'max_features': 'log2',
 'min_samples_leaf': 18,
 'min_samples_split': 15,
 'n_estimators': 200}

In [55]:
from sklearn.metrics import roc_auc_score

#### WITHOUT COMBINEDS
cvs = [get_cv_splits(sampler=sampler, df=df, run=run,
 healthy_bearing_ids=healthy_bearing_ids, outer_bearing_ids=outer_bearing_ids,
  inner_bearing_ids=inner_bearing_ids, use_combined=False, on_train=False)
 for run in range(5,105)]

aucs = []
for cv in cvs:

    X_train = X.iloc[cv[0]]
    X_test = X.iloc[cv[1]]

    y_train = y.iloc[cv[0]]
    y_test = y.iloc[cv[1]]

    model = MultiOutputClassifier(RandomForestClassifier(random_state=42, **best_params), n_jobs=-1)

    model.fit(X_train, y_train)
    y_probas = model.predict_proba(X_test)
    proba_outer = y_probas[1][:, 1]
    proba_inner = y_probas[0][:, 1]

    auroc_outer = roc_auc_score(y_test["outer"], proba_outer)
    auroc_inner = roc_auc_score(y_test["inner"], proba_inner)

    macro_auc = np.mean([auroc_outer, auroc_inner])
    print(f"Macro AUC: {macro_auc:.4f} | Outer AUC: {auroc_outer:.4f} | Inner AUC: {auroc_inner:.4f}")
    aucs.append([macro_auc, auroc_outer, auroc_inner])


Train IDs: ['K005', 'K001', 'K006', 'K004', 'KA15', 'KA22', 'KA16', 'KI16', 'KI04', 'KI18', 'KI21']
Test IDs: ['K002', 'K003', 'KA04', 'KA30', 'KI14', 'KI17']
Train IDs: ['K003', 'K001', 'K006', 'K005', 'KA16', 'KA22', 'KA30', 'KI16', 'KI17', 'KI21', 'KI14']
Test IDs: ['K002', 'K004', 'KA04', 'KA15', 'KI04', 'KI18']
Train IDs: ['K005', 'K001', 'K006', 'K004', 'KA15', 'KA04', 'KA16', 'KI04', 'KI17', 'KI21', 'KI14']
Test IDs: ['K002', 'K003', 'KA22', 'KA30', 'KI16', 'KI18']
Train IDs: ['K003', 'K001', 'K006', 'K004', 'KA15', 'KA16', 'KA30', 'KI16', 'KI18', 'KI17', 'KI21']
Test IDs: ['K002', 'K005', 'KA04', 'KA22', 'KI04', 'KI14']
Train IDs: ['K003', 'K006', 'K004', 'K005', 'KA15', 'KA04', 'KA16', 'KI16', 'KI04', 'KI18', 'KI14']
Test IDs: ['K001', 'K002', 'KA22', 'KA30', 'KI21', 'KI17']
Train IDs: ['K003', 'K001', 'K002', 'K005', 'KA15', 'KA16', 'KA30', 'KI04', 'KI17', 'KI21', 'KI14']
Test IDs: ['K004', 'K006', 'KA04', 'KA22', 'KI16', 'KI18']
Train IDs: ['K001', 'K002', 'K006', 'K004', 'K

In [56]:
macro_aucs = [i[0] for i in aucs]
np.mean(macro_aucs), np.std(macro_aucs)

(np.float64(0.6967018737792966), np.float64(0.1573412296115115))

## SVM

In [57]:
cvs = [get_cv_splits(sampler=sampler, df=df, run=run,
 healthy_bearing_ids=healthy_bearing_ids, outer_bearing_ids=outer_bearing_ids,
  inner_bearing_ids=inner_bearing_ids, use_combined=False, on_train=False)
 for run in range(5)]

Train IDs: ['K003', 'K001', 'K006', 'K005', 'KA04', 'KA22', 'KA30', 'KI04', 'KI18', 'KI21', 'KI14']
Test IDs: ['K002', 'K004', 'KA15', 'KA16', 'KI16', 'KI17']
Train IDs: ['K003', 'K001', 'K005', 'K004', 'KA04', 'KA22', 'KA30', 'KI16', 'KI18', 'KI21', 'KI14']
Test IDs: ['K002', 'K006', 'KA15', 'KA16', 'KI04', 'KI17']
Train IDs: ['K001', 'K002', 'K006', 'K004', 'KA16', 'KA04', 'KA22', 'KI16', 'KI17', 'KI21', 'KI14']
Test IDs: ['K003', 'K005', 'KA15', 'KA30', 'KI04', 'KI18']
Train IDs: ['K005', 'K001', 'K002', 'K004', 'KA16', 'KA04', 'KA30', 'KI16', 'KI04', 'KI17', 'KI14']
Test IDs: ['K003', 'K006', 'KA15', 'KA22', 'KI18', 'KI21']
Train IDs: ['K005', 'K001', 'K002', 'K004', 'KA16', 'KA22', 'KA30', 'KI16', 'KI18', 'KI17', 'KI21']
Test IDs: ['K003', 'K006', 'KA04', 'KA15', 'KI04', 'KI14']


In [58]:
from sklearn.svm import SVC

random_search = RandomizedSearchCV(
    estimator=MultiOutputClassifier(SVC(probability=True)),
    param_distributions={
        "estimator__C": loguniform(1e-3, 1e3),
        "estimator__gamma": ["scale", "auto"],
        "estimator__kernel": ["rbf"],
    },
    n_iter=250,
    cv=cvs,
    scoring="roc_auc",
    verbose=2,
    n_jobs=-1,
)

random_search.fit(X, y)

Fitting 5 folds for each of 250 candidates, totalling 1250 fits
[CV] END estimator__C=604.9357943432291, estimator__gamma=auto, estimator__kernel=rbf; total time=   0.6s
[CV] END estimator__C=11.985587895140974, estimator__gamma=auto, estimator__kernel=rbf; total time=   0.8s
[CV] END estimator__C=2.9809101146124157, estimator__gamma=auto, estimator__kernel=rbf; total time=   1.1s
[CV] END estimator__C=604.9357943432291, estimator__gamma=auto, estimator__kernel=rbf; total time=   1.3s
[CV] END estimator__C=1.2430449302761093, estimator__gamma=auto, estimator__kernel=rbf; total time=   1.3s
[CV] END estimator__C=604.9357943432291, estimator__gamma=auto, estimator__kernel=rbf; total time=   1.4s
[CV] END estimator__C=11.985587895140974, estimator__gamma=auto, estimator__kernel=rbf; total time=   1.5s
[CV] END estimator__C=11.985587895140974, estimator__gamma=auto, estimator__kernel=rbf; total time=   1.5s
[CV] END estimator__C=191.7890192657904, estimator__gamma=scale, estimator__kernel=

In [59]:
results = pd.DataFrame(random_search.cv_results_).sort_values(by="rank_test_score")

In [60]:
results.iloc[0]

mean_fit_time                                                       2.357889
std_fit_time                                                        0.540296
mean_score_time                                                     0.171337
std_score_time                                                      0.033656
param_estimator__C                                                440.768732
param_estimator__gamma                                                 scale
param_estimator__kernel                                                  rbf
params                     {'estimator__C': 440.7687319211647, 'estimator...
split0_test_score                                                   0.671256
split1_test_score                                                   0.753857
split2_test_score                                                   0.612257
split3_test_score                                                   0.729487
split4_test_score                                                   0.797661

In [61]:
best_params = results.iloc[0]["params"]

In [62]:
best_params.values()

dict_values([np.float64(440.7687319211647), 'scale', 'rbf'])

In [63]:
keys = []
for i in best_params.keys():
    keys.append(i.replace("estimator__", ""))

In [64]:
best_params = {k: i for (k,i) in zip(keys, best_params.values())}

In [65]:
best_params

{'C': np.float64(440.7687319211647), 'gamma': 'scale', 'kernel': 'rbf'}

In [66]:
from sklearn.metrics import roc_auc_score

#### WITHOUT COMBINEDS
cvs = [get_cv_splits(sampler=sampler, df=df, run=run,
 healthy_bearing_ids=healthy_bearing_ids, outer_bearing_ids=outer_bearing_ids,
  inner_bearing_ids=inner_bearing_ids, use_combined=False, on_train=False)
 for run in range(5,105)]

aucs = []
for cv in cvs:

    X_train = X.iloc[cv[0]]
    X_test = X.iloc[cv[1]]

    y_train = y.iloc[cv[0]]
    y_test = y.iloc[cv[1]]

    model = MultiOutputClassifier(SVC(random_state=42, **best_params, probability=True), n_jobs=-1)

    model.fit(X_train, y_train)
    y_probas = model.predict_proba(X_test)
    proba_outer = y_probas[1][:, 1]
    proba_inner = y_probas[0][:, 1]

    auroc_outer = roc_auc_score(y_test["outer"], proba_outer)
    auroc_inner = roc_auc_score(y_test["inner"], proba_inner)

    macro_auc = np.mean([auroc_outer, auroc_inner])
    print(f"Macro AUC: {macro_auc:.4f} | Outer AUC: {auroc_outer:.4f} | Inner AUC: {auroc_inner:.4f}")
    aucs.append([macro_auc, auroc_outer, auroc_inner])


Train IDs: ['K005', 'K001', 'K006', 'K004', 'KA15', 'KA22', 'KA16', 'KI16', 'KI04', 'KI18', 'KI21']
Test IDs: ['K002', 'K003', 'KA04', 'KA30', 'KI14', 'KI17']
Train IDs: ['K003', 'K001', 'K006', 'K005', 'KA16', 'KA22', 'KA30', 'KI16', 'KI17', 'KI21', 'KI14']
Test IDs: ['K002', 'K004', 'KA04', 'KA15', 'KI04', 'KI18']
Train IDs: ['K005', 'K001', 'K006', 'K004', 'KA15', 'KA04', 'KA16', 'KI04', 'KI17', 'KI21', 'KI14']
Test IDs: ['K002', 'K003', 'KA22', 'KA30', 'KI16', 'KI18']
Train IDs: ['K003', 'K001', 'K006', 'K004', 'KA15', 'KA16', 'KA30', 'KI16', 'KI18', 'KI17', 'KI21']
Test IDs: ['K002', 'K005', 'KA04', 'KA22', 'KI04', 'KI14']
Train IDs: ['K003', 'K006', 'K004', 'K005', 'KA15', 'KA04', 'KA16', 'KI16', 'KI04', 'KI18', 'KI14']
Test IDs: ['K001', 'K002', 'KA22', 'KA30', 'KI21', 'KI17']
Train IDs: ['K003', 'K001', 'K002', 'K005', 'KA15', 'KA16', 'KA30', 'KI04', 'KI17', 'KI21', 'KI14']
Test IDs: ['K004', 'K006', 'KA04', 'KA22', 'KI16', 'KI18']
Train IDs: ['K001', 'K002', 'K006', 'K004', 'K

In [None]:
macro_aucs = [i[0] for i in aucs]
np.mean(macro_aucs), np.std(macro_aucs)

(np.float64(0.6442562164306641), np.float64(0.15833498681887748))