## Compute feature importances for the best ML-Pipeline

### Setup and Helper Fucntions

#### Imports

In [1]:

from pathlib import Path

import pandas as pd
import numpy as np

import biopsykit as bp


from biopsykit.classification.model_selection import SklearnPipelinePermuter

from pepbench.io import load_best_performing_algos_b_point, load_preprocessed_training_data, compute_abs_error, \
     get_best_pipeline_results, get_best_estimator, get_pipeline_steps, compute_mae_std_from_permuter

import pingouin as pg

import matplotlib.pyplot as plt

#%matplotlib widget
%load_ext autoreload
%autoreload 2

#### Datapaths

In [2]:
model_path = Path(f"../../../results/models")
data_path = Path(f"../../../results/data")
result_path = Path(f"../../../results")
rater = "rater_01"

### Load pipeline permuter with best ML-Pipeline

In [11]:
permuter_include_nan = SklearnPipelinePermuter.from_pickle(model_path.joinpath(f'b-point/rr-interval/{rater}/b_point_regression_hpc_0_baseline_result_rr_include_nan_rater_01.pkl'))

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


#### Select best ML-Pipeline

In [None]:
algo_dict_include_nan = {
    "MinMax_None_RFR_RR-Interval-Include-Nan": ('MinMaxScaler', 'RandomForestRegressor'),
}

In [5]:
algo_include_nan = algo_dict_include_nan["MinMax_None_RFR_RR-Interval-Include-Nan"]

In [6]:
permuter_include_nan_summary = permuter_include_nan.metric_summary()
permuter_include_nan_summary[permuter_include_nan_summary.index == algo_include_nan]

Unnamed: 0_level_0,Unnamed: 1_level_0,conf_matrix,conf_matrix_folds,true_labels,true_labels_folds,predicted_labels,predicted_labels_folds,train_indices,train_indices_folds,test_indices,test_indices_folds,mean_test_neg_mean_absolute_error,std_test_neg_mean_absolute_error,test_neg_mean_absolute_error_fold_0,test_neg_mean_absolute_error_fold_1,test_neg_mean_absolute_error_fold_2,test_neg_mean_absolute_error_fold_3,test_neg_mean_absolute_error_fold_4
pipeline_scaler,pipeline_clf,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
MinMaxScaler,RandomForestRegressor,[0.0],[],"[516.0, 520.0, 492.0, 492.0, 498.0, 334.0, 516...","[[516.0, 520.0, 492.0, 492.0, 498.0, 334.0, 51...","[525.699485398262, 520.0283937211624, 493.0129...","[[525.699485398262, 520.0283937211624, 493.012...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...","[321, 322, 323, 324, 325, 326, 327, 328, 329, ...","[[321, 322, 323, 324, 325, 326, 327, 328, 329,...",8.153145,1.138026,6.413824,8.511139,7.921295,7.96797,9.951496


#### Show summary of best estimator per fold

In [7]:
pipeline_include_nan = permuter_include_nan.best_estimator_summary().loc[algo_include_nan]['best_estimator'].pipeline
pipeline_include_nan

[Pipeline(memory=Memory(location=cachedir/joblib),
          steps=[('scaler', MinMaxScaler()),
                 ('clf',
                  RandomForestRegressor(ccp_alpha=0.001,
                                        criterion='friedman_mse',
                                        max_features=0.8, n_estimators=250,
                                        random_state=RandomState(MT19937) at 0x1EC158F0D40))]),
 Pipeline(memory=Memory(location=cachedir/joblib),
          steps=[('scaler', MinMaxScaler()),
                 ('clf',
                  RandomForestRegressor(ccp_alpha=0.001,
                                        criterion='friedman_mse',
                                        max_depth=np.int64(40), max_features=0.8,
                                        min_impurity_decrease=0.01,
                                        n_estimators=250,
                                        random_state=RandomState(MT19937) at 0x1EC158F0F40))]),
 Pipeline(memory=Memory(location=cac

#### Get feature importances per fold

In [8]:
features_include_nan = np.array(pd.read_csv(data_path.joinpath(f'b-point/rr-interval/{rater}/train_data_b_point_rr_interval_include_nan.csv'), index_col=[0,1,2,3,4,5]).columns)
all_feature_importances = []
for fold in range(0,5,1):
    #selected_features = features_include_nan[pipeline_include_nan[fold]['reduce_dim'].get_support()]
    feature_importances = pipeline_include_nan[fold]['clf'].feature_importances_
    all_feature_importances.append(list(zip(features_include_nan, feature_importances)))
all_feature_importances = [
    [(key, float(value)) for key, value in inner_list]
    for inner_list in all_feature_importances
]
for fold, importances in enumerate(all_feature_importances):
    all_feature_importances[fold] = sorted(all_feature_importances[fold], key=lambda x: x[0], reverse=True)
all_feature_importances

[[('stern1985', 0.005605473834162119),
  ('sherwood1990', 0.0025500386217459686),
  ('rr_interval_ms_estimated', 0.017658697820983615),
  ('pale2021', 0.001300697082925951),
  ('miljkovic2022', 0.0033290955257245652),
  ('lozano2007-quadratic-regression', 0.00803547490442394),
  ('lozano2007-linear-regression', 0.14936996764799845),
  ('forouzanfar2018', 0.007239213990632266),
  ('drost2022', 0.7501286501566603),
  ('debski1993-second-derivative', 0.04457554895294814),
  ('arbol2017-third-derivative', 0.0029045270672191604),
  ('arbol2017-second-derivative', 0.005983867807072049),
  ('arbol2017-isoelectric-crossings', 0.001318746587503501)],
 [('stern1985', 0.005346452573357876),
  ('sherwood1990', 0.0024944152819322426),
  ('rr_interval_ms_estimated', 0.008122627581595755),
  ('pale2021', 0.0010292787360665732),
  ('miljkovic2022', 0.00263742121414177),
  ('lozano2007-quadratic-regression', 0.00793637535476849),
  ('lozano2007-linear-regression', 0.17372168115837128),
  ('forouzanfar2

### Create general feature importance summary

In [9]:
# all feature importances has to be sorted by the feature name
averaged_feature_importances = list(zip([key for key, value in all_feature_importances[0]], np.mean([[value for key, value in fold] for fold in all_feature_importances], axis=0), np.std([[value for key, value in fold] for fold in all_feature_importances], axis=0)))
averaged_feature_importances = sorted(averaged_feature_importances, key=lambda x: x[1], reverse=True)
averaged_feature_importances

[('drost2022',
  np.float64(0.7488399588391463),
  np.float64(0.01762885538131582)),
 ('lozano2007-linear-regression',
  np.float64(0.15857869393644744),
  np.float64(0.01452927382562722)),
 ('debski1993-second-derivative',
  np.float64(0.043817893088027055),
  np.float64(0.007111436591663506)),
 ('rr_interval_ms_estimated',
  np.float64(0.011542031623048574),
  np.float64(0.0034677813085983956)),
 ('forouzanfar2018',
  np.float64(0.008348224249834597),
  np.float64(0.0011847299978324133)),
 ('lozano2007-quadratic-regression',
  np.float64(0.007232216195851739),
  np.float64(0.000988357210381077)),
 ('arbol2017-second-derivative',
  np.float64(0.005968585654484638),
  np.float64(0.0013720662852054583)),
 ('stern1985',
  np.float64(0.005793030183327687),
  np.float64(0.0003311672667939882)),
 ('miljkovic2022',
  np.float64(0.0029564528885650714),
  np.float64(0.0006128381740283458)),
 ('arbol2017-third-derivative',
  np.float64(0.0025222361517477),
  np.float64(0.00023461517406696346)),

In [10]:
averaged_feature_importances_table = pd.DataFrame(averaged_feature_importances, columns=['feature', 'importance', 'std of importance'])
averaged_feature_importances_table["std in percent"] = averaged_feature_importances_table["std of importance"] / averaged_feature_importances_table["importance"] * 100
averaged_feature_importances_table

Unnamed: 0,feature,importance,std of importance,std in percent
0,drost2022,0.74884,0.017629,2.354155
1,lozano2007-linear-regression,0.158579,0.014529,9.162185
2,debski1993-second-derivative,0.043818,0.007111,16.229527
3,rr_interval_ms_estimated,0.011542,0.003468,30.044809
4,forouzanfar2018,0.008348,0.001185,14.191401
5,lozano2007-quadratic-regression,0.007232,0.000988,13.666035
6,arbol2017-second-derivative,0.005969,0.001372,22.988131
7,stern1985,0.005793,0.000331,5.71665
8,miljkovic2022,0.002956,0.000613,20.728833
9,arbol2017-third-derivative,0.002522,0.000235,9.301872
