In [1]:
from pathlib import Path

import pandas as pd
import numpy as np

import biopsykit as bp
from sklearn.preprocessing import MinMaxScaler, StandardScaler


#Feature Selection
from sklearn.feature_selection import SelectKBest, RFE

#Classification
from sklearn.svm import SVC

# Regression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor


# Cross-Validation
from sklearn.model_selection import GroupKFold

from biopsykit.classification.model_selection import SklearnPipelinePermuter


%matplotlib widget
%load_ext autoreload
%autoreload 2

In [2]:
save_results = True

In [3]:
data_path = Path("../../results/data")
data_path

WindowsPath('../../results/data')

In [4]:
models_path = Path("../../results/models")

In [12]:
input_data = pd.read_csv(data_path.joinpath("train_data_no_outlier_correction.csv"), index_col=[0,1,2,3,4])
input_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,arbol2017-isoelectric-crossings,arbol2017-second-derivative,arbol2017-third-derivative,debski1993-second-derivative,drost2022,forounzafar2018,lozano2007-linear-regression,lozano2007-quadratic-regression,sherwood1990,stern1985
Unnamed: 0_level_1,participant,phase,heartbeat_id_reference,b_point_sample_reference,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,GDN0005,HoldingBreath,0,388.0,438.0,398.0,394.0,452.0,400.0,550.0,412.0,384.0,442.0,388.0
1,GDN0005,HoldingBreath,1,404.0,340.0,350.0,244.0,388.0,420.0,402.0,404.0,384.0,330.0,402.0
2,GDN0005,HoldingBreath,3,376.0,382.0,296.0,386.0,366.0,386.0,388.0,366.0,348.0,382.0,374.0
3,GDN0005,HoldingBreath,4,390.0,394.0,344.0,396.0,376.0,396.0,398.0,372.0,348.0,394.0,388.0
4,GDN0005,HoldingBreath,5,386.0,398.0,312.0,388.0,418.0,392.0,390.0,378.0,354.0,400.0,384.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11219,VP_032,Talk,39,310.0,335.0,276.0,324.0,300.0,318.0,306.0,305.0,294.0,337.0,306.0
11220,VP_032,Talk,40,322.0,331.0,298.0,330.0,311.0,322.0,311.0,321.0,303.0,329.0,287.0
11221,VP_032,Talk,41,340.0,317.0,300.0,348.0,330.0,343.0,309.0,332.0,322.0,311.0,287.0
11222,VP_032,Talk,42,311.0,365.0,324.0,366.0,347.0,354.0,305.0,351.0,337.0,368.0,305.0


In [13]:
print(f"Min data value: {input_data.values.min()}\nMax data value: {input_data.values.max()}")

Min data value: -84.0
Max data value: 742.0


In [14]:
columns=input_data.columns
columns

Index(['arbol2017-isoelectric-crossings', 'arbol2017-second-derivative',
       'arbol2017-third-derivative', 'debski1993-second-derivative',
       'drost2022', 'forounzafar2018', 'lozano2007-linear-regression',
       'lozano2007-quadratic-regression', 'sherwood1990', 'stern1985'],
      dtype='object')

In [15]:
negative_data = input_data[input_data.values < 0]
negative_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,arbol2017-isoelectric-crossings,arbol2017-second-derivative,arbol2017-third-derivative,debski1993-second-derivative,drost2022,forounzafar2018,lozano2007-linear-regression,lozano2007-quadratic-regression,sherwood1990,stern1985
Unnamed: 0_level_1,participant,phase,heartbeat_id_reference,b_point_sample_reference,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
790,GDN0007,Valsalva,0,448.0,392.0,394.0,458.0,434.0,458.0,-84.0,440.0,410.0,388.0,448.0
6627,VP_002,Math,22,209.0,219.0,190.0,-9.0,199.0,217.0,168.0,236.0,234.0,216.0,118.0
6664,VP_002,Math,45,209.0,224.0,184.0,-15.0,200.0,216.0,183.0,230.0,228.0,223.0,133.0
6666,VP_002,Math,46,206.0,220.0,183.0,-16.0,197.0,211.0,139.0,229.0,228.0,219.0,139.0
6667,VP_002,Math,47,206.0,221.0,181.0,-18.0,199.0,214.0,183.0,228.0,227.0,224.0,137.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10667,VP_031,Math,34,201.0,175.0,158.0,-38.0,192.0,214.0,197.0,216.0,217.0,203.0,197.0
10669,VP_031,Math,35,199.0,218.0,160.0,-34.0,189.0,208.0,191.0,218.0,218.0,219.0,191.0
10693,VP_031,Math,49,197.0,206.0,154.0,-31.0,182.0,202.0,154.0,212.0,212.0,209.0,107.0
10694,VP_031,Math,50,197.0,220.0,151.0,-30.0,180.0,201.0,192.0,214.0,214.0,217.0,191.0


In [16]:
X, y, groups, group_keys = bp.classification.utils.prepare_df_sklearn(data=input_data, label_col="b_point_sample_reference", subject_col="participant", print_summary=True)

Shape of X: (10385, 10); shape of y: (10385,); number of groups: 39, class prevalence: [ 2  1  1  4  1  1  1  1  1  2  1  1  2  1  1  2  3  2  6  3  9  4  8  7
 10  4  5 11  7 13  9 11 14 11 15  9 12 15 17 21 22 20 28 27 29 28 34 35
 44 37 37 39 30 34 46 40 50 47 41 38 23 21 26 34 34 27 33 37 26 40 20 26
 33 23 23 29 31 18 28 24 19 29 17 20 24 23 23 11 18 20 16 23 26 24 22 10
 15 14 17 14 17 20 16 13 17 17 25 16 12 19 21 18 19 19 22 17 22 19 21 24
 24 18 27 21 17 23 13 14 22 21 25 12 21 18 27 14 26 17 20 23 19  9 22 16
 24 11 18 21 25 16 31 20 23 25 26 24 27 10 36 15 36 12 36 21 29 16 32 23
 55 16 48 14 38 28 31 11 52  8 56 20 54 23 50 14 53 18 63 18 56 18 55 24
 48 21 59 14 57 11 73 19 44 18 56 12 72 17 56 14 53 16 63 18 77 12 84  9
 60 16 69  9 59  4 66 16 64  7 66  9 82  9 83  6 73  8 71 10 81 10 90  9
 69  7 68  4 74  5 71  3 61  7 78  5 77  2 82  7 73  8 64  8 61  1 81  2
 76  3 63 73  2 68  3 64  2 86  6 66  2 84  2 78  1 66  2 65 72 77  2 56
  1 71 69  1 59  1 76 49 67  1 68 63 

In [17]:
group_keys

Index(['GDN0005', 'GDN0006', 'GDN0007', 'GDN0008', 'GDN0009', 'GDN0010',
       'GDN0011', 'GDN0012', 'GDN0013', 'GDN0014', 'GDN0016', 'GDN0017',
       'GDN0018', 'GDN0019', 'GDN0020', 'GDN0021', 'GDN0022', 'GDN0023',
       'GDN0024', 'GDN0025', 'GDN0027', 'GDN0028', 'GDN0029', 'GDN0030',
       'VP_001', 'VP_002', 'VP_003', 'VP_004', 'VP_005', 'VP_020', 'VP_022',
       'VP_023', 'VP_026', 'VP_027', 'VP_028', 'VP_029', 'VP_030', 'VP_031',
       'VP_032'],
      dtype='object')

In [18]:
model_dict = {
    "scaler": {"StandardScaler": StandardScaler(), "MinMaxScaler": MinMaxScaler()},
    "reduce_dim": {"SelectKBest": SelectKBest(), "RFE": RFE(SVC(kernel="linear"))},
    #"reduce_dim": {"SelectKBest": SelectKBest()},
    "clf": {
        "KNeighborsRegressor": KNeighborsRegressor(),
        #"RandomForestRegressor": RandomForestRegressor(n_jobs=10),
        #"HistGradientBoostingRegressor": HistGradientBoostingRegressor(),
    },
}

In [19]:
params_dict = {
    "StandardScaler": None,
    "MinMaxScaler": None,
    "SelectKBest": {"k": [2, 4, 6, 8, 10, "all"]},
    "SVC": {"C": [0.1, 1, 10]},
    "RFE": {
        "n_features_to_select": [0.4, 0.5, 0.6],
        "step": [1,2,3],
    },
    "KNeighborsRegressor": {
        "n_neighbors": [8,9,10,11,12,13,14],
        "weights": ["uniform", "distance"],
        "p": [1,2],
        },
    #"RandomForestRegressor": {
    #    "n_estimators": [10],
    #    "min_samples_split": [2, 5, 10, 20],
    #    "min_samples_leaf": [1, 2, 4, 10],
    #    "max_depth": [None, 10, 20, 30, 40, 50]
    #    #"max_features": ["sqrt", "log2", None],
    #},
    #"HistGradientBoostingRegressor": None,
}

In [20]:
pipeline_permuter = SklearnPipelinePermuter(
    model_dict=model_dict, param_dict=params_dict
)

In [21]:
outer_cv = GroupKFold(n_splits=5)
inner_cv = GroupKFold(n_splits=5)

pipeline_permuter.fit(X=X, y=y, outer_cv=outer_cv, inner_cv=inner_cv, scoring="neg_mean_absolute_error", groups=groups)

Pipeline Combinations:   0%|          | 0/4 [00:00<?, ?it/s]

### Running hyperparameter search for pipeline: (('scaler', 'StandardScaler'), ('reduce_dim', 'SelectKBest'), ('clf', 'KNeighborsRegressor')) with 1 parameter grid(s):
Parameter grid #0 ({'search_method': 'grid'}): {'reduce_dim__k': [2, 4, 6, 8, 10, 'all'], 'clf__n_neighbors': [8, 9, 10, 11, 12, 13, 14], 'clf__weights': ['uniform', 'distance'], 'clf__p': [1, 2]}


Outer CV:   0%|          | 0/5 [00:00<?, ?it/s]

Fitting 5 folds for each of 168 candidates, totalling 840 fits




Fitting 5 folds for each of 168 candidates, totalling 840 fits




Fitting 5 folds for each of 168 candidates, totalling 840 fits




Fitting 5 folds for each of 168 candidates, totalling 840 fits




Fitting 5 folds for each of 168 candidates, totalling 840 fits


### Running hyperparameter search for pipeline: (('scaler', 'StandardScaler'), ('reduce_dim', 'RFE'), ('clf', 'KNeighborsRegressor')) with 1 parameter grid(s):
Parameter grid #0 ({'search_method': 'grid'}): {'reduce_dim__n_features_to_select': [0.4, 0.5, 0.6], 'reduce_dim__step': [1, 2, 3], 'clf__n_neighbors': [8, 9, 10, 11, 12, 13, 14], 'clf__weights': ['uniform', 'distance'], 'clf__p': [1, 2]}




Outer CV:   0%|          | 0/5 [00:00<?, ?it/s]

Fitting 5 folds for each of 252 candidates, totalling 1260 fits




Fitting 5 folds for each of 252 candidates, totalling 1260 fits




Fitting 5 folds for each of 252 candidates, totalling 1260 fits




Fitting 5 folds for each of 252 candidates, totalling 1260 fits




Fitting 5 folds for each of 252 candidates, totalling 1260 fits


### Running hyperparameter search for pipeline: (('scaler', 'MinMaxScaler'), ('reduce_dim', 'SelectKBest'), ('clf', 'KNeighborsRegressor')) with 1 parameter grid(s):
Parameter grid #0 ({'search_method': 'grid'}): {'reduce_dim__k': [2, 4, 6, 8, 10, 'all'], 'clf__n_neighbors': [8, 9, 10, 11, 12, 13, 14], 'clf__weights': ['uniform', 'distance'], 'clf__p': [1, 2]}




Outer CV:   0%|          | 0/5 [00:00<?, ?it/s]

Fitting 5 folds for each of 168 candidates, totalling 840 fits




Fitting 5 folds for each of 168 candidates, totalling 840 fits




Fitting 5 folds for each of 168 candidates, totalling 840 fits




Fitting 5 folds for each of 168 candidates, totalling 840 fits




Fitting 5 folds for each of 168 candidates, totalling 840 fits


### Running hyperparameter search for pipeline: (('scaler', 'MinMaxScaler'), ('reduce_dim', 'RFE'), ('clf', 'KNeighborsRegressor')) with 1 parameter grid(s):
Parameter grid #0 ({'search_method': 'grid'}): {'reduce_dim__n_features_to_select': [0.4, 0.5, 0.6], 'reduce_dim__step': [1, 2, 3], 'clf__n_neighbors': [8, 9, 10, 11, 12, 13, 14], 'clf__weights': ['uniform', 'distance'], 'clf__p': [1, 2]}




Outer CV:   0%|          | 0/5 [00:00<?, ?it/s]

Fitting 5 folds for each of 252 candidates, totalling 1260 fits




Fitting 5 folds for each of 252 candidates, totalling 1260 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 252 candidates, totalling 1260 fits




Fitting 5 folds for each of 252 candidates, totalling 1260 fits




Fitting 5 folds for each of 252 candidates, totalling 1260 fits








In [22]:
pipeline_permuter.metric_summary()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,true_labels,true_labels_folds,predicted_labels,predicted_labels_folds,train_indices,train_indices_folds,test_indices,test_indices_folds,mean_test_neg_mean_absolute_error,std_test_neg_mean_absolute_error,test_neg_mean_absolute_error_fold_0,test_neg_mean_absolute_error_fold_1,test_neg_mean_absolute_error_fold_2,test_neg_mean_absolute_error_fold_3,test_neg_mean_absolute_error_fold_4
pipeline_scaler,pipeline_reduce_dim,pipeline_clf,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
StandardScaler,SelectKBest,KNeighborsRegressor,"[370.0, 346.0, 374.0, 316.0, 336.0, 322.0, 334...","[[370.0, 346.0, 374.0, 316.0, 336.0, 322.0, 33...","[357.26026274767827, 342.3397730034228, 366.11...","[[357.26026274767827, 342.3397730034228, 366.1...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...","[2257, 2258, 2259, 2260, 2261, 2262, 2263, 226...","[[2257, 2258, 2259, 2260, 2261, 2262, 2263, 22...",9.39549,0.667512,9.998162,9.261511,9.774052,8.151963,9.79176
StandardScaler,RFE,KNeighborsRegressor,"[370.0, 346.0, 374.0, 316.0, 336.0, 322.0, 334...","[[370.0, 346.0, 374.0, 316.0, 336.0, 322.0, 33...","[363.085485931488, 342.61961720647287, 371.428...","[[363.085485931488, 342.61961720647287, 371.42...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...","[2257, 2258, 2259, 2260, 2261, 2262, 2263, 226...","[[2257, 2258, 2259, 2260, 2261, 2262, 2263, 22...",8.689497,0.825251,9.611074,8.498979,9.256969,7.217285,8.863179
MinMaxScaler,SelectKBest,KNeighborsRegressor,"[370.0, 346.0, 374.0, 316.0, 336.0, 322.0, 334...","[[370.0, 346.0, 374.0, 316.0, 336.0, 322.0, 33...","[359.6414425809928, 341.8356502802455, 366.889...","[[359.6414425809928, 341.8356502802455, 366.88...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...","[2257, 2258, 2259, 2260, 2261, 2262, 2263, 226...","[[2257, 2258, 2259, 2260, 2261, 2262, 2263, 22...",9.396367,0.723454,10.151527,8.900279,9.887607,8.221913,9.820511
MinMaxScaler,RFE,KNeighborsRegressor,"[370.0, 346.0, 374.0, 316.0, 336.0, 322.0, 334...","[[370.0, 346.0, 374.0, 316.0, 336.0, 322.0, 33...","[371.14289041775334, 346.63798434339765, 377.0...","[[371.14289041775334, 346.63798434339765, 377....","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...","[2257, 2258, 2259, 2260, 2261, 2262, 2263, 226...","[[2257, 2258, 2259, 2260, 2261, 2262, 2263, 22...",9.646205,0.755236,10.898095,9.074771,10.129938,8.91168,9.216539


In [26]:
pipeline_permuter.best_hyperparameter_pipeline()

Unnamed: 0_level_0,mean_test_neg_mean_absolute_error,param_clf__n_neighbors,param_clf__p,param_clf__weights,param_reduce_dim__n_features_to_select,param_reduce_dim__step,params,rank_test_neg_mean_absolute_error,split0_test_neg_mean_absolute_error,split1_test_neg_mean_absolute_error,split2_test_neg_mean_absolute_error,split3_test_neg_mean_absolute_error,split4_test_neg_mean_absolute_error,std_test_neg_mean_absolute_error
outer_fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,-8.635566,14,1,distance,0.4,1.0,"{'clf__n_neighbors': 14, 'clf__p': 1, 'clf__we...",6,-7.355686,-9.623169,-8.323662,-8.708293,-9.16702,0.774399
1,-8.955911,14,1,distance,0.4,1.0,"{'clf__n_neighbors': 14, 'clf__p': 1, 'clf__we...",1,-8.266636,-10.614622,-8.423419,-8.599175,-8.875704,0.853658
2,-8.639042,14,1,distance,0.4,1.0,"{'clf__n_neighbors': 14, 'clf__p': 1, 'clf__we...",1,-8.238201,-9.115226,-9.022021,-8.86242,-7.957339,0.457977
3,-9.138771,14,1,distance,0.4,1.0,"{'clf__n_neighbors': 14, 'clf__p': 1, 'clf__we...",4,-9.782426,-9.631542,-8.410206,-8.830565,-9.039118,0.508497
4,-8.743441,14,1,distance,0.4,1.0,"{'clf__n_neighbors': 14, 'clf__p': 1, 'clf__we...",8,-7.631981,-8.958766,-10.84819,-8.984664,-7.293606,1.254742


In [24]:
print(f"Save results: {save_results}")

Save results: True


In [25]:
if save_results:
    pipeline_permuter.to_pickle(models_path.joinpath("KNeighbors_No_Outlier_Correction.pkl"))