In [1]:
from pathlib import Path

import pandas as pd
import numpy as np

import biopsykit as bp
from sklearn.preprocessing import MinMaxScaler, StandardScaler


#Feature Selection
from sklearn.feature_selection import SelectKBest, RFE, f_regression, mutual_info_regression

#Classification
from sklearn.svm import SVR

# Regression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor


# Cross-Validation
from sklearn.model_selection import GroupKFold

from biopsykit.classification.model_selection import SklearnPipelinePermuter


%matplotlib widget
%load_ext autoreload
%autoreload 2

In [2]:
save_results = True

In [3]:
data_path = Path("../../results/data")
data_path

WindowsPath('../../results/data')

In [4]:
models_path = Path("../../results/models")

In [5]:
input_data = pd.read_csv(data_path.joinpath("train_data_q_wave.csv"), index_col=[0,1,2,3,4])
input_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,rr_interval_ms_estimated,forounzafar2018,martinez2004,scipy-findpeaks,vanlien2013-32-ms,vanlien2013-34-ms,vanlien2013-36-ms,vanlien2013-38-ms,vanlien2013-40-ms,vanlien2013-42-ms
Unnamed: 0_level_1,participant,phase,heartbeat_id_reference,q_wave_onset_sample_reference,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,GDN0005,HoldingBreath,0,260.0,850.0,276.0,262.0,262.0,266.0,264.0,262.0,260.0,258.0,256.0
1,GDN0005,HoldingBreath,1,260.0,778.0,278.0,262.0,262.0,266.0,264.0,262.0,260.0,258.0,256.0
2,GDN0005,HoldingBreath,3,222.0,746.0,236.0,224.0,224.0,230.0,228.0,226.0,224.0,222.0,220.0
3,GDN0005,HoldingBreath,4,220.0,766.0,236.0,222.0,222.0,230.0,228.0,226.0,224.0,222.0,220.0
4,GDN0005,HoldingBreath,5,228.0,790.0,242.0,230.0,230.0,236.0,234.0,232.0,230.0,228.0,226.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11498,VP_032,Talk,38,153.0,596.0,188.0,154.0,154.0,179.0,177.0,175.0,173.0,171.0,169.0
11499,VP_032,Talk,39,172.0,619.0,183.0,173.0,173.0,177.0,175.0,173.0,171.0,169.0,167.0
11500,VP_032,Talk,40,181.0,680.0,193.0,182.0,182.0,185.0,183.0,181.0,179.0,177.0,175.0
11501,VP_032,Talk,41,200.0,719.0,210.0,201.0,201.0,206.0,204.0,202.0,200.0,198.0,196.0


In [6]:
print(f"Min data value: {input_data.values.min()}\nMax data value: {input_data.values.max()}")

Min data value: 23.0
Max data value: 1534.0


In [7]:
columns=input_data.columns
columns

Index(['rr_interval_ms_estimated', 'forounzafar2018', 'martinez2004',
       'scipy-findpeaks', 'vanlien2013-32-ms', 'vanlien2013-34-ms',
       'vanlien2013-36-ms', 'vanlien2013-38-ms', 'vanlien2013-40-ms',
       'vanlien2013-42-ms'],
      dtype='object')

In [8]:
X, y, groups, group_keys = bp.classification.utils.prepare_df_sklearn(data=input_data, label_col="q_wave_onset_sample_reference", subject_col="participant", print_summary=True)

Shape of X: (11349, 10); shape of y: (11349,); number of groups: 39, class prevalence: [  1   2   1   6   2   8   4  15  21  14  26  30  35  50  48  55  57  52
  65  61  79  73  59  50  51  49  43  44  42  25  40  30  38  30  38  49
  33  32  31  31  22  38  29  43  47  34  21  47  48  41  32  31  36  36
  39  47  52  29  29  40  51  21  50  31  43  28  32  27  45  23  39  33
  47  22  45  25  35  29  34  37  53  23  68  24  66  38  59  36  62  31
  72  29  83  30  69  26  64  20  95  27  78  22  67  20  64  28  69  21
  92  18  81  19  91  25  91  19  81  27  69  25  81  15  90  14 101  16
  94  14 116  18 103  25  96  21  78  17  95  12  89  25  91  24  76  13
  82  22  95  11  74  20  82  18  82  11  77  14  83  19  96  12 100  14
 100  11  89  14  87  10  97   9  99   8  98   7 107   7 108   5 106   5
 110   3  89   4  93   4  86  10  82   7  73   2  68   1  75   2  83   3
  84   4  70   4  82  70   5  56   3  75   1  75   3  55   4  59   3  70
   2  69  68   1  66   3  56  59  72 

In [9]:
group_keys

Index(['GDN0005', 'GDN0006', 'GDN0007', 'GDN0008', 'GDN0009', 'GDN0010',
       'GDN0011', 'GDN0012', 'GDN0013', 'GDN0014', 'GDN0016', 'GDN0017',
       'GDN0018', 'GDN0019', 'GDN0020', 'GDN0021', 'GDN0022', 'GDN0023',
       'GDN0024', 'GDN0025', 'GDN0027', 'GDN0028', 'GDN0029', 'GDN0030',
       'VP_001', 'VP_002', 'VP_003', 'VP_004', 'VP_005', 'VP_020', 'VP_022',
       'VP_023', 'VP_026', 'VP_027', 'VP_028', 'VP_029', 'VP_030', 'VP_031',
       'VP_032'],
      dtype='object')

In [10]:
model_dict = {
    "scaler": {"StandardScaler": StandardScaler(), "MinMaxScaler": MinMaxScaler()},
    "reduce_dim": {"SelectKBest": SelectKBest(), "RFE": RFE(SVR(kernel="linear"))},
    #"reduce_dim": {"SelectKBest": SelectKBest()},
    #"reduce_dim": {"RFE": RFE(SVR(kernel="linear"))},
    "clf": {
        "KNeighborsRegressor": KNeighborsRegressor(),
        #"RandomForestRegressor": RandomForestRegressor(n_jobs=10),
    },
}

In [11]:
params_dict = {
    "StandardScaler": None,
    "MinMaxScaler": None,
    "SelectKBest": {
        "score_func": [f_regression, mutual_info_regression],
        "k": [2, 4, 6, 8, 10],
        },
    "SVR": {"C": [0.1, 1, 10]},
    "RFE": {
        "n_features_to_select": [0.4, 0.5, 0.6],
        "step": [1,2,3],
    },
    "KNeighborsRegressor": {
        "n_neighbors": [8,9,10,11,12,13,14],
        "weights": ["uniform", "distance"],
        "p": [1,2],
        },
    #"RandomForestRegressor": {
    #    "n_estimators": [200, 300],
    #    "min_samples_split": [2, 5, 10, 20, 30, 40],
    #    "min_samples_leaf": [1, 2, 4, 10, 20, 30],
    #    "max_depth": [None, 40, 60, 80, 100, 120],
    #    #"max_features": ["sqrt", "log2", None],
    #},
}

In [12]:
pipeline_permuter = SklearnPipelinePermuter(
    model_dict=model_dict, param_dict=params_dict
)

In [13]:
outer_cv = GroupKFold(n_splits=5)
inner_cv = GroupKFold(n_splits=5)

pipeline_permuter.fit(X=X, y=y, outer_cv=outer_cv, inner_cv=inner_cv, scoring="neg_mean_absolute_error", groups=groups)

Pipeline Combinations:   0%|          | 0/4 [00:00<?, ?it/s]

### Running hyperparameter search for pipeline: (('scaler', 'StandardScaler'), ('reduce_dim', 'SelectKBest'), ('clf', 'KNeighborsRegressor')) with 1 parameter grid(s):
Parameter grid #0 ({'search_method': 'grid'}): {'reduce_dim__score_func': [<function f_regression at 0x000001CC199B5760>, <function mutual_info_regression at 0x000001CC199A3600>], 'reduce_dim__k': [2, 4, 6, 8, 10], 'clf__n_neighbors': [8, 9, 10, 11, 12, 13, 14], 'clf__weights': ['uniform', 'distance'], 'clf__p': [1, 2]}


Outer CV:   0%|          | 0/5 [00:00<?, ?it/s]

Fitting 5 folds for each of 280 candidates, totalling 1400 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 280 candidates, totalling 1400 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 280 candidates, totalling 1400 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 280 candidates, totalling 1400 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 280 candidates, totalling 1400 fits


### Running hyperparameter search for pipeline: (('scaler', 'StandardScaler'), ('reduce_dim', 'RFE'), ('clf', 'KNeighborsRegressor')) with 1 parameter grid(s):
Parameter grid #0 ({'search_method': 'grid'}): {'reduce_dim__n_features_to_select': [0.4, 0.5, 0.6], 'reduce_dim__step': [1, 2, 3], 'clf__n_neighbors': [8, 9, 10, 11, 12, 13, 14], 'clf__weights': ['uniform', 'distance'], 'clf__p': [1, 2]}


  _data = np.array(data, dtype=dtype, copy=copy,


Outer CV:   0%|          | 0/5 [00:00<?, ?it/s]

Fitting 5 folds for each of 252 candidates, totalling 1260 fits




Fitting 5 folds for each of 252 candidates, totalling 1260 fits




Fitting 5 folds for each of 252 candidates, totalling 1260 fits




Fitting 5 folds for each of 252 candidates, totalling 1260 fits




Fitting 5 folds for each of 252 candidates, totalling 1260 fits


### Running hyperparameter search for pipeline: (('scaler', 'MinMaxScaler'), ('reduce_dim', 'SelectKBest'), ('clf', 'KNeighborsRegressor')) with 1 parameter grid(s):
Parameter grid #0 ({'search_method': 'grid'}): {'reduce_dim__score_func': [<function f_regression at 0x000001CC199B5760>, <function mutual_info_regression at 0x000001CC199A3600>], 'reduce_dim__k': [2, 4, 6, 8, 10], 'clf__n_neighbors': [8, 9, 10, 11, 12, 13, 14], 'clf__weights': ['uniform', 'distance'], 'clf__p': [1, 2]}




Outer CV:   0%|          | 0/5 [00:00<?, ?it/s]

Fitting 5 folds for each of 280 candidates, totalling 1400 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 280 candidates, totalling 1400 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 280 candidates, totalling 1400 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 280 candidates, totalling 1400 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 280 candidates, totalling 1400 fits


  _data = np.array(data, dtype=dtype, copy=copy,




### Running hyperparameter search for pipeline: (('scaler', 'MinMaxScaler'), ('reduce_dim', 'RFE'), ('clf', 'KNeighborsRegressor')) with 1 parameter grid(s):
Parameter grid #0 ({'search_method': 'grid'}): {'reduce_dim__n_features_to_select': [0.4, 0.5, 0.6], 'reduce_dim__step': [1, 2, 3], 'clf__n_neighbors': [8, 9, 10, 11, 12, 13, 14], 'clf__weights': ['uniform', 'distance'], 'clf__p': [1, 2]}




Outer CV:   0%|          | 0/5 [00:00<?, ?it/s]

Fitting 5 folds for each of 252 candidates, totalling 1260 fits




Fitting 5 folds for each of 252 candidates, totalling 1260 fits




Fitting 5 folds for each of 252 candidates, totalling 1260 fits




Fitting 5 folds for each of 252 candidates, totalling 1260 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 252 candidates, totalling 1260 fits


  _data = np.array(data, dtype=dtype, copy=copy,






In [14]:
pipeline_permuter.metric_summary()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,true_labels,true_labels_folds,predicted_labels,predicted_labels_folds,train_indices,train_indices_folds,test_indices,test_indices_folds,mean_test_neg_mean_absolute_error,std_test_neg_mean_absolute_error,test_neg_mean_absolute_error_fold_0,test_neg_mean_absolute_error_fold_1,test_neg_mean_absolute_error_fold_2,test_neg_mean_absolute_error_fold_3,test_neg_mean_absolute_error_fold_4
pipeline_scaler,pipeline_reduce_dim,pipeline_clf,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
StandardScaler,SelectKBest,KNeighborsRegressor,"[260.0, 260.0, 222.0, 220.0, 228.0, 236.0, 226...","[[260.0, 260.0, 222.0, 220.0, 228.0, 236.0, 22...","[262.1054982720167, 262.80401373977526, 223.82...","[[262.1054982720167, 262.80401373977526, 223.8...","[316, 317, 318, 319, 320, 321, 322, 323, 324, ...","[[316, 317, 318, 319, 320, 321, 322, 323, 324,...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...",2.541666,0.621796,2.890936,3.604033,1.965469,2.15189,2.096005
StandardScaler,RFE,KNeighborsRegressor,"[260.0, 260.0, 222.0, 220.0, 228.0, 236.0, 226...","[[260.0, 260.0, 222.0, 220.0, 228.0, 236.0, 22...","[262.11487266718837, 262.6920534613555, 223.80...","[[262.11487266718837, 262.6920534613555, 223.8...","[316, 317, 318, 319, 320, 321, 322, 323, 324, ...","[[316, 317, 318, 319, 320, 321, 322, 323, 324,...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...",2.539208,0.620433,2.878641,3.604033,1.965469,2.15189,2.096005
MinMaxScaler,SelectKBest,KNeighborsRegressor,"[260.0, 260.0, 222.0, 220.0, 228.0, 236.0, 226...","[[260.0, 260.0, 222.0, 220.0, 228.0, 236.0, 22...","[262.2516395220607, 262.5756930845154, 223.663...","[[262.2516395220607, 262.5756930845154, 223.66...","[316, 317, 318, 319, 320, 321, 322, 323, 324, ...","[[316, 317, 318, 319, 320, 321, 322, 323, 324,...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...",2.568405,0.635083,2.934804,3.648165,1.985529,2.182313,2.091217
MinMaxScaler,RFE,KNeighborsRegressor,"[260.0, 260.0, 222.0, 220.0, 228.0, 236.0, 226...","[[260.0, 260.0, 222.0, 220.0, 228.0, 236.0, 22...","[263.11911134482887, 266.1484420778204, 223.83...","[[263.11911134482887, 266.1484420778204, 223.8...","[316, 317, 318, 319, 320, 321, 322, 323, 324, ...","[[316, 317, 318, 319, 320, 321, 322, 323, 324,...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...",2.696625,0.598102,3.137547,3.632926,2.063266,2.465672,2.183715


In [15]:
pipeline_permuter.best_hyperparameter_pipeline()

  .agg(["mean", "std"])


Unnamed: 0_level_0,mean_test_neg_mean_absolute_error,param_clf__n_neighbors,param_clf__p,param_clf__weights,param_reduce_dim__k,param_reduce_dim__score_func,params,rank_test_neg_mean_absolute_error,split0_test_neg_mean_absolute_error,split1_test_neg_mean_absolute_error,split2_test_neg_mean_absolute_error,split3_test_neg_mean_absolute_error,split4_test_neg_mean_absolute_error,std_test_neg_mean_absolute_error
outer_fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,-2.406468,11,1,distance,4.0,<function mutual_info_regression at 0x000001CC...,"{'clf__n_neighbors': 11, 'clf__p': 1, 'clf__we...",5,-2.275953,-2.023213,-2.625273,-2.918804,-2.189098,0.322965
1,-2.211107,11,1,distance,4.0,<function mutual_info_regression at 0x000001CC...,"{'clf__n_neighbors': 11, 'clf__p': 1, 'clf__we...",1,-2.400573,-1.280076,-2.679754,-1.513481,-3.18165,0.71428
2,-2.937451,11,1,distance,4.0,<function mutual_info_regression at 0x000001CC...,"{'clf__n_neighbors': 11, 'clf__p': 1, 'clf__we...",9,-3.420161,-2.136884,-2.343208,-2.633822,-4.153178,0.74796
3,-2.687628,11,1,distance,4.0,<function mutual_info_regression at 0x000001CC...,"{'clf__n_neighbors': 11, 'clf__p': 1, 'clf__we...",4,-2.153667,-2.327619,-4.300177,-2.228469,-2.428207,0.811562
4,-2.766628,11,1,distance,4.0,<function mutual_info_regression at 0x000001CC...,"{'clf__n_neighbors': 11, 'clf__p': 1, 'clf__we...",1,-1.240604,-2.988792,-3.515118,-2.165044,-3.923583,0.963202


In [16]:
print(f"Save results: {save_results}")

Save results: True


In [17]:
if save_results:
    pipeline_permuter.to_pickle(models_path.joinpath("KNeighbors_q_peak.pkl"))