In [1]:
import json

from pathlib import Path

import pandas as pd
import numpy as np

import biopsykit as bp

import matplotlib.pyplot as plt

from pepbench.io import load_challenge_results_from_folder
from pepbench.io import convert_hz_to_ms
from pepbench.io import impute_missing_values
from pepbench.datasets import EmpkinsDataset, GuardianDataset

#%matplotlib widget
%load_ext autoreload
%autoreload 2

In [2]:
root_path = Path("../../../")
print(root_path.absolute())

/home/woody/iwso/iwso173h/pepbench/experiments/pep_algorithm_benchmarking/notebooks/ML_analysis/Preprocessing/../../..


In [3]:
result_path = root_path.joinpath("results")
result_path

PosixPath('../../../results')

In [4]:
deploy_type = "local"

config_dict = json.load(root_path.joinpath("config.json").open(encoding="utf-8"))

empkins_base_path = Path(config_dict[deploy_type]["empkins_path"])
guardian_base_path = Path(config_dict[deploy_type]["guardian_path"])
print(empkins_base_path)

/Users/sebas/Development/ResearchInternship/Data/2024_08_PEP_Benchmarking/EmpkinS_Dataset


#### Set flags for further processing

In [5]:
save_results = True
include_rr_interval = False
include_q_wave_reference = False
drop_nan = False
impute_nan = False

In [6]:
dataset_empkins = EmpkinsDataset(empkins_base_path, use_cache=True, only_labeled=True)
fs_empkins = dataset_empkins.sampling_rate_icg
print(f"Sampling rate ICG: {fs_empkins}")
dataset_guardian = GuardianDataset(guardian_base_path, use_cache=True, only_labeled=True)
fs_guardian = dataset_guardian.sampling_rate_icg
print(f"Sampling rate ICG: {fs_guardian}")

Sampling rate ICG: 1000
Sampling rate ICG: 500


In [7]:
algo_levels = ["q_wave_algorithm", "b_point_algorithm", "outlier_correction_algorithm"]
algo_level_mapping = dict(zip(algo_levels, ["Q-Wave Algorithm", "B-Point Algorithm", "Outlier Correction"]))

#### Load data from the challenge results

In [8]:
results_empkins = load_challenge_results_from_folder(
    result_path.joinpath("empkins_dataset_b_point").resolve(), index_cols_per_sample=["participant", "condition", "phase"]
).per_sample
results_guardian = load_challenge_results_from_folder(
    result_path.joinpath("guardian_dataset_b_point").resolve(), index_cols_per_sample=["participant", "phase"]
).per_sample

In [9]:
results_empkins = results_empkins.reset_index().set_index(['q_peak_algorithm', 'b_point_algorithm', 'outlier_correction_algorithm', 'participant', 'condition', 'phase'])
#results_empkins = results_empkins.drop(columns=['level_6'])
results_empkins

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,id,heartbeat_id,heartbeat_id,heartbeat_start_sample,heartbeat_start_sample,heartbeat_end_sample,heartbeat_end_sample,q_peak_sample,q_peak_sample,b_point_sample,...,heart_rate_bpm,pep_sample,pep_sample,pep_ms,pep_ms,nan_reason,nan_reason,error_per_sample_ms,absolute_error_per_sample_ms,absolute_relative_error_per_sample_percent
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,estimated,reference,estimated,reference,estimated,reference,estimated,reference,estimated,...,reference,estimated,reference,estimated,reference,estimated,reference,metric,metric,metric
q_peak_algorithm,b_point_algorithm,outlier_correction_algorithm,participant,condition,phase,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2
q-peak-reference,arbol2017-isoelectric-crossings,forouzanfar2018,VP_001,tsst,Prep,0,0,0,394,399,984,985,567,567,669,...,101.694915,102,67,102.0,67.0,,,-35.0,35.0,52.238806
q-peak-reference,arbol2017-isoelectric-crossings,forouzanfar2018,VP_001,tsst,Prep,1,1,1,984,985,1569,1569,1156,1156,1238,...,103.270224,82,50,82.0,50.0,,,-32.0,32.0,64.000000
q-peak-reference,arbol2017-isoelectric-crossings,forouzanfar2018,VP_001,tsst,Prep,2,2,2,1569,1569,2134,2134,1735,1735,1834,...,107.719928,99,76,99.0,76.0,,,-23.0,23.0,30.263158
q-peak-reference,arbol2017-isoelectric-crossings,forouzanfar2018,VP_001,tsst,Prep,3,3,3,2134,2134,2684,2684,2291,2291,2394,...,109.890110,103,91,103.0,91.0,,,-12.0,12.0,13.186813
q-peak-reference,arbol2017-isoelectric-crossings,forouzanfar2018,VP_001,tsst,Prep,4,4,4,2684,2684,3227,3227,2846,2846,2943,...,110.905730,97,80,97.0,80.0,,,-17.0,17.0,21.250000
q-peak-reference,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
q-peak-reference,stern1985,none,VP_032,ftsst,Pause_5,7,7,7,5864,5864,6660,6660,6110,6110,6210,...,75.853350,100,146,100.0,146.0,,,46.0,46.0,31.506849
q-peak-reference,stern1985,none,VP_032,ftsst,Pause_5,8,8,8,6660,6660,7458,7458,6900,6900,7052,...,74.812968,152,154,152.0,154.0,,,2.0,2.0,1.298701
q-peak-reference,stern1985,none,VP_032,ftsst,Pause_5,9,9,9,7458,7458,8267,8267,7701,7701,7857,...,73.891626,156,154,156.0,154.0,,,-2.0,2.0,1.298701
q-peak-reference,stern1985,none,VP_032,ftsst,Pause_5,10,10,10,8267,8267,9074,9074,8515,8515,8667,...,74.626866,152,158,152.0,158.0,,,6.0,6.0,3.797468


#### Introduce column 'condition' to the guardian dataset to assure compatibility with the empkins dataset

In [10]:
results_guardian = results_guardian.assign(condition="Dummy")
results_guardian = results_guardian.reset_index().set_index(['q_peak_algorithm', 'b_point_algorithm', 'outlier_correction_algorithm', 'participant', 'condition', 'phase'])
results_guardian

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,id,heartbeat_id,heartbeat_id,heartbeat_start_sample,heartbeat_start_sample,heartbeat_end_sample,heartbeat_end_sample,q_peak_sample,q_peak_sample,b_point_sample,...,heart_rate_bpm,pep_sample,pep_sample,pep_ms,pep_ms,nan_reason,nan_reason,error_per_sample_ms,absolute_error_per_sample_ms,absolute_relative_error_per_sample_percent
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,estimated,reference,estimated,reference,estimated,reference,estimated,reference,estimated,...,reference,estimated,reference,estimated,reference,estimated,reference,metric,metric,metric
q_peak_algorithm,b_point_algorithm,outlier_correction_algorithm,participant,condition,phase,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2
q-peak-reference,arbol2017-isoelectric-crossings,forouzanfar2018,GDN0005,Dummy,Pause,0,0,0,352,352,808,808,492,492,558,...,65.789474,66,76,132.0,152.0,,,20.0,20.0,13.157895
q-peak-reference,arbol2017-isoelectric-crossings,forouzanfar2018,GDN0005,Dummy,Pause,1,1,1,808,808,1252,1252,948,948,1009,...,68.649886,61,77,122.0,154.0,,,32.0,32.0,20.779221
q-peak-reference,arbol2017-isoelectric-crossings,forouzanfar2018,GDN0005,Dummy,Pause,2,2,2,1252,1252,1686,1686,1386,1386,1463,...,69.444444,77,70,154.0,140.0,,,-14.0,14.0,10.000000
q-peak-reference,arbol2017-isoelectric-crossings,forouzanfar2018,GDN0005,Dummy,Pause,3,3,3,1686,1686,2116,2115,1817,1817,1900,...,69.930070,83,71,166.0,142.0,,,-24.0,24.0,16.901408
q-peak-reference,arbol2017-isoelectric-crossings,forouzanfar2018,GDN0005,Dummy,Pause,4,4,4,2116,2115,2570,2570,2246,2246,2320,...,64.102564,74,47,148.0,94.0,,,-54.0,54.0,57.446809
q-peak-reference,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
q-peak-reference,stern1985,none,GDN0030,Dummy,TiltDown,51,51,51,27029,27029,27570,27570,27200,27200,27277,...,55.555556,77,78,154.0,156.0,,,2.0,2.0,1.282051
q-peak-reference,stern1985,none,GDN0030,Dummy,TiltDown,52,52,52,27570,27570,28118,28118,27740,27740,27788,...,54.249548,48,75,96.0,150.0,,,54.0,54.0,36.000000
q-peak-reference,stern1985,none,GDN0030,Dummy,TiltDown,53,53,53,28118,28118,28652,28652,28293,28293,28345,...,57.361377,52,52,104.0,104.0,,,0.0,0.0,0.000000
q-peak-reference,stern1985,none,GDN0030,Dummy,TiltDown,54,54,54,28652,28652,29167,29167,28816,28816,28846,...,58.708415,30,51,60.0,102.0,,,42.0,42.0,41.176471


#### Perform data preprocessing

In [11]:
results_empkins_norm = results_empkins.copy()
results_guardian_norm = results_guardian.copy()
results_empkins_norm[("b_point_sample", "estimated")] = results_empkins[("b_point_sample", "estimated")] - results_empkins[("heartbeat_start_sample", "estimated")]
results_empkins_norm[("b_point_sample", "reference")] = results_empkins[("b_point_sample", "reference")] - results_empkins[("heartbeat_start_sample", "estimated")]
results_empkins_norm[("q_peak_sample", "reference")] = results_empkins[("q_peak_sample", "reference")] - results_empkins[("heartbeat_start_sample", "estimated")]
results_guardian_norm[("b_point_sample", "estimated")] = results_guardian[("b_point_sample", "estimated")] - results_guardian[("heartbeat_start_sample", "estimated")]
results_guardian_norm[("b_point_sample", "reference")] = results_guardian[("b_point_sample", "reference")] - results_guardian[("heartbeat_start_sample", "estimated")]
results_guardian_norm[("q_peak_sample", "reference")] = results_guardian[("q_peak_sample", "reference")] - results_guardian[("heartbeat_start_sample", "estimated")]

In [12]:
cols = [("heartbeat_start_sample", "estimated"), ("heartbeat_start_sample", "reference"), ("heartbeat_end_sample", "estimated"), ("heartbeat_end_sample", "reference"), ("q_peak_sample", "estimated"), ("q_peak_sample", "reference"), ("b_point_sample", "estimated"), ("b_point_sample", "reference"), ("pep_sample", "estimated"), ("pep_sample", "reference")]
results_empkins_cleaned_ms = results_empkins_norm.copy()
results_guardian_cleaned_ms = results_guardian_norm.copy()
results_empkins_cleaned_ms[cols] = results_empkins_norm[cols] * convert_hz_to_ms(sampling_frequency=fs_empkins)
results_guardian_cleaned_ms[cols] = results_guardian_norm[cols] * convert_hz_to_ms(sampling_frequency=fs_guardian)

#### Concatenate the empkins and guardian dataframes

In [13]:
preprocessed_results_empkins_guardian = pd.concat([results_empkins_cleaned_ms, results_guardian_cleaned_ms])
preprocessed_results_empkins_guardian

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,id,heartbeat_id,heartbeat_id,heartbeat_start_sample,heartbeat_start_sample,heartbeat_end_sample,heartbeat_end_sample,q_peak_sample,q_peak_sample,b_point_sample,...,heart_rate_bpm,pep_sample,pep_sample,pep_ms,pep_ms,nan_reason,nan_reason,error_per_sample_ms,absolute_error_per_sample_ms,absolute_relative_error_per_sample_percent
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,estimated,reference,estimated,reference,estimated,reference,estimated,reference,estimated,...,reference,estimated,reference,estimated,reference,estimated,reference,metric,metric,metric
q_peak_algorithm,b_point_algorithm,outlier_correction_algorithm,participant,condition,phase,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2
q-peak-reference,arbol2017-isoelectric-crossings,forouzanfar2018,VP_001,tsst,Prep,0,0,0,394.0,399.0,984.0,985.0,567.0,173.0,275.0,...,101.694915,102.0,67.0,102.0,67.0,,,-35.0,35.0,52.238806
q-peak-reference,arbol2017-isoelectric-crossings,forouzanfar2018,VP_001,tsst,Prep,1,1,1,984.0,985.0,1569.0,1569.0,1156.0,172.0,254.0,...,103.270224,82.0,50.0,82.0,50.0,,,-32.0,32.0,64.000000
q-peak-reference,arbol2017-isoelectric-crossings,forouzanfar2018,VP_001,tsst,Prep,2,2,2,1569.0,1569.0,2134.0,2134.0,1735.0,166.0,265.0,...,107.719928,99.0,76.0,99.0,76.0,,,-23.0,23.0,30.263158
q-peak-reference,arbol2017-isoelectric-crossings,forouzanfar2018,VP_001,tsst,Prep,3,3,3,2134.0,2134.0,2684.0,2684.0,2291.0,157.0,260.0,...,109.890110,103.0,91.0,103.0,91.0,,,-12.0,12.0,13.186813
q-peak-reference,arbol2017-isoelectric-crossings,forouzanfar2018,VP_001,tsst,Prep,4,4,4,2684.0,2684.0,3227.0,3227.0,2846.0,162.0,259.0,...,110.905730,97.0,80.0,97.0,80.0,,,-17.0,17.0,21.250000
q-peak-reference,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
q-peak-reference,stern1985,none,GDN0030,Dummy,TiltDown,51,51,51,54058.0,54058.0,55140.0,55140.0,54400.0,342.0,496.0,...,55.555556,154.0,156.0,154.0,156.0,,,2.0,2.0,1.282051
q-peak-reference,stern1985,none,GDN0030,Dummy,TiltDown,52,52,52,55140.0,55140.0,56236.0,56236.0,55480.0,340.0,436.0,...,54.249548,96.0,150.0,96.0,150.0,,,54.0,54.0,36.000000
q-peak-reference,stern1985,none,GDN0030,Dummy,TiltDown,53,53,53,56236.0,56236.0,57304.0,57304.0,56586.0,350.0,454.0,...,57.361377,104.0,104.0,104.0,104.0,,,0.0,0.0,0.000000
q-peak-reference,stern1985,none,GDN0030,Dummy,TiltDown,54,54,54,57304.0,57304.0,58334.0,58334.0,57632.0,328.0,388.0,...,58.708415,60.0,102.0,60.0,102.0,,,42.0,42.0,41.176471


In [14]:
input_data = preprocessed_results_empkins_guardian.reset_index().set_index(['q_peak_algorithm', 'b_point_algorithm', 'outlier_correction_algorithm', 'participant', 'condition', 'phase'])
input_data = input_data.xs(key='none', level='outlier_correction_algorithm', drop_level=True)
input_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,id,heartbeat_id,heartbeat_id,heartbeat_start_sample,heartbeat_start_sample,heartbeat_end_sample,heartbeat_end_sample,q_peak_sample,q_peak_sample,b_point_sample,...,heart_rate_bpm,pep_sample,pep_sample,pep_ms,pep_ms,nan_reason,nan_reason,error_per_sample_ms,absolute_error_per_sample_ms,absolute_relative_error_per_sample_percent
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,estimated,reference,estimated,reference,estimated,reference,estimated,reference,estimated,...,reference,estimated,reference,estimated,reference,estimated,reference,metric,metric,metric
q_peak_algorithm,b_point_algorithm,participant,condition,phase,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2
q-peak-reference,arbol2017-isoelectric-crossings,VP_001,tsst,Prep,0,0,0,394.0,399.0,984.0,985.0,567.0,173.0,275.0,...,101.694915,102.0,67.0,102.0,67.0,,,-35.0,35.0,52.238806
q-peak-reference,arbol2017-isoelectric-crossings,VP_001,tsst,Prep,1,1,1,984.0,985.0,1569.0,1569.0,1156.0,172.0,254.0,...,103.270224,82.0,50.0,82.0,50.0,,,-32.0,32.0,64.000000
q-peak-reference,arbol2017-isoelectric-crossings,VP_001,tsst,Prep,2,2,2,1569.0,1569.0,2134.0,2134.0,1735.0,166.0,265.0,...,107.719928,99.0,76.0,99.0,76.0,,,-23.0,23.0,30.263158
q-peak-reference,arbol2017-isoelectric-crossings,VP_001,tsst,Prep,3,3,3,2134.0,2134.0,2684.0,2684.0,2291.0,157.0,260.0,...,109.890110,103.0,91.0,103.0,91.0,,,-12.0,12.0,13.186813
q-peak-reference,arbol2017-isoelectric-crossings,VP_001,tsst,Prep,4,4,4,2684.0,2684.0,3227.0,3227.0,2846.0,162.0,259.0,...,110.905730,97.0,80.0,97.0,80.0,,,-17.0,17.0,21.250000
q-peak-reference,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
q-peak-reference,stern1985,GDN0030,Dummy,TiltDown,51,51,51,54058.0,54058.0,55140.0,55140.0,54400.0,342.0,496.0,...,55.555556,154.0,156.0,154.0,156.0,,,2.0,2.0,1.282051
q-peak-reference,stern1985,GDN0030,Dummy,TiltDown,52,52,52,55140.0,55140.0,56236.0,56236.0,55480.0,340.0,436.0,...,54.249548,96.0,150.0,96.0,150.0,,,54.0,54.0,36.000000
q-peak-reference,stern1985,GDN0030,Dummy,TiltDown,53,53,53,56236.0,56236.0,57304.0,57304.0,56586.0,350.0,454.0,...,57.361377,104.0,104.0,104.0,104.0,,,0.0,0.0,0.000000
q-peak-reference,stern1985,GDN0030,Dummy,TiltDown,54,54,54,57304.0,57304.0,58334.0,58334.0,57632.0,328.0,388.0,...,58.708415,60.0,102.0,60.0,102.0,,,42.0,42.0,41.176471


#### Select columns

In [15]:
if include_rr_interval:
    input_data = input_data[[("heartbeat_id", "reference"), ("b_point_sample", "estimated"), ("b_point_sample", "reference"), ("rr_interval_ms", "estimated")]]
elif include_q_wave_reference:
    input_data = input_data[[("heartbeat_id", "reference"), ("b_point_sample", "estimated"), ("b_point_sample", "reference"), ("rr_interval_ms", "estimated"), ("q_wave_onset_sample", "reference")]]
else:
    input_data = input_data[[("heartbeat_id", "reference"), ("b_point_sample", "estimated"), ("b_point_sample", "reference")]]
input_data.columns = input_data.columns.to_flat_index().str.join("_")
input_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,heartbeat_id_reference,b_point_sample_estimated,b_point_sample_reference
q_peak_algorithm,b_point_algorithm,participant,condition,phase,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
q-peak-reference,arbol2017-isoelectric-crossings,VP_001,tsst,Prep,0,275.0,240.0
q-peak-reference,arbol2017-isoelectric-crossings,VP_001,tsst,Prep,1,254.0,222.0
q-peak-reference,arbol2017-isoelectric-crossings,VP_001,tsst,Prep,2,265.0,242.0
q-peak-reference,arbol2017-isoelectric-crossings,VP_001,tsst,Prep,3,260.0,248.0
q-peak-reference,arbol2017-isoelectric-crossings,VP_001,tsst,Prep,4,259.0,242.0
q-peak-reference,...,...,...,...,...,...,...
q-peak-reference,stern1985,GDN0030,Dummy,TiltDown,51,496.0,498.0
q-peak-reference,stern1985,GDN0030,Dummy,TiltDown,52,436.0,490.0
q-peak-reference,stern1985,GDN0030,Dummy,TiltDown,53,454.0,454.0
q-peak-reference,stern1985,GDN0030,Dummy,TiltDown,54,388.0,430.0


In [16]:
print(f"Amount of algorithms: {len(input_data.index.get_level_values('b_point_algorithm').unique())}")

Amount of algorithms: 10


#### Transform dataframe from long to wide format

In [17]:
input_data = input_data.reset_index()
# Pivot the DataFrame
if include_rr_interval:
    input_data_wide = input_data.pivot(
        index=['participant', 'condition', 'phase', 'heartbeat_id_reference', 'b_point_sample_reference', 'rr_interval_ms_estimated'],
        columns='b_point_algorithm',
        values='b_point_sample_estimated'
    ).reset_index()
elif include_q_wave_reference:
    input_data_wide = input_data.pivot(
        index=['participant', 'condition', 'phase', 'heartbeat_id_reference', 'b_point_sample_reference', 'q_wave_onset_sample_reference'],
        columns='b_point_algorithm',
        values='b_point_sample_estimated'
    ).reset_index()
else:
    input_data_wide = input_data.pivot(
        index=['participant', 'condition', 'phase', 'heartbeat_id_reference', 'b_point_sample_reference'],
        columns='b_point_algorithm',
        values='b_point_sample_estimated'
    ).reset_index()

input_data_wide

b_point_algorithm,participant,condition,phase,heartbeat_id_reference,b_point_sample_reference,arbol2017-isoelectric-crossings,arbol2017-second-derivative,arbol2017-third-derivative,debski1993-second-derivative,drost2022,forouzanfar2018,lozano2007-linear-regression,lozano2007-quadratic-regression,sherwood1990,stern1985
0,GDN0005,Dummy,HoldingBreath,0,388.0,438.0,398.0,394.0,452.0,400.0,550.0,412.0,384.0,442.0,388.0
1,GDN0005,Dummy,HoldingBreath,1,404.0,340.0,350.0,244.0,388.0,420.0,402.0,404.0,384.0,330.0,402.0
2,GDN0005,Dummy,HoldingBreath,3,376.0,382.0,296.0,386.0,366.0,386.0,388.0,366.0,348.0,382.0,374.0
3,GDN0005,Dummy,HoldingBreath,4,390.0,394.0,344.0,396.0,376.0,396.0,398.0,372.0,348.0,394.0,388.0
4,GDN0005,Dummy,HoldingBreath,5,386.0,398.0,312.0,388.0,418.0,392.0,390.0,378.0,354.0,400.0,384.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11606,VP_032,tsst,Talk,39,310.0,335.0,276.0,324.0,300.0,318.0,306.0,305.0,294.0,337.0,306.0
11607,VP_032,tsst,Talk,40,322.0,331.0,298.0,330.0,311.0,322.0,311.0,321.0,303.0,329.0,287.0
11608,VP_032,tsst,Talk,41,340.0,317.0,300.0,348.0,330.0,343.0,309.0,332.0,322.0,311.0,287.0
11609,VP_032,tsst,Talk,42,311.0,365.0,324.0,366.0,347.0,354.0,305.0,351.0,337.0,368.0,305.0


#### Drop nan values

In [18]:
print(f"Drop nan values: {drop_nan}")
print(f"Impute nan values: {impute_nan}")

Drop nan values: False
Impute nan values: False


#### Delete rows where the column 'rr_interval_ms_estimated' contains nan values

In [19]:
if include_rr_interval and drop_nan:
    input_data_wide = input_data_wide.dropna(subset=['rr_interval_ms_estimated'])
input_data_wide

b_point_algorithm,participant,condition,phase,heartbeat_id_reference,b_point_sample_reference,arbol2017-isoelectric-crossings,arbol2017-second-derivative,arbol2017-third-derivative,debski1993-second-derivative,drost2022,forouzanfar2018,lozano2007-linear-regression,lozano2007-quadratic-regression,sherwood1990,stern1985
0,GDN0005,Dummy,HoldingBreath,0,388.0,438.0,398.0,394.0,452.0,400.0,550.0,412.0,384.0,442.0,388.0
1,GDN0005,Dummy,HoldingBreath,1,404.0,340.0,350.0,244.0,388.0,420.0,402.0,404.0,384.0,330.0,402.0
2,GDN0005,Dummy,HoldingBreath,3,376.0,382.0,296.0,386.0,366.0,386.0,388.0,366.0,348.0,382.0,374.0
3,GDN0005,Dummy,HoldingBreath,4,390.0,394.0,344.0,396.0,376.0,396.0,398.0,372.0,348.0,394.0,388.0
4,GDN0005,Dummy,HoldingBreath,5,386.0,398.0,312.0,388.0,418.0,392.0,390.0,378.0,354.0,400.0,384.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11606,VP_032,tsst,Talk,39,310.0,335.0,276.0,324.0,300.0,318.0,306.0,305.0,294.0,337.0,306.0
11607,VP_032,tsst,Talk,40,322.0,331.0,298.0,330.0,311.0,322.0,311.0,321.0,303.0,329.0,287.0
11608,VP_032,tsst,Talk,41,340.0,317.0,300.0,348.0,330.0,343.0,309.0,332.0,322.0,311.0,287.0
11609,VP_032,tsst,Talk,42,311.0,365.0,324.0,366.0,347.0,354.0,305.0,351.0,337.0,368.0,305.0


In [20]:
if include_rr_interval:
    input_data_wide_cleaned = input_data_wide.reset_index().set_index(['participant', 'condition', 'phase', 'heartbeat_id_reference', 'b_point_sample_reference', 'rr_interval_ms_estimated'], append=True)
else:
    input_data_wide_cleaned = input_data_wide.reset_index().set_index(['participant', 'condition', 'phase', 'heartbeat_id_reference', 'b_point_sample_reference'], append=True)
input_data_wide_cleaned.index

MultiIndex([(    0, 'GDN0005', 'Dummy', 'HoldingBreath',  0, 388.0),
            (    1, 'GDN0005', 'Dummy', 'HoldingBreath',  1, 404.0),
            (    2, 'GDN0005', 'Dummy', 'HoldingBreath',  3, 376.0),
            (    3, 'GDN0005', 'Dummy', 'HoldingBreath',  4, 390.0),
            (    4, 'GDN0005', 'Dummy', 'HoldingBreath',  5, 386.0),
            (    5, 'GDN0005', 'Dummy', 'HoldingBreath',  6, 390.0),
            (    6, 'GDN0005', 'Dummy', 'HoldingBreath',  7, 388.0),
            (    7, 'GDN0005', 'Dummy', 'HoldingBreath',  9, 416.0),
            (    8, 'GDN0005', 'Dummy', 'HoldingBreath', 10, 378.0),
            (    9, 'GDN0005', 'Dummy', 'HoldingBreath', 11, 358.0),
            ...
            (11601,  'VP_032',  'tsst',          'Talk', 34, 323.0),
            (11602,  'VP_032',  'tsst',          'Talk', 35, 321.0),
            (11603,  'VP_032',  'tsst',          'Talk', 36, 284.0),
            (11604,  'VP_032',  'tsst',          'Talk', 37, 323.0),
            (11605

In [21]:
if drop_nan:
    input_data_wide_cleaned = input_data_wide.dropna()
else:
    if impute_nan:
        input_data_wide_cleaned = input_data_wide_cleaned.drop(columns=['index'])
        input_data_wide_cleaned = impute_missing_values(input_data_wide_cleaned, mode='median')
        print(f"Amount of rows after imputation: {input_data_wide_cleaned.shape[0]}")
        input_data_wide_cleaned = input_data_wide_cleaned.dropna()
        print(f"Amount of rows after deleting all nan rows: {input_data_wide_cleaned.shape[0]}")
    else:
        input_data_wide_cleaned = input_data_wide_cleaned.dropna(how='all')
input_data_wide_cleaned

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,b_point_algorithm,index,arbol2017-isoelectric-crossings,arbol2017-second-derivative,arbol2017-third-derivative,debski1993-second-derivative,drost2022,forouzanfar2018,lozano2007-linear-regression,lozano2007-quadratic-regression,sherwood1990,stern1985
Unnamed: 0_level_1,participant,condition,phase,heartbeat_id_reference,b_point_sample_reference,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,GDN0005,Dummy,HoldingBreath,0,388.0,0,438.0,398.0,394.0,452.0,400.0,550.0,412.0,384.0,442.0,388.0
1,GDN0005,Dummy,HoldingBreath,1,404.0,1,340.0,350.0,244.0,388.0,420.0,402.0,404.0,384.0,330.0,402.0
2,GDN0005,Dummy,HoldingBreath,3,376.0,2,382.0,296.0,386.0,366.0,386.0,388.0,366.0,348.0,382.0,374.0
3,GDN0005,Dummy,HoldingBreath,4,390.0,3,394.0,344.0,396.0,376.0,396.0,398.0,372.0,348.0,394.0,388.0
4,GDN0005,Dummy,HoldingBreath,5,386.0,4,398.0,312.0,388.0,418.0,392.0,390.0,378.0,354.0,400.0,384.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11606,VP_032,tsst,Talk,39,310.0,11606,335.0,276.0,324.0,300.0,318.0,306.0,305.0,294.0,337.0,306.0
11607,VP_032,tsst,Talk,40,322.0,11607,331.0,298.0,330.0,311.0,322.0,311.0,321.0,303.0,329.0,287.0
11608,VP_032,tsst,Talk,41,340.0,11608,317.0,300.0,348.0,330.0,343.0,309.0,332.0,322.0,311.0,287.0
11609,VP_032,tsst,Talk,42,311.0,11609,365.0,324.0,366.0,347.0,354.0,305.0,351.0,337.0,368.0,305.0


In [22]:
input_data_wide_cleaned = input_data_wide_cleaned.drop(columns=['index'])
input_data_wide_cleaned

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,b_point_algorithm,arbol2017-isoelectric-crossings,arbol2017-second-derivative,arbol2017-third-derivative,debski1993-second-derivative,drost2022,forouzanfar2018,lozano2007-linear-regression,lozano2007-quadratic-regression,sherwood1990,stern1985
Unnamed: 0_level_1,participant,condition,phase,heartbeat_id_reference,b_point_sample_reference,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,GDN0005,Dummy,HoldingBreath,0,388.0,438.0,398.0,394.0,452.0,400.0,550.0,412.0,384.0,442.0,388.0
1,GDN0005,Dummy,HoldingBreath,1,404.0,340.0,350.0,244.0,388.0,420.0,402.0,404.0,384.0,330.0,402.0
2,GDN0005,Dummy,HoldingBreath,3,376.0,382.0,296.0,386.0,366.0,386.0,388.0,366.0,348.0,382.0,374.0
3,GDN0005,Dummy,HoldingBreath,4,390.0,394.0,344.0,396.0,376.0,396.0,398.0,372.0,348.0,394.0,388.0
4,GDN0005,Dummy,HoldingBreath,5,386.0,398.0,312.0,388.0,418.0,392.0,390.0,378.0,354.0,400.0,384.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11606,VP_032,tsst,Talk,39,310.0,335.0,276.0,324.0,300.0,318.0,306.0,305.0,294.0,337.0,306.0
11607,VP_032,tsst,Talk,40,322.0,331.0,298.0,330.0,311.0,322.0,311.0,321.0,303.0,329.0,287.0
11608,VP_032,tsst,Talk,41,340.0,317.0,300.0,348.0,330.0,343.0,309.0,332.0,322.0,311.0,287.0
11609,VP_032,tsst,Talk,42,311.0,365.0,324.0,366.0,347.0,354.0,305.0,351.0,337.0,368.0,305.0


#### Save the preprocessed data to use it for training

In [23]:
print(f"Save results: {save_results}")
print(f"Include RR-Interval: {include_rr_interval}")
print(f"Drop missing values: {drop_nan}")
print(f"Impute missing values: {impute_nan}")
print(f"Include Q-Wave Reference: {include_q_wave_reference}")

Save results: True
Include RR-Interval: False
Drop missing values: False
Impute missing values: False
Include Q-Wave Reference: False


In [24]:
if save_results:
    if drop_nan:
        input_data_wide_cleaned.to_csv(result_path.joinpath("data/b-point/without-rr-interval/train_data_b_point.csv"))
    elif impute_nan:
        input_data_wide_cleaned.to_csv(result_path.joinpath("data/b-point/rr-interval/train_data_b_point_rr_interval_median_imputed.csv"))
    else:
        input_data_wide_cleaned.to_csv(result_path.joinpath("data/b-point/without-rr-interval/train_data_b_point_include_nan.csv"))
    print(f"Data saved!")

Data saved!


In [25]:
input_data_b_point = pd.read_csv(result_path.joinpath("data/b-point/without-rr-interval/train_data_b_point_include_nan.csv"), index_col=[0,1,2,3,4,5])
input_data_b_point

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,arbol2017-isoelectric-crossings,arbol2017-second-derivative,arbol2017-third-derivative,debski1993-second-derivative,drost2022,forouzanfar2018,lozano2007-linear-regression,lozano2007-quadratic-regression,sherwood1990,stern1985
Unnamed: 0_level_1,participant,condition,phase,heartbeat_id_reference,b_point_sample_reference,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,GDN0005,Dummy,HoldingBreath,0,388.0,438.0,398.0,394.0,452.0,400.0,550.0,412.0,384.0,442.0,388.0
1,GDN0005,Dummy,HoldingBreath,1,404.0,340.0,350.0,244.0,388.0,420.0,402.0,404.0,384.0,330.0,402.0
2,GDN0005,Dummy,HoldingBreath,3,376.0,382.0,296.0,386.0,366.0,386.0,388.0,366.0,348.0,382.0,374.0
3,GDN0005,Dummy,HoldingBreath,4,390.0,394.0,344.0,396.0,376.0,396.0,398.0,372.0,348.0,394.0,388.0
4,GDN0005,Dummy,HoldingBreath,5,386.0,398.0,312.0,388.0,418.0,392.0,390.0,378.0,354.0,400.0,384.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11606,VP_032,tsst,Talk,39,310.0,335.0,276.0,324.0,300.0,318.0,306.0,305.0,294.0,337.0,306.0
11607,VP_032,tsst,Talk,40,322.0,331.0,298.0,330.0,311.0,322.0,311.0,321.0,303.0,329.0,287.0
11608,VP_032,tsst,Talk,41,340.0,317.0,300.0,348.0,330.0,343.0,309.0,332.0,322.0,311.0,287.0
11609,VP_032,tsst,Talk,42,311.0,365.0,324.0,366.0,347.0,354.0,305.0,351.0,337.0,368.0,305.0


In [46]:
input_data_b_point['rr_interval_ms_estimated'].isna().any()

np.True_