In [1]:
import json

from pathlib import Path

import pandas as pd
import numpy as np

import biopsykit as bp

import matplotlib.pyplot as plt

from pepbench.io import load_challenge_results_from_folder
from pepbench.io import convert_hz_to_ms
from pepbench.io import impute_missing_values
from pepbench.datasets import EmpkinsDataset, GuardianDataset

#%matplotlib widget
%load_ext autoreload
%autoreload 2

In [2]:
root_path = Path("../../../")
print(root_path.absolute())

c:\Users\sebas\Development\ResearchInternship\Code\pepbench\experiments\pep_algorithm_benchmarking\notebooks\ML_analysis\Preprocessing\..\..\..


In [3]:
result_path = root_path.joinpath("results")
result_path

WindowsPath('../../../results')

In [4]:
rater = "rater_02"

In [5]:
deploy_type = "local"

config_dict = json.load(root_path.joinpath("config.json").open(encoding="utf-8"))

empkins_base_path = Path(config_dict[deploy_type]["empkins_path"])
guardian_base_path = Path(config_dict[deploy_type]["guardian_path"])
print(empkins_base_path)

\Users\sebas\Development\ResearchInternship\Data\2024_08_PEP_Benchmarking\EmpkinS_Dataset


#### Set flags for further processing

In [6]:
save_results = True
include_rr_interval = True
include_q_wave_reference = False
drop_nan = False
impute_nan = False

In [7]:
dataset_empkins = EmpkinsDataset(empkins_base_path, use_cache=True, only_labeled=True)
fs_empkins = dataset_empkins.sampling_rate_icg
print(f"Sampling rate ICG: {fs_empkins}")
dataset_guardian = GuardianDataset(guardian_base_path, use_cache=True, only_labeled=True)
fs_guardian = dataset_guardian.sampling_rate_icg
print(f"Sampling rate ICG: {fs_guardian}")

Sampling rate ICG: 1000
Sampling rate ICG: 500


In [8]:
algo_levels = ["q_wave_algorithm", "b_point_algorithm", "outlier_correction_algorithm"]
algo_level_mapping = dict(zip(algo_levels, ["Q-Wave Algorithm", "B-Point Algorithm", "Outlier Correction"]))

#### Load data from the challenge results

In [9]:
results_empkins = load_challenge_results_from_folder(
    result_path.joinpath(f"empkins_dataset_b_point/{rater}").resolve(), index_cols_per_sample=["participant", "condition", "phase"]
).per_sample
results_guardian = load_challenge_results_from_folder(
    result_path.joinpath(f"guardian_dataset_b_point/{rater}").resolve(), index_cols_per_sample=["participant", "phase"]
).per_sample

In [10]:
results_empkins = results_empkins.reset_index().set_index(['q_peak_algorithm', 'b_point_algorithm', 'outlier_correction_algorithm', 'participant', 'condition', 'phase'])
#results_empkins = results_empkins.drop(columns=['level_6'])
results_empkins

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,id,heartbeat_id,heartbeat_id,heartbeat_start_sample,heartbeat_start_sample,heartbeat_end_sample,heartbeat_end_sample,q_peak_sample,q_peak_sample,b_point_sample,...,heart_rate_bpm,pep_sample,pep_sample,pep_ms,pep_ms,nan_reason,nan_reason,error_per_sample_ms,absolute_error_per_sample_ms,absolute_relative_error_per_sample_percent
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,estimated,reference,estimated,reference,estimated,reference,estimated,reference,estimated,...,reference,estimated,reference,estimated,reference,estimated,reference,metric,metric,metric
q_peak_algorithm,b_point_algorithm,outlier_correction_algorithm,participant,condition,phase,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2
q-peak-reference,arbol2017-isoelectric-crossings,forouzanfar2018,VP_001,tsst,Prep,0,0,0,394,394,984,984,568,568,669,...,101.694915,101,81,101.0,81.0,,,-20.0,20.0,24.691358
q-peak-reference,arbol2017-isoelectric-crossings,forouzanfar2018,VP_001,tsst,Prep,1,1,1,984,984,1569,1569,1156,1156,1238,...,103.270224,82,51,82.0,51.0,,,-31.0,31.0,60.784314
q-peak-reference,arbol2017-isoelectric-crossings,forouzanfar2018,VP_001,tsst,Prep,2,2,2,1569,1569,2134,2134,1736,1736,1834,...,107.719928,98,79,98.0,79.0,,,-19.0,19.0,24.050633
q-peak-reference,arbol2017-isoelectric-crossings,forouzanfar2018,VP_001,tsst,Prep,3,3,3,2134,2134,2684,2684,2292,2292,2394,...,109.890110,102,94,102.0,94.0,,,-8.0,8.0,8.510638
q-peak-reference,arbol2017-isoelectric-crossings,forouzanfar2018,VP_001,tsst,Prep,4,4,4,2684,2684,3227,3227,2844,2844,2943,...,110.905730,99,83,99.0,83.0,,,-16.0,16.0,19.277108
q-peak-reference,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
q-peak-reference,stern1985,none,VP_032,ftsst,Pause_5,7,7,7,5864,5864,6660,6660,6109,6109,6210,...,75.853350,101,153,101.0,153.0,,,52.0,52.0,33.986928
q-peak-reference,stern1985,none,VP_032,ftsst,Pause_5,8,8,8,6660,6660,7458,7458,6901,6901,7052,...,74.812968,151,153,151.0,153.0,,,2.0,2.0,1.307190
q-peak-reference,stern1985,none,VP_032,ftsst,Pause_5,9,9,9,7458,7458,8267,8267,7701,7701,7857,...,73.891626,156,157,156.0,157.0,,,1.0,1.0,0.636943
q-peak-reference,stern1985,none,VP_032,ftsst,Pause_5,10,10,10,8267,8267,9074,9074,8513,8513,8667,...,74.626866,154,155,154.0,155.0,,,1.0,1.0,0.645161


#### Introduce column 'condition' to the guardian dataset to assure compatibility with the empkins dataset

In [11]:
results_guardian = results_guardian.assign(condition="Dummy")
results_guardian = results_guardian.reset_index().set_index(['q_peak_algorithm', 'b_point_algorithm', 'outlier_correction_algorithm', 'participant', 'condition', 'phase'])
results_guardian

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,id,heartbeat_id,heartbeat_id,heartbeat_start_sample,heartbeat_start_sample,heartbeat_end_sample,heartbeat_end_sample,q_peak_sample,q_peak_sample,b_point_sample,...,heart_rate_bpm,pep_sample,pep_sample,pep_ms,pep_ms,nan_reason,nan_reason,error_per_sample_ms,absolute_error_per_sample_ms,absolute_relative_error_per_sample_percent
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,estimated,reference,estimated,reference,estimated,reference,estimated,reference,estimated,...,reference,estimated,reference,estimated,reference,estimated,reference,metric,metric,metric
q_peak_algorithm,b_point_algorithm,outlier_correction_algorithm,participant,condition,phase,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2
q-peak-reference,arbol2017-isoelectric-crossings,forouzanfar2018,GDN0005,Dummy,Pause,0,0,0,352,352,808,808,491,491,558,...,65.789474,67,87,134.0,174.0,,,40.0,40.0,22.988506
q-peak-reference,arbol2017-isoelectric-crossings,forouzanfar2018,GDN0005,Dummy,Pause,1,1,1,808,808,1252,1252,948,948,1009,...,68.649886,61,78,122.0,156.0,,,34.0,34.0,21.794872
q-peak-reference,arbol2017-isoelectric-crossings,forouzanfar2018,GDN0005,Dummy,Pause,2,2,2,1252,1252,1686,1686,1385,1385,1463,...,69.444444,78,72,156.0,144.0,,,-12.0,12.0,8.333333
q-peak-reference,arbol2017-isoelectric-crossings,forouzanfar2018,GDN0005,Dummy,Pause,3,3,3,1686,1686,2116,2115,1819,1819,1900,...,69.930070,81,71,162.0,142.0,,,-20.0,20.0,14.084507
q-peak-reference,arbol2017-isoelectric-crossings,forouzanfar2018,GDN0005,Dummy,Pause,4,4,4,2116,2115,2570,2570,2247,2247,2320,...,64.102564,73,43,146.0,86.0,,,-60.0,60.0,69.767442
q-peak-reference,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
q-peak-reference,stern1985,none,GDN0030,Dummy,TiltLevel,51,51,51,27029,27029,27570,27570,27199,27199,27277,...,55.555556,78,78,156.0,156.0,,,0.0,0.0,0.000000
q-peak-reference,stern1985,none,GDN0030,Dummy,TiltLevel,52,52,52,27570,27570,28118,28118,27740,27740,27788,...,54.249548,48,48,96.0,96.0,,,0.0,0.0,0.000000
q-peak-reference,stern1985,none,GDN0030,Dummy,TiltLevel,53,53,53,28118,28118,28652,28652,28292,28292,28345,...,57.361377,53,53,106.0,106.0,,,0.0,0.0,0.000000
q-peak-reference,stern1985,none,GDN0030,Dummy,TiltLevel,54,54,54,28652,28652,29167,29167,28815,28815,28846,...,58.708415,31,52,62.0,104.0,,,42.0,42.0,40.384615


#### Perform data preprocessing

In [12]:
results_empkins_norm = results_empkins.copy()
results_guardian_norm = results_guardian.copy()
results_empkins_norm[("b_point_sample", "estimated")] = results_empkins[("b_point_sample", "estimated")] - results_empkins[("heartbeat_start_sample", "estimated")]
results_empkins_norm[("b_point_sample", "reference")] = results_empkins[("b_point_sample", "reference")] - results_empkins[("heartbeat_start_sample", "estimated")]
results_empkins_norm[("q_peak_sample", "reference")] = results_empkins[("q_peak_sample", "reference")] - results_empkins[("heartbeat_start_sample", "estimated")]
results_guardian_norm[("b_point_sample", "estimated")] = results_guardian[("b_point_sample", "estimated")] - results_guardian[("heartbeat_start_sample", "estimated")]
results_guardian_norm[("b_point_sample", "reference")] = results_guardian[("b_point_sample", "reference")] - results_guardian[("heartbeat_start_sample", "estimated")]
results_guardian_norm[("q_peak_sample", "reference")] = results_guardian[("q_peak_sample", "reference")] - results_guardian[("heartbeat_start_sample", "estimated")]

In [13]:
cols = [("heartbeat_start_sample", "estimated"), ("heartbeat_start_sample", "reference"), ("heartbeat_end_sample", "estimated"), ("heartbeat_end_sample", "reference"), ("q_peak_sample", "estimated"), ("q_peak_sample", "reference"), ("b_point_sample", "estimated"), ("b_point_sample", "reference"), ("pep_sample", "estimated"), ("pep_sample", "reference")]
results_empkins_cleaned_ms = results_empkins_norm.copy()
results_guardian_cleaned_ms = results_guardian_norm.copy()
results_empkins_cleaned_ms[cols] = results_empkins_norm[cols] * convert_hz_to_ms(sampling_frequency=fs_empkins)
results_guardian_cleaned_ms[cols] = results_guardian_norm[cols] * convert_hz_to_ms(sampling_frequency=fs_guardian)

#### Concatenate the empkins and guardian dataframes

In [14]:
preprocessed_results_empkins_guardian = pd.concat([results_empkins_cleaned_ms, results_guardian_cleaned_ms])
preprocessed_results_empkins_guardian

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,id,heartbeat_id,heartbeat_id,heartbeat_start_sample,heartbeat_start_sample,heartbeat_end_sample,heartbeat_end_sample,q_peak_sample,q_peak_sample,b_point_sample,...,heart_rate_bpm,pep_sample,pep_sample,pep_ms,pep_ms,nan_reason,nan_reason,error_per_sample_ms,absolute_error_per_sample_ms,absolute_relative_error_per_sample_percent
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,estimated,reference,estimated,reference,estimated,reference,estimated,reference,estimated,...,reference,estimated,reference,estimated,reference,estimated,reference,metric,metric,metric
q_peak_algorithm,b_point_algorithm,outlier_correction_algorithm,participant,condition,phase,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2
q-peak-reference,arbol2017-isoelectric-crossings,forouzanfar2018,VP_001,tsst,Prep,0,0,0,394.0,394.0,984.0,984.0,568.0,174.0,275.0,...,101.694915,101.0,81.0,101.0,81.0,,,-20.0,20.0,24.691358
q-peak-reference,arbol2017-isoelectric-crossings,forouzanfar2018,VP_001,tsst,Prep,1,1,1,984.0,984.0,1569.0,1569.0,1156.0,172.0,254.0,...,103.270224,82.0,51.0,82.0,51.0,,,-31.0,31.0,60.784314
q-peak-reference,arbol2017-isoelectric-crossings,forouzanfar2018,VP_001,tsst,Prep,2,2,2,1569.0,1569.0,2134.0,2134.0,1736.0,167.0,265.0,...,107.719928,98.0,79.0,98.0,79.0,,,-19.0,19.0,24.050633
q-peak-reference,arbol2017-isoelectric-crossings,forouzanfar2018,VP_001,tsst,Prep,3,3,3,2134.0,2134.0,2684.0,2684.0,2292.0,158.0,260.0,...,109.890110,102.0,94.0,102.0,94.0,,,-8.0,8.0,8.510638
q-peak-reference,arbol2017-isoelectric-crossings,forouzanfar2018,VP_001,tsst,Prep,4,4,4,2684.0,2684.0,3227.0,3227.0,2844.0,160.0,259.0,...,110.905730,99.0,83.0,99.0,83.0,,,-16.0,16.0,19.277108
q-peak-reference,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
q-peak-reference,stern1985,none,GDN0030,Dummy,TiltLevel,51,51,51,54058.0,54058.0,55140.0,55140.0,54398.0,340.0,496.0,...,55.555556,156.0,156.0,156.0,156.0,,,0.0,0.0,0.000000
q-peak-reference,stern1985,none,GDN0030,Dummy,TiltLevel,52,52,52,55140.0,55140.0,56236.0,56236.0,55480.0,340.0,436.0,...,54.249548,96.0,96.0,96.0,96.0,,,0.0,0.0,0.000000
q-peak-reference,stern1985,none,GDN0030,Dummy,TiltLevel,53,53,53,56236.0,56236.0,57304.0,57304.0,56584.0,348.0,454.0,...,57.361377,106.0,106.0,106.0,106.0,,,0.0,0.0,0.000000
q-peak-reference,stern1985,none,GDN0030,Dummy,TiltLevel,54,54,54,57304.0,57304.0,58334.0,58334.0,57630.0,326.0,388.0,...,58.708415,62.0,104.0,62.0,104.0,,,42.0,42.0,40.384615


In [15]:
input_data = preprocessed_results_empkins_guardian.reset_index().set_index(['q_peak_algorithm', 'b_point_algorithm', 'outlier_correction_algorithm', 'participant', 'condition', 'phase'])
input_data = input_data.xs(key='none', level='outlier_correction_algorithm', drop_level=True)
input_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,id,heartbeat_id,heartbeat_id,heartbeat_start_sample,heartbeat_start_sample,heartbeat_end_sample,heartbeat_end_sample,q_peak_sample,q_peak_sample,b_point_sample,...,heart_rate_bpm,pep_sample,pep_sample,pep_ms,pep_ms,nan_reason,nan_reason,error_per_sample_ms,absolute_error_per_sample_ms,absolute_relative_error_per_sample_percent
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,estimated,reference,estimated,reference,estimated,reference,estimated,reference,estimated,...,reference,estimated,reference,estimated,reference,estimated,reference,metric,metric,metric
q_peak_algorithm,b_point_algorithm,participant,condition,phase,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2
q-peak-reference,arbol2017-isoelectric-crossings,VP_001,tsst,Prep,0,0,0,394.0,394.0,984.0,984.0,568.0,174.0,275.0,...,101.694915,101.0,81.0,101.0,81.0,,,-20.0,20.0,24.691358
q-peak-reference,arbol2017-isoelectric-crossings,VP_001,tsst,Prep,1,1,1,984.0,984.0,1569.0,1569.0,1156.0,172.0,254.0,...,103.270224,82.0,51.0,82.0,51.0,,,-31.0,31.0,60.784314
q-peak-reference,arbol2017-isoelectric-crossings,VP_001,tsst,Prep,2,2,2,1569.0,1569.0,2134.0,2134.0,1736.0,167.0,265.0,...,107.719928,98.0,79.0,98.0,79.0,,,-19.0,19.0,24.050633
q-peak-reference,arbol2017-isoelectric-crossings,VP_001,tsst,Prep,3,3,3,2134.0,2134.0,2684.0,2684.0,2292.0,158.0,260.0,...,109.890110,102.0,94.0,102.0,94.0,,,-8.0,8.0,8.510638
q-peak-reference,arbol2017-isoelectric-crossings,VP_001,tsst,Prep,4,4,4,2684.0,2684.0,3227.0,3227.0,2844.0,160.0,259.0,...,110.905730,99.0,83.0,99.0,83.0,,,-16.0,16.0,19.277108
q-peak-reference,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
q-peak-reference,stern1985,GDN0030,Dummy,TiltLevel,51,51,51,54058.0,54058.0,55140.0,55140.0,54398.0,340.0,496.0,...,55.555556,156.0,156.0,156.0,156.0,,,0.0,0.0,0.000000
q-peak-reference,stern1985,GDN0030,Dummy,TiltLevel,52,52,52,55140.0,55140.0,56236.0,56236.0,55480.0,340.0,436.0,...,54.249548,96.0,96.0,96.0,96.0,,,0.0,0.0,0.000000
q-peak-reference,stern1985,GDN0030,Dummy,TiltLevel,53,53,53,56236.0,56236.0,57304.0,57304.0,56584.0,348.0,454.0,...,57.361377,106.0,106.0,106.0,106.0,,,0.0,0.0,0.000000
q-peak-reference,stern1985,GDN0030,Dummy,TiltLevel,54,54,54,57304.0,57304.0,58334.0,58334.0,57630.0,326.0,388.0,...,58.708415,62.0,104.0,62.0,104.0,,,42.0,42.0,40.384615


#### Select columns

In [16]:
if include_rr_interval:
    input_data = input_data[[("heartbeat_id", "reference"), ("b_point_sample", "estimated"), ("b_point_sample", "reference"), ("rr_interval_ms", "estimated")]]
elif include_q_wave_reference:
    input_data = input_data[[("heartbeat_id", "reference"), ("b_point_sample", "estimated"), ("b_point_sample", "reference"), ("rr_interval_ms", "estimated"), ("q_wave_onset_sample", "reference")]]
else:
    input_data = input_data[[("heartbeat_id", "reference"), ("b_point_sample", "estimated"), ("b_point_sample", "reference")]]
input_data.columns = input_data.columns.to_flat_index().str.join("_")
input_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,heartbeat_id_reference,b_point_sample_estimated,b_point_sample_reference,rr_interval_ms_estimated
q_peak_algorithm,b_point_algorithm,participant,condition,phase,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
q-peak-reference,arbol2017-isoelectric-crossings,VP_001,tsst,Prep,0,275.0,255.0,590.0
q-peak-reference,arbol2017-isoelectric-crossings,VP_001,tsst,Prep,1,254.0,223.0,581.0
q-peak-reference,arbol2017-isoelectric-crossings,VP_001,tsst,Prep,2,265.0,246.0,557.0
q-peak-reference,arbol2017-isoelectric-crossings,VP_001,tsst,Prep,3,260.0,252.0,546.0
q-peak-reference,arbol2017-isoelectric-crossings,VP_001,tsst,Prep,4,259.0,243.0,541.0
q-peak-reference,...,...,...,...,...,...,...,...
q-peak-reference,stern1985,GDN0030,Dummy,TiltLevel,51,496.0,496.0,1080.0
q-peak-reference,stern1985,GDN0030,Dummy,TiltLevel,52,436.0,436.0,1106.0
q-peak-reference,stern1985,GDN0030,Dummy,TiltLevel,53,454.0,454.0,1046.0
q-peak-reference,stern1985,GDN0030,Dummy,TiltLevel,54,388.0,430.0,1022.0


In [17]:
print(f"Amount of algorithms: {len(input_data.index.get_level_values('b_point_algorithm').unique())}")

Amount of algorithms: 12


#### Transform dataframe from long to wide format

In [18]:
input_data = input_data.reset_index()
# Pivot the DataFrame
if include_rr_interval:
    input_data_wide = input_data.pivot(
        index=['participant', 'condition', 'phase', 'heartbeat_id_reference', 'b_point_sample_reference', 'rr_interval_ms_estimated'],
        columns='b_point_algorithm',
        values='b_point_sample_estimated'
    ).reset_index()
elif include_q_wave_reference:
    input_data_wide = input_data.pivot(
        index=['participant', 'condition', 'phase', 'heartbeat_id_reference', 'b_point_sample_reference', 'q_wave_onset_sample_reference'],
        columns='b_point_algorithm',
        values='b_point_sample_estimated'
    ).reset_index()
else:
    input_data_wide = input_data.pivot(
        index=['participant', 'condition', 'phase', 'heartbeat_id_reference', 'b_point_sample_reference'],
        columns='b_point_algorithm',
        values='b_point_sample_estimated'
    ).reset_index()

input_data_wide

b_point_algorithm,participant,condition,phase,heartbeat_id_reference,b_point_sample_reference,rr_interval_ms_estimated,arbol2017-isoelectric-crossings,arbol2017-second-derivative,arbol2017-third-derivative,debski1993-second-derivative,drost2022,forouzanfar2018,lozano2007-linear-regression,lozano2007-quadratic-regression,miljkovic2022,pale2021,sherwood1990,stern1985
0,GDN0005,Dummy,HoldingBreath,0,388.0,850.0,438.0,398.0,394.0,452.0,400.0,550.0,412.0,384.0,442.0,388.0,442.0,388.0
1,GDN0005,Dummy,HoldingBreath,1,406.0,778.0,340.0,350.0,244.0,388.0,420.0,402.0,404.0,384.0,288.0,246.0,330.0,402.0
2,GDN0005,Dummy,HoldingBreath,2,314.0,750.0,358.0,348.0,356.0,418.0,346.0,300.0,374.0,358.0,358.0,382.0,358.0,268.0
3,GDN0005,Dummy,HoldingBreath,3,374.0,746.0,382.0,296.0,386.0,366.0,386.0,388.0,366.0,348.0,386.0,374.0,382.0,374.0
4,GDN0005,Dummy,HoldingBreath,4,390.0,766.0,394.0,344.0,396.0,376.0,396.0,398.0,372.0,348.0,398.0,388.0,394.0,388.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11444,VP_032,tsst,Talk,39,312.0,619.0,335.0,276.0,324.0,300.0,318.0,306.0,305.0,294.0,308.0,306.0,337.0,306.0
11445,VP_032,tsst,Talk,40,314.0,680.0,331.0,298.0,330.0,311.0,322.0,311.0,321.0,303.0,329.0,85.0,329.0,287.0
11446,VP_032,tsst,Talk,41,333.0,719.0,317.0,300.0,348.0,330.0,343.0,309.0,332.0,322.0,292.0,14.0,311.0,287.0
11447,VP_032,tsst,Talk,42,312.0,698.0,365.0,324.0,366.0,347.0,354.0,305.0,351.0,337.0,312.0,261.0,368.0,305.0


#### Drop nan values

In [19]:
print(f"Drop nan values: {drop_nan}")
print(f"Impute nan values: {impute_nan}")

Drop nan values: False
Impute nan values: False


#### Delete rows where the column 'rr_interval_ms_estimated' contains nan values
Discuss how to handle this case

In [20]:
if include_rr_interval and (drop_nan or impute_nan):
    input_data_wide = input_data_wide.dropna(subset=['rr_interval_ms_estimated'])
input_data_wide

b_point_algorithm,participant,condition,phase,heartbeat_id_reference,b_point_sample_reference,rr_interval_ms_estimated,arbol2017-isoelectric-crossings,arbol2017-second-derivative,arbol2017-third-derivative,debski1993-second-derivative,drost2022,forouzanfar2018,lozano2007-linear-regression,lozano2007-quadratic-regression,miljkovic2022,pale2021,sherwood1990,stern1985
0,GDN0005,Dummy,HoldingBreath,0,388.0,850.0,438.0,398.0,394.0,452.0,400.0,550.0,412.0,384.0,442.0,388.0,442.0,388.0
1,GDN0005,Dummy,HoldingBreath,1,406.0,778.0,340.0,350.0,244.0,388.0,420.0,402.0,404.0,384.0,288.0,246.0,330.0,402.0
2,GDN0005,Dummy,HoldingBreath,2,314.0,750.0,358.0,348.0,356.0,418.0,346.0,300.0,374.0,358.0,358.0,382.0,358.0,268.0
3,GDN0005,Dummy,HoldingBreath,3,374.0,746.0,382.0,296.0,386.0,366.0,386.0,388.0,366.0,348.0,386.0,374.0,382.0,374.0
4,GDN0005,Dummy,HoldingBreath,4,390.0,766.0,394.0,344.0,396.0,376.0,396.0,398.0,372.0,348.0,398.0,388.0,394.0,388.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11444,VP_032,tsst,Talk,39,312.0,619.0,335.0,276.0,324.0,300.0,318.0,306.0,305.0,294.0,308.0,306.0,337.0,306.0
11445,VP_032,tsst,Talk,40,314.0,680.0,331.0,298.0,330.0,311.0,322.0,311.0,321.0,303.0,329.0,85.0,329.0,287.0
11446,VP_032,tsst,Talk,41,333.0,719.0,317.0,300.0,348.0,330.0,343.0,309.0,332.0,322.0,292.0,14.0,311.0,287.0
11447,VP_032,tsst,Talk,42,312.0,698.0,365.0,324.0,366.0,347.0,354.0,305.0,351.0,337.0,312.0,261.0,368.0,305.0


In [21]:
if include_rr_interval:
    input_data_wide_cleaned = input_data_wide.reset_index().set_index(['participant', 'condition', 'phase', 'heartbeat_id_reference', 'b_point_sample_reference', 'rr_interval_ms_estimated'], append=True)
else:
    input_data_wide_cleaned = input_data_wide.reset_index().set_index(['participant', 'condition', 'phase', 'heartbeat_id_reference', 'b_point_sample_reference'], append=True)

if drop_nan:
    input_data_wide_cleaned = input_data_wide.dropna()
else:
    if impute_nan:
        input_data_wide_cleaned = input_data_wide_cleaned.drop(columns=['index'])
        input_data_wide_cleaned = input_data_wide_cleaned.astype(np.float64)  # Ensure all values are float64 for imputation
        input_data_wide_cleaned = impute_missing_values(input_data_wide_cleaned, mode='median')
        print(f"Amount of rows after imputation: {input_data_wide_cleaned.shape[0]}")
        input_data_wide_cleaned = input_data_wide_cleaned.dropna()
        print(f"Amount of rows after deleting all nan rows: {input_data_wide_cleaned.shape[0]}")
    else:
        input_data_wide_cleaned = input_data_wide_cleaned.drop(columns=['index'])
        input_data_wide_cleaned = input_data_wide_cleaned.dropna(how='all', axis=0)
        input_data_wide_cleaned = input_data_wide_cleaned.astype(np.float64)  # Ensure all values are float64
input_data_wide_cleaned

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,b_point_algorithm,arbol2017-isoelectric-crossings,arbol2017-second-derivative,arbol2017-third-derivative,debski1993-second-derivative,drost2022,forouzanfar2018,lozano2007-linear-regression,lozano2007-quadratic-regression,miljkovic2022,pale2021,sherwood1990,stern1985
Unnamed: 0_level_1,participant,condition,phase,heartbeat_id_reference,b_point_sample_reference,rr_interval_ms_estimated,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,GDN0005,Dummy,HoldingBreath,0,388.0,850.0,438.0,398.0,394.0,452.0,400.0,550.0,412.0,384.0,442.0,388.0,442.0,388.0
1,GDN0005,Dummy,HoldingBreath,1,406.0,778.0,340.0,350.0,244.0,388.0,420.0,402.0,404.0,384.0,288.0,246.0,330.0,402.0
2,GDN0005,Dummy,HoldingBreath,2,314.0,750.0,358.0,348.0,356.0,418.0,346.0,300.0,374.0,358.0,358.0,382.0,358.0,268.0
3,GDN0005,Dummy,HoldingBreath,3,374.0,746.0,382.0,296.0,386.0,366.0,386.0,388.0,366.0,348.0,386.0,374.0,382.0,374.0
4,GDN0005,Dummy,HoldingBreath,4,390.0,766.0,394.0,344.0,396.0,376.0,396.0,398.0,372.0,348.0,398.0,388.0,394.0,388.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11444,VP_032,tsst,Talk,39,312.0,619.0,335.0,276.0,324.0,300.0,318.0,306.0,305.0,294.0,308.0,306.0,337.0,306.0
11445,VP_032,tsst,Talk,40,314.0,680.0,331.0,298.0,330.0,311.0,322.0,311.0,321.0,303.0,329.0,85.0,329.0,287.0
11446,VP_032,tsst,Talk,41,333.0,719.0,317.0,300.0,348.0,330.0,343.0,309.0,332.0,322.0,292.0,14.0,311.0,287.0
11447,VP_032,tsst,Talk,42,312.0,698.0,365.0,324.0,366.0,347.0,354.0,305.0,351.0,337.0,312.0,261.0,368.0,305.0


In [22]:
input_data_wide_cleaned[input_data_wide_cleaned.isna().all(axis=1)]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,b_point_algorithm,arbol2017-isoelectric-crossings,arbol2017-second-derivative,arbol2017-third-derivative,debski1993-second-derivative,drost2022,forouzanfar2018,lozano2007-linear-regression,lozano2007-quadratic-regression,miljkovic2022,pale2021,sherwood1990,stern1985
Unnamed: 0_level_1,participant,condition,phase,heartbeat_id_reference,b_point_sample_reference,rr_interval_ms_estimated,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1


#### Save the preprocessed data to use it for training

In [23]:
print(f"Save results: {save_results}")
print(f"Include RR-Interval: {include_rr_interval}")
print(f"Drop missing values: {drop_nan}")
print(f"Impute missing values: {impute_nan}")
print(f"Include Q-Wave Reference: {include_q_wave_reference}")

Save results: True
Include RR-Interval: True
Drop missing values: False
Impute missing values: False
Include Q-Wave Reference: False


In [24]:
if save_results:
    if include_rr_interval:
        if drop_nan:
            input_data_wide_cleaned.to_csv(result_path.joinpath(f"data/b-point/rr-interval/{rater}/train_data_b_point_rr_interval.csv"))
        elif impute_nan:
            input_data_wide_cleaned.to_csv(result_path.joinpath(f"data/b-point/rr-interval/{rater}/train_data_b_point_rr_interval_median_imputed.csv"))
        else:
            input_data_wide_cleaned.to_csv(result_path.joinpath(f"data/b-point/rr-interval/{rater}/train_data_b_point_rr_interval_include_nan.csv"))
    else:   
        if drop_nan:
            input_data_wide_cleaned.to_csv(result_path.joinpath(f"data/b-point/without-rr-interval/{rater}/train_data_b_point.csv"))
        elif impute_nan:
            input_data_wide_cleaned.to_csv(result_path.joinpath(f"data/b-point/without-rr-interval/{rater}/train_data_b_point_median_imputed.csv"))
        else:
            input_data_wide_cleaned.to_csv(result_path.joinpath(f"data/b-point/without-rr-interval/{rater}/train_data_b_point_include_nan.csv"))
    print(f"Data saved!")

Data saved!


In [25]:
rater = "rater_02"

In [26]:
data_path = Path("../../../results/data")
test = pd.read_csv(data_path.joinpath(f"b-point/without-rr-interval/{rater}/train_data_b_point_include_nan.csv"), index_col=[0,1,2,3,4,5])
test

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,arbol2017-isoelectric-crossings,arbol2017-second-derivative,arbol2017-third-derivative,debski1993-second-derivative,drost2022,forouzanfar2018,lozano2007-linear-regression,lozano2007-quadratic-regression,miljkovic2022,pale2021,sherwood1990,stern1985
Unnamed: 0_level_1,participant,condition,phase,heartbeat_id_reference,b_point_sample_reference,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,GDN0005,Dummy,HoldingBreath,0,388.0,438.0,398.0,394.0,452.0,400.0,550.0,412.0,384.0,442.0,388.0,442.0,388.0
1,GDN0005,Dummy,HoldingBreath,1,406.0,340.0,350.0,244.0,388.0,420.0,402.0,404.0,384.0,288.0,246.0,330.0,402.0
2,GDN0005,Dummy,HoldingBreath,2,314.0,358.0,348.0,356.0,418.0,346.0,300.0,374.0,358.0,358.0,382.0,358.0,268.0
3,GDN0005,Dummy,HoldingBreath,3,374.0,382.0,296.0,386.0,366.0,386.0,388.0,366.0,348.0,386.0,374.0,382.0,374.0
4,GDN0005,Dummy,HoldingBreath,4,390.0,394.0,344.0,396.0,376.0,396.0,398.0,372.0,348.0,398.0,388.0,394.0,388.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11444,VP_032,tsst,Talk,39,312.0,335.0,276.0,324.0,300.0,318.0,306.0,305.0,294.0,308.0,306.0,337.0,306.0
11445,VP_032,tsst,Talk,40,314.0,331.0,298.0,330.0,311.0,322.0,311.0,321.0,303.0,329.0,85.0,329.0,287.0
11446,VP_032,tsst,Talk,41,333.0,317.0,300.0,348.0,330.0,343.0,309.0,332.0,322.0,292.0,14.0,311.0,287.0
11447,VP_032,tsst,Talk,42,312.0,365.0,324.0,366.0,347.0,354.0,305.0,351.0,337.0,312.0,261.0,368.0,305.0


In [26]:
test.isna().any()

rr_interval_ms_estimated            True
arbol2017-isoelectric-crossings     True
arbol2017-second-derivative        False
arbol2017-third-derivative          True
debski1993-second-derivative        True
drost2022                          False
forouzanfar2018                     True
lozano2007-linear-regression       False
lozano2007-quadratic-regression    False
miljkovic2022                      False
pale2021                           False
sherwood1990                        True
stern1985                           True
dtype: bool