In [1]:
import json

from pathlib import Path

import pandas as pd
import numpy as np

import biopsykit as bp

import matplotlib.pyplot as plt

from pepbench.io import load_challenge_results_from_folder
from pepbench.io import convert_hz_to_ms
from pepbench.io import impute_missing_values
from pepbench.datasets import EmpkinsDataset, GuardianDataset

#%matplotlib widget
%load_ext autoreload
%autoreload 2

In [2]:
root_path = Path("../../../")

In [3]:
result_path = root_path.joinpath("results")
result_path

WindowsPath('../../../results')

In [4]:
deploy_type = "local"

config_dict = json.load(root_path.joinpath("config.json").open(encoding="utf-8"))

empkins_base_path = Path(config_dict[deploy_type]["empkins_path"])
guardian_base_path = Path(config_dict[deploy_type]["guardian_path"])
print(empkins_base_path)

\Users\sebas\Development\ResearchInternship\Data\2024_08_PEP_Benchmarking\EmpkinS_Dataset


#### Set flags for further processing

In [38]:
save_results = False
include_rr_interval = False
include_b_point_reference = False
drop_nan = True
impute_nan = False

In [39]:
dataset_empkins = EmpkinsDataset(empkins_base_path, use_cache=True, only_labeled=True)
fs_empkins = dataset_empkins.sampling_rate_icg
print(f"Sampling rate ICG: {fs_empkins}")
dataset_guardian = GuardianDataset(guardian_base_path, use_cache=True, only_labeled=True)
fs_guardian = dataset_guardian.sampling_rate_icg
print(f"Sampling rate ICG: {fs_guardian}")

Sampling rate ICG: 1000
Sampling rate ICG: 500


In [40]:
algo_levels = ["q_peak_algorithm", "b_point_algorithm", "outlier_correction_algorithm"]
algo_level_mapping = dict(zip(algo_levels, ["Q-Wave Algorithm", "B-Point Algorithm", "Outlier Correction"]))

#### Load data from the challenge results

In [41]:
results_empkins = load_challenge_results_from_folder(
    result_path.joinpath("empkins_dataset_q_peak").resolve(), index_cols_per_sample=["participant", "condition", "phase"]
).per_sample
results_guardian = load_challenge_results_from_folder(
    result_path.joinpath("guardian_dataset_q_peak").resolve(), index_cols_per_sample=["participant", "phase"]
).per_sample

In [42]:
results_empkins = results_empkins.reset_index().set_index(['q_peak_algorithm', 'b_point_algorithm', 'outlier_correction_algorithm', 'participant', 'condition', 'phase'])
#results_empkins = results_empkins.drop(columns=['level_6'])
results_empkins

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,id,heartbeat_id,heartbeat_id,heartbeat_start_sample,heartbeat_start_sample,heartbeat_end_sample,heartbeat_end_sample,q_peak_sample,q_peak_sample,b_point_sample,...,heart_rate_bpm,pep_sample,pep_sample,pep_ms,pep_ms,nan_reason,nan_reason,error_per_sample_ms,absolute_error_per_sample_ms,absolute_relative_error_per_sample_percent
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,estimated,reference,estimated,reference,estimated,reference,estimated,reference,estimated,...,reference,estimated,reference,estimated,reference,estimated,reference,metric,metric,metric
q_peak_algorithm,b_point_algorithm,outlier_correction_algorithm,participant,condition,phase,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2
forouzanfar2018,b-point-reference,none,VP_001,tsst,Prep,0,0,0,394,399,984,985,581,567,634,...,101.694915,53,67,53.0,67.0,,,14.0,14.0,20.895522
forouzanfar2018,b-point-reference,none,VP_001,tsst,Prep,1,1,1,984,985,1569,1569,1170,1156,1206,...,103.270224,36,50,36.0,50.0,,,14.0,14.0,28.000000
forouzanfar2018,b-point-reference,none,VP_001,tsst,Prep,2,2,2,1569,1569,2134,2134,1751,1735,1811,...,107.719928,60,76,60.0,76.0,,,16.0,16.0,21.052632
forouzanfar2018,b-point-reference,none,VP_001,tsst,Prep,3,3,3,2134,2134,2684,2684,2308,2291,2382,...,109.890110,74,91,74.0,91.0,,,17.0,17.0,18.681319
forouzanfar2018,b-point-reference,none,VP_001,tsst,Prep,4,4,4,2684,2684,3227,3227,2854,2846,2926,...,110.905730,72,80,72.0,80.0,,,8.0,8.0,10.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
vanlien2013-42-ms,b-point-reference,none,VP_032,ftsst,Pause_5,7,7,7,5864,5864,6660,6660,6104,6110,6256,...,75.853350,152,146,152.0,146.0,,,-6.0,6.0,4.109589
vanlien2013-42-ms,b-point-reference,none,VP_032,ftsst,Pause_5,8,8,8,6660,6660,7458,7458,6895,6900,7054,...,74.812968,159,154,159.0,154.0,,,-5.0,5.0,3.246753
vanlien2013-42-ms,b-point-reference,none,VP_032,ftsst,Pause_5,9,9,9,7458,7458,8267,8267,7697,7701,7855,...,73.891626,158,154,158.0,154.0,,,-4.0,4.0,2.597403
vanlien2013-42-ms,b-point-reference,none,VP_032,ftsst,Pause_5,10,10,10,8267,8267,9074,9074,8509,8515,8673,...,74.626866,164,158,164.0,158.0,,,-6.0,6.0,3.797468


#### Introduce column 'condition' to the guardian dataset to assure compatibility with the empkins dataset

In [43]:
results_guardian = results_guardian.assign(condition="Dummy")
results_guardian = results_guardian.reset_index().set_index(['q_peak_algorithm', 'b_point_algorithm', 'outlier_correction_algorithm', 'participant', 'condition', 'phase'])
results_guardian

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,id,heartbeat_id,heartbeat_id,heartbeat_start_sample,heartbeat_start_sample,heartbeat_end_sample,heartbeat_end_sample,q_peak_sample,q_peak_sample,b_point_sample,...,heart_rate_bpm,pep_sample,pep_sample,pep_ms,pep_ms,nan_reason,nan_reason,error_per_sample_ms,absolute_error_per_sample_ms,absolute_relative_error_per_sample_percent
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,estimated,reference,estimated,reference,estimated,reference,estimated,reference,estimated,...,reference,estimated,reference,estimated,reference,estimated,reference,metric,metric,metric
q_peak_algorithm,b_point_algorithm,outlier_correction_algorithm,participant,condition,phase,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2
forouzanfar2018,b-point-reference,none,GDN0005,Dummy,Pause,0,0,0,352,352,808,808,500,492,568,...,65.789474,68,76,136.0,152.0,,,16.0,16.0,10.526316
forouzanfar2018,b-point-reference,none,GDN0005,Dummy,Pause,1,1,1,808,808,1252,1252,956,948,1025,...,68.649886,69,77,138.0,154.0,,,16.0,16.0,10.389610
forouzanfar2018,b-point-reference,none,GDN0005,Dummy,Pause,2,2,2,1252,1252,1686,1686,1394,1386,1456,...,69.444444,62,70,124.0,140.0,,,16.0,16.0,11.428571
forouzanfar2018,b-point-reference,none,GDN0005,Dummy,Pause,3,3,3,1686,1686,2116,2115,1825,1817,1888,...,69.930070,63,71,126.0,142.0,,,16.0,16.0,11.267606
forouzanfar2018,b-point-reference,none,GDN0005,Dummy,Pause,4,4,4,2116,2115,2570,2570,2253,2246,2293,...,64.102564,40,47,80.0,94.0,,,14.0,14.0,14.893617
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
vanlien2013-42-ms,b-point-reference,none,GDN0030,Dummy,TiltDown,51,51,51,27029,27029,27570,27570,27198,27200,27278,...,55.555556,80,78,160.0,156.0,,,-4.0,4.0,2.564103
vanlien2013-42-ms,b-point-reference,none,GDN0030,Dummy,TiltDown,52,52,52,27570,27570,28118,28118,27738,27740,27815,...,54.249548,77,75,154.0,150.0,,,-4.0,4.0,2.666667
vanlien2013-42-ms,b-point-reference,none,GDN0030,Dummy,TiltDown,53,53,53,28118,28118,28652,28652,28291,28293,28345,...,57.361377,54,52,108.0,104.0,,,-4.0,4.0,3.846154
vanlien2013-42-ms,b-point-reference,none,GDN0030,Dummy,TiltDown,54,54,54,28652,28652,29167,29167,28814,28816,28867,...,58.708415,53,51,106.0,102.0,,,-4.0,4.0,3.921569


#### Perform data preprocessing

In [44]:
results_empkins_norm = results_empkins.copy()
results_guardian_norm = results_guardian.copy()
results_empkins_norm[("q_peak_sample", "estimated")] = results_empkins[("q_peak_sample", "estimated")] - results_empkins[("heartbeat_start_sample", "estimated")]
results_empkins_norm[("q_peak_sample", "reference")] = results_empkins[("q_peak_sample", "reference")] - results_empkins[("heartbeat_start_sample", "estimated")]
results_empkins_norm[("b_point_sample", "reference")] = results_empkins[("b_point_sample", "reference")] - results_empkins[("heartbeat_start_sample", "estimated")]
results_guardian_norm[("q_peak_sample", "estimated")] = results_guardian[("q_peak_sample", "estimated")] - results_guardian[("heartbeat_start_sample", "estimated")]
results_guardian_norm[("q_peak_sample", "reference")] = results_guardian[("q_peak_sample", "reference")] - results_guardian[("heartbeat_start_sample", "estimated")]
results_guardian_norm[("b_point_sample", "reference")] = results_guardian[("b_point_sample", "reference")] - results_guardian[("heartbeat_start_sample", "estimated")]

In [45]:
cols = [("heartbeat_start_sample", "estimated"), ("heartbeat_start_sample", "reference"), ("heartbeat_end_sample", "estimated"), ("heartbeat_end_sample", "reference"), ("q_peak_sample", "estimated"), ("q_peak_sample", "reference"), ("b_point_sample", "estimated"), ("b_point_sample", "reference"), ("pep_sample", "estimated"), ("pep_sample", "reference")]
results_empkins_cleaned_ms = results_empkins_norm.copy()
results_guardian_cleaned_ms = results_guardian_norm.copy()
results_empkins_cleaned_ms[cols] = results_empkins_norm[cols] * convert_hz_to_ms(sampling_frequency=fs_empkins)
results_guardian_cleaned_ms[cols] = results_guardian_norm[cols] * convert_hz_to_ms(sampling_frequency=fs_guardian)

#### Concatenate the empkins and guardian dataframes

In [46]:
preprocessed_results_empkins_guardian = pd.concat([results_empkins_cleaned_ms, results_guardian_cleaned_ms])
preprocessed_results_empkins_guardian

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,id,heartbeat_id,heartbeat_id,heartbeat_start_sample,heartbeat_start_sample,heartbeat_end_sample,heartbeat_end_sample,q_peak_sample,q_peak_sample,b_point_sample,...,heart_rate_bpm,pep_sample,pep_sample,pep_ms,pep_ms,nan_reason,nan_reason,error_per_sample_ms,absolute_error_per_sample_ms,absolute_relative_error_per_sample_percent
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,estimated,reference,estimated,reference,estimated,reference,estimated,reference,estimated,...,reference,estimated,reference,estimated,reference,estimated,reference,metric,metric,metric
q_peak_algorithm,b_point_algorithm,outlier_correction_algorithm,participant,condition,phase,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2
forouzanfar2018,b-point-reference,none,VP_001,tsst,Prep,0,0,0,394.0,399.0,984.0,985.0,187.0,173.0,634.0,...,101.694915,53.0,67.0,53.0,67.0,,,14.0,14.0,20.895522
forouzanfar2018,b-point-reference,none,VP_001,tsst,Prep,1,1,1,984.0,985.0,1569.0,1569.0,186.0,172.0,1206.0,...,103.270224,36.0,50.0,36.0,50.0,,,14.0,14.0,28.000000
forouzanfar2018,b-point-reference,none,VP_001,tsst,Prep,2,2,2,1569.0,1569.0,2134.0,2134.0,182.0,166.0,1811.0,...,107.719928,60.0,76.0,60.0,76.0,,,16.0,16.0,21.052632
forouzanfar2018,b-point-reference,none,VP_001,tsst,Prep,3,3,3,2134.0,2134.0,2684.0,2684.0,174.0,157.0,2382.0,...,109.890110,74.0,91.0,74.0,91.0,,,17.0,17.0,18.681319
forouzanfar2018,b-point-reference,none,VP_001,tsst,Prep,4,4,4,2684.0,2684.0,3227.0,3227.0,170.0,162.0,2926.0,...,110.905730,72.0,80.0,72.0,80.0,,,8.0,8.0,10.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
vanlien2013-42-ms,b-point-reference,none,GDN0030,Dummy,TiltDown,51,51,51,54058.0,54058.0,55140.0,55140.0,338.0,342.0,54556.0,...,55.555556,160.0,156.0,160.0,156.0,,,-4.0,4.0,2.564103
vanlien2013-42-ms,b-point-reference,none,GDN0030,Dummy,TiltDown,52,52,52,55140.0,55140.0,56236.0,56236.0,336.0,340.0,55630.0,...,54.249548,154.0,150.0,154.0,150.0,,,-4.0,4.0,2.666667
vanlien2013-42-ms,b-point-reference,none,GDN0030,Dummy,TiltDown,53,53,53,56236.0,56236.0,57304.0,57304.0,346.0,350.0,56690.0,...,57.361377,108.0,104.0,108.0,104.0,,,-4.0,4.0,3.846154
vanlien2013-42-ms,b-point-reference,none,GDN0030,Dummy,TiltDown,54,54,54,57304.0,57304.0,58334.0,58334.0,324.0,328.0,57734.0,...,58.708415,106.0,102.0,106.0,102.0,,,-4.0,4.0,3.921569


In [47]:
input_data = preprocessed_results_empkins_guardian.reset_index().set_index(['q_peak_algorithm', 'b_point_algorithm', 'outlier_correction_algorithm', 'participant', 'condition', 'phase'])

#### Select columns

In [48]:
if include_rr_interval:
    input_data = input_data[[("heartbeat_id", "reference"), ("q_peak_sample", "estimated"), ("q_peak_sample", "reference"), ("rr_interval_ms", "estimated")]]
elif include_b_point_reference:
    input_data = input_data[[("heartbeat_id", "reference"), ("q_peak_sample", "estimated"), ("q_peak_sample", "reference"), ("rr_interval_ms", "estimated"), ("b_point_sample", "reference")]]
else:
    input_data = input_data[[("heartbeat_id", "reference"), ("q_peak_sample", "estimated"), ("q_peak_sample", "reference")]]
input_data.columns = input_data.columns.to_flat_index().str.join("_")
input_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,heartbeat_id_reference,q_peak_sample_estimated,q_peak_sample_reference
q_peak_algorithm,b_point_algorithm,outlier_correction_algorithm,participant,condition,phase,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
forouzanfar2018,b-point-reference,none,VP_001,tsst,Prep,0,187.0,173.0
forouzanfar2018,b-point-reference,none,VP_001,tsst,Prep,1,186.0,172.0
forouzanfar2018,b-point-reference,none,VP_001,tsst,Prep,2,182.0,166.0
forouzanfar2018,b-point-reference,none,VP_001,tsst,Prep,3,174.0,157.0
forouzanfar2018,b-point-reference,none,VP_001,tsst,Prep,4,170.0,162.0
...,...,...,...,...,...,...,...,...
vanlien2013-42-ms,b-point-reference,none,GDN0030,Dummy,TiltDown,51,338.0,342.0
vanlien2013-42-ms,b-point-reference,none,GDN0030,Dummy,TiltDown,52,336.0,340.0
vanlien2013-42-ms,b-point-reference,none,GDN0030,Dummy,TiltDown,53,346.0,350.0
vanlien2013-42-ms,b-point-reference,none,GDN0030,Dummy,TiltDown,54,324.0,328.0


In [49]:
print(f"Amount of algorithms: {len(input_data.index.get_level_values('q_peak_algorithm').unique())}")

Amount of algorithms: 8


#### Transform dataframe from long to wide format

In [50]:
input_data = input_data.reset_index()
# Pivot the DataFrame
if include_rr_interval:
    input_data_wide = input_data.pivot(
        index=['participant', 'condition', 'phase', 'heartbeat_id_reference', 'q_peak_sample_reference', 'rr_interval_ms_estimated'],
        columns="q_peak_algorithm",
        values='q_peak_sample_estimated'
    ).reset_index()
elif include_b_point_reference:
    input_data_wide = input_data.pivot(
        index=['participant', 'condition', 'phase', 'heartbeat_id_reference', 'q_peak_sample_reference', 'b_point_sample_reference'],
        columns="q_peak_algorithm",
        values='q_peak_sample_estimated'
    ).reset_index()
else:
    input_data_wide = input_data.pivot(
        index=['participant', 'condition', 'phase', 'heartbeat_id_reference', 'q_peak_sample_reference'],
        columns="q_peak_algorithm",
        values='q_peak_sample_estimated'
    ).reset_index()

input_data_wide

q_peak_algorithm,participant,condition,phase,heartbeat_id_reference,q_peak_sample_reference,forouzanfar2018,martinez2004,vanlien2013-32-ms,vanlien2013-34-ms,vanlien2013-36-ms,vanlien2013-38-ms,vanlien2013-40-ms,vanlien2013-42-ms
0,GDN0005,Dummy,HoldingBreath,0,260.0,276.0,262.0,266.0,264.0,262.0,260.0,258.0,256.0
1,GDN0005,Dummy,HoldingBreath,1,260.0,278.0,262.0,266.0,264.0,262.0,260.0,258.0,256.0
2,GDN0005,Dummy,HoldingBreath,3,222.0,236.0,224.0,230.0,228.0,226.0,224.0,222.0,220.0
3,GDN0005,Dummy,HoldingBreath,4,220.0,236.0,222.0,230.0,228.0,226.0,224.0,222.0,220.0
4,GDN0005,Dummy,HoldingBreath,5,228.0,242.0,230.0,236.0,234.0,232.0,230.0,228.0,226.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11606,VP_032,tsst,Talk,39,172.0,183.0,173.0,177.0,175.0,173.0,171.0,169.0,167.0
11607,VP_032,tsst,Talk,40,181.0,193.0,182.0,185.0,183.0,181.0,179.0,177.0,175.0
11608,VP_032,tsst,Talk,41,200.0,210.0,201.0,206.0,204.0,202.0,200.0,198.0,196.0
11609,VP_032,tsst,Talk,42,212.0,223.0,213.0,220.0,218.0,216.0,214.0,212.0,210.0


#### Drop nan values and the 'scipy-findpeaks' column

In [51]:
print(f"Drop nan values: {drop_nan}")
print(f"Impute nan values: {impute_nan}")

Drop nan values: True
Impute nan values: False


#### Delete rows where the column 'rr_interval_ms_estimated' contains nan values
Discuss how to handle this case

In [52]:
if include_rr_interval and drop_nan:
    input_data_wide = input_data_wide.dropna(subset=['rr_interval_ms_estimated'])
input_data_wide

q_peak_algorithm,participant,condition,phase,heartbeat_id_reference,q_peak_sample_reference,forouzanfar2018,martinez2004,vanlien2013-32-ms,vanlien2013-34-ms,vanlien2013-36-ms,vanlien2013-38-ms,vanlien2013-40-ms,vanlien2013-42-ms
0,GDN0005,Dummy,HoldingBreath,0,260.0,276.0,262.0,266.0,264.0,262.0,260.0,258.0,256.0
1,GDN0005,Dummy,HoldingBreath,1,260.0,278.0,262.0,266.0,264.0,262.0,260.0,258.0,256.0
2,GDN0005,Dummy,HoldingBreath,3,222.0,236.0,224.0,230.0,228.0,226.0,224.0,222.0,220.0
3,GDN0005,Dummy,HoldingBreath,4,220.0,236.0,222.0,230.0,228.0,226.0,224.0,222.0,220.0
4,GDN0005,Dummy,HoldingBreath,5,228.0,242.0,230.0,236.0,234.0,232.0,230.0,228.0,226.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11606,VP_032,tsst,Talk,39,172.0,183.0,173.0,177.0,175.0,173.0,171.0,169.0,167.0
11607,VP_032,tsst,Talk,40,181.0,193.0,182.0,185.0,183.0,181.0,179.0,177.0,175.0
11608,VP_032,tsst,Talk,41,200.0,210.0,201.0,206.0,204.0,202.0,200.0,198.0,196.0
11609,VP_032,tsst,Talk,42,212.0,223.0,213.0,220.0,218.0,216.0,214.0,212.0,210.0


In [53]:
if include_rr_interval:
    input_data_wide_cleaned = input_data_wide.reset_index().set_index(['participant', 'condition', 'phase', 'heartbeat_id_reference', 'q_peak_sample_reference', 'rr_interval_ms_estimated'], append=True)
else:
    input_data_wide_cleaned = input_data_wide.reset_index().set_index(['participant', 'condition', 'phase', 'heartbeat_id_reference', 'q_peak_sample_reference'], append=True)

if drop_nan:
    input_data_wide_cleaned = input_data_wide_cleaned.dropna()
else:
    if impute_nan:
        input_data_wide_cleaned = input_data_wide_cleaned.drop(columns=['index'])
        input_data_wide_cleaned = input_data_wide_cleaned.astype(np.float64)  # Ensure all data is float64 for imputation
        input_data_wide_cleaned = impute_missing_values(input_data_wide_cleaned, mode='median')
        print(f"Amount of rows after imputation: {input_data_wide_cleaned.shape[0]}")
        input_data_wide_cleaned = input_data_wide_cleaned.dropna()  # This line of code is responsible for the length mismatcht for the median imputed data 
                                                                    # between rr-interval and without-rr-interval datasets
        print(f"Amount of rows after deleting all nan rows: {input_data_wide_cleaned.shape[0]}")
    else:
        input_data_wide_cleaned = input_data_wide_cleaned.dropna(how='all')
input_data_wide_cleaned

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,q_peak_algorithm,index,forouzanfar2018,martinez2004,vanlien2013-32-ms,vanlien2013-34-ms,vanlien2013-36-ms,vanlien2013-38-ms,vanlien2013-40-ms,vanlien2013-42-ms
Unnamed: 0_level_1,participant,condition,phase,heartbeat_id_reference,q_peak_sample_reference,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,GDN0005,Dummy,HoldingBreath,0,260.0,0,276.0,262.0,266.0,264.0,262.0,260.0,258.0,256.0
1,GDN0005,Dummy,HoldingBreath,1,260.0,1,278.0,262.0,266.0,264.0,262.0,260.0,258.0,256.0
2,GDN0005,Dummy,HoldingBreath,3,222.0,2,236.0,224.0,230.0,228.0,226.0,224.0,222.0,220.0
3,GDN0005,Dummy,HoldingBreath,4,220.0,3,236.0,222.0,230.0,228.0,226.0,224.0,222.0,220.0
4,GDN0005,Dummy,HoldingBreath,5,228.0,4,242.0,230.0,236.0,234.0,232.0,230.0,228.0,226.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11605,VP_032,tsst,Talk,38,153.0,11605,188.0,154.0,179.0,177.0,175.0,173.0,171.0,169.0
11606,VP_032,tsst,Talk,39,172.0,11606,183.0,173.0,177.0,175.0,173.0,171.0,169.0,167.0
11607,VP_032,tsst,Talk,40,181.0,11607,193.0,182.0,185.0,183.0,181.0,179.0,177.0,175.0
11608,VP_032,tsst,Talk,41,200.0,11608,210.0,201.0,206.0,204.0,202.0,200.0,198.0,196.0


#### Save the preprocessed data to use it for training

In [25]:
print(f"Save results: {save_results}")
print(f"Include RR-Interval: {include_rr_interval}")
print(f"Drop missing values: {drop_nan}")
print(f"Impute missing values: {impute_nan}")
print(f"Include B-Point reference: {include_b_point_reference}")

Save results: True
Include RR-Interval: False
Drop missing values: False
Impute missing values: False
Include B-Point reference: False


In [27]:
if save_results:
    input_data_wide_cleaned.to_csv(result_path.joinpath("data/q-peak/without-rr-interval/train_data_q_peak_include_nan.csv"))
    print("Data saved!")

Data saved!


In [33]:
test_data = pd.read_csv(result_path.joinpath("data/q-peak/without-rr-interval/train_data_q_peak_include_nan.csv"), index_col=[0,1,2,3,4,5])
test_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,forouzanfar2018,martinez2004,vanlien2013-32-ms,vanlien2013-34-ms,vanlien2013-36-ms,vanlien2013-38-ms,vanlien2013-40-ms,vanlien2013-42-ms
Unnamed: 0_level_1,participant,condition,phase,heartbeat_id_reference,q_peak_sample_reference,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,GDN0005,Dummy,HoldingBreath,0,260.0,276.0,262.0,266.0,264.0,262.0,260.0,258.0,256.0
1,GDN0005,Dummy,HoldingBreath,1,260.0,278.0,262.0,266.0,264.0,262.0,260.0,258.0,256.0
2,GDN0005,Dummy,HoldingBreath,3,222.0,236.0,224.0,230.0,228.0,226.0,224.0,222.0,220.0
3,GDN0005,Dummy,HoldingBreath,4,220.0,236.0,222.0,230.0,228.0,226.0,224.0,222.0,220.0
4,GDN0005,Dummy,HoldingBreath,5,228.0,242.0,230.0,236.0,234.0,232.0,230.0,228.0,226.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11606,VP_032,tsst,Talk,39,172.0,183.0,173.0,177.0,175.0,173.0,171.0,169.0,167.0
11607,VP_032,tsst,Talk,40,181.0,193.0,182.0,185.0,183.0,181.0,179.0,177.0,175.0
11608,VP_032,tsst,Talk,41,200.0,210.0,201.0,206.0,204.0,202.0,200.0,198.0,196.0
11609,VP_032,tsst,Talk,42,212.0,223.0,213.0,220.0,218.0,216.0,214.0,212.0,210.0
