# Explore the PEP extraction results on the Empkins data

Observations:  
- For each phase the heartbeat ID starts at zero

## Setup and helper functions

In [1]:
import json

from pathlib import Path

import pandas as pd
import numpy as np

import biopsykit as bp

import matplotlib.pyplot as plt

from pepbench.io import load_challenge_results_from_folder
from pepbench.io import convert_hz_to_ms
from pepbench.datasets import EmpkinsDataset

%matplotlib widget
%load_ext autoreload
%autoreload 2

In [2]:
root_path = Path("../../")

In [3]:
deploy_type = "local"

config_dict = json.load(root_path.joinpath("config.json").open(encoding="utf-8"))

empkins_base_path = Path(config_dict[deploy_type]["empkins_path"])
guardian_base_path = Path(config_dict[deploy_type]["guardian_path"])
print(empkins_base_path)

\Users\sebas\Development\ResearchInternship\Data\2024_08_PEP_Benchmarking\EmpkinS_Dataset


#### Specify whether the results should be saved or not

In [4]:
save_results = True

In [5]:
result_path = root_path.joinpath("results")
result_path

WindowsPath('../../results')

In [6]:
dataset_empkins = EmpkinsDataset(empkins_base_path, use_cache=True, only_labeled=True)
fs_empkins = dataset_empkins.sampling_rate_icg
print(f"Sampling rate ICG: {fs_empkins}")
dataset_empkins

Sampling rate ICG: 1000


Unnamed: 0,participant,condition,phase
0,VP_001,tsst,Prep
1,VP_001,tsst,Pause_1
2,VP_001,tsst,Talk
3,VP_001,tsst,Math
4,VP_001,tsst,Pause_5
...,...,...,...
145,VP_032,ftsst,Prep
146,VP_032,ftsst,Pause_1
147,VP_032,ftsst,Talk
148,VP_032,ftsst,Math


In [7]:
algo_levels = ["q_wave_algorithm", "b_point_algorithm", "outlier_correction_algorithm"]
algo_level_mapping = dict(zip(algo_levels, ["Q-Wave Algorithm", "B-Point Algorithm", "Outlier Correction"]))

In [8]:
results_empkins = load_challenge_results_from_folder(
    result_path.joinpath("empkins_dataset_reference_q_wave").resolve(), index_cols_per_sample=["participant", "condition", "phase"]
)

In [9]:
results_empkins.per_sample

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,heartbeat_id,heartbeat_id,heartbeat_start_sample,heartbeat_start_sample,heartbeat_end_sample,heartbeat_end_sample,q_wave_onset_sample,q_wave_onset_sample,b_point_sample,b_point_sample,rr_interval_ms,pep_sample,pep_sample,pep_ms,pep_ms,nan_reason,nan_reason,error_per_sample_ms,absolute_error_per_sample_ms,absolute_relative_error_per_sample_percent
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,estimated,reference,estimated,reference,estimated,reference,estimated,reference,estimated,reference,estimated,estimated,reference,estimated,reference,estimated,reference,metric,metric,metric
q_wave_algorithm,b_point_algorithm,outlier_correction_algorithm,participant,condition,phase,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2
q-wave-reference,arbol2017-isoelectric-crossings,forouzanfar2018,VP_001,tsst,Prep,0,0,0,394,399,984,985,567,567,669.0,634,590.0,102.0,67,102.0,67,,,-35.0,35.0,52.238806
q-wave-reference,arbol2017-isoelectric-crossings,forouzanfar2018,VP_001,tsst,Prep,1,1,1,984,985,1569,1569,1156,1156,1238.0,1206,581.0,82.0,50,82.0,50,,,-32.0,32.0,64.000000
q-wave-reference,arbol2017-isoelectric-crossings,forouzanfar2018,VP_001,tsst,Prep,2,2,2,1569,1569,2134,2134,1735,1735,1834.0,1811,557.0,99.0,76,99.0,76,,,-23.0,23.0,30.263158
q-wave-reference,arbol2017-isoelectric-crossings,forouzanfar2018,VP_001,tsst,Prep,3,3,3,2134,2134,2684,2684,2291,2291,2394.0,2382,546.0,103.0,91,103.0,91,,,-12.0,12.0,13.186813
q-wave-reference,arbol2017-isoelectric-crossings,forouzanfar2018,VP_001,tsst,Prep,4,4,4,2684,2684,3227,3227,2846,2846,2943.0,2926,541.0,97.0,80,97.0,80,,,-17.0,17.0,21.250000
q-wave-reference,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
q-wave-reference,stern1985,none,VP_032,ftsst,Pause_5,7,7,7,5864,5864,6660,6660,6110,6110,6210.0,6256,791.0,100.0,146,100.0,146,,,46.0,46.0,31.506849
q-wave-reference,stern1985,none,VP_032,ftsst,Pause_5,8,8,8,6660,6660,7458,7458,6900,6900,7052.0,7054,802.0,152.0,154,152.0,154,,,2.0,2.0,1.298701
q-wave-reference,stern1985,none,VP_032,ftsst,Pause_5,9,9,9,7458,7458,8267,8267,7701,7701,7857.0,7855,812.0,156.0,154,156.0,154,,,-2.0,2.0,1.298701
q-wave-reference,stern1985,none,VP_032,ftsst,Pause_5,10,10,10,8267,8267,9074,9074,8515,8515,8667.0,8673,804.0,152.0,158,152.0,158,,,6.0,6.0,3.797468


In [10]:
results_empkins_b_point_estimated = results_empkins.per_sample.droplevel("q_wave_algorithm")[[("heartbeat_id", "reference"), ("b_point_sample", "estimated"), ("b_point_sample", "reference")]]
results_empkins_b_point_estimated = results_empkins_b_point_estimated.reset_index()
results_empkins_b_point_estimated

Unnamed: 0_level_0,b_point_algorithm,outlier_correction_algorithm,participant,condition,phase,level_5,heartbeat_id,b_point_sample,b_point_sample
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,reference,estimated,reference
0,arbol2017-isoelectric-crossings,forouzanfar2018,VP_001,tsst,Prep,0,0,669.0,634
1,arbol2017-isoelectric-crossings,forouzanfar2018,VP_001,tsst,Prep,1,1,1238.0,1206
2,arbol2017-isoelectric-crossings,forouzanfar2018,VP_001,tsst,Prep,2,2,1834.0,1811
3,arbol2017-isoelectric-crossings,forouzanfar2018,VP_001,tsst,Prep,3,3,2394.0,2382
4,arbol2017-isoelectric-crossings,forouzanfar2018,VP_001,tsst,Prep,4,4,2943.0,2926
...,...,...,...,...,...,...,...,...,...
149995,stern1985,none,VP_032,ftsst,Pause_5,7,7,6210.0,6256
149996,stern1985,none,VP_032,ftsst,Pause_5,8,8,7052.0,7054
149997,stern1985,none,VP_032,ftsst,Pause_5,9,9,7857.0,7855
149998,stern1985,none,VP_032,ftsst,Pause_5,10,10,8667.0,8673


In [11]:
results_empkins_b_point_estimated.columns = results_empkins_b_point_estimated.columns.to_flat_index().str.join("")
results_empkins_b_point_estimated = results_empkins_b_point_estimated.drop(columns=["level_5"])
results_empkins_b_point_estimated

Unnamed: 0,b_point_algorithm,outlier_correction_algorithm,participant,condition,phase,heartbeat_idreference,b_point_sampleestimated,b_point_samplereference
0,arbol2017-isoelectric-crossings,forouzanfar2018,VP_001,tsst,Prep,0,669.0,634
1,arbol2017-isoelectric-crossings,forouzanfar2018,VP_001,tsst,Prep,1,1238.0,1206
2,arbol2017-isoelectric-crossings,forouzanfar2018,VP_001,tsst,Prep,2,1834.0,1811
3,arbol2017-isoelectric-crossings,forouzanfar2018,VP_001,tsst,Prep,3,2394.0,2382
4,arbol2017-isoelectric-crossings,forouzanfar2018,VP_001,tsst,Prep,4,2943.0,2926
...,...,...,...,...,...,...,...,...
149995,stern1985,none,VP_032,ftsst,Pause_5,7,6210.0,6256
149996,stern1985,none,VP_032,ftsst,Pause_5,8,7052.0,7054
149997,stern1985,none,VP_032,ftsst,Pause_5,9,7857.0,7855
149998,stern1985,none,VP_032,ftsst,Pause_5,10,8667.0,8673


In [12]:
results_empkins_b_point_estimated["b_point_algorithm_combi"] = results_empkins_b_point_estimated["b_point_algorithm"] + "_" + results_empkins_b_point_estimated["outlier_correction_algorithm"]
results_empkins_b_point_estimated = results_empkins_b_point_estimated.drop(columns=["b_point_algorithm", "outlier_correction_algorithm"])
results_empkins_b_point_estimated.reindex(level=["particpant", "condition", "phase"])

Unnamed: 0,participant,condition,phase,heartbeat_idreference,b_point_sampleestimated,b_point_samplereference,b_point_algorithm_combi
0,VP_001,tsst,Prep,0,669.0,634,arbol2017-isoelectric-crossings_forouzanfar2018
1,VP_001,tsst,Prep,1,1238.0,1206,arbol2017-isoelectric-crossings_forouzanfar2018
2,VP_001,tsst,Prep,2,1834.0,1811,arbol2017-isoelectric-crossings_forouzanfar2018
3,VP_001,tsst,Prep,3,2394.0,2382,arbol2017-isoelectric-crossings_forouzanfar2018
4,VP_001,tsst,Prep,4,2943.0,2926,arbol2017-isoelectric-crossings_forouzanfar2018
...,...,...,...,...,...,...,...
149995,VP_032,ftsst,Pause_5,7,6210.0,6256,stern1985_none
149996,VP_032,ftsst,Pause_5,8,7052.0,7054,stern1985_none
149997,VP_032,ftsst,Pause_5,9,7857.0,7855,stern1985_none
149998,VP_032,ftsst,Pause_5,10,8667.0,8673,stern1985_none


In [13]:
print("Possible B-Point algorithm combinations: ")
algo_list = results_empkins_b_point_estimated["b_point_algorithm_combi"].unique()
print(f"There are {len(algo_list)} possible algorithm combinations:")
algo_list

Possible B-Point algorithm combinations: 
There are 30 possible algorithm combinations:


array(['arbol2017-isoelectric-crossings_forouzanfar2018',
       'arbol2017-isoelectric-crossings_linear-interpolation',
       'arbol2017-isoelectric-crossings_none',
       'arbol2017-second-derivative_forouzanfar2018',
       'arbol2017-second-derivative_linear-interpolation',
       'arbol2017-second-derivative_none',
       'arbol2017-third-derivative_forouzanfar2018',
       'arbol2017-third-derivative_linear-interpolation',
       'arbol2017-third-derivative_none',
       'debski1993-second-derivative_forouzanfar2018',
       'debski1993-second-derivative_linear-interpolation',
       'debski1993-second-derivative_none', 'drost2022_forouzanfar2018',
       'drost2022_linear-interpolation', 'drost2022_none',
       'forounzafar2018_forouzanfar2018',
       'forounzafar2018_linear-interpolation', 'forounzafar2018_none',
       'lozano2007-linear-regression_forouzanfar2018',
       'lozano2007-linear-regression_linear-interpolation',
       'lozano2007-linear-regression_none',
    

In [14]:
print(f"Number of rows in the long dataframe: {results_empkins_b_point_estimated.shape[0]}")
print(f"Number of possible B-Point algorithm combinations: {len(algo_list)}")
print(f"Expected amount of rows in the wide datframe: {results_empkins_b_point_estimated.shape[0]/len(algo_list)}")

Number of rows in the long dataframe: 150000
Number of possible B-Point algorithm combinations: 30
Expected amount of rows in the wide datframe: 5000.0


### Convert the dataframe from the long to the wide format

In [15]:
# Pivot the DataFrame
df_empkins_pivot = results_empkins_b_point_estimated.pivot_table(
    index=['participant', 'condition', 'phase', 'heartbeat_idreference', 'b_point_samplereference'],
    columns='b_point_algorithm_combi',
    values='b_point_sampleestimated'
).reset_index()

# Flatten the columns
df_empkins_pivot.columns.name = None
df_empkins_pivot.columns = [f'{col}' if isinstance(col, str) else f'{col[1]}' for col in df_empkins_pivot.columns]
df_empkins_pivot

Unnamed: 0,participant,condition,phase,heartbeat_idreference,b_point_samplereference,arbol2017-isoelectric-crossings_forouzanfar2018,arbol2017-isoelectric-crossings_linear-interpolation,arbol2017-isoelectric-crossings_none,arbol2017-second-derivative_forouzanfar2018,arbol2017-second-derivative_linear-interpolation,...,lozano2007-linear-regression_none,lozano2007-quadratic-regression_forouzanfar2018,lozano2007-quadratic-regression_linear-interpolation,lozano2007-quadratic-regression_none,sherwood1990_forouzanfar2018,sherwood1990_linear-interpolation,sherwood1990_none,stern1985_forouzanfar2018,stern1985_linear-interpolation,stern1985_none
0,VP_001,ftsst,Math,1,1074,1111.0,1111.0,1111.0,1080.0,1080.0,...,1126.0,1125.0,1125.0,1125.0,1111.0,1111.0,1111.0,1108.0,1079.0,1022.0
1,VP_001,ftsst,Math,2,1849,1844.0,1844.0,1844.0,1810.0,1809.0,...,1860.0,1856.0,1856.0,1856.0,1853.0,1853.0,1853.0,1826.0,1818.0,1765.0
2,VP_001,ftsst,Math,3,2518,2558.0,2558.0,2558.0,2494.0,2494.0,...,2565.0,2565.0,2565.0,2565.0,2551.0,2551.0,2551.0,2516.0,2516.0,2516.0
3,VP_001,ftsst,Math,4,3252,3264.0,3264.0,3264.0,3215.0,3212.0,...,3280.0,3291.0,3294.0,3278.0,3279.0,3279.0,3279.0,3247.0,3247.0,3247.0
4,VP_001,ftsst,Math,5,3933,3942.0,3942.0,3942.0,3881.0,3879.0,...,3941.0,3961.0,3962.0,3937.0,3925.0,3936.0,3872.0,3932.0,3932.0,3932.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4990,VP_032,tsst,Talk,39,26419,26444.0,26444.0,26444.0,26385.0,26385.0,...,26414.0,26403.0,26403.0,26403.0,26446.0,26446.0,26446.0,26415.0,26415.0,26415.0
4991,VP_032,tsst,Talk,40,27042,27051.0,27051.0,27051.0,27018.0,27018.0,...,27041.0,27036.0,27037.0,27023.0,27049.0,27049.0,27049.0,27007.0,27007.0,27007.0
4992,VP_032,tsst,Talk,41,27719,27696.0,27696.0,27696.0,27679.0,27679.0,...,27711.0,27701.0,27701.0,27701.0,27690.0,27690.0,27690.0,27666.0,27666.0,27666.0
4993,VP_032,tsst,Talk,42,28395,28449.0,28449.0,28449.0,28408.0,28408.0,...,28435.0,28421.0,28421.0,28421.0,28452.0,28452.0,28452.0,28389.0,28389.0,28389.0


In [16]:
print(f"Max heartbeat id long dataframe: {max(results_empkins_b_point_estimated["heartbeat_idreference"])}")
print(f"Max heartbeat id wide dataframe: {max(df_empkins_pivot["heartbeat_idreference"])}")

Max heartbeat id long dataframe: 73
Max heartbeat id wide dataframe: 73


In [17]:
vp_001_tsst_long = results_empkins_b_point_estimated[results_empkins_b_point_estimated["participant"] == "VP_001"][results_empkins_b_point_estimated["condition"] == "tsst"]
vp_01_tsst_wide = df_empkins_pivot[df_empkins_pivot["participant"] == "VP_001"][df_empkins_pivot["condition"] == "tsst"]

  vp_001_tsst_long = results_empkins_b_point_estimated[results_empkins_b_point_estimated["participant"] == "VP_001"][results_empkins_b_point_estimated["condition"] == "tsst"]
  vp_01_tsst_wide = df_empkins_pivot[df_empkins_pivot["participant"] == "VP_001"][df_empkins_pivot["condition"] == "tsst"]


In [18]:
print(f"Lenght of VP01 - tsst long / Number of B-Point algorithm combinations: {vp_001_tsst_long.shape[0]/len(algo_list)} should match length of VP01 - tsst wide: {vp_01_tsst_wide.shape[0]}")

Lenght of VP01 - tsst long / Number of B-Point algorithm combinations: 179.0 should match length of VP01 - tsst wide: 179


### Save the pivot dataframe

In [19]:
if save_results:
    df_empkins_pivot.to_csv(result_path.joinpath("pivot_dataframe_b_point/empkins_pivot_dataframe_b_point_all_algos.csv"))

### Convert samples to ms

In [20]:
df_empkins_pivot_ms = df_empkins_pivot.copy()
exclude_cols = ["participant", "condition", "phase", "heartbeat_idreference"]
df_empkins_pivot_ms.loc[:, ~df_empkins_pivot_ms.columns.isin(exclude_cols)] = df_empkins_pivot_ms.loc[:, ~df_empkins_pivot_ms.columns.isin(exclude_cols)].apply(lambda x: x * convert_hz_to_ms(fs_empkins))
df_empkins_pivot_ms          

  df_empkins_pivot_ms.loc[:, ~df_empkins_pivot_ms.columns.isin(exclude_cols)] = df_empkins_pivot_ms.loc[:, ~df_empkins_pivot_ms.columns.isin(exclude_cols)].apply(lambda x: x * convert_hz_to_ms(fs_empkins))


Unnamed: 0,participant,condition,phase,heartbeat_idreference,b_point_samplereference,arbol2017-isoelectric-crossings_forouzanfar2018,arbol2017-isoelectric-crossings_linear-interpolation,arbol2017-isoelectric-crossings_none,arbol2017-second-derivative_forouzanfar2018,arbol2017-second-derivative_linear-interpolation,...,lozano2007-linear-regression_none,lozano2007-quadratic-regression_forouzanfar2018,lozano2007-quadratic-regression_linear-interpolation,lozano2007-quadratic-regression_none,sherwood1990_forouzanfar2018,sherwood1990_linear-interpolation,sherwood1990_none,stern1985_forouzanfar2018,stern1985_linear-interpolation,stern1985_none
0,VP_001,ftsst,Math,1,1074.0,1111.0,1111.0,1111.0,1080.0,1080.0,...,1126.0,1125.0,1125.0,1125.0,1111.0,1111.0,1111.0,1108.0,1079.0,1022.0
1,VP_001,ftsst,Math,2,1849.0,1844.0,1844.0,1844.0,1810.0,1809.0,...,1860.0,1856.0,1856.0,1856.0,1853.0,1853.0,1853.0,1826.0,1818.0,1765.0
2,VP_001,ftsst,Math,3,2518.0,2558.0,2558.0,2558.0,2494.0,2494.0,...,2565.0,2565.0,2565.0,2565.0,2551.0,2551.0,2551.0,2516.0,2516.0,2516.0
3,VP_001,ftsst,Math,4,3252.0,3264.0,3264.0,3264.0,3215.0,3212.0,...,3280.0,3291.0,3294.0,3278.0,3279.0,3279.0,3279.0,3247.0,3247.0,3247.0
4,VP_001,ftsst,Math,5,3933.0,3942.0,3942.0,3942.0,3881.0,3879.0,...,3941.0,3961.0,3962.0,3937.0,3925.0,3936.0,3872.0,3932.0,3932.0,3932.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4990,VP_032,tsst,Talk,39,26419.0,26444.0,26444.0,26444.0,26385.0,26385.0,...,26414.0,26403.0,26403.0,26403.0,26446.0,26446.0,26446.0,26415.0,26415.0,26415.0
4991,VP_032,tsst,Talk,40,27042.0,27051.0,27051.0,27051.0,27018.0,27018.0,...,27041.0,27036.0,27037.0,27023.0,27049.0,27049.0,27049.0,27007.0,27007.0,27007.0
4992,VP_032,tsst,Talk,41,27719.0,27696.0,27696.0,27696.0,27679.0,27679.0,...,27711.0,27701.0,27701.0,27701.0,27690.0,27690.0,27690.0,27666.0,27666.0,27666.0
4993,VP_032,tsst,Talk,42,28395.0,28449.0,28449.0,28449.0,28408.0,28408.0,...,28435.0,28421.0,28421.0,28421.0,28452.0,28452.0,28452.0,28389.0,28389.0,28389.0


### Save the converted dataframe

In [21]:
if save_results:
    df_empkins_pivot_ms.to_csv(result_path.joinpath("pivot_dataframe_b_point/empkins_pivot_dataframe_b_point_all_algos_ms.csv"))