# Explore the PEP extraction results on the Empkins data

Observations:  
- For each phase the heartbeat ID starts at zero

## Setup and helper functions

In [1]:
import json

from pathlib import Path

import pandas as pd
import numpy as np

import biopsykit as bp
from biopsykit.stats import StatsPipeline

import matplotlib.pyplot as plt

from pepbench.io import load_challenge_results_from_folder
from pepbench.datasets import EmpkinsDataset

%matplotlib widget
%load_ext autoreload
%autoreload 2

In [2]:
root_path = Path("../../")

In [3]:
deploy_type = "local"

config_dict = json.load(root_path.joinpath("config.json").open(encoding="utf-8"))

empkins_base_path = Path(config_dict[deploy_type]["empkins_path"])
guardian_base_path = Path(config_dict[deploy_type]["guardian_path"])
print(empkins_base_path)

\Users\sebas\Development\ResearchInternship\Data\2024_08_PEP_Benchmarking\EmpkinS_Dataset


In [4]:
result_path = root_path.joinpath("results")
result_path

WindowsPath('../../results')

In [5]:
dataset_empkins = EmpkinsDataset(empkins_base_path, use_cache=True, only_labeled=True)
dataset_empkins

Unnamed: 0,participant,condition,phase
0,VP_001,tsst,Prep
1,VP_001,tsst,Pause_1
2,VP_001,tsst,Talk
3,VP_001,tsst,Math
4,VP_001,tsst,Pause_5
...,...,...,...
145,VP_032,ftsst,Prep
146,VP_032,ftsst,Pause_1
147,VP_032,ftsst,Talk
148,VP_032,ftsst,Math


In [6]:
algo_levels = ["q_wave_algorithm", "b_point_algorithm", "outlier_correction_algorithm"]
algo_level_mapping = dict(zip(algo_levels, ["Q-Wave Algorithm", "B-Point Algorithm", "Outlier Correction"]))

In [7]:
results_empkins = load_challenge_results_from_folder(
    result_path.joinpath("empkins_dataset_reference_q_wave").resolve(), index_cols_per_sample=["participant", "condition", "phase"]
)

In [8]:
results_empkins_b_point_estimated = results_empkins.per_sample.droplevel("q_wave_algorithm")[[("heartbeat_id", "reference"), ("b_point_sample", "estimated"), ("b_point_sample", "reference")]]
results_empkins_b_point_estimated = results_empkins_b_point_estimated.reset_index()
results_empkins_b_point_estimated

Unnamed: 0_level_0,b_point_algorithm,outlier_correction_algorithm,participant,condition,phase,level_5,heartbeat_id,b_point_sample,b_point_sample
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,reference,estimated,reference
0,multiple-conditions,autoregression,VP_001,tsst,Prep,0,0,619.0,634
1,multiple-conditions,autoregression,VP_001,tsst,Prep,1,1,1198.0,1206
2,multiple-conditions,autoregression,VP_001,tsst,Prep,2,2,1777.0,1811
3,multiple-conditions,autoregression,VP_001,tsst,Prep,3,3,2331.0,2382
4,multiple-conditions,autoregression,VP_001,tsst,Prep,4,4,2924.0,2926
...,...,...,...,...,...,...,...,...,...
59995,third-derivative,none,VP_032,ftsst,Pause_5,7,7,6281.0,6256
59996,third-derivative,none,VP_032,ftsst,Pause_5,8,8,7069.0,7054
59997,third-derivative,none,VP_032,ftsst,Pause_5,9,9,7870.0,7855
59998,third-derivative,none,VP_032,ftsst,Pause_5,10,10,8682.0,8673


In [9]:
results_empkins_b_point_estimated.columns = results_empkins_b_point_estimated.columns.to_flat_index().str.join("")
results_empkins_b_point_estimated = results_empkins_b_point_estimated.drop(columns=["level_5"])
results_empkins_b_point_estimated

Unnamed: 0,b_point_algorithm,outlier_correction_algorithm,participant,condition,phase,heartbeat_idreference,b_point_sampleestimated,b_point_samplereference
0,multiple-conditions,autoregression,VP_001,tsst,Prep,0,619.0,634
1,multiple-conditions,autoregression,VP_001,tsst,Prep,1,1198.0,1206
2,multiple-conditions,autoregression,VP_001,tsst,Prep,2,1777.0,1811
3,multiple-conditions,autoregression,VP_001,tsst,Prep,3,2331.0,2382
4,multiple-conditions,autoregression,VP_001,tsst,Prep,4,2924.0,2926
...,...,...,...,...,...,...,...,...
59995,third-derivative,none,VP_032,ftsst,Pause_5,7,6281.0,6256
59996,third-derivative,none,VP_032,ftsst,Pause_5,8,7069.0,7054
59997,third-derivative,none,VP_032,ftsst,Pause_5,9,7870.0,7855
59998,third-derivative,none,VP_032,ftsst,Pause_5,10,8682.0,8673


In [10]:
results_empkins_b_point_estimated["b_point_algorithm_combi"] = results_empkins_b_point_estimated["b_point_algorithm"] + "_" + results_empkins_b_point_estimated["outlier_correction_algorithm"]
results_empkins_b_point_estimated = results_empkins_b_point_estimated.drop(columns=["b_point_algorithm", "outlier_correction_algorithm"])
results_empkins_b_point_estimated.reindex(level=["particpant", "condition", "phase"])

Unnamed: 0,participant,condition,phase,heartbeat_idreference,b_point_sampleestimated,b_point_samplereference,b_point_algorithm_combi
0,VP_001,tsst,Prep,0,619.0,634,multiple-conditions_autoregression
1,VP_001,tsst,Prep,1,1198.0,1206,multiple-conditions_autoregression
2,VP_001,tsst,Prep,2,1777.0,1811,multiple-conditions_autoregression
3,VP_001,tsst,Prep,3,2331.0,2382,multiple-conditions_autoregression
4,VP_001,tsst,Prep,4,2924.0,2926,multiple-conditions_autoregression
...,...,...,...,...,...,...,...
59995,VP_032,ftsst,Pause_5,7,6281.0,6256,third-derivative_none
59996,VP_032,ftsst,Pause_5,8,7069.0,7054,third-derivative_none
59997,VP_032,ftsst,Pause_5,9,7870.0,7855,third-derivative_none
59998,VP_032,ftsst,Pause_5,10,8682.0,8673,third-derivative_none


In [19]:
print("Possible B-Point algorithm combinations: ")
algo_list = results_empkins_b_point_estimated["b_point_algorithm_combi"].unique()
print(f"There are {len(algo_list)} possible algorithm combinations:")
algo_list

Possible B-Point algorithm combinations: 
There are 12 possible algorithm combinations:


array(['multiple-conditions_autoregression',
       'multiple-conditions_linear-interpolation',
       'multiple-conditions_none', 'second-derivative_autoregression',
       'second-derivative_linear-interpolation', 'second-derivative_none',
       'straight-line_autoregression',
       'straight-line_linear-interpolation', 'straight-line_none',
       'third-derivative_autoregression',
       'third-derivative_linear-interpolation', 'third-derivative_none'],
      dtype=object)

In [12]:
print(f"Number of rows in the long dataframe: {results_empkins_b_point_estimated.shape[0]}")
print(f"Number of possible B-Point algorithm combinations: {len(algo_list)}")
print(f"Expected amount of rows in the wide datframe: {results_empkins_b_point_estimated.shape[0]/len(algo_list)}")

Number of rows in the long dataframe: 60000
Number of possible B-Point algorithm combinations: 12
Expected amount of rows in the wide datframe: 5000.0


### Convert the dataframe from the long to the wide format

In [13]:
# Pivot the DataFrame
df_empkins_pivot = results_empkins_b_point_estimated.pivot_table(
    index=['participant', 'condition', 'phase', 'heartbeat_idreference', 'b_point_samplereference'],
    columns='b_point_algorithm_combi',
    values='b_point_sampleestimated'
).reset_index()

# Flatten the columns
df_empkins_pivot.columns.name = None
df_empkins_pivot.columns = [f'{col}' if isinstance(col, str) else f'{col[1]}' for col in df_empkins_pivot.columns]
df_empkins_pivot

Unnamed: 0,participant,condition,phase,heartbeat_idreference,b_point_samplereference,multiple-conditions_autoregression,multiple-conditions_linear-interpolation,multiple-conditions_none,second-derivative_autoregression,second-derivative_linear-interpolation,second-derivative_none,straight-line_autoregression,straight-line_linear-interpolation,straight-line_none,third-derivative_autoregression,third-derivative_linear-interpolation,third-derivative_none
0,VP_001,ftsst,Math,1,1074,1231.0,1238.0,1063.0,1063.0,1063.0,1063.0,1116.0,1116.0,1116.0,1134.0,1134.0,1134.0
1,VP_001,ftsst,Math,2,1849,1888.0,1889.0,1816.0,1829.0,1829.0,1829.0,1853.0,1853.0,1853.0,1872.0,1879.0,1859.0
2,VP_001,ftsst,Math,3,2518,2517.0,2517.0,2517.0,2508.0,2508.0,2508.0,2542.0,2542.0,2542.0,2577.0,2577.0,2577.0
3,VP_001,ftsst,Math,4,3252,3247.0,3247.0,3247.0,3228.0,3228.0,3228.0,3270.0,3270.0,3270.0,3295.0,3295.0,3295.0
4,VP_001,ftsst,Math,5,3933,3932.0,3932.0,3932.0,3917.0,3917.0,3917.0,3940.0,3940.0,3940.0,3949.0,3956.0,3857.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4990,VP_032,tsst,Talk,39,26419,26415.0,26415.0,26415.0,26418.0,26421.0,26409.0,26427.0,26427.0,26427.0,26433.0,26433.0,26433.0
4991,VP_032,tsst,Talk,40,27042,27031.0,27031.0,27031.0,27038.0,27047.0,27031.0,27042.0,27042.0,27042.0,27062.0,27065.0,27050.0
4992,VP_032,tsst,Talk,41,27719,27688.0,27688.0,27688.0,27709.0,27709.0,27709.0,27722.0,27722.0,27722.0,27727.0,27727.0,27727.0
4993,VP_032,tsst,Talk,42,28395,28419.0,28419.0,28419.0,28431.0,28431.0,28431.0,28438.0,28438.0,28438.0,28450.0,28450.0,28450.0


In [14]:
print(f"Max heartbeat id long dataframe: {max(results_empkins_b_point_estimated["heartbeat_idreference"])}")
print(f"Max heartbeat id wide dataframe: {max(df_empkins_pivot["heartbeat_idreference"])}")

Max heartbeat id long dataframe: 73
Max heartbeat id wide dataframe: 73


In [15]:
vp_001_tsst_long = results_empkins_b_point_estimated[results_empkins_b_point_estimated["participant"] == "VP_001"][results_empkins_b_point_estimated["condition"] == "tsst"]
vp_01_tsst_wide = df_empkins_pivot[df_empkins_pivot["participant"] == "VP_001"][df_empkins_pivot["condition"] == "tsst"]

  vp_001_tsst_long = results_empkins_b_point_estimated[results_empkins_b_point_estimated["participant"] == "VP_001"][results_empkins_b_point_estimated["condition"] == "tsst"]
  vp_01_tsst_wide = df_empkins_pivot[df_empkins_pivot["participant"] == "VP_001"][df_empkins_pivot["condition"] == "tsst"]


In [16]:
print(f"Lenght of VP01 - tsst long / Number of B-Point algorithm combinations: {vp_001_tsst_long.shape[0]/len(algo_list)} should match length of VP01 - tsst wide: {vp_01_tsst_wide.shape[0]}")

Lenght of VP01 - tsst long / Number of B-Point algorithm combinations: 179.0 should match length of VP01 - tsst wide: 179


### Save the pivot dataframe

In [20]:
df_empkins_pivot

RangeIndex(start=0, stop=4995, step=1)

In [17]:
#df_empkins_pivot.to_csv(result_path.joinpath("pivot_dataframe_b_point/empkins_pivot_dataframe_b_point_algos.csv"))

In [21]:
#df_empkins_pivot["ID"] = df_empkins_pivot.index.astype(int)
#df_empkins_pivot

Unnamed: 0,participant,condition,phase,heartbeat_idreference,b_point_samplereference,multiple-conditions_autoregression,multiple-conditions_linear-interpolation,multiple-conditions_none,second-derivative_autoregression,second-derivative_linear-interpolation,second-derivative_none,straight-line_autoregression,straight-line_linear-interpolation,straight-line_none,third-derivative_autoregression,third-derivative_linear-interpolation,third-derivative_none,ID
0,VP_001,ftsst,Math,1,1074,1231.0,1238.0,1063.0,1063.0,1063.0,1063.0,1116.0,1116.0,1116.0,1134.0,1134.0,1134.0,0
1,VP_001,ftsst,Math,2,1849,1888.0,1889.0,1816.0,1829.0,1829.0,1829.0,1853.0,1853.0,1853.0,1872.0,1879.0,1859.0,1
2,VP_001,ftsst,Math,3,2518,2517.0,2517.0,2517.0,2508.0,2508.0,2508.0,2542.0,2542.0,2542.0,2577.0,2577.0,2577.0,2
3,VP_001,ftsst,Math,4,3252,3247.0,3247.0,3247.0,3228.0,3228.0,3228.0,3270.0,3270.0,3270.0,3295.0,3295.0,3295.0,3
4,VP_001,ftsst,Math,5,3933,3932.0,3932.0,3932.0,3917.0,3917.0,3917.0,3940.0,3940.0,3940.0,3949.0,3956.0,3857.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4990,VP_032,tsst,Talk,39,26419,26415.0,26415.0,26415.0,26418.0,26421.0,26409.0,26427.0,26427.0,26427.0,26433.0,26433.0,26433.0,4990
4991,VP_032,tsst,Talk,40,27042,27031.0,27031.0,27031.0,27038.0,27047.0,27031.0,27042.0,27042.0,27042.0,27062.0,27065.0,27050.0,4991
4992,VP_032,tsst,Talk,41,27719,27688.0,27688.0,27688.0,27709.0,27709.0,27709.0,27722.0,27722.0,27722.0,27727.0,27727.0,27727.0,4992
4993,VP_032,tsst,Talk,42,28395,28419.0,28419.0,28419.0,28431.0,28431.0,28431.0,28438.0,28438.0,28438.0,28450.0,28450.0,28450.0,4993


### Split the data into training and test set 
- Do not split participants

In [20]:
fraction = 0.7  # 50% of the data in each split

# Split the DataFrame randomly into two parts based on the 'participant' column
df_part1 = results_empkins_b_point_estimated.sample(frac=fraction, random_state=1, axis=0)
df_part2 = results_empkins_b_point_estimated.drop(df_part1.index)


In [21]:
df_part1

Unnamed: 0,participant,condition,phase,heartbeat_idreference,b_point_sampleestimated,b_point_samplereference,b_point_algorithm_combi
15281,VP_001,ftsst,Talk,38,25210.0,25182,second-derivative_autoregression
21435,VP_005,tsst,Math,24,13018.0,12994,second-derivative_linear-interpolation
44536,VP_031,tsst,Math,27,12312.0,12304,straight-line_none
13518,VP_028,ftsst,Prep,38,20829.0,20899,multiple-conditions_none
47529,VP_026,tsst,Prep,12,6301.0,6284,third-derivative_autoregression
...,...,...,...,...,...,...,...
20613,VP_002,ftsst,Pause_1,3,1872.0,1881,second-derivative_linear-interpolation
45260,VP_001,ftsst,Talk,17,11505.0,11426,third-derivative_autoregression
25884,VP_003,tsst,Math,41,28660.0,28678,second-derivative_none
47171,VP_022,ftsst,Talk,8,7314.0,7299,third-derivative_autoregression
