# Building the feature table

- Features: The features are the B-Point locations extracted by the different B-Point algorithms
- Target variable: Manually labeled reference B-Point location
- Samples: Data on the heartbeat level

## Setup and helper functions

In [1]:
import json

from pathlib import Path

import pandas as pd
import numpy as np

import biopsykit as bp
from biopsykit.stats import StatsPipeline

import matplotlib.pyplot as plt

from pepbench.io import load_challenge_results_from_folder
from pepbench.datasets import EmpkinsDataset
from pepbench.data_handling import get_reference_pep, compute_pep_performance_metrics
from pepbench.data_handling.utils import reindex_empkins, reindex_guardian, rename_empkins, rename_guardian

%matplotlib widget
%load_ext autoreload
%autoreload 2

In [2]:
root_path = Path("../../")

In [3]:
deploy_type = "local"

config_dict = json.load(root_path.joinpath("config.json").open(encoding="utf-8"))

empkins_base_path = Path(config_dict[deploy_type]["empkins_path"])
guardian_base_path = Path(config_dict[deploy_type]["guardian_path"])
print(empkins_base_path)

\Users\sebas\Development\ResearchInternship\Data\2024_08_PEP_Benchmarking\EmpkinS_Dataset


In [4]:
result_path = root_path.joinpath("results")
result_path

WindowsPath('../../results')

In [5]:
dataset_empkins = EmpkinsDataset(empkins_base_path, use_cache=True, only_labeled=True)
dataset_empkins

Unnamed: 0,participant,condition,phase
0,VP_001,tsst,Prep
1,VP_001,tsst,Pause_1
2,VP_001,tsst,Talk
3,VP_001,tsst,Math
4,VP_001,tsst,Pause_5
...,...,...,...
145,VP_032,ftsst,Prep
146,VP_032,ftsst,Pause_1
147,VP_032,ftsst,Talk
148,VP_032,ftsst,Math


In [6]:
algo_levels = ["q_wave_algorithm", "b_point_algorithm", "outlier_correction_algorithm"]
algo_level_mapping = dict(zip(algo_levels, ["Q-Wave Algorithm", "B-Point Algorithm", "Outlier Correction"]))

In [7]:
results_empkins = load_challenge_results_from_folder(
    result_path.joinpath("empkins_dataset_reference_q_wave").resolve(), index_cols_per_sample=["participant", "condition", "phase"]
)

### Extract the reference B-Point samples

In [8]:
results_empkins_b_point_reference = results_empkins.per_sample.droplevel(["q_wave_algorithm", "condition", "phase"])[[("heartbeat_id", "reference"), ("b_point_sample", "reference")]]
results_empkins_b_point_reference

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,heartbeat_id,b_point_sample
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,reference,reference
b_point_algorithm,outlier_correction_algorithm,participant,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
multiple-conditions,autoregression,VP_001,0,0,634
multiple-conditions,autoregression,VP_001,1,1,1206
multiple-conditions,autoregression,VP_001,2,2,1811
multiple-conditions,autoregression,VP_001,3,3,2382
multiple-conditions,autoregression,VP_001,4,4,2926
...,...,...,...,...,...
third-derivative,none,VP_032,7,7,6256
third-derivative,none,VP_032,8,8,7054
third-derivative,none,VP_032,9,9,7855
third-derivative,none,VP_032,10,10,8673


In [9]:
results_empkins_b_point_estimated = results_empkins.per_sample.droplevel("q_wave_algorithm")[[("heartbeat_id", "reference"), ("b_point_sample", "estimated")]]
results_empkins_b_point_estimated

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,heartbeat_id,b_point_sample
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,reference,estimated
b_point_algorithm,outlier_correction_algorithm,participant,condition,phase,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
multiple-conditions,autoregression,VP_001,tsst,Prep,0,0,619.0
multiple-conditions,autoregression,VP_001,tsst,Prep,1,1,1198.0
multiple-conditions,autoregression,VP_001,tsst,Prep,2,2,1777.0
multiple-conditions,autoregression,VP_001,tsst,Prep,3,3,2331.0
multiple-conditions,autoregression,VP_001,tsst,Prep,4,4,2924.0
...,...,...,...,...,...,...,...
third-derivative,none,VP_032,ftsst,Pause_5,7,7,6281.0
third-derivative,none,VP_032,ftsst,Pause_5,8,8,7069.0
third-derivative,none,VP_032,ftsst,Pause_5,9,9,7870.0
third-derivative,none,VP_032,ftsst,Pause_5,10,10,8682.0


In [10]:
results_empkins_b_point_estimated = results_empkins_b_point_estimated.reset_index()
results_empkins_b_point_estimated

Unnamed: 0_level_0,b_point_algorithm,outlier_correction_algorithm,participant,condition,phase,level_5,heartbeat_id,b_point_sample
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,reference,estimated
0,multiple-conditions,autoregression,VP_001,tsst,Prep,0,0,619.0
1,multiple-conditions,autoregression,VP_001,tsst,Prep,1,1,1198.0
2,multiple-conditions,autoregression,VP_001,tsst,Prep,2,2,1777.0
3,multiple-conditions,autoregression,VP_001,tsst,Prep,3,3,2331.0
4,multiple-conditions,autoregression,VP_001,tsst,Prep,4,4,2924.0
...,...,...,...,...,...,...,...,...
59995,third-derivative,none,VP_032,ftsst,Pause_5,7,7,6281.0
59996,third-derivative,none,VP_032,ftsst,Pause_5,8,8,7069.0
59997,third-derivative,none,VP_032,ftsst,Pause_5,9,9,7870.0
59998,third-derivative,none,VP_032,ftsst,Pause_5,10,10,8682.0


In [11]:
results_empkins_b_point_estimated.columns = results_empkins_b_point_estimated.columns.to_flat_index().str.join("")

In [12]:
results_empkins_b_point_estimated = results_empkins_b_point_estimated.drop(columns=["level_5", "condition", "phase"])
results_empkins_b_point_estimated

Unnamed: 0,b_point_algorithm,outlier_correction_algorithm,participant,heartbeat_idreference,b_point_sampleestimated
0,multiple-conditions,autoregression,VP_001,0,619.0
1,multiple-conditions,autoregression,VP_001,1,1198.0
2,multiple-conditions,autoregression,VP_001,2,1777.0
3,multiple-conditions,autoregression,VP_001,3,2331.0
4,multiple-conditions,autoregression,VP_001,4,2924.0
...,...,...,...,...,...
59995,third-derivative,none,VP_032,7,6281.0
59996,third-derivative,none,VP_032,8,7069.0
59997,third-derivative,none,VP_032,9,7870.0
59998,third-derivative,none,VP_032,10,8682.0


In [13]:
results_empkins_b_point_estimated["b_point_algorithm_combi"] = results_empkins_b_point_estimated["b_point_algorithm"] + "_" + results_empkins_b_point_estimated["outlier_correction_algorithm"]
results_empkins_b_point_estimated = results_empkins_b_point_estimated.drop(columns=["b_point_algorithm", "outlier_correction_algorithm"])
results_empkins_b_point_estimated

Unnamed: 0,participant,heartbeat_idreference,b_point_sampleestimated,b_point_algorithm_combi
0,VP_001,0,619.0,multiple-conditions_autoregression
1,VP_001,1,1198.0,multiple-conditions_autoregression
2,VP_001,2,1777.0,multiple-conditions_autoregression
3,VP_001,3,2331.0,multiple-conditions_autoregression
4,VP_001,4,2924.0,multiple-conditions_autoregression
...,...,...,...,...
59995,VP_032,7,6281.0,third-derivative_none
59996,VP_032,8,7069.0,third-derivative_none
59997,VP_032,9,7870.0,third-derivative_none
59998,VP_032,10,8682.0,third-derivative_none


In [14]:
results_empkins_b_point_estimated.reindex(level=["particpant", "condition", "phase"])

Unnamed: 0,participant,heartbeat_idreference,b_point_sampleestimated,b_point_algorithm_combi
0,VP_001,0,619.0,multiple-conditions_autoregression
1,VP_001,1,1198.0,multiple-conditions_autoregression
2,VP_001,2,1777.0,multiple-conditions_autoregression
3,VP_001,3,2331.0,multiple-conditions_autoregression
4,VP_001,4,2924.0,multiple-conditions_autoregression
...,...,...,...,...
59995,VP_032,7,6281.0,third-derivative_none
59996,VP_032,8,7069.0,third-derivative_none
59997,VP_032,9,7870.0,third-derivative_none
59998,VP_032,10,8682.0,third-derivative_none


In [15]:
fraction = 0.7  # 50% of the data in each split

# Split the DataFrame randomly into two parts based on the 'participant' column
df_part1 = results_empkins_b_point_estimated.sample(frac=fraction, random_state=1, axis=0)
df_part2 = results_empkins_b_point_estimated.drop(df_part1.index)


In [16]:
df_part1

Unnamed: 0,participant,heartbeat_idreference,b_point_sampleestimated,b_point_algorithm_combi
15281,VP_001,38,25210.0,second-derivative_autoregression
21435,VP_005,24,13018.0,second-derivative_linear-interpolation
44536,VP_031,27,12312.0,straight-line_none
13518,VP_028,38,20829.0,multiple-conditions_none
47529,VP_026,12,6301.0,third-derivative_autoregression
...,...,...,...,...
20613,VP_002,3,1872.0,second-derivative_linear-interpolation
45260,VP_001,17,11505.0,third-derivative_autoregression
25884,VP_003,41,28660.0,second-derivative_none
47171,VP_022,8,7314.0,third-derivative_autoregression
