# Explore the PEP extraction results on the Guardian Data

## Setup and helper functions

In [1]:
import json

from pathlib import Path

import pandas as pd
import numpy as np

import biopsykit as bp
from biopsykit.stats import StatsPipeline

import matplotlib.pyplot as plt

from pepbench.io import load_challenge_results_from_folder
from pepbench.datasets import GuardianDataset
from pepbench.data_handling import get_reference_pep, compute_pep_performance_metrics
from pepbench.data_handling.utils import reindex_empkins, reindex_guardian, rename_empkins, rename_guardian

%matplotlib widget
%load_ext autoreload
%autoreload 2

In [2]:
root_path = Path("../../")

In [3]:
deploy_type = "local"

config_dict = json.load(root_path.joinpath("config.json").open(encoding="utf-8"))

empkins_base_path = Path(config_dict[deploy_type]["empkins_path"])
guardian_base_path = Path(config_dict[deploy_type]["guardian_path"])
print(guardian_base_path)

\Users\sebas\Development\ResearchInternship\Data\2024_08_PEP_Benchmarking\Guardian_Dataset


In [4]:
result_path = root_path.joinpath("results")
result_path

WindowsPath('../../results')

In [5]:
dataset_guardian = GuardianDataset(guardian_base_path, use_cache=True, only_labeled=True)
dataset_guardian

Unnamed: 0,participant,phase
0,GDN0005,Pause
1,GDN0005,Valsalva
2,GDN0005,HoldingBreath
3,GDN0005,TiltUp
4,GDN0005,TiltDown
...,...,...
101,GDN0029,TiltUp
102,GDN0029,TiltDown
103,GDN0030,Valsalva
104,GDN0030,HoldingBreath


In [6]:
algo_levels = ["q_wave_algorithm", "b_point_algorithm", "outlier_correction_algorithm"]
algo_level_mapping = dict(zip(algo_levels, ["Q-Wave Algorithm", "B-Point Algorithm", "Outlier Correction"]))

In [16]:
results_guardian = load_challenge_results_from_folder(
    result_path.joinpath("guardian_dataset_reference_q_wave").resolve(), index_cols_per_sample=["participant", "phase"]
)

In [17]:
results_guardian_b_point = results_guardian.per_sample.droplevel("q_wave_algorithm")[[("heartbeat_id", "reference"), ("b_point_sample", "estimated"), ("b_point_sample", "reference")]]
results_guardian_b_point = results_guardian_b_point.reset_index()
results_guardian_b_point

Unnamed: 0_level_0,b_point_algorithm,outlier_correction_algorithm,participant,phase,level_4,heartbeat_id,b_point_sample,b_point_sample
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,reference,estimated,reference
0,multiple-conditions,autoregression,GDN0005,Pause,0,0,556.0,568
1,multiple-conditions,autoregression,GDN0005,Pause,1,1,1019.0,1025
2,multiple-conditions,autoregression,GDN0005,Pause,2,2,1453.0,1456
3,multiple-conditions,autoregression,GDN0005,Pause,3,3,1887.0,1888
4,multiple-conditions,autoregression,GDN0005,Pause,4,4,2314.0,2293
...,...,...,...,...,...,...,...,...
79327,third-derivative,none,GDN0030,TiltDown,51,51,27283.0,27278
79328,third-derivative,none,GDN0030,TiltDown,52,52,27824.0,27815
79329,third-derivative,none,GDN0030,TiltDown,53,53,28371.0,28345
79330,third-derivative,none,GDN0030,TiltDown,54,54,28903.0,28867


In [18]:
results_guardian_b_point.columns = results_guardian_b_point.columns.to_flat_index().str.join("")
results_guardian_b_point = results_guardian_b_point.drop(columns=["level_4"])
results_guardian_b_point

Unnamed: 0,b_point_algorithm,outlier_correction_algorithm,participant,phase,heartbeat_idreference,b_point_sampleestimated,b_point_samplereference
0,multiple-conditions,autoregression,GDN0005,Pause,0,556.0,568
1,multiple-conditions,autoregression,GDN0005,Pause,1,1019.0,1025
2,multiple-conditions,autoregression,GDN0005,Pause,2,1453.0,1456
3,multiple-conditions,autoregression,GDN0005,Pause,3,1887.0,1888
4,multiple-conditions,autoregression,GDN0005,Pause,4,2314.0,2293
...,...,...,...,...,...,...,...
79327,third-derivative,none,GDN0030,TiltDown,51,27283.0,27278
79328,third-derivative,none,GDN0030,TiltDown,52,27824.0,27815
79329,third-derivative,none,GDN0030,TiltDown,53,28371.0,28345
79330,third-derivative,none,GDN0030,TiltDown,54,28903.0,28867


In [19]:
results_guardian_b_point["b_point_algorithm_combi"] = results_guardian_b_point["b_point_algorithm"] + "_" + results_guardian_b_point["outlier_correction_algorithm"]
results_guardian_b_point = results_guardian_b_point.drop(columns=["b_point_algorithm", "outlier_correction_algorithm"])
results_guardian_b_point

Unnamed: 0,participant,phase,heartbeat_idreference,b_point_sampleestimated,b_point_samplereference,b_point_algorithm_combi
0,GDN0005,Pause,0,556.0,568,multiple-conditions_autoregression
1,GDN0005,Pause,1,1019.0,1025,multiple-conditions_autoregression
2,GDN0005,Pause,2,1453.0,1456,multiple-conditions_autoregression
3,GDN0005,Pause,3,1887.0,1888,multiple-conditions_autoregression
4,GDN0005,Pause,4,2314.0,2293,multiple-conditions_autoregression
...,...,...,...,...,...,...
79327,GDN0030,TiltDown,51,27283.0,27278,third-derivative_none
79328,GDN0030,TiltDown,52,27824.0,27815,third-derivative_none
79329,GDN0030,TiltDown,53,28371.0,28345,third-derivative_none
79330,GDN0030,TiltDown,54,28903.0,28867,third-derivative_none


In [20]:
print("Possible B-Point algorithm combinations: ")
algo_list = results_guardian_b_point["b_point_algorithm_combi"].unique()
algo_list

Possible B-Point algorithm combinations: 


array(['multiple-conditions_autoregression',
       'multiple-conditions_linear-interpolation',
       'multiple-conditions_none', 'second-derivative_autoregression',
       'second-derivative_linear-interpolation', 'second-derivative_none',
       'straight-line_autoregression',
       'straight-line_linear-interpolation', 'straight-line_none',
       'third-derivative_autoregression',
       'third-derivative_linear-interpolation', 'third-derivative_none'],
      dtype=object)

In [21]:
print(f"Number of rows in the long dataframe: {results_guardian_b_point.shape[0]}")
print(f"Number of possible B-Point algorithm combinations: {len(algo_list)}")
print(f"Expected amount of rows in the wide datframe: {results_guardian_b_point.shape[0]/len(algo_list)}")

Number of rows in the long dataframe: 79332
Number of possible B-Point algorithm combinations: 12
Expected amount of rows in the wide datframe: 6611.0


### Convert the dataframe from the long to the wide format

In [32]:
# Pivot the DataFrame
df_guardian_pivot = results_guardian_b_point.pivot_table(
    index=['participant', 'phase', 'heartbeat_idreference', 'b_point_samplereference'],
    columns='b_point_algorithm_combi',
    values='b_point_sampleestimated'
).reset_index()

# Flatten the columns
df_guardian_pivot.columns.name = None
df_guardian_pivot.columns = [f'{col}' if isinstance(col, str) else f'{col[1]}' for col in df_guardian_pivot.columns]
df_guardian_pivot

Unnamed: 0,participant,phase,heartbeat_idreference,b_point_samplereference,multiple-conditions_autoregression,multiple-conditions_linear-interpolation,multiple-conditions_none,second-derivative_autoregression,second-derivative_linear-interpolation,second-derivative_none,straight-line_autoregression,straight-line_linear-interpolation,straight-line_none,third-derivative_autoregression,third-derivative_linear-interpolation,third-derivative_none
0,GDN0005,HoldingBreath,0,540,705.0,705.0,705.0,572.0,572.0,572.0,546.0,546.0,546.0,543.0,543.0,543.0
1,GDN0005,HoldingBreath,1,973,1068.0,1072.0,1014.0,984.0,993.0,965.0,981.0,981.0,981.0,964.0,964.0,939.0
2,GDN0005,HoldingBreath,3,1741,1757.0,1760.0,1740.0,1736.0,1736.0,1736.0,1746.0,1746.0,1746.0,1746.0,1746.0,1746.0
3,GDN0005,HoldingBreath,4,2121,2120.0,2120.0,2120.0,2114.0,2114.0,2114.0,2124.0,2124.0,2124.0,2124.0,2124.0,2124.0
4,GDN0005,HoldingBreath,5,2499,2498.0,2498.0,2498.0,2515.0,2515.0,2515.0,2502.0,2507.0,2502.0,2500.0,2500.0,2500.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6239,GDN0030,Valsalva,51,27597,27582.0,27582.0,27582.0,27596.0,27596.0,27596.0,27603.0,27603.0,27603.0,27606.0,27606.0,27606.0
6240,GDN0030,Valsalva,52,28086,28074.0,28074.0,28074.0,28085.0,28085.0,28085.0,28091.0,28091.0,28091.0,28092.0,28092.0,28092.0
6241,GDN0030,Valsalva,53,28581,28595.0,28595.0,28595.0,28602.0,28602.0,28602.0,28592.0,28592.0,28592.0,28612.0,28612.0,28612.0
6242,GDN0030,Valsalva,54,29061,29058.0,29058.0,29058.0,29089.0,29089.0,29089.0,29091.0,29091.0,29091.0,29097.0,29097.0,29097.0


### Check whether differences in the B-Point samples occur between the autoregression and linear interpolation algorithm combinations

In [33]:
df_guardian_pivot[(df_guardian_pivot["third-derivative_autoregression"] == df_guardian_pivot["third-derivative_linear-interpolation"]) == False]

Unnamed: 0,participant,phase,heartbeat_idreference,b_point_samplereference,multiple-conditions_autoregression,multiple-conditions_linear-interpolation,multiple-conditions_none,second-derivative_autoregression,second-derivative_linear-interpolation,second-derivative_none,straight-line_autoregression,straight-line_linear-interpolation,straight-line_none,third-derivative_autoregression,third-derivative_linear-interpolation,third-derivative_none
5,GDN0005,HoldingBreath,6,2892,2880.0,2890.0,2854.0,2878.0,2878.0,2878.0,2899.0,2899.0,2899.0,2891.0,2895.0,2857.0
8,GDN0005,HoldingBreath,10,4521,4520.0,4520.0,4520.0,4516.0,4516.0,4516.0,4533.0,4537.0,4525.0,4535.0,4539.0,4524.0
18,GDN0005,HoldingBreath,21,9305,9303.0,9303.0,9303.0,9322.0,9319.0,9303.0,9320.0,9326.0,9320.0,9326.0,9328.0,9283.0
20,GDN0005,HoldingBreath,23,10143,10140.0,10140.0,10140.0,10166.0,10166.0,10166.0,10148.0,10153.0,10148.0,10149.0,10159.0,10149.0
25,GDN0005,HoldingBreath,28,12336,12335.0,12335.0,12335.0,12329.0,12329.0,12329.0,12343.0,12350.0,12343.0,12353.0,12351.0,12336.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6216,GDN0030,Valsalva,28,15691,15677.0,15677.0,15677.0,15714.0,15714.0,15714.0,15670.0,15670.0,15670.0,15669.0,15686.0,15669.0
6221,GDN0030,Valsalva,33,18176,18165.0,18165.0,18165.0,18175.0,18175.0,18175.0,18184.0,18184.0,18184.0,18159.0,18178.0,18159.0
6222,GDN0030,Valsalva,34,18689,18685.0,18685.0,18685.0,18685.0,18690.0,18685.0,18696.0,18696.0,18696.0,18680.0,18687.0,18655.0
6223,GDN0030,Valsalva,35,19189,19181.0,19181.0,19181.0,19204.0,19207.0,19189.0,19200.0,19200.0,19200.0,19197.0,19214.0,19197.0


In [34]:
print(f"Max heartbeat id long dataframe: {max(results_guardian_b_point["heartbeat_idreference"])}")
print(f"Max heartbeat id wide dataframe: {max(df_guardian_pivot["heartbeat_idreference"])}")

Max heartbeat id long dataframe: 97
Max heartbeat id wide dataframe: 97


### Save the pivot dataframe

In [36]:
df_guardian_pivot.to_csv(result_path.joinpath("pivot_dataframe_b_point/guardian_pivot_dataframe_b_point_algos.csv"))

In [64]:
vp_05_HoldingBreath = df_guardian_pivot[df_guardian_pivot["participant"] == "GDN0005"][df_guardian_pivot["phase"] == "HoldingBreath"]
vp_05_Valsalva = df_guardian_pivot[df_guardian_pivot["participant"] == "GDN0005"][df_guardian_pivot["phase"] == "Valsalva"]
print(f"Heartbeat IDs in the phase HoldingBreath: {vp_05_HoldingBreath["heartbeat_idreference"]}")
print(f"Heartbeat IDs in the phase Valsalva: {vp_05_Valsalva["heartbeat_idreference"]}")

Heartbeat IDs in the phase HoldingBreath: 0      0
1      1
2      3
3      4
4      5
      ..
60    63
61    64
62    65
63    66
64    67
Name: heartbeat_idreference, Length: 65, dtype: int64
Heartbeat IDs in the phase Valsalva: 258     0
259     1
260     2
261     3
262     4
       ..
316    58
317    59
318    60
319    61
320    62
Name: heartbeat_idreference, Length: 63, dtype: int64


  vp_05_HoldingBreath = df_guardian_pivot[df_guardian_pivot["participant"] == "GDN0005"][df_guardian_pivot["phase"] == "HoldingBreath"]
  vp_05_Valsalva = df_guardian_pivot[df_guardian_pivot["participant"] == "GDN0005"][df_guardian_pivot["phase"] == "Valsalva"]
