In [1]:
import json

from pathlib import Path

import pandas as pd
import numpy as np

import biopsykit as bp

import matplotlib.pyplot as plt

%matplotlib widget
%load_ext autoreload
%autoreload 2

In [2]:
root_path = Path("../../")

#### Specify whether the results should be saved or not

In [3]:
save_results = True

In [4]:
result_path = root_path.joinpath("results")
result_path

WindowsPath('../../results')

In [5]:
empkins_pivot_df = pd.read_csv(result_path.joinpath("pivot_dataframe_b_point/empkins_pivot_dataframe_b_point_all_algos_ms.csv")).drop(columns="Unnamed: 0")
guardian_pivot_df = pd.read_csv(result_path.joinpath("pivot_dataframe_b_point/guardian_pivot_dataframe_b_point_all_algos_ms.csv")).drop(columns="Unnamed: 0")

#### Combine the empkins and guardian dataframes

In [6]:
combined_df = pd.concat([empkins_pivot_df, guardian_pivot_df])
combined_df

Unnamed: 0,participant,condition,phase,heartbeat_idreference,b_point_samplereference,arbol2017-isoelectric-crossings_forouzanfar2018,arbol2017-isoelectric-crossings_linear-interpolation,arbol2017-isoelectric-crossings_none,arbol2017-second-derivative_forouzanfar2018,arbol2017-second-derivative_linear-interpolation,...,lozano2007-linear-regression_none,lozano2007-quadratic-regression_forouzanfar2018,lozano2007-quadratic-regression_linear-interpolation,lozano2007-quadratic-regression_none,sherwood1990_forouzanfar2018,sherwood1990_linear-interpolation,sherwood1990_none,stern1985_forouzanfar2018,stern1985_linear-interpolation,stern1985_none
0,VP_001,ftsst,Math,1,1074.0,1111.0,1111.0,1111.0,1080.0,1080.0,...,1126.0,1125.0,1125.0,1125.0,1111.0,1111.0,1111.0,1108.0,1079.0,1022.0
1,VP_001,ftsst,Math,2,1849.0,1844.0,1844.0,1844.0,1810.0,1809.0,...,1860.0,1856.0,1856.0,1856.0,1853.0,1853.0,1853.0,1826.0,1818.0,1765.0
2,VP_001,ftsst,Math,3,2518.0,2558.0,2558.0,2558.0,2494.0,2494.0,...,2565.0,2565.0,2565.0,2565.0,2551.0,2551.0,2551.0,2516.0,2516.0,2516.0
3,VP_001,ftsst,Math,4,3252.0,3264.0,3264.0,3264.0,3215.0,3212.0,...,3280.0,3291.0,3294.0,3278.0,3279.0,3279.0,3279.0,3247.0,3247.0,3247.0
4,VP_001,ftsst,Math,5,3933.0,3942.0,3942.0,3942.0,3881.0,3879.0,...,3941.0,3961.0,3962.0,3937.0,3925.0,3936.0,3872.0,3932.0,3932.0,3932.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6239,GDN0030,,Valsalva,51,55194.0,55162.0,55162.0,55162.0,55108.0,55108.0,...,55176.0,55160.0,55160.0,55160.0,55158.0,55158.0,55158.0,55142.0,55142.0,55142.0
6240,GDN0030,,Valsalva,52,56172.0,56146.0,56146.0,56146.0,56086.0,56086.0,...,56156.0,56138.0,56138.0,56138.0,56144.0,56144.0,56144.0,56124.0,56124.0,56124.0
6241,GDN0030,,Valsalva,53,57162.0,57182.0,57182.0,57182.0,57114.0,57114.0,...,57186.0,57172.0,57172.0,57172.0,57188.0,57188.0,57188.0,57160.0,57160.0,57160.0
6242,GDN0030,,Valsalva,54,58122.0,58148.0,58148.0,58148.0,58090.0,58090.0,...,58162.0,58144.0,58144.0,58144.0,58138.0,58162.0,58138.0,58124.0,58126.0,58066.0


In [7]:
print(f"Length of empkins dataframe: {empkins_pivot_df.shape[0]} and length of guardian dataframe: {guardian_pivot_df.shape[0]} should add up: {empkins_pivot_df.shape[0] + guardian_pivot_df.shape[0]} to the length of the combined dataframe: {combined_df.shape[0]}")

Length of empkins dataframe: 4995 and length of guardian dataframe: 6244 should add up: 11239 to the length of the combined dataframe: 11239


#### Replace the NaN values in the condition column with another meaningful expression

In [8]:
combined_df["condition"] = combined_df["condition"].replace(np.nan, "default")
combined_df

Unnamed: 0,participant,condition,phase,heartbeat_idreference,b_point_samplereference,arbol2017-isoelectric-crossings_forouzanfar2018,arbol2017-isoelectric-crossings_linear-interpolation,arbol2017-isoelectric-crossings_none,arbol2017-second-derivative_forouzanfar2018,arbol2017-second-derivative_linear-interpolation,...,lozano2007-linear-regression_none,lozano2007-quadratic-regression_forouzanfar2018,lozano2007-quadratic-regression_linear-interpolation,lozano2007-quadratic-regression_none,sherwood1990_forouzanfar2018,sherwood1990_linear-interpolation,sherwood1990_none,stern1985_forouzanfar2018,stern1985_linear-interpolation,stern1985_none
0,VP_001,ftsst,Math,1,1074.0,1111.0,1111.0,1111.0,1080.0,1080.0,...,1126.0,1125.0,1125.0,1125.0,1111.0,1111.0,1111.0,1108.0,1079.0,1022.0
1,VP_001,ftsst,Math,2,1849.0,1844.0,1844.0,1844.0,1810.0,1809.0,...,1860.0,1856.0,1856.0,1856.0,1853.0,1853.0,1853.0,1826.0,1818.0,1765.0
2,VP_001,ftsst,Math,3,2518.0,2558.0,2558.0,2558.0,2494.0,2494.0,...,2565.0,2565.0,2565.0,2565.0,2551.0,2551.0,2551.0,2516.0,2516.0,2516.0
3,VP_001,ftsst,Math,4,3252.0,3264.0,3264.0,3264.0,3215.0,3212.0,...,3280.0,3291.0,3294.0,3278.0,3279.0,3279.0,3279.0,3247.0,3247.0,3247.0
4,VP_001,ftsst,Math,5,3933.0,3942.0,3942.0,3942.0,3881.0,3879.0,...,3941.0,3961.0,3962.0,3937.0,3925.0,3936.0,3872.0,3932.0,3932.0,3932.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6239,GDN0030,default,Valsalva,51,55194.0,55162.0,55162.0,55162.0,55108.0,55108.0,...,55176.0,55160.0,55160.0,55160.0,55158.0,55158.0,55158.0,55142.0,55142.0,55142.0
6240,GDN0030,default,Valsalva,52,56172.0,56146.0,56146.0,56146.0,56086.0,56086.0,...,56156.0,56138.0,56138.0,56138.0,56144.0,56144.0,56144.0,56124.0,56124.0,56124.0
6241,GDN0030,default,Valsalva,53,57162.0,57182.0,57182.0,57182.0,57114.0,57114.0,...,57186.0,57172.0,57172.0,57172.0,57188.0,57188.0,57188.0,57160.0,57160.0,57160.0
6242,GDN0030,default,Valsalva,54,58122.0,58148.0,58148.0,58148.0,58090.0,58090.0,...,58162.0,58144.0,58144.0,58144.0,58138.0,58162.0,58138.0,58124.0,58126.0,58066.0


#### Check how many nan values are present in the data

In [9]:
null_df = combined_df[combined_df.isnull().any(axis=1)]
print(f"A total of {null_df.shape[0]} rows contain nan values.")

A total of 842 rows contain nan values.


#### Drop rows containing nan

In [10]:
clean_combined_df = combined_df.dropna()
clean_combined_df

Unnamed: 0,participant,condition,phase,heartbeat_idreference,b_point_samplereference,arbol2017-isoelectric-crossings_forouzanfar2018,arbol2017-isoelectric-crossings_linear-interpolation,arbol2017-isoelectric-crossings_none,arbol2017-second-derivative_forouzanfar2018,arbol2017-second-derivative_linear-interpolation,...,lozano2007-linear-regression_none,lozano2007-quadratic-regression_forouzanfar2018,lozano2007-quadratic-regression_linear-interpolation,lozano2007-quadratic-regression_none,sherwood1990_forouzanfar2018,sherwood1990_linear-interpolation,sherwood1990_none,stern1985_forouzanfar2018,stern1985_linear-interpolation,stern1985_none
0,VP_001,ftsst,Math,1,1074.0,1111.0,1111.0,1111.0,1080.0,1080.0,...,1126.0,1125.0,1125.0,1125.0,1111.0,1111.0,1111.0,1108.0,1079.0,1022.0
1,VP_001,ftsst,Math,2,1849.0,1844.0,1844.0,1844.0,1810.0,1809.0,...,1860.0,1856.0,1856.0,1856.0,1853.0,1853.0,1853.0,1826.0,1818.0,1765.0
2,VP_001,ftsst,Math,3,2518.0,2558.0,2558.0,2558.0,2494.0,2494.0,...,2565.0,2565.0,2565.0,2565.0,2551.0,2551.0,2551.0,2516.0,2516.0,2516.0
3,VP_001,ftsst,Math,4,3252.0,3264.0,3264.0,3264.0,3215.0,3212.0,...,3280.0,3291.0,3294.0,3278.0,3279.0,3279.0,3279.0,3247.0,3247.0,3247.0
4,VP_001,ftsst,Math,5,3933.0,3942.0,3942.0,3942.0,3881.0,3879.0,...,3941.0,3961.0,3962.0,3937.0,3925.0,3936.0,3872.0,3932.0,3932.0,3932.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6239,GDN0030,default,Valsalva,51,55194.0,55162.0,55162.0,55162.0,55108.0,55108.0,...,55176.0,55160.0,55160.0,55160.0,55158.0,55158.0,55158.0,55142.0,55142.0,55142.0
6240,GDN0030,default,Valsalva,52,56172.0,56146.0,56146.0,56146.0,56086.0,56086.0,...,56156.0,56138.0,56138.0,56138.0,56144.0,56144.0,56144.0,56124.0,56124.0,56124.0
6241,GDN0030,default,Valsalva,53,57162.0,57182.0,57182.0,57182.0,57114.0,57114.0,...,57186.0,57172.0,57172.0,57172.0,57188.0,57188.0,57188.0,57160.0,57160.0,57160.0
6242,GDN0030,default,Valsalva,54,58122.0,58148.0,58148.0,58148.0,58090.0,58090.0,...,58162.0,58144.0,58144.0,58144.0,58138.0,58162.0,58138.0,58124.0,58126.0,58066.0


In [11]:
print(f"The length of the combined dataframe: {combined_df.shape[0]} minus the amount of nan values: {null_df.shape[0]} equals: {combined_df.shape[0] - null_df.shape[0]} and should match the length of the clean_combined dataframe: {clean_combined_df.shape[0]}.")

The length of the combined dataframe: 11239 minus the amount of nan values: 842 equals: 10397 and should match the length of the clean_combined dataframe: 10397.


In [12]:
null_df = clean_combined_df[clean_combined_df.isnull().any(axis=1)]
print(f"A total of {null_df.shape[0]} rows contain nan values.")

A total of 0 rows contain nan values.


In [13]:
if save_results:
    clean_combined_df.to_csv(result_path.joinpath("train_test_data/combined_data.csv"))

#### Split the dataset into train and target

In [17]:
train_data = clean_combined_df.drop(columns=["b_point_samplereference"])
train_data

Unnamed: 0,participant,condition,phase,heartbeat_idreference,arbol2017-isoelectric-crossings_forouzanfar2018,arbol2017-isoelectric-crossings_linear-interpolation,arbol2017-isoelectric-crossings_none,arbol2017-second-derivative_forouzanfar2018,arbol2017-second-derivative_linear-interpolation,arbol2017-second-derivative_none,...,lozano2007-linear-regression_none,lozano2007-quadratic-regression_forouzanfar2018,lozano2007-quadratic-regression_linear-interpolation,lozano2007-quadratic-regression_none,sherwood1990_forouzanfar2018,sherwood1990_linear-interpolation,sherwood1990_none,stern1985_forouzanfar2018,stern1985_linear-interpolation,stern1985_none
0,VP_001,ftsst,Math,1,1111.0,1111.0,1111.0,1080.0,1080.0,1080.0,...,1126.0,1125.0,1125.0,1125.0,1111.0,1111.0,1111.0,1108.0,1079.0,1022.0
1,VP_001,ftsst,Math,2,1844.0,1844.0,1844.0,1810.0,1809.0,1788.0,...,1860.0,1856.0,1856.0,1856.0,1853.0,1853.0,1853.0,1826.0,1818.0,1765.0
2,VP_001,ftsst,Math,3,2558.0,2558.0,2558.0,2494.0,2494.0,2490.0,...,2565.0,2565.0,2565.0,2565.0,2551.0,2551.0,2551.0,2516.0,2516.0,2516.0
3,VP_001,ftsst,Math,4,3264.0,3264.0,3264.0,3215.0,3212.0,3185.0,...,3280.0,3291.0,3294.0,3278.0,3279.0,3279.0,3279.0,3247.0,3247.0,3247.0
4,VP_001,ftsst,Math,5,3942.0,3942.0,3942.0,3881.0,3879.0,3879.0,...,3941.0,3961.0,3962.0,3937.0,3925.0,3936.0,3872.0,3932.0,3932.0,3932.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6239,GDN0030,default,Valsalva,51,55162.0,55162.0,55162.0,55108.0,55108.0,55108.0,...,55176.0,55160.0,55160.0,55160.0,55158.0,55158.0,55158.0,55142.0,55142.0,55142.0
6240,GDN0030,default,Valsalva,52,56146.0,56146.0,56146.0,56086.0,56086.0,56086.0,...,56156.0,56138.0,56138.0,56138.0,56144.0,56144.0,56144.0,56124.0,56124.0,56124.0
6241,GDN0030,default,Valsalva,53,57182.0,57182.0,57182.0,57114.0,57114.0,57114.0,...,57186.0,57172.0,57172.0,57172.0,57188.0,57188.0,57188.0,57160.0,57160.0,57160.0
6242,GDN0030,default,Valsalva,54,58148.0,58148.0,58148.0,58090.0,58090.0,58090.0,...,58162.0,58144.0,58144.0,58144.0,58138.0,58162.0,58138.0,58124.0,58126.0,58066.0


In [18]:
target_data = clean_combined_df[["participant", "condition", "phase", "heartbeat_idreference", "b_point_samplereference"]]
target_data

Unnamed: 0,participant,condition,phase,heartbeat_idreference,b_point_samplereference
0,VP_001,ftsst,Math,1,1074.0
1,VP_001,ftsst,Math,2,1849.0
2,VP_001,ftsst,Math,3,2518.0
3,VP_001,ftsst,Math,4,3252.0
4,VP_001,ftsst,Math,5,3933.0
...,...,...,...,...,...
6239,GDN0030,default,Valsalva,51,55194.0
6240,GDN0030,default,Valsalva,52,56172.0
6241,GDN0030,default,Valsalva,53,57162.0
6242,GDN0030,default,Valsalva,54,58122.0


#### Save the train and target data

In [19]:
if save_results:
    train_data.to_csv(result_path.joinpath("train_test_data/train_data_all_algos.csv"))
    target_data.to_csv(result_path.joinpath("train_test_data/target_data_all_algos.csv"))