# Dataset Demographics

## Setup and Helper Functions

In [None]:
import json
from pathlib import Path

import biopsykit as bp
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from fau_colors import cmaps, register_fausans_font

from pepbench.datasets import EmpkinsDataset, GuardianDataset

%matplotlib widget
%load_ext autoreload
%autoreload 2

In [None]:
register_fausans_font()
plt.close("all")

palette = sns.color_palette(cmaps.faculties_light)
sns.set_theme(context="notebook", style="ticks", font="sans-serif", palette=palette)

plt.rcParams["figure.figsize"] = (10, 5)
plt.rcParams["pdf.fonttype"] = 42
plt.rcParams["mathtext.default"] = "regular"
plt.rcParams["font.family"] = "sans-serif"
plt.rcParams["font.sans-serif"] = "FAUSans Office"

palette

In [None]:
root_path = Path("../../")

In [None]:
deploy_type = "local"

config_dict = json.load(root_path.joinpath("config.json").open(encoding="utf-8"))

empkins_base_path = Path(config_dict[deploy_type]["empkins_path"])
guardian_base_path = Path(config_dict[deploy_type]["guardian_path"])
print(empkins_base_path)

In [None]:
condition_mapping_empkins = {"tsst": "TSST", "ftsst": "f-TSST"}
phase_mapping_empkins = {
    "Prep": "Preparation",
    "Pause_1": "Pause 1",
    "Pause_5": "Pause 5",
    "Talk": "Interview",
    "Math": "Mental Arithmetic",
}
phase_order_empkins = ["Prep", "Pause_1", "Talk", "Math", "Pause_5"]

In [None]:
export_path = root_path.joinpath("exports")
export_path.mkdir(exist_ok=True)

In [None]:
phase_mapping_guardian = {
    "Pause": "Resting",
    "Valsalva": "Valsalva",
    "HoldingBreath": "Holding Breath",
    "TiltUp": "Tilt Table Up",
    "TiltDown": "Tilt Table Down",
}
phase_order_guardian = ["Pause", "Valsalva", "HoldingBreath", "TiltUp", "TiltDown"]

In [None]:
algo_levels = ["q_peak_algorithm", "b_point_algorithm", "outlier_correction_algorithm"]

In [None]:
empkins_dataset = EmpkinsDataset(empkins_base_path)
empkins_dataset

In [None]:
guardian_dataset = GuardianDataset(guardian_base_path)
guardian_dataset

## EmpkinS

In [None]:
empkins_demographics = empkins_dataset.base_demographics
empkins_demographics

### Base Demographics

In [None]:
bp.metadata.gender_counts(empkins_demographics[["Gender"]], gender_col="Gender")

In [None]:
empkins_demographics[["Age"]].agg(["mean", "std"]).T

In [None]:
empkins_demographics[["BMI"]].agg(["mean", "std"]).T

### Number of Labeled Heartbeats

In [None]:
num_heartbeats_empkins = {}

for subset in empkins_dataset.groupby(None):
    num_heartbeats_empkins[subset.group_label] = len(subset.reference_heartbeats)

num_heartbeats_empkins = pd.DataFrame(num_heartbeats_empkins, index=["num_heartbeats"]).T
num_heartbeats_empkins.index.names = subset.group_label._fields
num_heartbeats_empkins.head()

#### Total

In [None]:
num_heartbeats_empkins.sum()

#### Per Participant

In [None]:
num_heartbeats_empkins.groupby(["participant"]).sum().agg(["mean", "std"]).T

#### Per Condition and Phase

In [None]:
num_heartbeats_empkins_per_phase = num_heartbeats_empkins.groupby(["condition", "phase"]).sum().unstack("condition")
num_heartbeats_empkins_per_phase = num_heartbeats_empkins_per_phase.reindex(phase_order_empkins, level="phase")
num_heartbeats_empkins_per_phase = num_heartbeats_empkins_per_phase.rename(index=phase_mapping_empkins).rename(
    columns=condition_mapping_empkins
)
print(num_heartbeats_empkins_per_phase.style.to_latex())
num_heartbeats_empkins_per_phase

### Comparison between Annotators

In [None]:
empkins_dataset_01 = EmpkinsDataset(empkins_base_path, label_type="rater_01")
empkins_dataset_02 = EmpkinsDataset(empkins_base_path, label_type="rater_02")

In [None]:
num_heartbeats_empkins_01 = {}
num_heartbeats_empkins_02 = {}

for subset_01, subset_02 in zip(empkins_dataset_01.groupby(None), empkins_dataset_02.groupby(None), strict=False):
    num_heartbeats_empkins_01[subset_01.group_label] = len(subset_01.reference_heartbeats)
    num_heartbeats_empkins_02[subset_02.group_label] = len(subset_02.reference_heartbeats)

num_heartbeats_empkins_01 = pd.DataFrame(num_heartbeats_empkins_01, index=["num_heartbeats"]).T
num_heartbeats_empkins_01.index.names = subset_01.group_label._fields
num_heartbeats_empkins_02 = pd.DataFrame(num_heartbeats_empkins_02, index=["num_heartbeats"]).T
num_heartbeats_empkins_02.index.names = subset_02.group_label._fields

In [None]:
display(num_heartbeats_empkins_01.sum())
display(num_heartbeats_empkins_02.sum())

## Guardian

In [None]:
guardian_demographics = guardian_dataset.base_demographics
guardian_demographics

### Base Demographics

In [None]:
bp.metadata.gender_counts(guardian_demographics[["Gender"]], gender_col="Gender")

In [None]:
guardian_demographics[["Age"]].agg(["mean", "std"]).T

In [None]:
guardian_demographics[["BMI"]].agg(["mean", "std"]).T

### Number of Labeled Heartbeats

In [None]:
num_heartbeats_guardian = {}

for subset in guardian_dataset.groupby(None):
    num_heartbeats_guardian[subset.group_label] = len(subset.reference_heartbeats)

num_heartbeats_guardian = pd.DataFrame(num_heartbeats_guardian, index=["num_heartbeats"]).T
num_heartbeats_guardian.index.names = subset.group_label._fields
num_heartbeats_guardian.head()

#### Total

In [None]:
num_heartbeats_guardian.sum()


#### Per Participant

In [None]:
num_heartbeats_guardian.groupby(["participant"]).sum().agg(["mean", "std"]).T

#### Per Phase

In [None]:
num_heartbeats_guardian_per_phase = num_heartbeats_guardian.groupby(["phase"]).sum()
num_heartbeats_guardian_per_phase = num_heartbeats_guardian_per_phase.reindex(phase_order_guardian).rename(
    index=phase_mapping_guardian
)
print(num_heartbeats_guardian_per_phase.style.to_latex())
display(num_heartbeats_guardian_per_phase)

## Export

In [None]:
empkins_demographics.to_csv(export_path.joinpath("demographics_empkins.csv"))
guardian_demographics.to_csv(export_path.joinpath("demographics_guardian.csv"))

In [None]:
num_heartbeats_empkins.to_csv(export_path.joinpath("num_heartbeats_empkins.csv"))
num_heartbeats_guardian.to_csv(export_path.joinpath("num_heartbeats_guardian.csv"))