In [None]:
import sys
from pathlib import Path
import pandas as pd

project_root = str(Path().absolute().parent)
sys.path.append(project_root)

from src.data.unified import UnifiedDataset
from src.data.dataset import DatasetModality

data_root = Path(project_root) / "data"
arrhythmia_data = UnifiedDataset(
    data_root, modality=DatasetModality.ECG, dataset_key="ptbxl"
)

records = arrhythmia_data.get_all_record_ids()
metadata_store = arrhythmia_data.metadata_store
df = pd.DataFrame(
    [{**metadata_store.get(record_id), "record_id": record_id} for record_id in records]
)
df["labels"] = [
    arrhythmia_data[record_id].preprocessed_record.target_labels
    for record_id in records
]

del arrhythmia_data, metadata_store
df.head()

In [None]:
# if age is larger than 89 clip to 90
df["age"] = df["age"].clip(upper=90)

df["sex_category"] = df["sex"].map(
    {1: "Male", 0: "Female"}
)  # no info if 1 or 0 is male or female
df.loc[df["sex"].isna(), "sex_category"] = "Missing"


df["age"].describe().round(2), df["sex_category"].value_counts()

In [None]:
# sex category percentages

df["sex_category"].value_counts(normalize=True).round(2)

In [None]:
len(df)

In [None]:
df.columns

In [None]:
# egt sample scp_statements
df["scp_statements"][0]

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# high retina
%matplotlib inline
%config InlineBackend.figure_format='retina'

sns.set_theme(style="whitegrid")

# set default size
plt.rcParams["figure.figsize"] = [12, 5]

In [None]:
df["sex_category"] = df["sex"].map(
    {1: "Male", 0: "Female"}
)  # no info if 1 or 0 is male or female
df.loc[df["sex"].isna(), "sex_category"] = "Missing"

missing_age = df["age"].isna().sum()
missing_sex = df["sex"].isna().sum()

print(f"Total missing age values: {missing_age}")
print(f"Total missing sex values: {missing_sex}")


# and all ages over 89 and all elements of dates (including year) indicative of such age, except that such ages and elements may be aggregated into a single category of age 90 or older
df["age"] = df["age"].apply(lambda x: x if x < 90 else 90)

sex_counts = df["sex_category"].value_counts()
total_records = len(df)
percentages = (sex_counts / total_records * 100).round(2)  # Round to 1 decimal place

# Set up the figure with two subplots
fig, axes = plt.subplots(1, 2)

# ---- Violin Plot: Age Distribution by Sex ----
sns.violinplot(
    x="sex_category", y="age", data=df, palette="muted", inner="quartile", ax=axes[0]
)

axes[0].set_xlabel("Sex Category")
axes[0].set_ylabel("Age")

# ---- Bar Plot: Distribution of Sex Categories ----
barplot = sns.countplot(x="sex_category", data=df, palette="muted", ax=axes[1])

# Add percentage labels on top of bars
for p in barplot.patches:
    height = p.get_height()
    barplot.annotate(
        f"{(height / total_records * 100):.2f}% (n={int(height)})",  # Convert count to percentage
        (p.get_x() + p.get_width() / 2, height),  # Position
        ha="center",
        va="bottom",
        fontsize=10,
        color="black",
    )

axes[1].set_xlabel("Sex Category")
axes[1].set_ylabel("Count")

plt.tight_layout()
plt.show()

In [None]:
# Assuming df is your DataFrame
# Extract keys from 'scp_statements' column along with their diagnostic class and subclass
df_exploded = df.copy()
df_exploded["scp_labels"] = df_exploded["scp_statements"].apply(
    lambda x: list(x.keys())
)
df_exploded["diagnostic_class"] = df_exploded["scp_statements"].apply(
    lambda x: [x[key]["diagnostic_class"] for key in x.keys()]
)
df_exploded["diagnostic_subclass"] = df_exploded["scp_statements"].apply(
    lambda x: [x[key]["diagnostic_subclass"] for key in x.keys()]
)
df_exploded = df_exploded.explode(
    ["scp_labels", "diagnostic_class", "diagnostic_subclass"]
)

# remove leading or trailing "_" in subclass
df_exploded["diagnostic_subclass"] = df_exploded["diagnostic_subclass"].str.strip("_")

# Count occurrences of each diagnostic subclass
df_counts = (
    df_exploded.groupby(["diagnostic_subclass", "diagnostic_class"])
    .size()
    .reset_index(name="count")
)

# Sort by count
df_counts = df_counts.sort_values(by="count", ascending=False)

# Calculate percentages
total_count = df_counts["count"].sum()
df_counts["percentage"] = (df_counts["count"] / total_count) * 100

# Plot
plt.figure(figsize=(14, 6))
ax = sns.barplot(
    data=df_counts,
    x="diagnostic_subclass",
    y="count",
    hue="diagnostic_class",
    dodge=False,
)

# Annotate with percentages
for p in ax.patches:
    height = p.get_height()
    if height == 0:
        continue
    ax.text(
        p.get_x() + p.get_width() / 2.0,
        height + 3,
        f"{height / total_count * 100:.2f}%",
        ha="center",
        va="bottom",
        fontsize=10,
    )

plt.xticks(rotation=30, ha="right")
plt.xlabel("Diagnostic Subclass")
plt.ylabel("Count")
plt.legend(title="Diagnostic Class")
plt.tight_layout()
plt.show()

In [None]:
# Extract relevant information from labels_metadata
all_diagnoses = [
    (entry["group"], entry["integration_name"], entry["snomed_code"])
    for row in df["labels_metadata"]
    for entry in row
    if "integration_name" in entry and "snomed_code" in entry
]

# Convert to DataFrame
df_counts = pd.DataFrame(
    all_diagnoses, columns=["Group", "Integration Name", "SNOMED-CT Code"]
)

# Count occurrences
df_counts = df_counts.value_counts().reset_index()
df_counts.columns = ["Group", "Integration Name", "SNOMED-CT Code", "Count"]

# Compute percentages based on total dataset size
total_records = len(df)
df_counts["Percentage"] = (df_counts["Count"] / total_records * 100).round(2)

# Ensure we only have 20 unique integration names
df_unique_counts = (
    df_counts.groupby("Integration Name")
    .agg(
        {
            "Group": "first",  # Keep the first group name (they should be unique per Integration Name)
            "SNOMED-CT Code": lambda x: ", ".join(
                x.unique()
            ),  # Merge SNOMED-CT codes per Integration Name
            "Count": "sum",  # Sum counts if multiple codes exist for the same integration name
        }
    )
    .reset_index()
)

# Recalculate percentages based on total dataset size
df_unique_counts["Percentage"] = (
    df_unique_counts["Count"] / total_records * 100
).round(2)

# Ensure we only have 20 rows
df_final_20 = df_unique_counts.sort_values(by=["Group", "Integration Name"]).head(20)
df_final_20