# EDA 01 — Dataset overview & sanity checks (consistent column roles)

This notebook provides a first-pass EDA overview while enforcing explicit column roles:
- Label column (target): excluded from descriptor summaries
- Metadata columns: excluded from descriptor summaries
- Descriptor (feature) columns: the numeric columns used for EDA/modeling

It produces:
- Dataset shapes, dtype overview, duplicates, constant columns
- Missingness report for descriptor features only
- Numeric descriptive summary for descriptor features only
- Exported tables and figures for downstream reporting

Outputs:
- Tables: `results/eda/stats/*.csv`
- Figures: `results/eda/figures/overview/*.png`


In [1]:
from __future__ import annotations

from pathlib import Path
from typing import List, Tuple, Optional, Dict

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns


In [3]:
sns.set_context("notebook")
sns.set_style("whitegrid")

plt.rcParams["figure.dpi"] = 140
plt.rcParams["savefig.dpi"] = 300

PROJECT_ROOT = Path("../../").resolve()

DATA_PROCESSED_DIR = PROJECT_ROOT / "data" / "processed_variants"
STRICT_CSV  = DATA_PROCESSED_DIR / "hcs_strict_complete_case_20260129_114259.csv"
IMPUTED_CSV = DATA_PROCESSED_DIR / "hcs_imputed_exploratory_20260129_114259.csv"

EDA_ROOT = PROJECT_ROOT / "results" / "eda"
FIG_DIR = EDA_ROOT / "figures" / "overview"
STATS_DIR = EDA_ROOT / "stats"

FIG_DIR.mkdir(parents=True, exist_ok=True)
STATS_DIR.mkdir(parents=True, exist_ok=True)

print("DATA_PROCESSED_DIR:", DATA_PROCESSED_DIR)
print("FIG_DIR:", FIG_DIR)
print("STATS_DIR:", STATS_DIR)


DATA_PROCESSED_DIR: /home/david/Desktop/colabs/serra_ramon/hypercholesterolemia_classifiers/data/processed_variants
FIG_DIR: /home/david/Desktop/colabs/serra_ramon/hypercholesterolemia_classifiers/results/eda/figures/overview
STATS_DIR: /home/david/Desktop/colabs/serra_ramon/hypercholesterolemia_classifiers/results/eda/stats


In [4]:
def load_dataset(path: Path) -> pd.DataFrame:
    """
    Load a dataset from disk (CSV).

    Parameters
    ----------
    path : Path
        Path to the CSV file.

    Returns
    -------
    pd.DataFrame
        Loaded dataframe.

    Raises
    ------
    FileNotFoundError
        If the file does not exist.
    """
    if not path.exists():
        raise FileNotFoundError(f"Dataset not found: {path}")
    return pd.read_csv(path)

df_strict = load_dataset(STRICT_CSV)
df_imputed = load_dataset(IMPUTED_CSV)

print("STRICT:", df_strict.shape)
print("IMPUTED:", df_imputed.shape)

display(df_strict.head(3))


STRICT: (148, 15)
IMPUTED: (160, 15)


Unnamed: 0,row_id,_sheet,LocalID,Age,Weight,Height,BMI_final,Glycemia,SBP_1T,DBP_1T,TC_1T,TG_1T,HDL_1T,LDL_1T,MSPH
0,cohorte_and_000000,cohorte_and,1,33,68.0,169.0,23.8,76.2,101.0,60.0,239,85.1,69.0,153.0,1
1,cohorte_and_000001,cohorte_and,2,37,58.8,162.0,22.4,71.9,101.0,58.0,222,97.0,65.6,137.0,1
2,cohorte_and_000002,cohorte_and,3,30,63.0,160.0,24.6,79.8,113.0,58.0,197,100.0,56.3,120.7,1


In [5]:
# ============================================================
# EXPLICIT COLUMN DEFINITIONS
# ============================================================

# 1) Label column (target)
LABEL_COL = "MSPH"     

# 2) Metadata columns (identifiers, strings, groups, etc.)
META_COLS = [
    "row_id", "_sheet", "LocalID"	
]  # put here what exists in your df; missing ones are ignored

# 3) Feature columns:
# Option A (recommended): numeric columns minus label/meta
# Option B: manually define descriptor columns
USE_AUTO_FEATURES = True
MANUAL_FEATURE_COLS: List[str] = [
    # If USE_AUTO_FEATURES = False, list descriptor columns here
]

# ============================================================

def resolve_columns(df: pd.DataFrame) -> tuple[str, List[str], List[str]]:
    """
    Resolve label, metadata, and descriptor feature columns based on explicit definitions.

    Parameters
    ----------
    df : pd.DataFrame
        Input dataframe.

    Returns
    -------
    tuple[str, List[str], List[str]]
        (label_col, meta_cols_present, feature_cols)
    """
    if LABEL_COL not in df.columns:
        raise ValueError(f"LABEL_COL='{LABEL_COL}' not found in dataframe columns.")

    meta_present = [c for c in META_COLS if c in df.columns]

    if USE_AUTO_FEATURES:
        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
        blacklist = set(meta_present + [LABEL_COL])
        feature_cols = [c for c in numeric_cols if c not in blacklist]
    else:
        if not MANUAL_FEATURE_COLS:
            raise ValueError("USE_AUTO_FEATURES=False but MANUAL_FEATURE_COLS is empty.")
        blacklist = set(meta_present + [LABEL_COL])
        feature_cols = [c for c in MANUAL_FEATURE_COLS if (c in df.columns and c not in blacklist)]

    if not feature_cols:
        raise ValueError("No feature columns resolved. Check LABEL_COL/META_COLS or descriptor dtypes.")

    return LABEL_COL, meta_present, feature_cols


label_s, meta_s, feat_s = resolve_columns(df_strict)
label_i, meta_i, feat_i = resolve_columns(df_imputed)

print("Resolved (strict):")
print("  label:", label_s)
print("  meta:", meta_s)
print("  #features:", len(feat_s))

print("Resolved (imputed):")
print("  label:", label_i)
print("  meta:", meta_i)
print("  #features:", len(feat_i))


Resolved (strict):
  label: MSPH
  meta: ['row_id', '_sheet', 'LocalID']
  #features: 11
Resolved (imputed):
  label: MSPH
  meta: ['row_id', '_sheet', 'LocalID']
  #features: 11


In [6]:
def build_schema_manifest(df: pd.DataFrame, dataset_name: str, label_col: str, meta_cols: List[str], feat_cols: List[str]) -> pd.DataFrame:
    """
    Build a reproducibility manifest describing column roles and dtypes.

    Parameters
    ----------
    df : pd.DataFrame
        Dataset.
    dataset_name : str
        Identifier (e.g., 'strict', 'imputed').
    label_col : str
        Label column.
    meta_cols : List[str]
        Metadata columns.
    feat_cols : List[str]
        Descriptor feature columns.

    Returns
    -------
    pd.DataFrame
        Manifest with columns: dataset, col, dtype, role.
    """
    roles: Dict[str, str] = {}

    roles[label_col] = "label"
    for c in meta_cols:
        roles[c] = "metadata"
    for c in feat_cols:
        roles[c] = "feature"

    manifest = pd.DataFrame({
        "dataset": dataset_name,
        "col": df.columns,
        "dtype": df.dtypes.astype(str).values,
        "role": [roles.get(c, "other") for c in df.columns],
    })
    return manifest.sort_values(["role", "col"]).reset_index(drop=True)


manifest_strict = build_schema_manifest(df_strict, "strict", label_s, meta_s, feat_s)
manifest_imputed = build_schema_manifest(df_imputed, "imputed", label_i, meta_i, feat_i)

schema_manifest = pd.concat([manifest_strict, manifest_imputed], ignore_index=True)
schema_path = STATS_DIR / "schema_manifest.csv"
schema_manifest.to_csv(schema_path, index=False)

print("Saved:", schema_path)
display(schema_manifest.head(25))


Saved: /home/david/Desktop/colabs/serra_ramon/hypercholesterolemia_classifiers/results/eda/stats/schema_manifest.csv


Unnamed: 0,dataset,col,dtype,role
0,strict,Age,int64,feature
1,strict,BMI_final,float64,feature
2,strict,DBP_1T,float64,feature
3,strict,Glycemia,float64,feature
4,strict,HDL_1T,float64,feature
5,strict,Height,float64,feature
6,strict,LDL_1T,float64,feature
7,strict,SBP_1T,float64,feature
8,strict,TC_1T,int64,feature
9,strict,TG_1T,float64,feature


In [7]:
def dataset_overview(df: pd.DataFrame, dataset_name: str) -> pd.DataFrame:
    """
    Compute dataset-level overview metrics.

    Parameters
    ----------
    df : pd.DataFrame
        Input dataframe.
    dataset_name : str
        Dataset identifier.

    Returns
    -------
    pd.DataFrame
        One-row overview table.
    """
    n_rows, n_cols = df.shape
    dup = int(df.duplicated().sum())
    dup_pct = float((dup / n_rows) * 100.0) if n_rows else 0.0

    return pd.DataFrame([{
        "dataset": dataset_name,
        "n_rows": n_rows,
        "n_cols": n_cols,
        "duplicate_rows": dup,
        "duplicate_pct": dup_pct,
    }])


overview = pd.concat([
    dataset_overview(df_strict, "strict"),
    dataset_overview(df_imputed, "imputed"),
], ignore_index=True)

display(overview)

overview_path = STATS_DIR / "dataset_overview.csv"
overview.to_csv(overview_path, index=False)
print("Saved:", overview_path)


Unnamed: 0,dataset,n_rows,n_cols,duplicate_rows,duplicate_pct
0,strict,148,15,0,0.0
1,imputed,160,15,0,0.0


Saved: /home/david/Desktop/colabs/serra_ramon/hypercholesterolemia_classifiers/results/eda/stats/dataset_overview.csv


In [8]:
def feature_missingness(df: pd.DataFrame, feature_cols: List[str], dataset_name: str) -> pd.DataFrame:
    """
    Compute missingness for descriptor feature columns only.

    Parameters
    ----------
    df : pd.DataFrame
        Input dataframe.
    feature_cols : List[str]
        Descriptor feature columns.
    dataset_name : str
        Dataset identifier.

    Returns
    -------
    pd.DataFrame
        Missingness table.
    """
    X = df[feature_cols].copy()
    n = len(X)
    miss = X.isna().sum()
    miss_pct = (miss / max(n, 1)) * 100.0

    out = pd.DataFrame({
        "dataset": dataset_name,
        "feature": feature_cols,
        "missing": miss.values,
        "missing_pct": miss_pct.values,
    }).sort_values("missing_pct", ascending=False).reset_index(drop=True)
    return out


miss_strict = feature_missingness(df_strict, feat_s, "strict")
miss_imputed = feature_missingness(df_imputed, feat_i, "imputed")

display(miss_strict.head(20))
display(miss_imputed.head(20))

miss_strict.to_csv(STATS_DIR / "missingness_features_strict.csv", index=False)
miss_imputed.to_csv(STATS_DIR / "missingness_features_imputed.csv", index=False)
print("Saved missingness tables.")

def plot_missingness_top(df_miss: pd.DataFrame, dataset_name: str, top_n: int = 40) -> Optional[Path]:
    """
    Plot a missingness bar chart for top-N missing features.

    Parameters
    ----------
    df_miss : pd.DataFrame
        Missingness table from `feature_missingness`.
    dataset_name : str
        Dataset identifier.
    top_n : int
        Number of features to plot.

    Returns
    -------
    Optional[Path]
        Saved figure path if any missingness exists.
    """
    tmp = df_miss.head(top_n).copy()
    tmp = tmp[tmp["missing_pct"] > 0]
    if tmp.empty:
        print(f"[{dataset_name}] No missingness > 0.")
        return None

    plt.figure(figsize=(10, max(3, 0.25 * len(tmp))))
    sns.barplot(data=tmp, y="feature", x="missing_pct")
    plt.xlabel("Missingness (%)")
    plt.ylabel("Feature")
    plt.title(f"Top missingness (features only) — {dataset_name} (top {top_n})")
    plt.tight_layout()

    out = FIG_DIR / f"missingness_features_top{top_n}_{dataset_name}.png"
    plt.savefig(out)
    plt.show()
    print("Saved:", out)
    return out

plot_missingness_top(miss_strict, "strict", top_n=50)
plot_missingness_top(miss_imputed, "imputed", top_n=50)


Unnamed: 0,dataset,feature,missing,missing_pct
0,strict,Age,0,0.0
1,strict,Weight,0,0.0
2,strict,Height,0,0.0
3,strict,BMI_final,0,0.0
4,strict,Glycemia,0,0.0
5,strict,SBP_1T,0,0.0
6,strict,DBP_1T,0,0.0
7,strict,TC_1T,0,0.0
8,strict,TG_1T,0,0.0
9,strict,HDL_1T,0,0.0


Unnamed: 0,dataset,feature,missing,missing_pct
0,imputed,Age,0,0.0
1,imputed,Weight,0,0.0
2,imputed,Height,0,0.0
3,imputed,BMI_final,0,0.0
4,imputed,Glycemia,0,0.0
5,imputed,SBP_1T,0,0.0
6,imputed,DBP_1T,0,0.0
7,imputed,TC_1T,0,0.0
8,imputed,TG_1T,0,0.0
9,imputed,HDL_1T,0,0.0


Saved missingness tables.
[strict] No missingness > 0.
[imputed] No missingness > 0.


In [9]:
def constant_features(df: pd.DataFrame, feature_cols: List[str], dataset_name: str) -> pd.DataFrame:
    """
    Identify constant descriptor features (nunique <= 1).

    Parameters
    ----------
    df : pd.DataFrame
        Input dataframe.
    feature_cols : List[str]
        Descriptor features.
    dataset_name : str
        Dataset identifier.

    Returns
    -------
    pd.DataFrame
        Constant feature table.
    """
    X = df[feature_cols].copy()
    nunique = X.nunique(dropna=True)
    const = nunique[nunique <= 1].index.tolist()

    out = pd.DataFrame({
        "dataset": dataset_name,
        "feature": const,
    })
    return out


const_strict = constant_features(df_strict, feat_s, "strict")
const_imputed = constant_features(df_imputed, feat_i, "imputed")

display(const_strict.head(20))
display(const_imputed.head(20))

const_strict.to_csv(STATS_DIR / "constant_features_strict.csv", index=False)
const_imputed.to_csv(STATS_DIR / "constant_features_imputed.csv", index=False)
print("Saved constant feature tables.")


Unnamed: 0,dataset,feature


Unnamed: 0,dataset,feature


Saved constant feature tables.


In [10]:
def numeric_feature_summary(df: pd.DataFrame, feature_cols: List[str], dataset_name: str) -> pd.DataFrame:
    """
    Compute descriptive statistics for descriptor features only.

    Parameters
    ----------
    df : pd.DataFrame
        Input dataframe.
    feature_cols : List[str]
        Descriptor features.
    dataset_name : str
        Dataset identifier.

    Returns
    -------
    pd.DataFrame
        Feature summary table.
    """
    X = df[feature_cols].copy()

    # ensure numeric
    X = X.select_dtypes(include=[np.number])
    if X.empty:
        return pd.DataFrame()

    desc = X.describe(percentiles=[0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99]).T
    desc.insert(0, "dataset", dataset_name)
    desc.insert(1, "feature", desc.index)
    desc = desc.reset_index(drop=True)

    desc["iqr"] = desc["75%"] - desc["25%"]
    desc["range"] = desc["max"] - desc["min"]
    desc["max_abs"] = desc[["min", "max"]].abs().max(axis=1)

    return desc


sum_strict = numeric_feature_summary(df_strict, feat_s, "strict")
sum_imputed = numeric_feature_summary(df_imputed, feat_i, "imputed")

display(sum_strict.head(20))
display(sum_imputed.head(20))

sum_strict.to_csv(STATS_DIR / "numeric_feature_summary_strict.csv", index=False)
sum_imputed.to_csv(STATS_DIR / "numeric_feature_summary_imputed.csv", index=False)
print("Saved numeric feature summaries.")


Unnamed: 0,dataset,feature,count,mean,std,min,1%,5%,25%,50%,75%,95%,99%,max,iqr,range,max_abs
0,strict,Age,148.0,29.027027,5.703497,16.0,16.47,18.35,24.75,30.0,33.0,37.0,38.0,39.0,8.25,23.0,39.0
1,strict,Weight,148.0,64.615541,12.633816,42.0,46.0,49.0,57.0,60.5,71.75,88.325,104.01,115.0,14.75,73.0,115.0
2,strict,Height,148.0,161.351351,6.647453,145.0,147.94,151.0,156.75,161.0,167.0,171.0,176.0,178.0,10.25,33.0,178.0
3,strict,BMI_final,148.0,24.838455,4.823313,17.03,18.282,18.835,21.5,23.8,27.6,32.76,39.391,46.6,6.1,29.57,46.6
4,strict,Glycemia,148.0,77.791216,9.197137,52.0,57.41,63.0,72.875,78.25,82.0,90.0,95.65,134.0,9.125,82.0,134.0
5,strict,SBP_1T,148.0,107.641892,10.116337,80.0,85.29,90.0,100.0,109.0,112.25,121.3,134.53,140.0,12.25,60.0,140.0
6,strict,DBP_1T,148.0,64.77027,9.103408,27.0,44.41,51.7,60.0,64.0,70.0,80.0,85.65,99.0,10.0,72.0,99.0
7,strict,TC_1T,148.0,183.885135,28.666395,121.0,124.41,144.35,163.0,183.0,203.0,233.3,252.59,259.0,40.0,138.0,259.0
8,strict,TG_1T,148.0,108.635811,44.337773,36.0,43.29,55.0,74.75,100.0,130.25,194.3,237.65,246.0,55.5,210.0,246.0
9,strict,HDL_1T,148.0,56.939865,13.227886,18.0,27.64,36.35,47.0,56.9,65.05,80.295,87.495,100.2,18.05,82.2,100.2


Unnamed: 0,dataset,feature,count,mean,std,min,1%,5%,25%,50%,75%,95%,99%,max,iqr,range,max_abs
0,imputed,Age,160.0,29.2625,5.88088,13.0,16.0,18.0,25.75,30.0,33.0,37.0,38.41,41.0,7.25,28.0,41.0
1,imputed,Weight,160.0,64.62875,12.387666,42.0,46.0,49.0,57.0,61.0,71.0,88.025,101.97,115.0,14.0,73.0,115.0
2,imputed,Height,160.0,161.21875,6.659435,145.0,148.18,150.95,156.0,161.0,166.25,171.0,176.0,178.0,10.25,33.0,178.0
3,imputed,BMI_final,160.0,24.885571,4.717931,17.03,18.354,18.895,21.7,23.85,27.525,32.9,38.827,46.6,5.825,29.57,46.6
4,imputed,Glycemia,160.0,77.718125,8.943272,52.0,57.77,63.0,73.0,78.1,81.85,90.0,95.05,134.0,8.85,82.0,134.0
5,imputed,SBP_1T,160.0,107.3875,10.035464,80.0,86.13,90.0,100.0,108.5,112.0,120.1,134.41,140.0,12.0,60.0,140.0
6,imputed,DBP_1T,160.0,64.575,8.960114,27.0,44.77,50.95,60.0,64.0,70.0,80.0,85.05,99.0,10.0,72.0,99.0
7,imputed,TC_1T,160.0,183.3625,29.069314,121.0,124.77,143.8,161.75,181.0,203.0,238.05,252.23,259.0,41.25,138.0,259.0
8,imputed,TG_1T,160.0,108.956875,44.007899,36.0,38.77,55.0,75.75,102.5,130.25,195.05,237.05,246.0,54.5,210.0,246.0
9,imputed,HDL_1T,160.0,56.800625,13.285162,18.0,24.95,36.0,47.0,56.9,65.05,80.115,87.315,100.2,18.05,82.2,100.2


Saved numeric feature summaries.


In [11]:
def flag_crazy_ranges(
    summary_table: pd.DataFrame,
    max_abs_threshold: float = 1e6,
    huge_range_threshold: float = 1e6,
) -> pd.DataFrame:
    """
    Flag descriptor features with suspiciously large absolute values or ranges.

    Parameters
    ----------
    summary_table : pd.DataFrame
        Output from `numeric_feature_summary`.
    max_abs_threshold : float
        Threshold for max(|min|, |max|).
    huge_range_threshold : float
        Threshold for range (max - min).

    Returns
    -------
    pd.DataFrame
        Flagged features sorted by severity.
    """
    if summary_table.empty:
        return pd.DataFrame()

    tmp = summary_table.copy()
    flagged = tmp[(tmp["max_abs"] >= max_abs_threshold) | (tmp["range"] >= huge_range_threshold)].copy()
    flagged = flagged.sort_values(["max_abs", "range"], ascending=False).reset_index(drop=True)
    return flagged


crazy_strict = flag_crazy_ranges(sum_strict)
crazy_imputed = flag_crazy_ranges(sum_imputed)

display(crazy_strict.head(30))
display(crazy_imputed.head(30))

crazy_strict.to_csv(STATS_DIR / "crazy_ranges_features_strict.csv", index=False)
crazy_imputed.to_csv(STATS_DIR / "crazy_ranges_features_imputed.csv", index=False)
print("Saved crazy range tables.")


Unnamed: 0,dataset,feature,count,mean,std,min,1%,5%,25%,50%,75%,95%,99%,max,iqr,range,max_abs


Unnamed: 0,dataset,feature,count,mean,std,min,1%,5%,25%,50%,75%,95%,99%,max,iqr,range,max_abs


Saved crazy range tables.


In [12]:
feat_intersection = sorted(list(set(feat_s).intersection(set(feat_i))))
pd.DataFrame({"feature": feat_intersection}).to_csv(STATS_DIR / "feature_set_intersection_from_notebook01.csv", index=False)

print("Intersection features:", len(feat_intersection))
print("Saved:", STATS_DIR / "feature_set_intersection_from_notebook01.csv")


Intersection features: 11
Saved: /home/david/Desktop/colabs/serra_ramon/hypercholesterolemia_classifiers/results/eda/stats/feature_set_intersection_from_notebook01.csv


## Notes for Notebook 02

Notebook 02 will use the same explicit column roles:
- `LABEL_COL`: used for conditioning plots (hue/x)
- `META_COLS`: ignored for descriptor summaries
- Descriptor features: numeric columns excluding label/meta (or manual list)

This prevents contamination of descriptor statistics with the response label.