# First Models

## Preprocessing

First: apply preprocessing steps derived from the `notebooks/01_data_exploration.ipynb` analysis so new modeling notebooks can import and reuse consistent logic.

Key capabilities:
- Load PTBDB (normal/abnormal) and MITBIH (train/test) CSV datasets
- Drop duplicates in PTBDB partitions
- Provide features/targets split with column 187 as target
- Optional float32 downcasting to reduce memory
- Compute class weights for imbalanced classification
- Provide stratified train/val split helpers
- Compute zero-padding start index per row (for diagnostics/feature eng)

Signals are already normalized to [0, 1] per the source datasets.
Everything developed here is put into src/utils/preprocessing for further usage.

In [2]:
from __future__ import annotations

from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight


In [3]:
TARGET_COLUMN_INDEX: int = 187
FEATURE_COLUMN_RANGE: slice = slice(0, TARGET_COLUMN_INDEX)  # 0..186
RANDOM_STATE: int = 42

# MITBIH label mapping for reporting/plots (keep numeric labels for modeling)
MITBIH_LABELS_MAP: Dict[int, str] = {0: "N", 1: "S", 2: "V", 3: "F", 4: "Q"}
MITBIH_LABELS_TO_DESC: Dict[str, str] = {
    "N": "Normal",
    "S": "Supraventricular premature beat",
    "V": "Premature ventricular contraction",
    "F": "Fusion of V+N",
    "Q": "Unclassified",
}


@dataclass
class DatasetSplit:
    X_train: pd.DataFrame
    X_val: Optional[pd.DataFrame]
    X_test: Optional[pd.DataFrame]
    y_train: pd.Series
    y_val: Optional[pd.Series]
    y_test: Optional[pd.Series]
    class_weight: Optional[Dict[int, float]]
    
    # Note: Outlier removal (if enabled) is applied after splitting. Class
    # weights are computed on the final training labels so training loop
    # can pass them directly to supported estimators.

In [4]:
def _load_csv(path: Union[str, Path]) -> pd.DataFrame:
    """Read a CSV without header where each row is a 1D signal of length 188.

    Column 187 is the target label.
    """
    return pd.read_csv(str(path), header=None)


def load_ptbdb(
    data_dir: Union[str, Path] = "../data/original",
    drop_duplicates: bool = True,
) -> pd.DataFrame:
    """Load and combine PTBDB normal/abnormal datasets into a single DataFrame.

    Returns a single DataFrame with features in columns 0..186 and target in 187.
    """
    data_dir = Path(data_dir)
    normal = _load_csv(data_dir / "ptbdb_normal.csv")
    abnormal = _load_csv(data_dir / "ptbdb_abnormal.csv")

    if drop_duplicates:
        # Duplicates were found in exploration; remove them
        normal = normal.drop_duplicates()
        abnormal = abnormal.drop_duplicates()

    df = pd.concat([abnormal, normal], axis=0, ignore_index=True)
    return df


def load_mitbih(
    data_dir: Union[str, Path] = "../data/original",
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Load MITBIH train and test DataFrames (kept as provided)."""
    data_dir = Path(data_dir)
    train = _load_csv(data_dir / "mitbih_train.csv")
    test = _load_csv(data_dir / "mitbih_test.csv")
    return train, test


def split_features_target(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series]:
    """Return (X, y) where y is column 187 and X are columns 0..186."""
    X = df.iloc[:, FEATURE_COLUMN_RANGE]
    y = df.iloc[:, TARGET_COLUMN_INDEX].astype(int)
    return X, y


def compute_balanced_class_weight(y: Union[pd.Series, np.ndarray]) -> Dict[int, float]:
    """Compute class weights to counter class imbalance. Useful for many models."""
    classes = np.unique(y)
    weights = compute_class_weight(class_weight="balanced", classes=classes, y=y)
    return {int(cls): float(w) for cls, w in zip(classes, weights)}


def find_zero_padding_start(sequence_row: Union[pd.Series, np.ndarray]) -> int:
    """Return the first index after the last non-zero value scanning from the end.

    This matches the exploratory notebook's logic to estimate the beginning of
    right-side zero-padding per row.
    """
    if isinstance(sequence_row, pd.Series):
        values = sequence_row.values
    else:
        values = sequence_row

    first_zero_index = 0
    for i in range(len(values) - 1, -1, -1):
        if values[i] != 0:
            first_zero_index = ( i + 1 ) / 1.2 # pre-defined from dataset
            break
    return int(first_zero_index)


def compute_zero_padding_feature(df: pd.DataFrame) -> pd.Series:
    """Compute `zero_pad_start` for each row based on feature columns 0..186."""
    X = df.iloc[:, FEATURE_COLUMN_RANGE]
    return X.apply(lambda row: find_zero_padding_start(row), axis=1)


def stratified_train_val_split(
    X: pd.DataFrame,
    y: pd.Series,
    val_size: float = 0.2,
    random_state: int = 42,
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
    """Create a stratified train/validation split preserving class distribution."""
    X_train, X_val, y_train, y_val = train_test_split(
        X,
        y,
        test_size=val_size,
        random_state=random_state,
        stratify=y,
    )
    return X_train, X_val, y_train, y_val


def drop_zero_pad_outliers_with_bounds(
    df: pd.DataFrame,
    bounds: pd.DataFrame,
    zero_pad_start: Optional[pd.Series] = None,
) -> pd.DataFrame:
    """Drop rows whose `zero_pad_start` is outside class-specific bounds.

    Rows with unseen classes (not present in `bounds`) are kept unchanged.
    """
    if zero_pad_start is None:
        zero_pad_start = compute_zero_padding_feature(df)
    target = df.iloc[:, TARGET_COLUMN_INDEX].astype(int)

    temp = pd.DataFrame(
        {"zero_pad_start": zero_pad_start, "target": target.values}, index=df.index
    )
    temp = temp.join(bounds, on="target", how="left")
    keep_mask = temp["lower"].isna() | (
        (temp["zero_pad_start"] >= temp["lower"]) & (temp["zero_pad_start"] <= temp["upper"])
    )
    return df.loc[keep_mask]


def fit_zero_pad_whisker_bounds(
    df: pd.DataFrame,
    zero_pad_start: Optional[pd.Series] = None,
    whisker_k: float = 1.5,
) -> pd.DataFrame:
    """Fit per-class Tukey whisker bounds for `zero_pad_start` on the given df.

    Returns a DataFrame indexed by class with columns `lower` and `upper`.
    """
    if zero_pad_start is None:
        zero_pad_start = compute_zero_padding_feature(df)
    target = df.iloc[:, TARGET_COLUMN_INDEX].astype(int)

    temp = pd.DataFrame({
        "zero_pad_start": zero_pad_start,
        "target": target.values,
    })

    quantiles = (
        temp.groupby("target")["zero_pad_start"].quantile([0.25, 0.75]).unstack()
    )
    quantiles = quantiles.rename(columns={0.25: "q1", 0.75: "q3"})
    quantiles["iqr"] = quantiles["q3"] - quantiles["q1"]
    quantiles["lower"] = quantiles["q1"] - whisker_k * quantiles["iqr"]
    quantiles["upper"] = quantiles["q3"] + whisker_k * quantiles["iqr"]
    return quantiles[["lower", "upper"]]


def prepare_mitbih(
    data_dir: Union[str, Path] = "../data/original",
    val_size: float = 0.1,
    random_state: int = 42,
    remove_outliers: bool = False,
    whisker_k: float = 1.5,
) -> DatasetSplit:
    """Load MITBIH train/test, produce train/val split and class weights.

    The original test set is kept for final evaluation. A validation set is
    carved out of the provided training set using stratification.
    """
    train_df, test_df = load_mitbih(data_dir=data_dir)
    X_train_full, y_train_full = split_features_target(train_df)
    X_test, y_test = split_features_target(test_df)

    X_train, X_val, y_train, y_val = stratified_train_val_split(
        X_train_full, y_train_full, val_size=val_size, random_state=random_state
    )

    if remove_outliers:
        # Reassemble dfs to compute zero_pad and apply bounds
        train_df = pd.concat([X_train, y_train.rename("target")], axis=1)
        val_df = pd.concat([X_val, y_val.rename("target")], axis=1)
        test_df = pd.concat([X_test, y_test.rename("target")], axis=1)

        zp_train = compute_zero_padding_feature(train_df)
        bounds = fit_zero_pad_whisker_bounds(train_df, zp_train, whisker_k=whisker_k)

        train_df = drop_zero_pad_outliers_with_bounds(train_df, bounds, zp_train)
        val_df = drop_zero_pad_outliers_with_bounds(val_df, bounds)
        test_df = drop_zero_pad_outliers_with_bounds(test_df, bounds)

        # Split back to X/y
        X_train, y_train = split_features_target(train_df)
        X_val, y_val = split_features_target(val_df)
        X_test, y_test = split_features_target(test_df)

    weight_map = compute_balanced_class_weight(y_train)

    return DatasetSplit(
        X_train=X_train,
        X_val=X_val,
        X_test=X_test,
        y_train=y_train,
        y_val=y_val,
        y_test=y_test,
        class_weight=weight_map,
    )


def prepare_ptbdb(
    data_dir: Union[str, Path] = "../data/original",
    test_size: float = 0.2,
    val_size: float = 0.1,
    random_state: int = 42,
    remove_outliers: bool = False,
    whisker_k: float = 1.5,
) -> DatasetSplit:
    """Load PTBDB and produce stratified train/val/test splits and class weights."""
    df = load_ptbdb(data_dir=data_dir, drop_duplicates=True)
    X, y = split_features_target(df)

    # First split: train vs test
    X_train_val, X_test, y_train_val, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )

    # Second split: train vs val (from the train_val portion)
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val,
        y_train_val,
        test_size=val_size,
        random_state=random_state,
        stratify=y_train_val,
    )

    if remove_outliers:
        train_df = pd.concat([X_train, y_train.rename("target")], axis=1)
        val_df = pd.concat([X_val, y_val.rename("target")], axis=1)
        test_df = pd.concat([X_test, y_test.rename("target")], axis=1)

        zp_train = compute_zero_padding_feature(train_df)
        bounds = fit_zero_pad_whisker_bounds(train_df, zp_train, whisker_k=whisker_k)

        train_df = drop_zero_pad_outliers_with_bounds(train_df, bounds, zp_train)
        val_df = drop_zero_pad_outliers_with_bounds(val_df, bounds)
        test_df = drop_zero_pad_outliers_with_bounds(test_df, bounds)

        X_train, y_train = split_features_target(train_df)
        X_val, y_val = split_features_target(val_df)
        X_test, y_test = split_features_target(test_df)

    weight_map = compute_balanced_class_weight(y_train)

    return DatasetSplit(
        X_train=X_train,
        X_val=X_val,
        X_test=X_test,
        y_train=y_train,
        y_val=y_val,
        y_test=y_test,
        class_weight=weight_map,
    )




# Load Models

In [5]:
ptb = prepare_ptbdb()  # optional: remove_outliers=True
mit = prepare_mitbih() # optional: remove_outliers=True

In [8]:
ptb.X_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,177,178,179,180,181,182,183,184,185,186
6222,1.0,0.782719,0.403109,0.263557,0.172813,0.105206,0.06833,0.076283,0.056038,0.04013,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2240,1.0,0.633437,0.325834,0.081071,0.124515,0.141195,0.12917,0.125291,0.140807,0.120248,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14461,1.0,0.761387,0.214027,0.088271,0.138251,0.182588,0.165256,0.158001,0.154776,0.157598,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7915,0.992216,0.762874,0.368263,0.298802,0.265868,0.149102,0.180838,0.157485,0.192216,0.15988,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2972,1.0,0.846457,0.707537,0.425197,0.232283,0.179415,0.170979,0.143982,0.136108,0.14117,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
