# setup

In [18]:
import sys
import os
import pandas as pd
import numpy as np
import h5py
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

In [2]:
sys.path.append(os.path.abspath('..'))
sys.path.append(os.path.abspath('../src'))
from hparams import DATA_ROOT
from dataset import DynamicsDataset

In [3]:
WAVE_PATH = os.path.join(DATA_ROOT, 'mimic_iv_ecg_waveforms.h5')
LABEL_PATH = os.path.join(DATA_ROOT, 'mimic_iv_ecg_icd.h5')
META_PATH = os.path.join(DATA_ROOT, 'metadata.csv')

In [4]:
df = pd.read_csv(META_PATH)

# trn

In [5]:
ds_train = DynamicsDataset(
    split='train',
    return_pairs=False
)

Initializing Dataset from /home/remote/Documents/datasets/lesaude/mimic-iv-ecg-monolith/mimic_iv_ecg_waveforms.h5...
   Loading Subject IDs for integrity check...
   > Integrity Check Passed: Waveform and Label files are perfectly aligned.
   Loading Study IDs for integrity check...


In [6]:
ds_train = DynamicsDataset(
    split='train',
    return_pairs=True
)

Initializing Dataset from /home/remote/Documents/datasets/lesaude/mimic-iv-ecg-monolith/mimic_iv_ecg_waveforms.h5...
   Loading Subject IDs for integrity check...
   > Integrity Check Passed: Waveform and Label files are perfectly aligned.
   Loading Study IDs for integrity check...
   Scanning for action types...
   > Total Pairs: 575784
   > Stable Pairs: 420018
   > Changed Pairs: 155766


In [7]:
target_folds = list(range(0, 18))
df_train_raw = df[df['fold'].isin(target_folds)]

print(f"Rows in CSV (Folds 0-17): {len(df_train_raw)}")
print(f"Valid Pairs in Dataset:   {len(ds_train)}")

expected_max = len(df_train_raw)
assert len(ds_train) < expected_max, "Error: Dataset has more pairs than rows!"
assert len(ds_train) > expected_max * 0.5, "Error: Too many pairs lost! Check alignment."

Rows in CSV (Folds 0-17): 721002
Valid Pairs in Dataset:   575784


# val

In [8]:
ds_val = DynamicsDataset(
    split='val',
    return_pairs=True
)

Initializing Dataset from /home/remote/Documents/datasets/lesaude/mimic-iv-ecg-monolith/mimic_iv_ecg_waveforms.h5...
   Loading Subject IDs for integrity check...
   > Integrity Check Passed: Waveform and Label files are perfectly aligned.
   Loading Study IDs for integrity check...
   Scanning for action types...
   > Total Pairs: 31397
   > Stable Pairs: 22753
   > Changed Pairs: 8644


In [9]:
ds_val = DynamicsDataset(
    split='val',
    return_pairs=False
)

   > Applying Baseline Filter: Keeping only first ECG per stay (ecg_no_within_stay == 0)
   > Reduced 39464 -> 14456 records.
Initializing Dataset from /home/remote/Documents/datasets/lesaude/mimic-iv-ecg-monolith/mimic_iv_ecg_waveforms.h5...
   Loading Subject IDs for integrity check...
   > Integrity Check Passed: Waveform and Label files are perfectly aligned.
   Loading Study IDs for integrity check...


In [10]:
df_val_filtered = df[(df['fold'] == 18) & (df['ecg_no_within_stay'] == 0)]

print(f"Manual Count (Fold 18 & First ECG): {len(df_val_filtered)}")
print(f"Dataset Count:                      {len(ds_val)}")

if len(ds_val) == len(df_val_filtered):
    print("✅ Validation Filter (First ECG) is working perfectly.")
else:
    print("❌ Validation Count Mismatch! Check the __init__ filtering logic.")

Manual Count (Fold 18 & First ECG): 14456
Dataset Count:                      14456
✅ Validation Filter (First ECG) is working perfectly.


# tst

In [11]:
ds_test = DynamicsDataset(
    split='test',
    return_pairs=True
)

Initializing Dataset from /home/remote/Documents/datasets/lesaude/mimic-iv-ecg-monolith/mimic_iv_ecg_waveforms.h5...
   Loading Subject IDs for integrity check...
   > Integrity Check Passed: Waveform and Label files are perfectly aligned.
   Loading Study IDs for integrity check...
   Scanning for action types...
   > Total Pairs: 31502
   > Stable Pairs: 23071
   > Changed Pairs: 8431


In [12]:
ds_test = DynamicsDataset(
    split='test',
    return_pairs=False
)

   > Applying Baseline Filter: Keeping only first ECG per stay (ecg_no_within_stay == 0)
   > Reduced 39569 -> 14237 records.
Initializing Dataset from /home/remote/Documents/datasets/lesaude/mimic-iv-ecg-monolith/mimic_iv_ecg_waveforms.h5...
   Loading Subject IDs for integrity check...
   > Integrity Check Passed: Waveform and Label files are perfectly aligned.
   Loading Study IDs for integrity check...


In [13]:
df_test_filtered = df[(df['fold'] == 19) & (df['ecg_no_within_stay'] == 0)]

print(f"Manual Count (Fold 19 & First ECG): {len(df_test_filtered)}")
print(f"Dataset Count:                      {len(ds_test)}")

assert len(ds_test) == len(df_test_filtered)
print("✅ Test Split working perfectly.")

Manual Count (Fold 19 & First ECG): 14237
Dataset Count:                      14237
✅ Test Split working perfectly.


# patient leak

In [14]:
def get_subjects(dataset):
    # Helper to read subjects from the valid indices of the dataset
    indices = dataset.valid_indices
    with h5py.File(WAVE_PATH, 'r') as f:
        # We assume indices are sorted, so we can use fancy indexing or just read all and mask
        # Reading all is faster for integrity checks if RAM allows
        all_subjs = f['subject_id'][:]
    return set(all_subjs[indices])

In [15]:
print("Extracting Subject IDs from Datasets...")
train_subjs = get_subjects(ds_train)
val_subjs = get_subjects(ds_val)
test_subjs = get_subjects(ds_test)

print(f"Unique Patients - Train: {len(train_subjs)}")
print(f"Unique Patients - Val:   {len(val_subjs)}")
print(f"Unique Patients - Test:  {len(test_subjs)}")

# Intersections
train_val_leak = train_subjs.intersection(val_subjs)
train_test_leak = train_subjs.intersection(test_subjs)
val_test_leak = val_subjs.intersection(test_subjs)

if len(train_val_leak) == 0 and len(train_test_leak) == 0 and len(val_test_leak) == 0:
    print("✅ PASS: Zero Patient Leakage detected.")
else:
    print("❌ FAIL: Leakage detected!")
    print(f"Train/Val Overlap: {len(train_val_leak)}")
    print(f"Train/Test Overlap: {len(train_test_leak)}")

Extracting Subject IDs from Datasets...
Unique Patients - Train: 93627
Unique Patients - Val:   6212
Unique Patients - Test:  6175
✅ PASS: Zero Patient Leakage detected.


# loader

In [30]:
# dl = DataLoader(ds_train, batch_size=4, shuffle=True)

# ds_val = DynamicsDataset(
#     split='val',
#     return_pairs=True
# )
# dl = DataLoader(ds_val, batch_size=4, )

ds_test = DynamicsDataset(
    split='test',
    return_pairs=False
)
dl = DataLoader(ds_test, batch_size=4, )

   > Applying Baseline Filter: Keeping only first ECG per stay (ecg_no_within_stay == 0)
   > Reduced 39569 -> 14237 records.
Initializing Dataset from /home/remote/Documents/datasets/lesaude/mimic-iv-ecg-monolith/mimic_iv_ecg_waveforms.h5...
   Loading Subject IDs for integrity check...
   > Integrity Check Passed: Waveform and Label files are perfectly aligned.
   Loading Study IDs for integrity check...


In [None]:
for batch in tqdm(dl):
    pass