# setup

In [1]:
import pandas as pd
import h5py
import numpy as np
import os
import argparse
from tqdm import tqdm
import sys

In [2]:
sys.path.append(os.path.abspath('..'))
from hparams import DATA_ROOT, MIMIC_ECG_ROOT, MIMIC_ICD_ROOT

In [3]:
waveform_h5 = os.path.join(DATA_ROOT, 'mimic_iv_ecg_waveforms.h5')
original_csv = os.path.join(MIMIC_ICD_ROOT, 'records_w_diag_icd10.csv')
output_csv = os.path.join(DATA_ROOT, 'metadata.csv')

# h5

In [4]:
with h5py.File(waveform_h5, 'r') as f:
    # We read the whole arrays into memory (int32 arrays are small, ~1.5MB for 400k rows)
    if 'subject_id' not in f or 'study_id' not in f:
        raise ValueError("HDF5 file must contain 'subject_id' and 'study_id' datasets to reconstruct metadata.")
        
    h5_subjects = f['subject_id'][:]
    h5_studies = f['study_id'][:]
    
    # Create a DataFrame representing the HDF5 structure
    df_h5 = pd.DataFrame({
        'h5_index': np.arange(len(h5_subjects)), # The critical column for Dataset.__getitem__
        'subject_id': h5_subjects,
        'study_id': h5_studies
    })
    
print(f"   > HDF5 contains {len(df_h5)} records.")

   > HDF5 contains 800035 records.


# icd splits

In [5]:
print(f"2. Reading Original PhysioNet CSV from: {original_csv}...")
# Read the file containing fold info
# We only need specific columns
cols_to_use = ['subject_id', 'study_id', 'fold', 'ecg_no_within_stay']

try:
    df_origin = pd.read_csv(original_csv, usecols=cols_to_use)
except ValueError as e:
    print(f"Error reading CSV columns: {e}")
    print("Please check if your CSV has 'fold' and 'ecg_no_within_stay' columns.")

2. Reading Original PhysioNet CSV from: /home/remote/Documents/datasets/lesaude/mimic-iv-ecg-ext-icd-diagnostic-labels-for-mimic-iv-ecg-1.0.1/records_w_diag_icd10.csv...


In [6]:
df_origin['subject_id'] = df_origin['subject_id'].astype(int)
# Handle NaN study_ids if any (though unlikely in the matched subset)
df_origin = df_origin.dropna(subset=['study_id'])
df_origin['study_id'] = df_origin['study_id'].astype(int)

print(f"   > Original CSV contains {len(df_origin)} records.")

   > Original CSV contains 800035 records.


# metadata

In [7]:
# We left join because the HDF5 determines the training index. 
# We want to attach info TO the HDF5 rows.
df_merged = pd.merge(df_h5, df_origin, on=['subject_id', 'study_id'], how='left')

# Check for missing metadata
# (This happens if you downloaded a waveform that isn't in the matched ICD subset)
missing_count = df_merged['fold'].isna().sum()
if missing_count > 0:
    print(f"   WARNING: {missing_count} records in HDF5 do not have matching metadata in the CSV.")
    print("   These records will have NaN folds and will be ignored by the Dataset split logic.")
    
    # Fill NaNs with -1 to indicate 'unknown'
    df_merged['fold'] = df_merged['fold'].fillna(-1).astype(int)
    df_merged['ecg_no_within_stay'] = df_merged['ecg_no_within_stay'].fillna(-1).astype(int)
else:
    df_merged['fold'] = df_merged['fold'].astype(int)
    df_merged['ecg_no_within_stay'] = df_merged['ecg_no_within_stay'].astype(int)

In [8]:
df_merged.to_csv(output_csv, index=False)