# setup

In [1]:
import pandas as pd
import numpy as np
import h5py
import ast
import os
from tqdm import tqdm
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
LABELS_CSV = '/home/remote/Documents/datasets/lesaude/mimic-iv-ecg-ext-icd-diagnostic-labels-for-mimic-iv-ecg-1.0.1/records_w_diag_icd10.csv'
WAVEFORM_H5 = '/home/remote/Documents/datasets/lesaude/mimic-iv-ecg-monolith/mimic_iv_ecg_waveforms.h5'
LABEL_H5 = '/home/remote/Documents/datasets/lesaude/mimic-iv-ecg-monolith/mimic_iv_ecg_icd.h5'

In [3]:
TARGET_CHAPTER = 'I'  # Circulatory System
CODE_LENGTH = 3       # "I" + 2 digits
BATCH_SIZE = 1000     # Buffer size for writing

# monolith

In [4]:
def preprocess_codes(code_list_str):
    if pd.isna(code_list_str) or code_list_str == '':
        return set()
    try:
        raw_codes = ast.literal_eval(code_list_str)
    except:
        return set()
    clean_codes = set()
    for code in raw_codes:
        if code.startswith(TARGET_CHAPTER):
            clean_codes.add(code[:CODE_LENGTH])
    return clean_codes

In [None]:
# with h5py.File(WAVEFORM_H5, 'r') as f:
#     # We read the IDs and Timestamps that actually exist in the compiled dataset
#     h5_subjects = f['subject_id'][:]
#     h5_studies = f['study_id'][:]
#     h5_timestamps = f['ecg_time'][:]

In [None]:
df_labels = pd.read_csv(LABELS_CSV)
df_labels

In [6]:
df_labels['parsed_set'] = df_labels['all_diag_all'].apply(preprocess_codes)
all_codes = set()
for code_set in df_labels['parsed_set']:
    all_codes.update(code_set)

vocab_list = sorted(list(all_codes))
vocab_map = {code: i for i, code in enumerate(vocab_list)}
vocab_size = len(vocab_list)

In [7]:
label_lookup = {}
for _, row in tqdm(df_labels.iterrows(), total=len(df_labels), desc="Hashing"):
    subj_id = int(row['subject_id'])
    # Handle NaN study_ids (some records might miss them)
    study_id = int(row['study_id']) if not pd.isna(row['study_id']) else 0
    
    vector = np.zeros(vocab_size, dtype='int8')
    for code in row['parsed_set']:
        if code in vocab_map:
            vector[vocab_map[code]] = 1
    label_lookup[(subj_id, study_id)] = vector

Hashing: 100%|██████████| 800035/800035 [00:27<00:00, 29551.62it/s]


In [8]:
with h5py.File(WAVEFORM_H5, 'r') as f_src, \
        h5py.File(LABEL_H5, 'w') as f_dst:
    
    total_records = f_src['subject_id'].shape[0]
    
    # Create Datasets
    dset_labels = f_dst.create_dataset('icd', (total_records, vocab_size), dtype='i1')
    dset_subj = f_dst.create_dataset('subject_id', (total_records,), dtype='i4')
    dset_study = f_dst.create_dataset('study_id', (total_records,), dtype='i4')
    
    # Save Vocabulary for reference
    dt_str = h5py.special_dtype(vlen=str)
    dset_vocab = f_dst.create_dataset('vocabulary', (vocab_size,), dtype=dt_str)
    dset_vocab[:] = vocab_list
    
    # Buffers
    buf_labels = np.zeros((BATCH_SIZE, vocab_size), dtype='int8')
    buf_subj = np.zeros(BATCH_SIZE, dtype='int32')
    buf_study = np.zeros(BATCH_SIZE, dtype='int32')
    
    buf_ptr = 0
    global_ptr = 0
    match_count = 0
    
    # Source Datasets
    src_subjs = f_src['subject_id']
    src_studies = f_src['study_id']
    
    for i in tqdm(range(total_records), desc="Processing"):
        # Read IDs from source (Preserving Order is Key)
        s_id = src_subjs[i]
        st_id = src_studies[i]
        
        # Lookup Labels
        key = (s_id, st_id)
        if key in label_lookup:
            vector = label_lookup[key]
            match_count += 1
        else:
            vector = np.zeros(vocab_size, dtype='int8')
        
        # Fill Buffers
        buf_labels[buf_ptr] = vector
        buf_subj[buf_ptr] = s_id
        buf_study[buf_ptr] = st_id
        
        buf_ptr += 1
        
        # Flush if full
        if buf_ptr >= BATCH_SIZE:
            end_ptr = global_ptr + BATCH_SIZE
            dset_labels[global_ptr:end_ptr] = buf_labels
            dset_subj[global_ptr:end_ptr] = buf_subj
            dset_study[global_ptr:end_ptr] = buf_study
            
            global_ptr += BATCH_SIZE
            buf_ptr = 0
    
    # Flush remaining
    if buf_ptr > 0:
        end_ptr = global_ptr + buf_ptr
        dset_labels[global_ptr:end_ptr] = buf_labels[:buf_ptr]
        dset_subj[global_ptr:end_ptr] = buf_subj[:buf_ptr]
        dset_study[global_ptr:end_ptr] = buf_study[:buf_ptr]

Processing: 100%|██████████| 800035/800035 [00:04<00:00, 173569.65it/s]
