# HAIM test

# Prep data

In [None]:
import os
import pandas as pd
from tqdm import tqdm

In [None]:
# Load filter data
df_filter = pd.read_parquet("/data/wolf6245/src/mm_study/data/f_modelling/03_model_input/data-2024-12-19-01-23-23/(3) Chronic ischaemic heart disease/y_fusion_label_not_gt.parquet")
df_folds = pd.read_pickle("/data/wolf6245/src/mm_study/data/f_modelling/03_model_input/data-2024-12-19-01-23-23/(3) Chronic ischaemic heart disease/train_test_vali_folds_fusion_label.pkl")
subject_ids_to_use = [int(i) for i in df_filter['subject_id'].unique()]
hadm_ids_to_use = [int(i) for i in df_filter['hadm_id'].unique()]

# Load data
haim_mimiciv_key_ids = pd.read_csv("/data/wolf6245/src/HAIM/data/haim_mimiciv/haim_mimiciv_key_ids.csv")
mimic_cxr_metadata = pd.read_csv("/data/wolf6245/src/HAIM/data/haim_mimiciv/mimic-cxr-2.0.0-metadata.csv")
core = pd.read_csv("/data/wolf6245/src/HAIM/data/haim_mimiciv/core/core.csv")
pickle_aux = pd.read_pickle("/data/wolf6245/src/HAIM/data/haim_mimiciv/pickle/00000000.pkl")
pickle_files = os.listdir("/data/wolf6245/src/HAIM/data/haim_mimiciv/pickle/")

# Shapes
print(f"Shape haim_mimiciv_key_ids: {haim_mimiciv_key_ids.shape}")
print(f"Shape mimic_cxr_metadata: {mimic_cxr_metadata.shape}")
print(f"Shape core: {core.shape}")
print(f"pickle files: {len(pickle_files)}")

# Filter
haim_mimiciv_key_ids_filtered = haim_mimiciv_key_ids[haim_mimiciv_key_ids['hadm_id'].astype(int).isin(hadm_ids_to_use)]
print(f"Shape haim_mimiciv_key_ids_filtered: {haim_mimiciv_key_ids_filtered.shape}")
core_filtered = core[core['hadm_id'].astype(int).isin(hadm_ids_to_use)]
print(f"Shape core_filtered: {core_filtered.shape}")
mimic_cxr_metadata_filtered = mimic_cxr_metadata[mimic_cxr_metadata['subject_id'].astype(int).isin(subject_ids_to_use)]
print(f"Shape mimic_cxr_metadata_filtered: {mimic_cxr_metadata_filtered.shape}")

# Filter cxr
core_filtered['dischtime'] = pd.to_datetime(core_filtered['dischtime'])
mimic_cxr_metadata_filtered['cxrtime'] = pd.to_datetime(mimic_cxr_metadata_filtered['cxrtime'])
max_dischtime = core_filtered.copy().groupby('subject_id')['dischtime'].max().reset_index()
max_dischtime.rename(columns={'dischtime': 'max_dischtime'}, inplace=True)
merged = mimic_cxr_metadata_filtered.copy().merge(max_dischtime, on='subject_id', how='left')
mimic_cxr_metadata_filtered_filtered = merged.copy()[merged['cxrtime'] <= merged['max_dischtime']].drop(columns=['max_dischtime'])
print(f"Shape mimic_cxr_metadata_filtered_filtered: {mimic_cxr_metadata_filtered_filtered.shape}")

# Get all hadm_ids
files_to_remove = []
for pickle_file in tqdm(pickle_files[:]):
    pickle_file_path = os.path.join("/data/wolf6245/src/HAIM/data/haim_mimiciv/pickle/", pickle_file)
    pickle_df = pd.read_pickle(pickle_file_path)
    hadm_ids_aux = [int(i) for i in pickle_df.admissions.hadm_id.unique()]
    subject_ids_aux = [int(i) for i in pickle_df.admissions.subject_id.unique()]
    if any([hadm_id not in hadm_ids_to_use for hadm_id in hadm_ids_aux]):
            files_to_remove.append(pickle_file)
print(f"Number of pickle files with hadm_ids not in filter: {len(files_to_remove)} of {len(pickle_files)}")

if False:
    # Save files back
    haim_mimiciv_key_ids_filtered.to_csv("/data/wolf6245/src/HAIM/data/haim_mimiciv/haim_mimiciv_key_ids.csv", index=False)
    mimic_cxr_metadata_filtered_filtered.to_csv("/data/wolf6245/src/HAIM/data/haim_mimiciv/mimic-cxr-2.0.0-metadata.csv", index=False)
    core_filtered.to_csv("/data/wolf6245/src/HAIM/data/haim_mimiciv/core/core.csv", index=False)

    # Delete files in files_to_remove
    for file in tqdm(files_to_remove, desc="Deleting files"):
        file_path = os.path.join("/data/wolf6245/src/HAIM/data/haim_mimiciv/pickle/", file)
        os.remove(file_path)

## Check features

In [None]:
import pickle
import pandas as pd

In [None]:
mapping = {
    # Exact 6
    "de_": ['anchor_age', 'gender_int', 'ethnicity_int', 'marital_status_int', 'language_int', 'insurance_int'],
    # Times 11
    "ts_ce_": ['Heart Rate','Non Invasive Blood Pressure systolic','Non Invasive Blood Pressure diastolic', 'Non Invasive Blood Pressure mean', 'Respiratory Rate','O2 saturation pulseoxymetry', 'GCS - Verbal Response', 'GCS - Eye Opening', 'GCS - Motor Response'],
    # 242 instead of 11*23 = 253
    "ts_le_": ['Glucose', 'Potassium', 'Sodium', 'Chloride', 'Creatinine', 'Urea Nitrogen', 'Bicarbonate', 'Anion Gap', 'Hemoglobin', 'Hematocrit', 'Magnesium', 'Platelet Count', 'Phosphate', 'White Blood Cells', 'Calcium, Total', 'MCH', 'Red Blood Cells', 'MCHC', 'MCV', 'RDW', 'Neutrophils', 'Vancomycin'],
    # Times 11
    "ts_pe_": ['Foley Catheter', 'PICC Line', 'Intubation', 'Peritoneal Dialysis', 'Bronchoscopy', 'EEG', 'Dialysis - CRRT', 'Dialysis Catheter', 'Chest Tube Removed', 'Hemodialysis'],
    "vd_": ["vision_dense"],
    "vp_": ["vision_predictions"],
    "vmd_": ["vision_multi_dense"],
    "vmp_": ["vision_multi_predictions"],
}

In [None]:
# df_chartevents = pd.read_csv("/data/wolf6245/data/physionet.org/files/mimiciv/3.0/icu/chartevents.csv.gz", dtype={'value': 'object', 'valueuom': 'object'})
df_pickle = pd.read_pickle("/data/wolf6245/src/HAIM/data/haim_mimiciv/pickle/00000002.pkl")
df_embedding = pd.read_csv("/data/wolf6245/src/HAIM/data/haim_mimiciv/embedding/00000002.pkl")
df_d_items = pd.read_csv("/data/wolf6245/src/mm_study/data/a_raw/MIMIC/MIMIC-IV/icu/d_items.csv.gz")
df_d_labitems = pd.read_csv("/data/wolf6245/src/mm_study/data/a_raw/MIMIC/MIMIC-IV/hosp/d_labitems.csv.gz")
df_d_hcpcs = pd.read_csv("/data/wolf6245/src/mm_study/data/a_raw/MIMIC/MIMIC-IV/hosp/d_hcpcs.csv.gz")

In [None]:
chartevent_ids = df_pickle.chartevents[df_pickle.chartevents["label"].isin(mapping["ts_ce_"])].itemid.unique()
labevent_ids = df_pickle.labevents[df_pickle.labevents["label"].isin(mapping["ts_le_"])].itemid.unique()
proceedureevent_ids = df_pickle.procedureevents[df_pickle.procedureevents["label"].isin(mapping["ts_pe_"])].itemid.unique()

print(f"Number of chartevent_ids: {len(chartevent_ids)}, compared to {len(mapping['ts_ce_'])}")
print(f"Number of labevent_ids: {len(labevent_ids)}, compared to {len(mapping['ts_le_'])}")
print(f"Number of proceedureevent_ids: {len(proceedureevent_ids)}, compared to {len(mapping['ts_pe_'])}")