# Run for our experiments

In [1]:
import pandas as pd

## Prep data

In [None]:
# cd mimic4extract 

In [None]:
# Generate one directory per `SUBJECT_ID`
python -m mimic3benchmark.scripts.extract_subjects_iv /data/wolf6245/src/mm_study/data/a_raw/MIMIC/MIMIC-IV data/root/ --filter_subject_id_file "/data/wolf6245/src/mm_study/data/f_modelling/03_model_input/data-2024-12-19-01-23-23/(3) Chronic ischaemic heart disease/y_fusion_label_not_gt.parquet"

In [None]:
# Fix some issues
python -m mimic3benchmark.scripts.validate_events data/root/

In [None]:
# Breaks up per-subject data into separate episodes
python -m mimic3benchmark.scripts.extract_episodes_from_subjects data/root/

In [None]:
# Split
python -m mimic3benchmark.scripts.split_train_and_test data/root/

In [None]:
# Generate task-specific datasets
python -m mimic3benchmark.scripts.create_phenotyping data/root/ data/phenotyping/

In [None]:
# Split again for validation
python -m mimic3models.split_train_val data/phenotyping

## Update labels

In [8]:
import os
import pandas as pd

In [None]:
# Load all dataframes
train_folder_listfile_path = "/data/wolf6245/src/MedFuse/mimic4extract/data/phenotyping/train/listfile.csv"
test_folder_listfile_path = "/data/wolf6245/src/MedFuse/mimic4extract/data/phenotyping/test/listfile.csv"
train_listfile_path = "/data/wolf6245/src/MedFuse/mimic4extract/data/phenotyping/train_listfile.csv"
test_listfile_path = "/data/wolf6245/src/MedFuse/mimic4extract/data/phenotyping/test_listfile.csv"
val_listfile_path = "/data/wolf6245/src/MedFuse/mimic4extract/data/phenotyping/val_listfile.csv"
y_file_path = "/data/wolf6245/src/mm_study/data/f_modelling/03_model_input/data-2024-12-19-01-23-23/(3) Chronic ischaemic heart disease/y_fusion_label_not_gt.parquet"
icu_stay_df = pd.read_csv("/data/wolf6245/src/mm_study/data/a_raw/MIMIC/MIMIC-IV/icu/icustays.csv.gz")
key_columns = ["stay", "period_length", "stay_id"]
train_folder_listfile = pd.read_csv(train_folder_listfile_path, usecols=key_columns)
test_folder_listfile = pd.read_csv(test_folder_listfile_path, usecols=key_columns)
train_listfile = pd.read_csv(train_listfile_path, usecols=key_columns)
test_listfile = pd.read_csv(test_listfile_path, usecols=key_columns)
val_listfile = pd.read_csv(val_listfile_path, usecols=key_columns)
df_label = pd.read_parquet(y_file_path)
assert list(train_folder_listfile.columns) == list(test_folder_listfile.columns)
assert list(train_folder_listfile.columns) == list(train_listfile.columns)
assert list(train_folder_listfile.columns) == list(test_listfile.columns)
assert list(train_folder_listfile.columns) == list(val_listfile.columns)

# Merge stay_ids to df_label from icu_stay_df
df_label_shape_old = df_label.shape
df_label = df_label.merge(icu_stay_df[["subject_id", "hadm_id", "stay_id"]], on=["subject_id", "hadm_id"], how="left")
df_label_columns_to_keep = [c for c in df_label.columns if c not in ["hadm_id", "subject_id"]]
print(f"df_label shape before merge: {df_label_shape_old}, after merge: {df_label.shape}")

# Merge labels to all dataframes
train_folder_listfile = train_folder_listfile.merge(df_label[df_label_columns_to_keep], on=["stay_id"], how="left")
test_folder_listfile = test_folder_listfile.merge(df_label[df_label_columns_to_keep], on=["stay_id"], how="left")
train_listfile = train_listfile.merge(df_label[df_label_columns_to_keep], on=["stay_id"], how="left")
test_listfile = test_listfile.merge(df_label[df_label_columns_to_keep], on=["stay_id"], how="left")
val_listfile = val_listfile.merge(df_label[df_label_columns_to_keep], on=["stay_id"], how="left")

# Drop nans
train_folder_listfile_shape_old = train_folder_listfile.shape
train_folder_listfile = train_folder_listfile.dropna()
print(f"train_folder_listfile shape before dropna: {train_folder_listfile_shape_old}, after dropna: {train_folder_listfile.shape}")
test_folder_listfile_shape_old = test_folder_listfile.shape
test_folder_listfile = test_folder_listfile.dropna()
print(f"test_folder_listfile shape before dropna: {test_folder_listfile_shape_old}, after dropna: {test_folder_listfile.shape}")
train_listfile_shape_old = train_listfile.shape
train_listfile = train_listfile.dropna()
print(f"train_listfile shape before dropna: {train_listfile_shape_old}, after dropna: {train_listfile.shape}")
test_listfile_shape_old = test_listfile.shape
test_listfile = test_listfile.dropna()
print(f"test_listfile shape before dropna: {test_listfile_shape_old}, after dropna: {test_listfile.shape}")
val_listfile_shape_old = val_listfile.shape
val_listfile = val_listfile.dropna()
print(f"val_listfile shape before dropna: {val_listfile_shape_old}, after dropna: {val_listfile.shape}")

# Convert label columns to int
for col in df_label_columns_to_keep:
    if col not in ["stay_id"]:
        train_folder_listfile[col] = train_folder_listfile[col].astype(int)
        test_folder_listfile[col] = test_folder_listfile[col].astype(int)
        train_listfile[col] = train_listfile[col].astype(int)
        test_listfile[col] = test_listfile[col].astype(int)
        val_listfile[col] = val_listfile[col].astype(int)

# Save dataframes
train_folder_listfile.to_csv(train_folder_listfile_path, index=False)
test_folder_listfile.to_csv(test_folder_listfile_path, index=False)
train_listfile.to_csv(train_listfile_path, index=False)
test_listfile.to_csv(test_listfile_path, index=False)
val_listfile.to_csv(val_listfile_path, index=False)

df_label shape before merge: (5280, 12), after merge: (6148, 13)
train_folder_listfile shape before dropna: (5266, 13), after dropna: (3704, 13)
test_folder_listfile shape before dropna: (1267, 13), after dropna: (842, 13)
train_listfile shape before dropna: (4780, 13), after dropna: (3394, 13)
test_listfile shape before dropna: (1267, 13), after dropna: (842, 13)
val_listfile shape before dropna: (486, 13), after dropna: (310, 13)


## Prep images

In [None]:
# Resize
## specify the ehr_data_dir and cxr_data_dir directories paths before running the scripts.
python resize.py

In [None]:
# Create image splits
python create_split.py

## Train

In [None]:
# pre-train the imaging model with 14 radiology labels.
sh ./scripts/radiology/uni_cxr.sh
# pre-train LSTM model on extracted time-series EHR data for phenotype task.
sh ./scripts/phenotyping/train/uni_all.sh

In [None]:
# med fuse for phenotype task
sh ./scripts/phenotyping/train/medFuse.sh

## Evaluate

In [None]:
# med fuse for phenotype task
sh ./scripts/phenotyping/eval/medFuse.sh