# Run for our experiments

## Prep data

In [None]:
# cd mimic4extract 

In [None]:
# Generate one directory per `SUBJECT_ID`
python -m mimic3benchmark.scripts.extract_subjects_iv /data/wolf6245/src/mm_study/data/a_raw/MIMIC/MIMIC-IV data/root/ --filter_subject_id_file "/data/wolf6245/src/mm_study/data/f_modelling/03_model_input/data-2024-12-19-01-23-23/(3) Chronic ischaemic heart disease/y_fusion_label_not_gt.parquet"

In [None]:
# Fix some issues
python -m mimic3benchmark.scripts.validate_events data/root/

In [None]:
# Breaks up per-subject data into separate episodes
python -m mimic3benchmark.scripts.extract_episodes_from_subjects data/root/

## SPLIT

In [None]:
# Split
python -m mimic3benchmark.scripts.split_train_and_test data/root/

In [None]:
# Generate task-specific datasets
python -m mimic3benchmark.scripts.create_phenotyping data/root/ data/phenotyping/

In [None]:
# Split again for validation
python -m mimic3models.split_train_val data/phenotyping

## Update labels

In [1]:
import os
import pandas as pd

In [None]:
train_folder_listfile_path = "/data/wolf6245/src/MedFuse/mimic4extract/data/phenotyping/train/listfile.csv"
test_folder_listfile_path = "/data/wolf6245/src/MedFuse/mimic4extract/data/phenotyping/test/listfile.csv"
train_listfile_path = "/data/wolf6245/src/MedFuse/mimic4extract/data/phenotyping/train_listfile.csv"
test_listfile_path = "/data/wolf6245/src/MedFuse/mimic4extract/data/phenotyping/test_listfile.csv"
val_listfile_path = "/data/wolf6245/src/MedFuse/mimic4extract/data/phenotyping/val_listfile.csv"

In [None]:
# Load all dataframes
y_file_path = "/data/wolf6245/src/mm_study/data/f_modelling/03_model_input/data-2024-12-19-01-23-23/(3) Chronic ischaemic heart disease/y_fusion_label_not_gt.parquet"
icu_stay_df = pd.read_csv("/data/wolf6245/src/mm_study/data/a_raw/MIMIC/MIMIC-IV/icu/icustays.csv.gz")
key_columns = ["stay", "period_length", "stay_id"]
train_folder_listfile = pd.read_csv(train_folder_listfile_path, usecols=key_columns)
test_folder_listfile = pd.read_csv(test_folder_listfile_path, usecols=key_columns)
train_listfile = pd.read_csv(train_listfile_path, usecols=key_columns)
test_listfile = pd.read_csv(test_listfile_path, usecols=key_columns)
val_listfile = pd.read_csv(val_listfile_path, usecols=key_columns)
df_label = pd.read_parquet(y_file_path)
assert list(train_folder_listfile.columns) == list(test_folder_listfile.columns)
assert list(train_folder_listfile.columns) == list(train_listfile.columns)
assert list(train_folder_listfile.columns) == list(test_listfile.columns)
assert list(train_folder_listfile.columns) == list(val_listfile.columns)

# Merge stay_ids to df_label from icu_stay_df
df_label_shape_old = df_label.shape
df_label = df_label.merge(icu_stay_df[["subject_id", "hadm_id", "stay_id"]], on=["subject_id", "hadm_id"], how="left")
print(f"df_label shape before merge: {df_label_shape_old}, after merge: {df_label.shape}")
#df_label_columns_to_keep = [c for c in df_label.columns if c not in ["hadm_id", "subject_id"]]
df_label_columns_to_keep = ["stay_id", "(1) Hypertensive diseases",
    "(2) Ischaemic heart diseases",
    "(3) Chronic ischaemic heart disease",
    "(4) Cardiomyopathies diseases",
    "(5) Dysrhythmias diseases",
    "(6) Heart failure",
]

# Merge labels to all dataframes
train_folder_listfile = train_folder_listfile.merge(df_label[df_label_columns_to_keep], on=["stay_id"], how="left")
test_folder_listfile = test_folder_listfile.merge(df_label[df_label_columns_to_keep], on=["stay_id"], how="left")
train_listfile = train_listfile.merge(df_label[df_label_columns_to_keep], on=["stay_id"], how="left")
test_listfile = test_listfile.merge(df_label[df_label_columns_to_keep], on=["stay_id"], how="left")
val_listfile = val_listfile.merge(df_label[df_label_columns_to_keep], on=["stay_id"], how="left")

# Drop nans
train_folder_listfile_shape_old = train_folder_listfile.shape
train_folder_listfile = train_folder_listfile.dropna()
print(f"train_folder_listfile shape before dropna: {train_folder_listfile_shape_old}, after dropna: {train_folder_listfile.shape}")
test_folder_listfile_shape_old = test_folder_listfile.shape
test_folder_listfile = test_folder_listfile.dropna()
print(f"test_folder_listfile shape before dropna: {test_folder_listfile_shape_old}, after dropna: {test_folder_listfile.shape}")
train_listfile_shape_old = train_listfile.shape
train_listfile = train_listfile.dropna()
print(f"train_listfile shape before dropna: {train_listfile_shape_old}, after dropna: {train_listfile.shape}")
test_listfile_shape_old = test_listfile.shape
test_listfile = test_listfile.dropna()
print(f"test_listfile shape before dropna: {test_listfile_shape_old}, after dropna: {test_listfile.shape}")
val_listfile_shape_old = val_listfile.shape
val_listfile = val_listfile.dropna()
print(f"val_listfile shape before dropna: {val_listfile_shape_old}, after dropna: {val_listfile.shape}")

# Convert label columns to int
for col in df_label_columns_to_keep:
    if col not in ["stay_id"]:
        train_folder_listfile[col] = train_folder_listfile[col].astype(int)
        test_folder_listfile[col] = test_folder_listfile[col].astype(int)
        train_listfile[col] = train_listfile[col].astype(int)
        test_listfile[col] = test_listfile[col].astype(int)
        val_listfile[col] = val_listfile[col].astype(int)

# Save dataframes
train_folder_listfile.to_csv(train_folder_listfile_path, index=False)
test_folder_listfile.to_csv(test_folder_listfile_path, index=False)
train_listfile.to_csv(train_listfile_path, index=False)
test_listfile.to_csv(test_listfile_path, index=False)
val_listfile.to_csv(val_listfile_path, index=False)

df_label shape before merge: (5280, 12), after merge: (6148, 13)
train_folder_listfile shape before dropna: (3704, 9), after dropna: (3704, 9)
test_folder_listfile shape before dropna: (842, 9), after dropna: (842, 9)
train_listfile shape before dropna: (3394, 9), after dropna: (3394, 9)
test_listfile shape before dropna: (842, 9), after dropna: (842, 9)
val_listfile shape before dropna: (310, 9), after dropna: (310, 9)


## Build new splits

In [None]:
import random

In [36]:
# Base output folder
input_folder = "/data/wolf6245/src/MedFuse/mimic4extract/data/phenotyping"
output_folder = "/data/wolf6245/src/MedFuse/mimic4extract/data"

# Load all dataframes
train_folder_listfile = pd.read_csv(train_folder_listfile_path)
test_folder_listfile = pd.read_csv(test_folder_listfile_path)
train_listfile = pd.read_csv(train_listfile_path)
test_listfile = pd.read_csv(test_listfile_path)
val_listfile = pd.read_csv(val_listfile_path)
assert train_folder_listfile.shape[0] + test_folder_listfile.shape[0] == train_listfile.shape[0] + test_listfile.shape[0] + val_listfile.shape[0]
assert train_folder_listfile.shape[0] == train_listfile.shape[0] + val_listfile.shape[0]

# Combine dataframes
train_test_folder_listfile = pd.concat([train_folder_listfile, test_folder_listfile], ignore_index=True)
train_test_val_listfile = pd.concat([train_listfile, test_listfile, val_listfile], ignore_index=True)
assert train_test_folder_listfile.shape[0] == train_test_val_listfile.shape[0], "train_test_folder_listfile and train_test_val_listfile have different number of rows"

# Get all filepaths
all_files = os.listdir(f"{input_folder}/train/")
all_files += os.listdir(f"{input_folder}/test/")
all_files = [f for f in all_files if "listfile" not in f]
#assert len(all_files) == train_test_folder_listfile.shape[0], f"Number of files and number of rows in train_test_folder_listfile do not match: {len(all_files)} != {train_test_folder_listfile.shape[0]}"

# Get all stays
all_stays = list(train_listfile.stay) + list(test_listfile.stay) + list(val_listfile.stay)
assert len(all_stays) == len(set(all_stays)), "Stays are not unique across train, test and val sets"
assert train_folder_listfile.shape[0] + test_folder_listfile.shape[0] == len(all_stays), "Stays are not unique across train_folder_listfile and test_folder_listfile"

random.seed(42)
random.shuffle(all_stays)
n_folds = 5
fold_size = len(all_stays) // n_folds
folds = {}
for i in range(n_folds):
    start = i * fold_size
    end = (i + 1) * fold_size if i < n_folds - 1 else len(all_stays)
    test_stays = all_stays[start:end]
    train_stays_full = all_stays[:start] + all_stays[end:]

    # Split train_stays_full into train (80%) and val (20%)
    val_size = int(0.2 * len(train_stays_full))
    val_stays = train_stays_full[:val_size]
    train_stays = train_stays_full[val_size:]
    folds[i] = {
        "train": train_stays,
        "test": test_stays,
        "val": val_stays
    }

In [38]:
# Create new dataframes for each fold
for i in range(n_folds):
    train_stays = folds[i]["train"]
    test_stays = folds[i]["test"]
    val_stays = folds[i]["val"]
    train_val_stays = train_stays + val_stays

    # Filter dataframes
    train_folder_listfile_fold = train_test_folder_listfile[train_test_folder_listfile.stay.isin(train_val_stays)].copy()
    test_folder_listfile_fold = train_test_folder_listfile[train_test_folder_listfile.stay.isin(test_stays)].copy()
    train_listfile_fold = train_test_val_listfile[train_test_val_listfile.stay.isin(train_stays)].copy()
    test_listfile_fold = train_test_val_listfile[train_test_val_listfile.stay.isin(test_stays)].copy()
    val_listfile_fold = train_test_val_listfile[train_test_val_listfile.stay.isin(val_stays)].copy()

    # Save dataframes
    output_folder_fold = os.path.join(output_folder, f"fold_{i}/phenotyping")
    train_folder = os.path.join(output_folder_fold, "train")
    test_folder = os.path.join(output_folder_fold, "test")
    os.makedirs(train_folder, exist_ok=True)
    os.makedirs(test_folder, exist_ok=True)
    train_folder_listfile_fold.to_csv(os.path.join(train_folder, "listfile.csv"), index=False)
    test_folder_listfile_fold.to_csv(os.path.join(test_folder, "listfile.csv"), index=False)
    train_listfile_fold.to_csv(os.path.join(output_folder_fold, "train_listfile.csv"), index=False)
    test_listfile_fold.to_csv(os.path.join(output_folder_fold, "test_listfile.csv"), index=False)
    val_listfile_fold.to_csv(os.path.join(output_folder_fold, "val_listfile.csv"), index=False)
    print(f"Fold {i} saved to {output_folder_fold}")

    # Copy files
    train_files = train_folder_listfile_fold.stay.tolist()
    test_files = test_folder_listfile_fold.stay.tolist()
    count_train = 0
    count_test = 0
    for file in all_files:
        if file in train_files:
            count_train += 1
            os.system(f"cp {input_folder}/train/{file} {train_folder}/{file}")
        elif file in test_files:
            count_test += 1
            os.system(f"cp {input_folder}/test/{file} {test_folder}/{file}")
    print(f"Fold {i} copied {count_train} train files and {count_test} test files")
    assert count_train == len(train_files), f"Fold {i} train files do not match: {count_train} != {len(train_files)}"
    assert count_test == len(test_files), f"Fold {i} test files do not match: {count_test} != {len(test_files)}"
    

Fold 0 saved to /data/wolf6245/src/MedFuse/mimic4extract/data/fold_0/phenotyping
Fold 0 copied 3637 train files and 909 test files
Fold 1 saved to /data/wolf6245/src/MedFuse/mimic4extract/data/fold_1/phenotyping
Fold 1 copied 3637 train files and 909 test files
Fold 2 saved to /data/wolf6245/src/MedFuse/mimic4extract/data/fold_2/phenotyping
Fold 2 copied 3637 train files and 909 test files
Fold 3 saved to /data/wolf6245/src/MedFuse/mimic4extract/data/fold_3/phenotyping
Fold 3 copied 3637 train files and 909 test files
Fold 4 saved to /data/wolf6245/src/MedFuse/mimic4extract/data/fold_4/phenotyping
Fold 4 copied 3636 train files and 910 test files


## Prep images

In [None]:
# Resize
## specify the ehr_data_dir and cxr_data_dir directories paths before running the scripts.
python resize.py

In [None]:
# Create image splits
python ehr_utils/create_split.py

## Train

In [None]:
# Adjust
src/MedFuse/datasets/fusion.py
scripts/radiology/uni_cxr.sh
scripts/phenotyping/train/uni_all.sh
scripts/phenotyping/train/medFuse.sh
scripts/phenotyping/eval/medFuse.sh

In [None]:
# pre-train the imaging model with 14 radiology labels.
sh ./scripts/radiology/uni_cxr.sh
# pre-train LSTM model on extracted time-series EHR data for phenotype task.
sh ./scripts/phenotyping/train/uni_all.sh

In [None]:
# med fuse for phenotype task
sh ./scripts/phenotyping/train/medFuse.sh

## Evaluate

In [None]:
# med fuse for phenotype task
sh ./scripts/phenotyping/eval/medFuse.sh