In [1]:
import numpy as np
import random
import pandas as pd
from tqdm import tqdm 
import time

from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, accuracy_score, roc_curve
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

import pickle

from aeon.regression.convolution_based import MultiRocketHydraRegressor

In [2]:
def tnr_score(y_test, y_pred):
    y_t = np.array(y_test)
    y_p = np.array(y_pred)
    tn = np.sum((1-y_t)*(1-y_p))
    fp = np.sum(y_p*(1-y_t))
    if (tn + fp) == 0:
        return 0
    else:
        return tn / (tn + fp)

In [51]:
def prepareData(dataGroup, id_list, window_time):
    if dataGroup == "dataMimic":
        dataPath = "../Mimic/dataMimic/"
    elif dataGroup == "dataECMO":
        dataPath = "../dataECMO/"
    else:
        dataPath = "../dataRea/"

    finalDataPath = dataPath + "finalData/"

    data = []

    for encounterId in tqdm(id_list, total=len(id_list)):
        
        df_mask = pd.read_parquet(finalDataPath + encounterId + "/mask.parquet")
        df_dynamic = pd.read_parquet(finalDataPath + encounterId + "/dynamic.parquet")
        df_static = pd.read_parquet(finalDataPath + encounterId + "/static.parquet")

        idx_variables_kept = [0,1,2,3,4,5,6,7,9,10,11]
    
        # Ajout des variables dynamiques
        data_patient = df_dynamic.iloc[:(window_time*24), idx_variables_kept].to_numpy()
    
        # Ajout des variables statiques
        for value in df_static.to_numpy()[0]:
            new_column = np.ones(shape=(window_time*24,1)) * value
            data_patient = np.append(data_patient, new_column, axis=1)

        
        data.append(data_patient)
    
    return np.array(data)


def prepareDeathList(dataGroup, window_time):
    target = []
    id_list = []
    
    if dataGroup == "dataMimic":
        dataPath = "../Mimic/dataMimic/"
        
        patients_df = pd.read_csv(dataPath + "ventiles.csv")
        # df_ventilation_start = pd.read_csv(dataPath + "ventiles1.csv")

        nb_patients = len(patients_df)
    
        for index, row in tqdm(patients_df.iterrows(), total=nb_patients):
            stay_id = str(row["stay_id"])

            df_mask = pd.read_parquet(dataPath + "finalData/" + stay_id + "/mask.parquet")
            total_true_values = df_mask.values.sum()
            total_values = df_mask.values.size
            percentageMissingValues = (total_values-total_true_values)/total_values * 100

            if percentageMissingValues < 40:
                id_list.append(stay_id)
                
                label_death = int(row["label"])
                if label_death:
                    target.append(1)
                else:
                    target.append(0)
            
        
        return target, id_list
    
    if dataGroup == "dataECMO":
        dataPath = "../dataECMO/"
    elif dataGroup == "dataRangueil":
        dataPath = "../dataRea/"

    
    patients_df = pd.read_parquet(dataPath + "patients.parquet")

    df_death = pd.read_csv(dataPath + "delais_deces.csv")
    
    nb_patients = len(patients_df)

    for _, row in tqdm(patients_df.iterrows(), total=nb_patients):
        encounterId = str(row["encounterId"])
        
        df_mask = pd.read_parquet(dataPath + "finalData/" + encounterId + "/mask.parquet")
        total_true_values = df_mask.values.sum()
        total_values = df_mask.values.size
        percentageMissingValues = (total_values-total_true_values)/total_values * 100
        
        withdrawal_date = pd.Timestamp(row["withdrawal_date"])
        installation_date = pd.Timestamp(row["installation_date"])
        total_time_hour = (withdrawal_date - installation_date).total_seconds() / 3600 + 4

        if total_time_hour >= window_time * 24 and percentageMissingValues < 40:
            id_list.append(encounterId)
            
            delai_sortie_deces = df_death.loc[df_death["encounterId"] == int(encounterId), "delai_sortie_deces"].to_numpy()[0]
            delai_installation_deces = df_death.loc[df_death["encounterId"] == int(encounterId), "delai_installation_deces"].to_numpy()[0]
            if delai_sortie_deces <= 1: #and delai_installation_deces <= 35:
                target.append(1)
            else:
                target.append(0)
    
    return target, id_list

In [52]:
target_ECMO, id_list_ECMO = prepareDeathList("dataECMO", window_time=5)
data_ECMO = prepareData("dataECMO", id_list_ECMO, window_time=5)

print(f"ECMO dataset size: {len(target_ECMO)} , num_deceased: {np.sum(target_ECMO)}")

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 189/189 [00:00<00:00, 360.37it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 153/153 [00:00<00:00, 282.18it/s]

ECMO dataset size: 153 , num_deceased: 55





In [84]:
target_Mimic, id_list_Mimic = prepareDeathList("dataMimic", window_time=5)
data_Mimic = prepareData("dataMimic", id_list_Mimic, window_time=5)

print(f"Mimic dataset size: {len(target_Mimic)} , num_deceased: {np.sum(target_Mimic)}")

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4245/4245 [00:05<00:00, 789.63it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4140/4140 [00:14<00:00, 281.86it/s]

Mimic dataset size: 4140 , num_deceased: 1054





In [11]:
from utils import augmentation as aug

def augment(X_train, Y_train):
    indices = np.where(np.array(Y_train) == 1)
    X_deceased = X_train[indices]
    
    # x_aug_1 = aug.magnitude_warp(X_deceased, sigma=0.2, knot=4)
    # x_aug_2 = aug.window_warp(X_deceased)
    x_aug_3 = aug.spawner(X_deceased, np.ones(np.shape(X_deceased)[0]))
    # x_aug_3 = aug.wdba(X_deceased, np.ones(np.shape(X_deceased)[0]))

    x_train = np.concatenate((X_train, x_aug_3))

    y_train = np.concatenate((Y_train, np.ones(np.shape(x_train)[0]-np.shape(X_train)[0])))
    
    return x_train, y_train

In [4]:
# MIMIC without imputation
saveDataPath = "../final_datasets/Mimic/without_imputation/"

test_data_Mimic = np.load(saveDataPath + "test_data.npy")
test_target_Mimic = np.load(saveDataPath + "test_target.npy")
train_data_Mimic = np.load(saveDataPath + "train_data.npy")
train_target_Mimic = np.load(saveDataPath + "train_target.npy")


# MIMIC with imputation
saveDataPath = "../final_datasets/Mimic/with_imputation/"

test_data_Mimic_Imputed = np.load(saveDataPath + "test_data.npy")
test_target_Mimic_Imputed = np.load(saveDataPath + "test_target.npy")
train_data_Mimic_Imputed = np.load(saveDataPath + "train_data.npy")
train_target_Mimic_Imputed = np.load(saveDataPath + "train_target.npy")


# VENTILES without imputation
saveDataPath = "../final_datasets/Ventiles/without_imputation/"

test_data_Ventiles = np.load(saveDataPath + "test_data.npy")
test_target_Ventiles = np.load(saveDataPath + "test_target.npy")
train_data_Ventiles = np.load(saveDataPath + "train_data.npy")
train_target_Ventiles = np.load(saveDataPath + "train_target.npy")


# VENTILES with imputation
saveDataPath = "../final_datasets/Ventiles/with_imputation/"

test_data_Ventiles_Imputed = np.load(saveDataPath + "test_data.npy")
test_target_Ventiles_Imputed = np.load(saveDataPath + "test_target.npy")
train_data_Ventiles_Imputed = np.load(saveDataPath + "train_data.npy")
train_target_Ventiles_Imputed = np.load(saveDataPath + "train_target.npy")


# ECMO without imputation
saveDataPath = "../final_datasets/ECMO/"

data_ECMO = np.load(saveDataPath + "data_ECMO.npy")
target_ECMO = np.load(saveDataPath + "target_ECMO.npy")


# ECMO with imputation
saveDataPath = "../final_datasets/ECMO/"

data_ECMO_Imputed = np.load(saveDataPath + "data_ECMO_Imputed.npy")
target_ECMO_Imputed = np.load(saveDataPath + "target_ECMO_Imputed.npy")


# ECMO_M without imputation
saveDataPath = "../final_datasets/ECMO_M/"

data_ECMO_M = np.load(saveDataPath + "data_ECMO_M.npy")
target_ECMO_M = np.load(saveDataPath + "target_ECMO_M.npy")


# ECMO_M with imputation
saveDataPath = "../final_datasets/ECMO_M/"

data_ECMO_M_Imputed = np.load(saveDataPath + "data_ECMO_M_Imputed.npy")
target_ECMO_M_Imputed = np.load(saveDataPath + "target_ECMO_M_Imputed.npy")


# ECMO train/test without imputation
saveDataPath = "../final_datasets/ECMO/without_imputation2/"

test_data_ECMO = np.load(saveDataPath + "test_data.npy")
test_target_ECMO = np.load(saveDataPath + "test_target.npy")
train_data_ECMO = np.load(saveDataPath + "train_data.npy")
train_target_ECMO = np.load(saveDataPath + "train_target.npy")


# ECMO train/test with imputation
saveDataPath = "../final_datasets/ECMO/with_imputation2/"

test_data_ECMO_Imputed = np.load(saveDataPath + "test_data.npy")
test_target_ECMO_Imputed = np.load(saveDataPath + "test_target.npy")
train_data_ECMO_Imputed = np.load(saveDataPath + "train_data.npy")
train_target_ECMO_Imputed = np.load(saveDataPath + "train_target.npy")

In [79]:
np.shape(train_data_Ventiles_Imputed)
print(type(train_data_Ventiles_Imputed))
print(np.shape(train_data_Ventiles_Imputed))
print(np.min(train_data_Ventiles_Imputed))

<class 'numpy.ndarray'>
(4214, 120, 15)
-14.549816


In [55]:
data_ECMO.shape

(153, 120, 15)

In [13]:
########### HYDRA-MR ############

train_data = train_data_Mimic
train_target = train_target_Mimic
test_data = test_data_Mimic
test_target = test_target_Mimic

# train_data = train_data_Mimic_Imputed
# train_target = train_target_Mimic_Imputed
# test_data = test_data_Mimic_Imputed
# test_target = test_target_Mimic_Imputed


# train_data = train_data_Ventiles
# train_target = train_target_Ventiles
# test_data = test_data_Ventiles
# test_target = test_target_Ventiles

# train_data = train_data_Ventiles_Imputed
# train_target = train_target_Ventiles_Imputed
# test_data = test_data_Ventiles_Imputed
# test_target = test_target_Ventiles_Imputed


# train_data = np.concatenate((train_data_Mimic, test_data_Mimic))
# train_target = np.concatenate((train_target_Mimic, test_target_Mimic))
# test_data = data_ECMO
# test_target = target_ECMO

# train_data = np.concatenate((train_data_Ventiles, test_data_Ventiles))
# train_target = np.concatenate((train_target_Ventiles, test_target_Ventiles))
# test_data = data_ECMO
# test_target = target_ECMO


# train_data = np.concatenate((train_data_Mimic_Imputed, test_data_Mimic_Imputed))
# train_target = np.concatenate((train_target_Mimic_Imputed, test_target_Mimic_Imputed))
# test_data = data_ECMO_Imputed
# test_target = target_ECMO_Imputed

# train_data = np.concatenate((train_data_Ventiles_Imputed, test_data_Ventiles_Imputed))
# train_target = np.concatenate((train_target_Ventiles_Imputed, test_target_Ventiles_Imputed))
# test_data = data_ECMO_Imputed
# test_target = target_ECMO_Imputed


# train_data = train_data_ECMO
# train_target = train_target_ECMO
# test_data = test_data_ECMO
# test_target = test_target_ECMO

# train_data = train_data_ECMO_Imputed
# train_target = train_target_ECMO_Imputed
# test_data = test_data_ECMO_Imputed
# test_target = test_target_ECMO_Imputed


# train_data = data_ECMO
# train_target = target_ECMO
# test_data = []
# test_target = []

# train_data = data_ECMO_Imputed
# train_target = target_ECMO_Imputed
# test_data = []
# test_target = []


K = 3

aurocs = []
auroc_ecmo = 0
mean_fpr = np.linspace(0, 1, 100)
tprs = []
predictions_list = []
labels_list = []

for j in range(K):
    
    clf = MultiRocketHydraRegressor(n_kernels=8, n_groups=64, n_jobs=1, random_state=None)

    # print(np.shape(train_data))
    # print(np.shape(train_target))
    
    # train_data, train_target = data_Mimic, np.array(target_Mimic)
    clf.fit(train_data, train_target)
    
    y_pred_proba = clf.predict(test_data)
    
    auroc = roc_auc_score(test_target, y_pred_proba)
    aurocs.append(auroc)
    print(auroc)
    
    fpr, tpr, _ = roc_curve(test_target, y_pred_proba)
    tprs.append(np.interp(mean_fpr, fpr, tpr))
    tprs[-1][0] = 0.0  # Ensure the curve starts at (0,0)

    predictions_list = predictions_list + list(y_pred_proba)
    labels_list = labels_list + list(test_target)

mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0  # Ensure the curve ends at (1,1)

list_to_save = [aurocs, mean_tpr, predictions_list, labels_list]

print(f"Total Mean AUROC: {np.mean(aurocs):.3f} ± {np.std(aurocs):.3f}")

saved_results_folder = "./saved_results/"
saved_file_name = "Hydra_MR"

# with open(saved_results_folder + saved_file_name + ".pkl", 'wb') as file:
#     pickle.dump(list_to_save, file)

0.6163971154519359
0.6316366776354174
0.641205162314173
Total Mean AUROC: 0.630 ± 0.010
