In [171]:
import lightgbm as lgb
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from tqdm import tqdm
import pandas as pd
import numpy as np
import random

In [None]:
def split_train_test(data, target, test_size):
    nb_samples = len(target)
    nb_test = int(test_size * nb_samples)

    shuffle = list(range(nb_samples))
    random.shuffle(shuffle)

    X_train, X_test, y_train, y_test = [], [], [], []
    
    nb_0 = 0
    nb_1 = 0

    for idx in shuffle:
        if nb_0 < (nb_test//2) and target[idx]==0:
            y_test.append(0)
            X_test.append(data[idx])
            nb_0 += 1
        elif nb_1 < (nb_test//2) and target[idx]==1:
            y_test.append(1)
            X_test.append(data[idx])
            nb_1 += 1
        else:
            y_train.append(target[idx])
            X_train.append(data[idx])
    
    return X_train, X_test, y_train, y_test

# X_train, X_test, y_train, y_test = split_train_test(data, target, test_size=0.2)

In [268]:
def aggregateData(dataGroup, id_list, window_time):
    
    if dataGroup == "dataECMO":
        dataPath = "../data/"
        patients_df = pd.read_parquet(dataPath + "patients.parquet")
    else:
        dataPath = "../dataRea/"
        patients_df = pd.read_parquet(dataPath + "patientsRea.parquet")

    finalDataPath = dataPath + "finalData/"

    nb_patients = len(patients_df)

    data = []

    for encounterId in tqdm(id_list, total=len(id_list)):
        
        df_mask = pd.read_parquet(finalDataPath + encounterId + "/mask.parquet")
        df_dynamic = pd.read_parquet(finalDataPath + encounterId + "/dynamic.parquet")
        df_static = pd.read_parquet(finalDataPath + encounterId + "/static.parquet")
        
        data_patient = []

        df_dynamic_masked = df_dynamic.iloc[:(window_time*24)].mask(df_mask.iloc[:(window_time*24)] == 0)

        idx_variables_kept = [0,1,3,4,6,7]
        df_dynamic_masked = df_dynamic_masked.iloc[:,idx_variables_kept]
        df_dynamic = df_dynamic.iloc[:,idx_variables_kept]

        statics = list(df_static.to_numpy()[0])
        # CAS OU TOUTE UNE COLONNE EST MASQUEE ?
        mean = df_dynamic_masked.mean().tolist()
        median = df_dynamic_masked.median().tolist()
        maxi = df_dynamic_masked.max().tolist()
        mini = df_dynamic_masked.min().tolist()
        first = list(df_dynamic.to_numpy()[0,:])
        last = list(df_dynamic.to_numpy()[window_time*24-1,:])

        data_patient.extend(mean)
        # data_patient.extend(median)
        data_patient.extend(maxi)
        data_patient.extend(mini)
        # data_patient.extend(first)
        # data_patient.extend(last)
        data_patient.extend(statics)

        data.append(data_patient)
    
    return data

def prepareDeathList(dataGroup, window_time):
    if dataGroup == "dataECMO":
        dataPath = "../data/"
        patients_df = pd.read_parquet(dataPath + "patients.parquet")
    else:
        dataPath = "../dataRea/"
        patients_df = pd.read_parquet(dataPath + "patientsRea.parquet")

    df_death = pd.read_csv(dataPath + "delais_deces.csv")
    
    nb_patients = len(patients_df)

    target = []
    id_list = []

    for _, row in tqdm(patients_df.iterrows(), total=nb_patients):
        encounterId = str(row["encounterId"])
        
        df_mask = pd.read_parquet(dataPath + "finalData/" + encounterId + "/mask.parquet")
        total_true_values = df_mask.values.sum()
        total_values = df_mask.values.size
        percentageMissingValues = (total_values-total_true_values)/total_values * 100
        
        withdrawal_date = pd.Timestamp(row["withdrawal_date"])
        installation_date = pd.Timestamp(row["installation_date"])
        total_time_hour = (withdrawal_date - installation_date).total_seconds() / 3600 + 4

        if total_time_hour >= window_time * 24 and percentageMissingValues < 40:
            id_list.append(encounterId)
            
            delai_sortie_deces = df_death.loc[df_death["encounterId"] == int(encounterId), "delai_sortie_deces"].to_numpy()[0]
            if delai_sortie_deces <= 3:
                target.append(1)
            else:
                target.append(0)
    
    return target, id_list

In [269]:
dataGroup = "dataECMO"
# dataGroup = "dataRangueil"

window_time_days = 5
target, id_list = prepareDeathList(dataGroup, window_time_days)
data = aggregateData(dataGroup, id_list, window_time_days)

100%|██████████| 392/392 [00:11<00:00, 34.19it/s]
100%|██████████| 287/287 [00:24<00:00, 11.69it/s]


In [271]:
aurocs = []
for i in range(1000):
    
    X_train, X_test, y_train, y_test = split_train_test(data, target, test_size=0.2)

    # Set parameters for LightGBM
    params = {
        'objective': 'binary',
        'metric': 'binary_error',
        'boosting_type': 'gbdt',
        'num_leaves': 5,
        'learning_rate': 0.05,
        'feature_fraction': 0.75,
        'verbose': -1
    }
    # params = {
    #     'boosting_type': 'gbdt',
    #     'num_leaves': 31,
    #     'max_depth': -1,
    #     'learning_rate': 0.1,
    #     'n_estimators': 100,
    #     'subsample_for_bin': 200000,
    #     'min_child_samples': 20,
    #     'subsample': 1.0,
    #     'subsample_freq': 0,
    #     'colsample_bytree': 1.0,
    #     'reg_alpha': 0.0,
    #     'reg_lambda': 0.0,
    #     'n_jobs': -1,
    #     'importance_type': 'split',
    #     'verbose': -1
    # }

    clf = lgb.LGBMClassifier(**params)
    clf.fit(X_train, y_train)

    auroc = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
    aurocs.append(auroc)


print(f"AUROC: {np.mean(aurocs)}")

AUROC: 0.6659910714285714


In [179]:
aurocs = []
for i in range(100):
    
    X_train, X_test, y_train, y_test = split_train_test(data, target, test_size=0.2)

    clf = xgb.XGBClassifier(
    learning_rate=0.1,
    max_depth=5,
    n_estimators=100,
    objective='binary:logistic'
    )

    clf.fit(X_train, y_train)

    auroc = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
    aurocs.append(auroc)


print(f"AUROC: {np.mean(aurocs)}")

AUROC: 0.6533293697978598


In [178]:
aurocs = []
for i in range(20):
    
    X_train, X_test, y_train, y_test = split_train_test(data, target, test_size=0.2)

    base_classifier = DecisionTreeClassifier()

    clf = BaggingClassifier(estimator=base_classifier, n_estimators=100, random_state=42)

    clf.fit(X_train, y_train)

    auroc = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
    aurocs.append(auroc)


print(f"AUROC: {np.mean(aurocs)}")

AUROC: 0.6331747919143875


In [272]:
aurocs = []

for i in range(300):
    
    X_train, X_test, y_train, y_test = split_train_test(data, target, test_size=0.2)

    clf = LogisticRegression(solver='sag', penalty='l2', verbose=0, max_iter=1000)

    clf.fit(X_train, y_train)

    auroc = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
    aurocs.append(auroc)

print(f"AUROC: {np.mean(aurocs)}")

AUROC: 0.6561522108843537
