In [4]:
import numpy as np
import random
import pandas as pd
from tqdm import tqdm 

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torchsummary import summary
import torchinfo

from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, accuracy_score
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

In [2]:
def tnr_score(y_test, y_pred):
    y_t = np.array(y_test)
    y_p = np.array(y_pred)
    tn = np.sum((1-y_t)*(1-y_p))
    fp = np.sum(y_p*(1-y_t))
    if (tn + fp) == 0:
        return 0
    else:
        return tn / (tn + fp)

In [9]:
def split_train_test_val(data, target, test_size, val_size):
    nb_samples = len(target)
    nb_test = int(test_size * nb_samples)
    nb_val = int(val_size * nb_samples)

    shuffle = list(range(nb_samples))
    random.shuffle(shuffle)

    x_train, x_test, x_val, y_train, y_test, y_val = [], [], [], [], [], []
    
    nb_0_test = 0
    nb_1_test = 0
    nb_0_val = 0
    nb_1_val = 0

    for idx in shuffle:
        if nb_0_test < (nb_test//2) and target[idx]==0:
            y_test.append(0)
            x_test.append(data[idx])
            nb_0_test += 1
        elif nb_1_test < (nb_test//2) and target[idx]==1:
            y_test.append(1)
            x_test.append(data[idx])
            nb_1_test += 1
        elif nb_0_val < (nb_val//2) and target[idx]==0:
            y_val.append(0)
            x_val.append(data[idx])
            nb_0_val += 1
        elif nb_1_val < (nb_val//2) and target[idx]==1:
            y_val.append(1)
            x_val.append(data[idx])
            nb_1_val += 1
        else:
            y_train.append(target[idx])
            x_train.append(data[idx])
    
    return x_train, x_test, x_val, y_train, y_test, y_val


def prepareData(dataGroup, id_list, window_time):
    
    if dataGroup == "dataECMO":
        dataPath = "../dataECMO/"
        patients_df = pd.read_parquet(dataPath + "patients.parquet")
    else:
        dataPath = "../dataRea/"
        patients_df = pd.read_parquet(dataPath + "patientsRea.parquet")

    finalDataPath = dataPath + "finalData/"

    data = []

    for encounterId in tqdm(id_list, total=len(id_list)):
        
        df_mask = pd.read_parquet(finalDataPath + encounterId + "/mask.parquet")
        df_dynamic = pd.read_parquet(finalDataPath + encounterId + "/dynamic.parquet")
        df_static = pd.read_parquet(finalDataPath + encounterId + "/static.parquet")
        
        # idx_variables_kept = [0,1,3,4,6,7]
        idx_variables_kept = [0,1,2,3,4,5,6,7,8,9]

        data_patient = df_dynamic.iloc[:(window_time*24), idx_variables_kept].to_numpy()
        
        for value in df_static.to_numpy()[0]:
            new_column = np.ones(shape=(window_time*24,1)) * value
            data_patient = np.append(data_patient, new_column, axis=1)
        
        # df_dynamic_masked = df_dynamic.iloc[:(window_time*24)].mask(df_mask.iloc[:(window_time*24)] == 0)

        # # idx_variables_kept = [0,1,3,4,6,7]
        # idx_variables_kept = list(range(0,10))
        # df_dynamic_masked = df_dynamic_masked.iloc[:,idx_variables_kept]
        # df_dynamic = df_dynamic.iloc[:,idx_variables_kept]


        data.append(data_patient)
    
    return np.array(data)


def prepareDeathList(dataGroup, window_time):
    if dataGroup == "dataECMO":
        dataPath = "../dataECMO/"
    else:
        dataPath = "../dataRea/"
    
    patients_df = pd.read_parquet(dataPath + "patients.parquet")

    df_death = pd.read_csv(dataPath + "delais_deces.csv")
    
    nb_patients = len(patients_df)

    target = []
    id_list = []

    for _, row in tqdm(patients_df.iterrows(), total=nb_patients):
        encounterId = str(row["encounterId"])
        
        df_mask = pd.read_parquet(dataPath + "finalData/" + encounterId + "/mask.parquet")
        total_true_values = df_mask.values.sum()
        total_values = df_mask.values.size
        percentageMissingValues = (total_values-total_true_values)/total_values * 100
        
        withdrawal_date = pd.Timestamp(row["withdrawal_date"])
        installation_date = pd.Timestamp(row["installation_date"])
        total_time_hour = (withdrawal_date - installation_date).total_seconds() / 3600 + 4

        if total_time_hour >= window_time * 24 and percentageMissingValues < 40:
            id_list.append(encounterId)
            
            delai_sortie_deces = df_death.loc[df_death["encounterId"] == int(encounterId), "delai_sortie_deces"].to_numpy()[0]
            if delai_sortie_deces <= 1:
                target.append(1)
            else:
                target.append(0)
    
    return target, id_list

In [10]:
dataGroup = "dataECMO"
# dataGroup = "dataRangueil"

window_time_days = 5
target, id_list = prepareDeathList(dataGroup, window_time_days)
data = prepareData(dataGroup, id_list, window_time_days)

100%|██████████| 189/189 [00:03<00:00, 51.18it/s]
100%|██████████| 153/153 [00:10<00:00, 14.26it/s]


In [62]:
def train_model(num_epochs, model_name, test_size, val_size, verbose, save_path, save_model):
    
    # x_train, x_test, x_val, y_train, y_test, y_val = split_train_test_val(data, target, test_size=test_size, val_size=val_size)
   
    x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=test_size)
    while np.sum(y_test) < 2:
        x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=test_size)
    
    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=val_size)
    while np.sum(y_val) == 0:
        x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=val_size)

    num_samples = len(target)
    num_timesteps = 24 * window_time_days
    num_features = np.size(x_train,2)
    num_static_features = 3
    
    batch_size = 32

    proportion_1 = np.sum(y_train)/np.size(y_train)
    proportion_0 = 1 - proportion_1

    class_weights = torch.tensor([1/proportion_0, 1/proportion_1], dtype=torch.float32)


    # Convert data to PyTorch tensors
    x_train_tensor = torch.tensor(x_train, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
    x_val_tensor = torch.tensor(x_val, dtype=torch.float32)
    y_val_tensor = torch.tensor(y_val, dtype=torch.float32)
    x_test_tensor = torch.tensor(x_test, dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

    # Create DataLoader for training and testing sets
    train_dataset = TensorDataset(x_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataset = TensorDataset(x_val_tensor, y_val_tensor)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_dataset = TensorDataset(x_test_tensor, y_test_tensor)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Define the CNN model
    class CNN(nn.Module):
        def __init__(self):
            super(CNN, self).__init__()
            self.conv1 = nn.Conv1d(in_channels=num_features-num_static_features, out_channels=8, kernel_size=1)
            self.pool = nn.MaxPool1d(kernel_size=2)
            self.conv2 = nn.Conv1d(in_channels=8, out_channels=16, kernel_size=1)
            self.fc1 = nn.Linear(480 , 16)  # Adjust input size based on your data dimensions
            self.fc2 = nn.Linear(16+num_static_features, 1)  # For binary classification

        def forward(self, x):
            lstm_input = x[:, :-num_static_features, :]
            static_input = x[:, -num_static_features:, 0]

            out = self.pool(nn.functional.relu(self.conv1(lstm_input)))
            out = self.pool(nn.functional.relu(self.conv2(out)))
            out = torch.flatten(out, 1)
            out = nn.functional.relu(self.fc1(out))
            out = torch.cat((out, static_input), dim=1)
            out = self.fc2(out)
            # out = torch.sigmoid(self.fc2(out))  
            return out

    class CNN2(nn.Module):
        def __init__(self):
            super(CNN2, self).__init__()
            self.conv1 = nn.Conv2d(in_channels=1, out_channels=8, kernel_size=(1, 3), padding=1)
            self.pool = nn.MaxPool2d(kernel_size=(2, 1))
            self.conv2 = nn.Conv2d(in_channels=8, out_channels=16, kernel_size=(1, 3), padding=1)
            self.fc1 = nn.Linear(5760, 4)  # Adjust input size based on your data dimensions
            self.fc2 = nn.Linear(4, 1)

        def forward(self, x):
            x = x.unsqueeze(1)  # Add a channel dimension
            x = self.pool(nn.functional.relu(self.conv1(x)))
            x = self.pool(nn.functional.relu(self.conv2(x)))
            # x = self.pool(nn.functional.relu(self.conv3(x)))
            x = torch.flatten(x, 1)
            x = nn.functional.relu(self.fc1(x))
            # x = nn.functional.sigmoid(self.fc2(x))
            x = self.fc2(x)
            return x

    class LSTMModel(nn.Module):
        def __init__(self, input_size, hidden_size, num_layers, output_size, num_static_features):
            super(LSTMModel, self).__init__()
            self.hidden_size = hidden_size
            self.num_layers = num_layers
            self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
            # self.fc1 = nn.Linear(hidden_size + num_static_features, 20)
            # self.fc2 = nn.Linear(20, output_size)
            self.fc2 = nn.Linear(hidden_size + num_static_features, 1)
        
        def forward(self, x):
            lstm_input = x[:, :, :-num_static_features]
            static_input = x[:, 0, -num_static_features:]

            h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
            c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
            
            out, _ = self.lstm(lstm_input, (h0, c0))
            out = out[:, -1, :]  # Take the output of the last time step

            out = torch.cat((out, static_input), dim=1)
            # out = torch.relu(self.fc1(out))

            # out = nn.functional.sigmoid(self.fc2(out))
            out = self.fc2(out)
            return out
    
    # Instantiate the model
    if model_name == "CNN":
        model = CNN()

        # if verbose:
        #     print(torchinfo.summary(model, input_size=(batch_size, num_features, num_timesteps)))
    
    if model_name == "CNN2":
            model = CNN2()

            if verbose:
                print(torchinfo.summary(model, input_size=(batch_size, num_timesteps, num_features)))

    elif model_name == "LSTM":
        input_size = num_features-num_static_features
        hidden_size = 32
        num_layers = 2
        output_size = 1

        model = LSTMModel(input_size, hidden_size, num_layers, output_size, num_static_features)

        if verbose:
            print(torchinfo.summary(model, input_size=(batch_size, num_timesteps, num_features)))


    class FocalLoss(nn.Module):
        def __init__(self, alpha=1, gamma=2, reduction='mean'):
            super(FocalLoss, self).__init__()
            self.alpha = alpha
            self.gamma = gamma
            self.reduction = reduction

        def forward(self, inputs, targets):
            # Apply sigmoid to inputs if not using BCEWithLogitsLoss
            inputs = torch.sigmoid(inputs)
            
            # Flatten the inputs and targets
            inputs = inputs.view(-1)
            targets = targets.view(-1)
            
            # Compute the binary cross entropy loss
            BCE_loss = nn.functional.binary_cross_entropy(inputs, targets, reduction='none')
            
            # Compute the focal loss component
            pt = torch.where(targets == 1, inputs, 1 - inputs)
            F_loss = self.alpha * (1 - pt) ** self.gamma * BCE_loss
            
            if self.reduction == 'mean':
                return F_loss.mean()
            elif self.reduction == 'sum':
                return F_loss.sum()
            else:
                return F_loss
    # Define loss function and optimizer
    # criterion = nn.BCELoss()  # Binary cross-entropy loss
    # Define weighted binary cross-entropy loss function
    criterion = nn.BCEWithLogitsLoss(pos_weight=class_weights[1])

    # criterion = FocalLoss(alpha=1, gamma=2, reduction='mean')
    
    optimizer = optim.AdamW(model.parameters())

    best_val_accuracy = 0
    # Train the model
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            # inputs = inputs.permute(0, 2, 1)
            outputs = model(inputs).squeeze()
            loss = criterion(outputs, labels)
            
            predicted = torch.round(nn.functional.sigmoid(outputs))
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            train_accuracy = 100 * correct / total

            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        if verbose:
            print(f"Epoch {epoch+1}/{num_epochs}, Train loss: {running_loss}, Train Accuracy: {train_accuracy}%")

        # Validation
        if np.size(y_val) > 0:
            model.eval()
            val_loss = 0.0
            correct = 0
            total = 0
            with torch.no_grad():
                for inputs, labels in val_loader:
                    # inputs = inputs.permute(0, 2, 1)
                    outputs = model(inputs).squeeze()
                    val_loss += criterion(outputs, labels).item()
                    predicted = torch.round(nn.functional.sigmoid(outputs))
                    total += labels.size(0)
                    correct += (predicted == labels).sum().item()
                    val_accuracy = 100 * correct / total
            if verbose:
                print(f"Validation Loss: {val_loss}, Accuracy on validation set: {val_accuracy}%")

            if val_accuracy > best_val_accuracy:
                best_val_accuracy = val_accuracy
                torch.save(model.state_dict(), save_path)
            


    state_dict = torch.load(save_path)
    model.load_state_dict(state_dict)

    model.eval()

    correct = 0
    total = 0

    true_labels = []
    predictions = []
    predictions_binary = []

    treshold = 0.5

    with torch.no_grad():
        for inputs, labels in test_loader:
            # inputs = inputs.permute(0, 2, 1)
            true_labels.extend(labels.numpy())

            outputs = nn.functional.sigmoid(model(inputs))
            predictions.extend(outputs.numpy())
            predictions_binary.extend((outputs.numpy() > treshold).astype(int))
            
            print(np.round(np.array([p[0] for p in predictions]), 1))

    # Calculate AUROC score
    auroc = roc_auc_score(true_labels, predictions)
    precision = precision_score(true_labels, predictions_binary, zero_division=0)
    recall = recall_score(true_labels, predictions_binary, zero_division=0)
    tnr = tnr_score(true_labels, predictions_binary)
    f1 = f1_score(true_labels, predictions_binary, zero_division=0)
    accuracy = accuracy_score(true_labels, predictions_binary)
    # if verbose:
    # print("Test AUROC score:", auroc)
    return auroc, precision, recall, tnr, f1, accuracy

In [63]:
num_train = 10

save_path = "LSTMs/lstm0.pth"

aurocs = []
precisions = []
recalls = []
tnrs = []
accuracies = []
f1s = []

for i in tqdm(range(num_train), total=num_train):
    
    # auroc = train_model(num_epochs=15, model_name="LSTM", test_size=0.2, val_size=0.0, verbose=False)
    auroc, precision, recall, tnr, f1, accuracy = train_model(num_epochs=50, model_name="LSTM", test_size=0.1, val_size=0.1, verbose=False, save_path=save_path, save_model=True)
    
    aurocs.append(auroc)
    precisions.append(precision)
    recalls.append(recall)
    tnrs.append(tnr)
    f1s.append(f1)
    accuracies.append(accuracy)
    
    print(f"AUROC: {np.mean(aurocs)}")

print(f"AUROC: {np.mean(aurocs):.4f}")
print(f"Precision: {np.mean(precisions):.4f}")
print(f"Recall: {np.mean(recalls):.4f}")
print(f"Specificity: {np.mean(tnrs):.4f}")
print(f"Accuracy: {np.mean(accuracies):.4f}")
print(f"F1 Score: {np.mean(f1s):.4f}")

  0%|          | 0/10 [00:00<?, ?it/s]

 10%|█         | 1/10 [00:06<00:57,  6.34s/it]

[0.7 0.5 0.5 0.5 0.7 0.6 0.5 0.9 0.5 0.4 0.8 0.8 0.8 0.5 0.6 0.5 0.5 0.4
 0.5 0.8 0.7 0.4 0.9 0.6 0.3 0.4 0.5 0.7 0.5 0.4 0.4]
AUROC: 0.6974789915966387


 20%|██        | 2/10 [00:11<00:46,  5.75s/it]

[0.6 0.6 0.5 0.7 0.5 0.5 0.6 0.5 0.5 0.6 0.5 0.8 0.5 0.5 0.6 0.6 0.5 0.5
 0.5 0.9 0.5 0.8 0.5 0.8 0.6 0.5 0.6 0.7 0.7 0.5 0.7]
AUROC: 0.6667219519386702


 30%|███       | 3/10 [00:17<00:39,  5.61s/it]

[0.6 0.8 0.7 0.6 0.6 0.6 0.6 0.7 0.6 0.7 0.9 0.5 0.7 0.5 0.6 0.8 0.7 0.8
 0.9 0.6 0.5 0.6 0.6 0.6 0.5 0.7 0.5 0.7 0.8 0.6 0.8]
AUROC: 0.6265800667245456


 40%|████      | 4/10 [00:22<00:32,  5.49s/it]

[0.4 0.4 0.3 0.7 0.3 0.3 0.5 0.8 0.3 0.8 0.5 0.4 0.5 0.4 0.6 0.9 0.3 0.7
 0.4 0.4 0.7 0.4 0.4 0.3 0.9 0.8 0.3 0.5 0.6 0.5 0.5]
AUROC: 0.5817771553065672


 40%|████      | 4/10 [00:25<00:38,  6.37s/it]


KeyboardInterrupt: 

NameError: name 'tensor' is not defined