In [2]:
import numpy as np
import random
import pandas as pd
from tqdm import tqdm 

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

from sklearn.metrics import roc_auc_score

In [13]:
def split_train_test_val(data, target, test_size, val_size):
    nb_samples = len(target)
    nb_test = int(test_size * nb_samples)
    nb_val = int(val_size * nb_samples)

    shuffle = list(range(nb_samples))
    random.shuffle(shuffle)

    x_train, x_test, x_val, y_train, y_test, y_val = [], [], [], [], [], []
    
    nb_0_test = 0
    nb_1_test = 0
    nb_0_val = 0
    nb_1_val = 0

    for idx in shuffle:
        if nb_0_test < (nb_test//2) and target[idx]==0:
            y_test.append(0)
            x_test.append(data[idx])
            nb_0_test += 1
        elif nb_1_test < (nb_test//2) and target[idx]==1:
            y_test.append(1)
            x_test.append(data[idx])
            nb_1_test += 1
        elif nb_0_val < (nb_val//2) and target[idx]==0:
            y_val.append(0)
            x_val.append(data[idx])
            nb_0_val += 1
        elif nb_1_val < (nb_val//2) and target[idx]==1:
            y_val.append(1)
            x_val.append(data[idx])
            nb_1_val += 1
        else:
            y_train.append(target[idx])
            x_train.append(data[idx])
    
    return x_train, x_test, x_val, y_train, y_test, y_val


def prepareData(dataGroup, id_list, window_time):
    
    if dataGroup == "dataECMO":
        dataPath = "../data/"
        patients_df = pd.read_parquet(dataPath + "patients.parquet")
    else:
        dataPath = "../dataRea/"
        patients_df = pd.read_parquet(dataPath + "patientsRea.parquet")

    finalDataPath = dataPath + "finalData/"

    data = []

    for encounterId in tqdm(id_list, total=len(id_list)):
        
        df_mask = pd.read_parquet(finalDataPath + encounterId + "/mask.parquet")
        df_dynamic = pd.read_parquet(finalDataPath + encounterId + "/dynamic.parquet")
        df_static = pd.read_parquet(finalDataPath + encounterId + "/static.parquet")
        
        data_patient = df_dynamic.iloc[:(window_time*24)].to_numpy()

        # df_dynamic_masked = df_dynamic.iloc[:(window_time*24)].mask(df_mask.iloc[:(window_time*24)] == 0)

        # # idx_variables_kept = [0,1,3,4,6,7]
        # idx_variables_kept = list(range(0,10))
        # df_dynamic_masked = df_dynamic_masked.iloc[:,idx_variables_kept]
        # df_dynamic = df_dynamic.iloc[:,idx_variables_kept]


        data.append(data_patient)
    
    return np.array(data)


def prepareDeathList(dataGroup, window_time):
    if dataGroup == "dataECMO":
        dataPath = "../data/"
        patients_df = pd.read_parquet(dataPath + "patients.parquet")
    else:
        dataPath = "../dataRea/"
        patients_df = pd.read_parquet(dataPath + "patientsRea.parquet")

    df_death = pd.read_csv(dataPath + "delais_deces.csv")
    
    nb_patients = len(patients_df)

    target = []
    id_list = []

    for _, row in tqdm(patients_df.iterrows(), total=nb_patients):
        encounterId = str(row["encounterId"])
        
        df_mask = pd.read_parquet(dataPath + "finalData/" + encounterId + "/mask.parquet")
        total_true_values = df_mask.values.sum()
        total_values = df_mask.values.size
        percentageMissingValues = (total_values-total_true_values)/total_values * 100
        
        withdrawal_date = pd.Timestamp(row["withdrawal_date"])
        installation_date = pd.Timestamp(row["installation_date"])
        total_time_hour = (withdrawal_date - installation_date).total_seconds() / 3600 + 4

        if total_time_hour >= window_time * 24 and percentageMissingValues < 40:
            id_list.append(encounterId)
            
            delai_sortie_deces = df_death.loc[df_death["encounterId"] == int(encounterId), "delai_sortie_deces"].to_numpy()[0]
            if delai_sortie_deces <= 3:
                target.append(1)
            else:
                target.append(0)
    
    return target, id_list

In [4]:
dataGroup = "dataECMO"
# dataGroup = "dataRangueil"

window_time_days = 5
target, id_list = prepareDeathList(dataGroup, window_time_days)
data = prepareData(dataGroup, id_list, window_time_days)

100%|██████████| 392/392 [00:13<00:00, 29.74it/s]
100%|██████████| 287/287 [00:27<00:00, 10.53it/s]


In [110]:
num_samples = len(target)
num_timesteps = 24 * window_time_days
num_features = 10

# Split data into training and testing sets
x_train, x_test, x_val, y_train, y_test, y_val = split_train_test_val(data, target, test_size=0.0, val_size=0.15)

proportion_1 = np.sum(y_train)/np.size(y_train)
proportion_0 = 1 - proportion_1

class_weights = torch.tensor([1/proportion_0, 1/proportion_1])

# Convert data to PyTorch tensors
x_train_tensor = torch.tensor(x_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
x_val_tensor = torch.tensor(x_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32)
x_test_tensor = torch.tensor(x_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

# Create DataLoader for training and testing sets
train_dataset = TensorDataset(x_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataset = TensorDataset(x_val_tensor, y_val_tensor)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_dataset = TensorDataset(x_test_tensor, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Define the CNN model
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=10, out_channels=4, kernel_size=3)
        self.pool = nn.MaxPool1d(kernel_size=2)
        self.conv2 = nn.Conv1d(in_channels=4, out_channels=8, kernel_size=3)
        self.fc1 = nn.Linear(224, 16)  # Adjust input size based on your data dimensions
        self.fc2 = nn.Linear(16, 1)  # For binary classification

    def forward(self, x):
        x = self.pool(nn.functional.relu(self.conv1(x)))
        x = self.pool(nn.functional.relu(self.conv2(x)))
        x = torch.flatten(x, 1)
        x = nn.functional.relu(self.fc1(x))
        x = torch.sigmoid(self.fc2(x))  # For binary classification
        return x

class CNN2(nn.Module):
    def __init__(self):
        super(CNN2, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=2, kernel_size=(1, 3), padding=1)
        self.pool = nn.MaxPool2d(kernel_size=(2, 1))
        self.conv2 = nn.Conv2d(in_channels=2, out_channels=4, kernel_size=(1, 3), padding=1)
        self.conv3 = nn.Conv2d(in_channels=4, out_channels=4, kernel_size=(1, 3), padding=1)
        self.conv4 = nn.Conv2d(in_channels=4, out_channels=4, kernel_size=(1, 3), padding=1)
        self.fc1 = nn.Linear(360, 4)  # Adjust input size based on your data dimensions
        self.fc2 = nn.Linear(4, 1)  # For binary classification

    def forward(self, x):
        x = x.unsqueeze(1)  # Add a channel dimension
        x = self.pool(nn.functional.relu(self.conv1(x)))
        x = self.pool(nn.functional.relu(self.conv2(x)))
        x = self.pool(nn.functional.relu(self.conv3(x)))
        x = self.pool(nn.functional.relu(self.conv4(x)))
        x = torch.flatten(x, 1)
        x = nn.functional.relu(self.fc1(x))
        x = torch.sigmoid(self.fc2(x))  # For binary classification
        return x

# Instantiate the model
model = CNN2()

# Define loss function and optimizer
# criterion = nn.BCELoss()  # Binary cross-entropy loss
# Define weighted binary cross-entropy loss function
criterion = nn.BCEWithLogitsLoss(pos_weight=class_weights)

optimizer = optim.Adam(model.parameters())


from torchsummary import summary

summary(model, input_size=(num_timesteps, num_features))

# Train the model
num_epochs = 500
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        print(outputs.squeeze().detach())
        print(labels)
        loss = criterion(outputs.squeeze().detach(), labels)
        
        predicted = torch.round(outputs)
        total += labels.size(0)
        correct += (predicted.squeeze() == labels).sum().item()
        
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Epoch {epoch+1}/{num_epochs}, Train loss: {running_loss}, Train Accuracy: {100 * correct / total}%")

    # Validation
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            outputs = model(inputs)
            val_loss += criterion(outputs.squeeze(), labels).item()
            predicted = torch.round(outputs)
            total += labels.size(0)
            correct += (predicted.squeeze() == labels).sum().item()
    print(f"Validation Loss: {val_loss}, Accuracy on validation set: {100 * correct / total}%")

# Evaluate the model
model.eval()
correct = 0
total = 0

true_labels = []
predictions = []
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        true_labels.extend(labels.numpy())
        predictions.extend(outputs.numpy())

# Calculate AUROC score
auroc = roc_auc_score(true_labels, predictions)
print("AUROC score:", auroc)

#     for inputs, labels in test_loader:
#         outputs = model(inputs.permute(0, 2, 1))
#         predicted = torch.round(outputs)
#         total += labels.size(0)
#         correct += (predicted.squeeze() == labels).sum().item()

# print(f"Accuracy on test set: {100 * correct / total}%")

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 2, 122, 10]               8
         MaxPool2d-2            [-1, 2, 61, 10]               0
            Conv2d-3            [-1, 4, 63, 10]              28
         MaxPool2d-4            [-1, 4, 31, 10]               0
            Conv2d-5            [-1, 4, 33, 10]              52
         MaxPool2d-6            [-1, 4, 16, 10]               0
            Conv2d-7            [-1, 4, 18, 10]              52
         MaxPool2d-8             [-1, 4, 9, 10]               0
            Linear-9                    [-1, 4]           1,444
           Linear-10                    [-1, 1]               5
Total params: 1,589
Trainable params: 1,589
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.08
Params size (MB): 0.01
Estimated Total

RuntimeError: The size of tensor a (2) must match the size of tensor b (32) at non-singleton dimension 0