In [1]:
"""
Some gerneral information:
We have data of time series with highly variable lengths, i.e., some patients might have dozens or even hundreds of CBC samples while their hospitilation. However, others might only have a few or even only 1 sample.
Therefore, we its a bit more difficult to implement time series deep learning approaches. 

Here, we will use masked padding, i.e., we will pad all sequences to the largest length and then will mask the padded results from the loss and auroc calculations using a bollean mask.
In my honest opinion this might be the best approach, since other approaches will have too high redundancies (creating time series for each sample with a label), less information for the prediction (truncation), wrong information (padding), or will have highly varying batch sizes (batching based on time series length)
"""

'\nSome gerneral information:\nWe have data of time series with highly variable lengths, i.e., some patients might have dozens or even hundreds of CBC samples while their hospitilation. However, others might only have a few or even only 1 sample.\nTherefore, we its a bit more difficult to implement time series deep learning approaches. \n\nHere, we will use masked padding, i.e., we will pad all sequences to the largest length and then will mask the padded results from the loss and auroc calculations using a bollean mask.\nIn my honest opinion this might be the best approach, since other approaches will have too high redundancies (creating time series for each sample with a label), less information for the prediction (truncation), wrong information (padding), or will have highly varying batch sizes (batching based on time series length)\n'

In [2]:
import sys
from dataAnalysis.DataAnalysis import DataAnalysis
import pandas as pd
import torch
from dataAnalysis.Constants import SEX_CATEGORY_COLUMN_NAME, SEX_COLUMN_NAME, FEATURES, LABEL_COLUMN_NAME
from sklearn.preprocessing import StandardScaler
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch.autograd import Variable 

In [3]:
data = pd.read_csv(r"sbcdata.csv", header=0)
data_analysis = DataAnalysis(data)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  self.data['Label'].mask(control_filter, "Control", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  self.data['Label'].mask(sepsis_filter, "Sepsis", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate obj

In [4]:
data = pd.concat((data_analysis.get_training_data(), data_analysis.get_testing_data()))
max_Id = data["Id"].unique().max()
gw_data = data_analysis.get_gw_testing_data().copy(deep=True)
gw_data = gw_data.assign(Id=lambda x: x.Id + max_Id)
data = pd.concat((data, gw_data))
data = data.sort_values(["Id", "Time"])
data = data.reset_index(drop=True)
popped_index = data.pop("index")

data[SEX_CATEGORY_COLUMN_NAME] = data.loc[:, SEX_COLUMN_NAME] =="W"
data[SEX_CATEGORY_COLUMN_NAME] = data[SEX_CATEGORY_COLUMN_NAME].astype("int8")
data["Label"] = data["Label"] == "Sepsis"
data["Label"] = data["Label"].astype("int8")


scaler = StandardScaler()
scaler.fit(data.loc[data["Set"] == "Training", FEATURES].values)

In [5]:
id_grouped_data = data.groupby("Id")


max_len = 0
for id, data_group in tqdm(id_grouped_data):
    max_len = max(max_len, data_group.shape[0])
unique_train_ids = data.loc[data["Set"] == "Training", "Id"].unique()
max_train_idx = int(len(unique_train_ids)*.8)
val_ids = unique_train_ids[max_train_idx:]

  0%|          | 0/866517 [00:00<?, ?it/s]

In [6]:
import numpy as np

pad_value = -10.0
TRAIN = "train"
GW = "gw_test"
TEST = "test"
VAL = "val"

features = dict({})
features[TRAIN] = []
features[VAL] = []
features[TEST] = []
features[GW] = []

labels = dict({})
labels[TRAIN] = []
labels[VAL] = []
labels[TEST] = []
labels[GW] = []

for id, data_group in tqdm(id_grouped_data):
    assert data_group["Set"].unique().shape[0] == 1
    assert data_group["Center"].unique().shape[0] == 1

    features_scaled = scaler.transform(data_group[FEATURES].values)

    padded_features = np.pad(features_scaled, ((0, max_len - data_group.shape[0]), (0,0)), mode='constant', constant_values=pad_value)
    padded_labels = np.pad(data_group[LABEL_COLUMN_NAME].values, ((0, max_len - data_group.shape[0])), mode='constant', constant_values=pad_value)
    
    first_el = data_group.iloc[0, :]
    if first_el["Set"] == "Training":
        if first_el["Id"] in val_ids:
            features[VAL].append(padded_features)
            labels[VAL].append(padded_labels)
            continue
        if first_el["Id"] not in val_ids:
            features[TRAIN].append(padded_features)
            labels[TRAIN].append(padded_labels)
            continue
    if first_el["Set"] == "Validation":
        if first_el["Center"] == "Greifswald":
            features[GW].append(padded_features)
            labels[GW].append(padded_labels)
            continue
        if first_el["Center"] == "Leipzig":
            features[TEST].append(padded_features)
            labels[TEST].append(padded_labels)
            continue

  0%|          | 0/866517 [00:00<?, ?it/s]

In [7]:
device = torch.device("cuda:0")

In [8]:
X_train = torch.from_numpy(np.stack(features[TRAIN])).type(torch.float32) #.to(device)
X_val = torch.from_numpy(np.stack(features[VAL])).type(torch.float32) # .to(device)
X_test = torch.from_numpy(np.stack(features[TEST])).type(torch.float32) #.to(device)
X_test_gw = torch.from_numpy(np.stack(features[GW])).type(torch.float32) #.to(device)

In [9]:
y_train = torch.from_numpy(np.stack(labels[TRAIN])).type(torch.float32) #.to(device)
y_val = torch.from_numpy(np.stack(labels[VAL])).type(torch.float32) #.to(device)
y_test = torch.from_numpy(np.stack(labels[TEST])).type(torch.float32) #.to(device)
y_test_gw = torch.from_numpy(np.stack(labels[GW])).type(torch.float32) #.to(device)

In [10]:
train_mask = y_train != pad_value
val_mask = y_val != pad_value
test_mask = y_test != pad_value
test_gw_mask = y_test_gw != pad_value

In [11]:
y_test.shape

torch.Size([180494, 416])

In [12]:
class LSTM1(nn.Module):
    def __init__(self, num_classes, input_size, hidden_dim, num_layers):
        super(LSTM1, self).__init__()
        self.num_classes = num_classes #number of classes
        
        self.input_size = input_size #input size
        self.hidden_dim = hidden_dim 
        
        self.num_layers = num_layers #number of layers

        self.lstm = nn.LSTM(input_size=self.input_size, hidden_size=self.hidden_dim,
                          num_layers=num_layers, batch_first=True, bidirectional=True) #lstm
        self.linear =  nn.Linear(hidden_dim*2, num_classes) #fully connected 1

        self.relu = nn.ReLU()
    
    def forward(self, x):
        hidden_state_0 = Variable(torch.zeros(self.num_layers*2, x.shape[0], self.hidden_dim)).to(device) 
        cell_state_0 = Variable(torch.zeros(self.num_layers*2, x.shape[0], self.hidden_dim)).to(device)
        output, (hidden_state, cell_state) = self.lstm(x, (hidden_state_0, cell_state_0)) 
        output = self.relu(output)
        output = self.linear(output) 
        return output

In [13]:
model = LSTM1(1, X_train.shape[-1], 8, 2).to(device)
lr = 1e-2
wd = 1e-5
epochs = 100
eval_iter = 1
auroc_iter = 10
batch_size = 4096*2
EARLY_STOP = 2
shuffle = False
weight = y_train[train_mask].numel() / y_train[train_mask].sum()

In [14]:
loss_fn = torch.nn.BCEWithLogitsLoss(pos_weight= torch.tensor(weight), reduction="mean")

  loss_fn = torch.nn.BCEWithLogitsLoss(pos_weight= torch.tensor(weight), reduction="mean")


In [15]:
optim = torch.optim.Adam(params=model.parameters(), lr = lr, weight_decay=wd)

  _torch_pytree._register_pytree_node(


In [16]:
from sklearn.metrics import roc_auc_score
def test_auroc(dataloader, model):
    acc_proba = []
    acc_labels = []
    
    with torch.inference_mode():
        model.eval()
        for data, mask, labels in dataloader:
            data, mask, labels = data.to(device), mask.to(device), labels.to(device)
            out = model(data).squeeze(-1)[mask]
            out = torch.sigmoid(out)
    
            acc_proba.append(out.cpu().numpy())
            acc_labels.append(labels[mask].cpu().numpy())

    np_labels = np.concatenate(acc_labels, axis = 0)
    np_proba = np.concatenate(acc_proba, axis = 0)
    auroc = roc_auc_score(np_labels, np_proba)
    return auroc 

In [17]:
def test(dataloader, model):
    acc_loss = 0
    batch_size = 0
    
    with torch.inference_mode():
        model.eval()
        for data, mask, labels in dataloader:
            data, mask, labels = data.to(device), mask.to(device), labels.to(device)
            out = model(data).squeeze(-1)[mask]
            loss = loss_fn(out, labels[mask])

            acc_loss += loss.item()
            batch_size += labels[mask].shape[0]
            del data
            del mask
            del labels
    return acc_loss / batch_size

In [18]:
def train(dataloader, model, optim):
    model.train()
    acc_loss = 0
    batch_size = 0
    
    for data, mask, labels in dataloader:
        data, mask, labels = data.to(device), mask.to(device), labels.to(device)
        loss = loss_fn(model(data).squeeze(-1)[mask], labels[mask])
        optim.zero_grad()
        loss.backward()
        optim.step()
        
        acc_loss += loss.item()
        batch_size += labels[mask].shape[0]
        del loss 
        del data
        del mask
        del labels
    return acc_loss / batch_size

In [19]:
class Data(Dataset):
    def __init__(self, X, mask, labels):
        self.X = X
        self.mask = mask
        self.y = labels

    def __len__(self):
        return len(self.X)


    def __getitem__(self, idx):
        return self.X[idx], self.mask[idx], self.y[idx]

In [20]:
store = dict()
store[TRAIN] = DataLoader(Data(X_train, train_mask, y_train), batch_size= batch_size, shuffle = shuffle)
store[VAL] = DataLoader(Data(X_val, val_mask, y_val), batch_size= batch_size, shuffle = shuffle) 
store[TEST] = DataLoader(Data(X_test, test_mask, y_test), batch_size= batch_size, shuffle = shuffle)
store[GW] = DataLoader(Data(X_test_gw, test_gw_mask, y_test_gw), batch_size= batch_size, shuffle = shuffle) 

In [21]:
sets = (TRAIN, VAL, TEST, GW)

In [22]:
losses = dict()
for set in sets:
    losses[set] = []

In [25]:
for epoch in tqdm(range(epochs)):
    loss = train(store[TRAIN], model, optim)
    if epoch % eval_iter == 0:
        for set in sets:
            loss = test(store[set], model)
            losses[set].append(loss)
        if len(losses[VAL]) >= EARLY_STOP and np.mean(losses[VAL][-EARLY_STOP:]) < losses[VAL][-1]:
            print(f"Breaked at {epoch}")
            break

  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 4


In [26]:
for set in sets:
    print(f"{set}: f{test_auroc(store[set], model)}")

train: f0.9604924420590564
val: f0.9548608408910193
test: f0.9535285221870724
gw_test: f0.9557218286942658


In [25]:
class HyperParam:
    def __init__(self, hidden_dim, lr, wd, num_layers):
        self.hidden_dim = hidden_dim
        self.lr = lr
        self.wd = wd
        self.num_layers = num_layers

In [29]:
hyper_params = []

hidden_dims = [8, 16, 32, 64]
lrs = [1e-2, 1e-3, 1e-4]
wds = [1e-4, 1e-5, 0]
num_layers_list = [1, 2]
linear_layers_list = [0, 1]

for hidden_dim in hidden_dims:
    for lr in lrs:
        for wd in wds:
            for num_layers in num_layers_list:
                hyper_param = HyperParam(hidden_dim, lr, wd, num_layers)
                hyper_params.append(hyper_param)

In [30]:
eval_iter = 1
batch_size = 4096*6
EARLY_STOP = 2
epochs = 100

def train_loop(model, optim, tracked_losses = [VAL]):
    losses = dict()
    for set in tracked_losses:
        losses[set] = []
    for epoch in tqdm(range(epochs)):
        loss = train(store[TRAIN], model, optim)
        if epoch % eval_iter == 0:
            for set in tracked_losses:
                loss = test(store[set], model)
                losses[set].append(loss)
            if len(losses[VAL]) >= EARLY_STOP and np.mean(losses[VAL][-EARLY_STOP:]) < losses[VAL][-1]:
                print(f"Breaked at {epoch}")
                break
    return losses[VAL]

In [32]:
import copy

best_val_loss = float("inf")
best_model = None
best_hyperparams = None
for hyper_param in tqdm(hyper_params):
    model = LSTM1(1, X_train.shape[-1], hyper_param.hidden_dim, hyper_param.num_layers).to(device)
    optim = torch.optim.Adam(params=model.parameters(), lr = hyper_param.lr, weight_decay=hyper_param.wd)
    val_losses = train_loop(model, optim)
    if best_val_loss >= val_losses[-1]:
        best_val_loss = val_losses[-1]
        best_model = copy.deepcopy(model.cpu())
        best_hyperparams = hyper_param

  0%|          | 0/72 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 11


  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 6


  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 10


  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 6


  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 7


  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 2


  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 74


  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 35


  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 65


  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 38


  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 42


  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 32


  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 23


  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 3


  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 4


  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 4


  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 3


  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 4


  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 5


  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 29


  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 20


  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 38


  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 24


  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 30


  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 9


  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 11


  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 5


  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 2


  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 3


  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 2


  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 3


  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 2


  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 23


  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 16


  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 25


  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 8


  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 24


  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 5


  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 56


  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 13


  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 29


  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 6


  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 3


  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 4


  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 2


  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 4


  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 2


  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 21


  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 10


  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 15


  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 11


  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 15


  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 8


  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 39


  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 81


  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 6


  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 96


  0%|          | 0/100 [00:00<?, ?it/s]

Breaked at 41


In [34]:
for set in sets:
    print(f"{set}: f{test_auroc(store[set], best_model.to(device))}")

train: f0.9637446938034622
val: f0.9572334610708221
test: f0.9558546697354918
gw_test: f0.9568966277045495


In [35]:
best_hyperparams

<__main__.HyperParam at 0x7f5d21cd3e80>

In [36]:
best_hyperparams.hidden_dim, best_hyperparams.lr ,        best_hyperparams.wd ,        best_hyperparams.num_layers ,

(8, 0.01, 1e-05, 2)

In [37]:
best_model

LSTM1(
  (lstm): LSTM(7, 8, num_layers=2, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=16, out_features=1, bias=True)
  (relu): ReLU()
)