## Dataset loader and preparation

In [5]:
import sys
sys.path.insert(0, "../")
from dataAnalysis.DataAnalysis import DataAnalysis
import pandas as pd
import torch
 
data = pd.read_csv(r"../sbcdata.csv", header=0)
data_analysis = DataAnalysis(data)

In [6]:
data = pd.concat((data_analysis.get_training_data(), data_analysis.get_testing_data()))
max_Id = data["Id"].unique().max()
gw_data = data_analysis.get_gw_testing_data().copy(deep=True)
gw_data = gw_data.assign(Id=lambda x: x.Id + max_Id)
data = pd.concat((data, gw_data))
data = data.sort_values(["Id", "Time"])
data = data.reset_index(drop=True)
popped_index = data.pop("index")

In [7]:
from dataAnalysis.Constants import SEX_CATEGORY_COLUMN_NAME, SEX_COLUMN_NAME, FEATURES, LABEL_COLUMN_NAME

data[SEX_CATEGORY_COLUMN_NAME] = data.loc[:, SEX_COLUMN_NAME] =="W"

data[SEX_CATEGORY_COLUMN_NAME] = data[SEX_CATEGORY_COLUMN_NAME].astype("int8")
data["Label"] = data["Label"] == "Sepsis"
data["Label"] = data["Label"].astype("int8")

## Normalizer

In [8]:
data.loc[data["Set"] == "Training", FEATURES].values

array([[ 79.  ,   0.  ,   8.1 , ...,   4.36,  86.  , 167.  ],
       [ 35.  ,   0.  ,  10.6 , ...,   6.02,  79.9 , 199.  ],
       [ 47.  ,   1.  ,   8.7 , ...,   4.37,  89.9 , 298.  ],
       ...,
       [ 32.  ,   1.  ,   7.2 , ...,   3.87,  87.9 , 221.  ],
       [ 47.  ,   0.  ,   8.3 , ...,   4.08,  91.9 , 148.  ],
       [ 47.  ,   0.  ,   8.9 , ...,   4.39,  92.3 , 150.  ]])

In [9]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(data.loc[data["Set"] == "Training", FEATURES].values)

## Padding to max length

In [10]:
id_grouped_data = data.groupby("Id")

In [11]:
from tqdm.notebook import tqdm

max_len = 0
for id, data_group in tqdm(id_grouped_data):
    max_len = max(max_len, data_group.shape[0])

  0%|          | 0/866517 [00:00<?, ?it/s]

In [12]:
unique_train_ids = data.loc[data["Set"] == "Training", "Id"].unique()
max_train_idx = int(len(unique_train_ids)*.8)
val_ids = unique_train_ids[max_train_idx:]

In [15]:
import numpy as np

pad_value = 0
TRAIN = "train"
GW = "gw_test"
TEST = "test"
VAL = "val"

features = dict({})
features[TRAIN] = []
features[VAL] = []
features[TEST] = []
features[GW] = []

labels = dict({})
labels[TRAIN] = []
labels[VAL] = []
labels[TEST] = []
labels[GW] = []

for id, data_group in tqdm(id_grouped_data):
    assert data_group["Set"].unique().shape[0] == 1
    assert data_group["Center"].unique().shape[0] == 1

    features_scaled = scaler.transform(data_group[FEATURES].values)

    padded_features = np.pad(features_scaled, ((0, max_len - data_group.shape[0]), (0,0)), mode='constant', constant_values=pad_value)
    padded_labels = np.pad(data_group[LABEL_COLUMN_NAME].values, ((0, max_len - data_group.shape[0])), mode='constant', constant_values=pad_value)
    
    first_el = data_group.iloc[0, :]
    if first_el["Set"] == "Training":
        if first_el["Id"] in val_ids:
            features[VAL].append(padded_features)
            labels[VAL].append(padded_labels)
            continue
        if first_el["Id"] not in val_ids:
            features[TRAIN].append(padded_features)
            labels[TRAIN].append(padded_labels)
            continue
    if first_el["Set"] == "Validation":
        if first_el["Center"] == "Greifswald":
            features[GW].append(padded_features)
            labels[GW].append(padded_labels)
            continue
        if first_el["Center"] == "Leipzig":
            features[TEST].append(padded_features)
            labels[TEST].append(padded_labels)
            continue

  0%|          | 0/866517 [00:00<?, ?it/s]

## Seed and hyperparams

In [27]:
torch.manual_seed(42)
n_embd = len(FEATURES)
head_size = 16
dropout = 0
out_dim = 1 #binary 
device = torch.device("cuda:0") #torch.device("cuda:2") #torch.device("cuda:2")
WEIGHT = 664
lr = 1e-2
wd = 0
n_blocks = 2
n_heads = 4

## Batching 

In [28]:
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

class Data(Dataset):
    # Constructor
    def __init__(self,X, y):
        self.x = X
        self.y = y
        self.len = self.x.shape[0]

    def __getitem__(self, index):
        return self.x[index], self.y[index]

    def __len__(self):
        return self.len

In [29]:
batch_loader = dict({})

In [30]:
def add_loader(set_name):
    global bitch_loader, sets, labels
    dataset = Data(torch.from_numpy(np.array(features[set_name])).type(torch.float).to(device), torch.from_numpy(np.array(labels[set_name])).type(torch.float).to(device))
    loader = DataLoader(dataset=dataset, batch_size=500) #max is 100_000
    batch_loader[set_name] = loader

In [31]:
add_loader(TRAIN)
add_loader(VAL)
add_loader(GW)
add_loader(TEST)

## Model

In [279]:
from torch import nn
from torch.nn import functional as F

torch.manual_seed(42)
class ConvModel(nn.Module):

    def __init__(self, input_dim, fiter_size, fiter_size_2,fiter_size_3):
        super(ConvModel, self).__init__()

        self.pos_embedding_table = nn.Embedding(max_len, input_dim)
        
        self.conv = nn.Conv1d(input_dim, 1, fiter_size)
        self.conv2 = nn.Conv1d(input_dim, 1, fiter_size_2)
        self.conv3 = nn.Conv1d(input_dim, 1, fiter_size_3)
        self.lin_heads = nn.Linear(3, 1)

    def forward(self, x, targets = None):
        B, T, C = x.shape
        pad_mask = (x != pad_value).type(torch.float)
        ignore_mask = torch.bmm(pad_mask, pad_mask.transpose(-2,-1))
        ignore_mask = ignore_mask == 0
        
        # pos_emb = self.pos_embedding_table(torch.arange(T, device = device)) ##destroys zero pad
        # x = x + pos_emb
        origin_x = torch.clone(x)
        
        conv_x = torch.nn.functional.pad(x, (0,0,0,self.conv.kernel_size[0]-1,0,0), mode='constant', value=pad_value)
        conv2_x = torch.nn.functional.pad(x, (0,0,0,self.conv2.kernel_size[0]-1,0,0), mode='constant', value=pad_value)
        conv3_x = torch.nn.functional.pad(x, (0,0,0,self.conv3.kernel_size[0]-1,0,0), mode='constant', value=pad_value)
        
        conv_x = conv_x.permute(0, 2, 1)
        conv2_x = conv2_x.permute(0, 2, 1)
        conv3_x = conv3_x.permute(0, 2, 1)
        
        logits_conv1 = self.conv(conv_x)
        logits_conv2 = self.conv2(conv2_x)
        logits_conv3 = self.conv3(conv3_x)

        logits_conv1 = torch.nn.functional.elu(logits_conv1)
        logits_conv2 = torch.nn.functional.elu(logits_conv2)
        logits_conv3 = torch.nn.functional.elu(logits_conv3)
        
        conv_heads = torch.cat((logits_conv1, logits_conv2,logits_conv3), dim = -2)
        logits = self.lin_heads(conv_heads.permute(0,2,1))
                
        logits_mask = torch.logical_not(torch.all(ignore_mask, dim =-1))
        return logits.squeeze(-1), logits_mask

In [350]:
from torch import nn
from torch.nn import functional as F

torch.manual_seed(42)
class ConvModel(nn.Module):

    def __init__(self, input_dim, filters):
        super(ConvModel, self).__init__()

        self.pos_embedding_table = nn.Embedding(max_len, input_dim)

        self.convs = nn.ModuleList()
        self.filters = filters
        for filter in filters:
            self.convs.append(nn.Conv1d(input_dim, 1, filter).to(device))
        self.lin_heads =nn.Linear(len(filters), 1)

    def forward(self, x, targets = None):
        B, T, C = x.shape
        pad_mask = (x != pad_value).type(torch.float)
        ignore_mask = torch.bmm(pad_mask, pad_mask.transpose(-2,-1))
        ignore_mask = ignore_mask == 0
        
        # pos_emb = self.pos_embedding_table(torch.arange(T, device = device)) ##destroys zero pad
        # x = x + pos_emb
        origin_x = torch.clone(x)

        conv_logits = []
        for i, filter in enumerate(self.filters):
            conv_x = torch.nn.functional.pad(x, (0,0,0,self.convs[i].kernel_size[0]-1,0,0), mode='constant', value=pad_value)
            
            conv_x = conv_x.permute(0, 2, 1)

            conv = self.convs[i]
            logits_conv1 = conv(conv_x)
    
            logits_conv1 = torch.nn.functional.elu(logits_conv1)
            conv_logits.append(logits_conv1)

        conv_heads = torch.cat(conv_logits, dim = -2)
        logits = self.lin_heads(conv_heads.permute(0,2,1))
                
        logits_mask = torch.logical_not(torch.all(ignore_mask, dim =-1))
        return logits.squeeze(-1), logits_mask

In [349]:
model = ConvModel(input_dim, [1,2,3]).to(device)
evaluate_loss_sets(model)

ModuleList(
  (0): Conv1d(7, 1, kernel_size=(1,), stride=(1,))
  (1): Conv1d(7, 1, kernel_size=(2,), stride=(1,))
  (2): Conv1d(7, 1, kernel_size=(3,), stride=(1,))
)


KeyboardInterrupt: 

## Evaluate

In [335]:
def evaluate_loss(model, set_name):
    with torch.inference_mode():
        model.eval()
        acc_loss = 0
        batch_size = 0

        for i, (x,y) in enumerate(batch_loader[set_name]):
            B,_,_ = x.shape
            logits, logits_mask = model(x)
        
            loss = torch.nn.functional.binary_cross_entropy_with_logits(logits[logits_mask].squeeze(-1), y[logits_mask], pos_weight=torch.tensor(WEIGHT))
            acc_loss += loss.item()
            batch_size += B
    return acc_loss / batch_size        

In [351]:
def evaluate_loss_sets(model):
    losses = dict({})
    for set_name in features.keys():
        # if set_name != VAL:
        #     continue
        loss = evaluate_loss(model, set_name)
        losses[set_name] = loss
        print(f"Loss of {set_name}: {loss:.5f}") 
    return losses
        

In [352]:
from sklearn.metrics import roc_auc_score, accuracy_score

def evaluate_auroc(model, set_name):
    with torch.inference_mode():
        model.eval()
        
        logits_list = []
        label_list = []

        batch_size = 0
        for i, (x,y) in enumerate(batch_loader[set_name]):
            B,_,_ = x.shape
            logits, logits_mask = model(x)

            logits_list.extend(logits[logits_mask].squeeze(-1).tolist())
            label_list.extend(y[logits_mask].squeeze(-1).tolist())
            batch_size += B
    auroc = roc_auc_score(np.array(label_list), torch.sigmoid(torch.tensor(logits_list)).numpy())
    return auroc        

In [353]:
def evaluate_auroc_sets(model):
    for set_name in features.keys():
        if set_name == TRAIN:
            continue
        auroc = evaluate_auroc(model, set_name)
        print(f"AUROC of {set_name}: {auroc:.5f}")

## Train

In [357]:
torch.manual_seed(42)
input_dim, n_embd, n_heads, dropout, n_blocks = len(FEATURES), len(FEATURES), 7, 0.0, 1
model = ConvModel(input_dim, [2,3,4,5]).to(device)
optim= torch.optim.Adam(model.parameters(), lr = 1e-2, weight_decay=wd)

In [358]:
torch.manual_seed(42)

evaluate_loss_sets(model)
last_val_loss = None
for epoch in range(5):#5
    for i, (x,y) in tqdm(enumerate(batch_loader[TRAIN])):
        model.train()
        optim.zero_grad()
        
        logits, logits_mask = model(x)
        loss = torch.nn.functional.binary_cross_entropy_with_logits(logits[logits_mask].squeeze(-1), y[logits_mask], pos_weight=torch.tensor(WEIGHT))
            
        loss.backward()
        optim.step()
    losses = evaluate_loss_sets(model)
    # if last_val_loss and last_val_loss <= losses[VAL]:
    #     print(epoch)
    #     break
    last_val_loss = losses[VAL]

Loss of train: 0.00288
Loss of val: 0.00296
Loss of test: 0.00274
Loss of gw_test: 0.00246


0it [00:00, ?it/s]

Loss of train: 0.00141
Loss of val: 0.00147
Loss of test: 0.00135
Loss of gw_test: 0.00116


0it [00:00, ?it/s]

Loss of train: 0.00135
Loss of val: 0.00142
Loss of test: 0.00131
Loss of gw_test: 0.00109


0it [00:00, ?it/s]

Loss of train: 0.00131
Loss of val: 0.00139
Loss of test: 0.00127
Loss of gw_test: 0.00107


0it [00:00, ?it/s]

Loss of train: 0.00128
Loss of val: 0.00136
Loss of test: 0.00123
Loss of gw_test: 0.00105


0it [00:00, ?it/s]

Loss of train: 0.00127
Loss of val: 0.00134
Loss of test: 0.00122
Loss of gw_test: 0.00104


In [359]:
torch.manual_seed(42)
evaluate_auroc_sets(model)#0876,88187,88417,0.88449,0.88569,,0.88623,,0.88692

AUROC of val: 0.94450
AUROC of test: 0.94269
AUROC of gw_test: 0.94449


In [287]:
from sklearn.metrics import confusion_matrix

def evaluate_confusion(model, set_name):
    with torch.inference_mode():
        model.eval()
        
        logits_list = []
        label_list = []

        batch_size = 0
        for i, (x,y) in enumerate(batch_loader[set_name]):
            B,_,_ = x.shape
            logits, logits_mask = model(x)

            logits_list.extend(logits[logits_mask].squeeze(-1).tolist())
            label_list.extend(y[logits_mask].squeeze(-1).tolist())
            batch_size += B
    conf = confusion_matrix(np.array(label_list), torch.round(torch.sigmoid(torch.tensor(logits_list))).numpy())
    return conf

In [288]:
evaluate_confusion(model, TEST)

array([[331490,  34304],
       [   109,    381]])

In [364]:
from sklearn.model_selection import ParameterGrid

space = {
    'f1': [i for i in range(2,6)],
    'f2': [i for i in range(2,6)],
    'f3': [i for i in range(2,6)],
    'f4': [i for i in range(2,6)],
    'f5': [i for i in range(2,6)],
}
param_grid = ParameterGrid(space)

best_hyper_params = None
best_val_loss = float("inf")
patience = 2
for params in tqdm(param_grid.__iter__()):
    
    
    torch.manual_seed(42)
    model = ConvModel(input_dim, [params[f"f{i}"] for i in range(1,6)]).to(device)
    optim= torch.optim.Adam(model.parameters(), lr = 1e-2)
    models = []

    last_val_loss = None
    val_losses = []
    never_breaked = True
    for epoch in range(100):
        for i, (x,y) in tqdm(enumerate(batch_loader[TRAIN])):
            model.train()
            optim.zero_grad()
            
            logits, logits_mask = model(x)
            loss = torch.nn.functional.binary_cross_entropy_with_logits(logits[logits_mask].squeeze(-1), y[logits_mask], pos_weight=torch.tensor(WEIGHT))
                
            loss.backward()
            optim.step()
        losses = evaluate_loss_sets(model)
        val_losses.append(losses[VAL])
        models.append(copy.deepcopy(model))
        print(val_losses)
        if len(val_losses) >= patience and all(list(map(lambda l: losses[VAL] >= l, val_losses[-patience:]))):
            print(f"Break at {epoch}")
            never_breaked = False
            model = models[-2]
            break
    if never_breaked:
        print("Never breaked with params")
    print(params)
    evaluate_auroc_sets(model)
    if val_losses[-1]<= best_val_loss:
        best_val_loss = min(val_losses[-1], best_val_loss)
        best_hyper_params = params
        print("Currently best params: ")
        print(best_hyper_params)

0it [00:00, ?it/s]

{'f1': 2, 'f2': 2, 'f3': 2, 'f4': 2, 'f5': 2}
{'f1': 2, 'f2': 2, 'f3': 2, 'f4': 2, 'f5': 3}
{'f1': 2, 'f2': 2, 'f3': 2, 'f4': 2, 'f5': 4}
{'f1': 2, 'f2': 2, 'f3': 2, 'f4': 2, 'f5': 5}
{'f1': 2, 'f2': 2, 'f3': 2, 'f4': 3, 'f5': 2}
{'f1': 2, 'f2': 2, 'f3': 2, 'f4': 3, 'f5': 3}
{'f1': 2, 'f2': 2, 'f3': 2, 'f4': 3, 'f5': 4}
{'f1': 2, 'f2': 2, 'f3': 2, 'f4': 3, 'f5': 5}
{'f1': 2, 'f2': 2, 'f3': 2, 'f4': 4, 'f5': 2}
{'f1': 2, 'f2': 2, 'f3': 2, 'f4': 4, 'f5': 3}
{'f1': 2, 'f2': 2, 'f3': 2, 'f4': 4, 'f5': 4}
{'f1': 2, 'f2': 2, 'f3': 2, 'f4': 4, 'f5': 5}
{'f1': 2, 'f2': 2, 'f3': 2, 'f4': 5, 'f5': 2}
{'f1': 2, 'f2': 2, 'f3': 2, 'f4': 5, 'f5': 3}
{'f1': 2, 'f2': 2, 'f3': 2, 'f4': 5, 'f5': 4}
{'f1': 2, 'f2': 2, 'f3': 2, 'f4': 5, 'f5': 5}
{'f1': 2, 'f2': 2, 'f3': 3, 'f4': 2, 'f5': 2}
{'f1': 2, 'f2': 2, 'f3': 3, 'f4': 2, 'f5': 3}
{'f1': 2, 'f2': 2, 'f3': 3, 'f4': 2, 'f5': 4}
{'f1': 2, 'f2': 2, 'f3': 3, 'f4': 2, 'f5': 5}
{'f1': 2, 'f2': 2, 'f3': 3, 'f4': 3, 'f5': 2}
{'f1': 2, 'f2': 2, 'f3': 3, 'f4': 