#### We re-start our transformer analysis here
#### First, some assumptions or goals for this script:
 - training usinng SGD -> later batching
 - pad time series to maximum length with value -1 - first backward padding -> NOTE: forward padding will most probably cancel out the feature-induces bias
 - mask values which are -1
 - custom transformer
 - first information exchange over all measurements in a sequence (easier and should give the best scores)
 - Set manual seed for reproducibility and improvements over diverse hyperparams
 - first only one block, one head, without dropout (check validation loss and training loss diff after pre-def. intervals), without layernorm without ffd?
 - Now 1 block with ffd, 2 heads, batching, layer norm and pos encoding 

## Dataset loader and preparation

In [1]:
import sys
sys.path.insert(0, "../")
from dataAnalysis.DataAnalysis import DataAnalysis
import pandas as pd
import torch
 
data = pd.read_csv(r"../sbcdata.csv", header=0)
data_analysis = DataAnalysis(data)

In [2]:
data = pd.concat((data_analysis.get_training_data(), data_analysis.get_testing_data()))
max_Id = data["Id"].unique().max()
gw_data = data_analysis.get_gw_testing_data().copy(deep=True)
gw_data = gw_data.assign(Id=lambda x: x.Id + max_Id)
data = pd.concat((data, gw_data))
data = data.sort_values(["Id", "Time"])
data = data.reset_index(drop=True)
popped_index = data.pop("index")

In [3]:
from dataAnalysis.Constants import SEX_CATEGORY_COLUMN_NAME, SEX_COLUMN_NAME, FEATURES, LABEL_COLUMN_NAME

data[SEX_CATEGORY_COLUMN_NAME] = data.loc[:, SEX_COLUMN_NAME] =="W"

data[SEX_CATEGORY_COLUMN_NAME] = data[SEX_CATEGORY_COLUMN_NAME].astype("int8")
data["Label"] = data["Label"] == "Sepsis"
data["Label"] = data["Label"].astype("int8")

## Normalizer

In [4]:
data.loc[data["Set"] == "Training", FEATURES].values

array([[ 79.  ,   0.  ,   8.1 , ...,   4.36,  86.  , 167.  ],
       [ 35.  ,   0.  ,  10.6 , ...,   6.02,  79.9 , 199.  ],
       [ 47.  ,   1.  ,   8.7 , ...,   4.37,  89.9 , 298.  ],
       ...,
       [ 32.  ,   1.  ,   7.2 , ...,   3.87,  87.9 , 221.  ],
       [ 47.  ,   0.  ,   8.3 , ...,   4.08,  91.9 , 148.  ],
       [ 47.  ,   0.  ,   8.9 , ...,   4.39,  92.3 , 150.  ]])

In [5]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(data.loc[data["Set"] == "Training", FEATURES].values)

## Padding to max length

In [6]:
id_grouped_data = data.groupby("Id")

In [7]:
from tqdm.notebook import tqdm

max_len = 0
for id, data_group in tqdm(id_grouped_data):
    max_len = max(max_len, data_group.shape[0])

  0%|          | 0/866517 [00:00<?, ?it/s]

In [8]:
unique_train_ids = data.loc[data["Set"] == "Training", "Id"].unique()
max_train_idx = int(len(unique_train_ids)*.8)
val_ids = unique_train_ids[max_train_idx:]

In [9]:
import numpy as np

pad_value = -10.0
TRAIN = "train"
GW = "gw_test"
TEST = "test"
VAL = "val"

features = dict({})
features[TRAIN] = []
features[VAL] = []
features[TEST] = []
features[GW] = []

labels = dict({})
labels[TRAIN] = []
labels[VAL] = []
labels[TEST] = []
labels[GW] = []

for id, data_group in tqdm(id_grouped_data):
    assert data_group["Set"].unique().shape[0] == 1
    assert data_group["Center"].unique().shape[0] == 1

    features_scaled = scaler.transform(data_group[FEATURES].values)

    padded_features = np.pad(features_scaled, ((0, max_len - data_group.shape[0]), (0,0)), mode='constant', constant_values=pad_value)
    padded_labels = np.pad(data_group[LABEL_COLUMN_NAME].values, ((0, max_len - data_group.shape[0])), mode='constant', constant_values=pad_value)
    
    first_el = data_group.iloc[0, :]
    if first_el["Set"] == "Training":
        if first_el["Id"] in val_ids:
            features[VAL].append(padded_features)
            labels[VAL].append(padded_labels)
            continue
        if first_el["Id"] not in val_ids:
            features[TRAIN].append(padded_features)
            labels[TRAIN].append(padded_labels)
            continue
    if first_el["Set"] == "Validation":
        if first_el["Center"] == "Greifswald":
            features[GW].append(padded_features)
            labels[GW].append(padded_labels)
            continue
        if first_el["Center"] == "Leipzig":
            features[TEST].append(padded_features)
            labels[TEST].append(padded_labels)
            continue

  0%|          | 0/866517 [00:00<?, ?it/s]

## Seed and hyperparams

In [10]:
torch.manual_seed(42)
n_embd = len(FEATURES)
head_size = 16
dropout = 0
out_dim = 1 #binary 
device = torch.device("cuda:0") #torch.device("cuda:2") #torch.device("cuda:2")
WEIGHT = 664
lr = 1e-2
wd = 0
n_blocks = 2
n_heads = 4

## Batching 

In [11]:
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

class Data(Dataset):
    # Constructor
    def __init__(self,X, y):
        self.x = X
        self.y = y
        self.len = self.x.shape[0]

    def __getitem__(self, index):
        return self.x[index], self.y[index]

    def __len__(self):
        return self.len

In [12]:
batch_loader = dict({})

In [13]:
def add_loader(set_name):
    global bitch_loader, sets, labels
    dataset = Data(torch.from_numpy(np.array(features[set_name])).type(torch.float).to(device), torch.from_numpy(np.array(labels[set_name])).type(torch.float).to(device))
    loader = DataLoader(dataset=dataset, batch_size=500) #max is 100_000
    batch_loader[set_name] = loader

In [14]:
add_loader(TRAIN)
add_loader(VAL)
add_loader(GW)
add_loader(TEST)

## Model

In [15]:
import torch 

torch.manual_seed(42)
class Head(torch.nn.Module):

    def __init__(self, n_embd, head_size, dropout):
        super(Head, self).__init__()

        self.query = torch.nn.Linear(n_embd, head_size, bias=False)
        self.key = torch.nn.Linear(n_embd, head_size, bias = False)
        self.value = torch.nn.Linear(n_embd, head_size, bias = False)
        self.dropout = torch.nn.Dropout(dropout)

    def forward(self, x, ignore_mask):
        B,T,C = x.shape
        
        k = self.key(x)
        q = self.query(x)
        v = self.value(x)

        w = q@k.transpose(-2,-1)
        w = w* C**-.5
        w[ignore_mask] = float("-inf")
        w = torch.nan_to_num(torch.softmax(w, dim = -1))
        
        w = self.dropout(w)    
        out = w@v
        return out
        

In [16]:
import torch 
class MultiHeadAttention(torch.nn.Module):

    def __init__(self, n_embd, num_heads, head_size, dropout):
        super(MultiHeadAttention, self).__init__()
        head_size = n_embd // n_heads
        self.heads = torch.nn.ModuleList([Head(n_embd, head_size, dropout) for _ in range(num_heads)])
        self.proj = torch.nn.Linear(head_size*num_heads, n_embd)
        self.dropout = torch.nn.Dropout(dropout)

    def forward(self, x, ignore_mask):
        out = torch.cat([h(x, ignore_mask) for h in self.heads], dim = -1)
        out = self.proj(out)
        out = self.dropout(out)
        return out

In [17]:
class FeedForward(torch.nn.Module):

    def __init__(self, n_embd, dropout):
        super(FeedForward, self).__init__()
        self.lin = torch.nn.Linear(n_embd, 4*n_embd)
        self.proj = torch.nn.Linear(4*n_embd, n_embd)
        self.dropout = torch.nn.Dropout(dropout)

    def forward(self, x):
        x = self.lin(x)
        x = torch.nn.functional.relu(x)
        x = self.proj(x)
        x = self.dropout(x)
        return x

In [18]:
class Block(torch.nn.Module):
    def __init__(self, n_embd, n_heads, dropout):
        super(Block, self).__init__()
        head_size = n_embd // n_heads

        self.sa_heads = MultiHeadAttention(n_embd, n_heads, head_size, dropout)
        self.ffwd = FeedForward(n_embd, dropout)
        self.layer_norm_1 = torch.nn.LayerNorm(n_embd)
        self.layer_norm_2 = torch.nn.LayerNorm(n_embd)

    def forward(self, x, ignore_mask):
        x = x + self.sa_heads(self.layer_norm_1(x), ignore_mask)
        x = x + self.ffwd(self.layer_norm_2(x))
        return x

In [19]:
from torch import nn
from torch.nn import functional as F

torch.manual_seed(42)
class TransformerModel(nn.Module):

    def __init__(self, input_dim, n_embd, n_heads, dropout, n_blocks):
        super(TransformerModel, self).__init__()

        self.lin_input = nn.Linear(input_dim, n_embd)
        self.pos_embedding_table = nn.Embedding(max_len, n_embd)
        
        self.blocks = []
        for _ in range(n_blocks):
            self.blocks.append(Block(n_embd, n_heads, dropout).to(device)) #nn.Sequential(*[Block(n_embd, n_heads, dropout) for _ in range(n_blocks)], nn.LayerNorm(n_embd))
        # self.lm_head = nn.Linear(n_embd, vocab_size)
        # self.sa = Head(head_size)
        self.lin = nn.Linear(n_embd, out_dim)

    def forward(self, x, targets = None):
        B, T, C = x.shape
        pad_mask = (x != pad_value).type(torch.float)
        ignore_mask = torch.bmm(pad_mask, pad_mask.transpose(-2,-1))
        ignore_mask = ignore_mask == 0
        
        x = self.lin_input(x)
        pos_emb = self.pos_embedding_table(torch.arange(T, device = device))
        x = x + pos_emb

        for block in self.blocks:
            x = block(x, ignore_mask)
        logits = self.lin(x)
        logits_mask = ignore_mask.sum(-1) != max_len
        
        return logits, logits_mask

## Evaluate

In [20]:
def evaluate_loss(model, set_name):
    with torch.inference_mode():
        model.eval()
        acc_loss = 0
        batch_size = 0

        for i, (x,y) in enumerate(batch_loader[set_name]):
            B,_,_ = x.shape
            logits, logits_mask = model(x)
        
            loss = torch.nn.functional.binary_cross_entropy_with_logits(logits[logits_mask].squeeze(-1), y[logits_mask], pos_weight=torch.tensor(WEIGHT))
            acc_loss += loss.item()
            batch_size += B
    return acc_loss / batch_size        

In [22]:
def evaluate_loss_sets(model):
    losses = dict({})
    for set_name in features.keys():
        if set_name != VAL:
            continue
        loss = evaluate_loss(model, set_name)
        losses[set_name] = loss
        print(f"Loss of {set_name}: {loss:.5f}") 
    return losses
        

In [23]:
from sklearn.metrics import roc_auc_score, accuracy_score

def evaluate_auroc(model, set_name):
    with torch.inference_mode():
        model.eval()
        
        logits_list = []
        label_list = []

        batch_size = 0
        for i, (x,y) in enumerate(batch_loader[set_name]):
            B,_,_ = x.shape
            logits, logits_mask = model(x)

            logits_list.extend(logits[logits_mask].squeeze(-1).tolist())
            label_list.extend(y[logits_mask].squeeze(-1).tolist())
            batch_size += B
    auroc = roc_auc_score(np.array(label_list), torch.sigmoid(torch.tensor(logits_list)).numpy())
    return auroc        

In [25]:
def evaluate_auroc_sets(model):
    for set_name in features.keys():
        if set_name == TRAIN:
            continue
        auroc = evaluate_auroc(model, set_name)
        print(f"AUROC of {set_name}: {auroc:.5f}")

## Train

In [26]:
torch.manual_seed(42)
input_dim, n_embd, n_heads, dropout, n_blocks = len(FEATURES), 8, 4, 0.0, 1
model = TransformerModel(input_dim, n_embd, n_heads, dropout, n_blocks).to(device)
optim= torch.optim.Adam(model.parameters(), lr = lr, weight_decay=wd)

In [27]:
# torch.manual_seed(42)
# evaluate_auroc_sets(TransformerModel(input_dim, n_embd, n_heads, dropout, n_blocks).to(device))

In [581]:
torch.manual_seed(42)

evaluate_loss_sets(model)
last_val_loss = None
for epoch in range(5):#5
    for i, (x,y) in tqdm(enumerate(batch_loader[TRAIN])):
        model.train()
        optim.zero_grad()
        
        logits, logits_mask = model(x)
        loss = torch.nn.functional.binary_cross_entropy_with_logits(logits[logits_mask].squeeze(-1), y[logits_mask], pos_weight=torch.tensor(WEIGHT))
            
        loss.backward()
        optim.step()
    losses = evaluate_loss_sets(model)
    if last_val_loss and last_val_loss <= losses[VAL]:
        print(epoch)
        break
    last_val_loss = losses[VAL]

Loss of train: 0.00393
Loss of val: 0.00416
Loss of test: 0.00368
Loss of gw_test: 0.00305


0it [00:00, ?it/s]

Loss of train: 0.00204
Loss of val: 0.00213
Loss of test: 0.00201
Loss of gw_test: 0.00209


0it [00:00, ?it/s]

Loss of train: 0.00191
Loss of val: 0.00203
Loss of test: 0.00189
Loss of gw_test: 0.00198


In [582]:
torch.manual_seed(42)
evaluate_auroc_sets(model)

AUROC of train: 0.86154
AUROC of val: 0.85416
AUROC of test: 0.84546
AUROC of gw_test: 0.78312


In [541]:
 np.linspace(1e-4, 1e-2, 3, endpoint=True)

array([0.0001 , 0.00505, 0.01   ])

In [None]:
from sklearn.model_selection import ParameterGrid
import copy

torch.manual_seed(42)
space = {
    'lr': [1e-2], #np.linspace(1e-4, 1e-2, 4, endpoint=True),
    'dropout':[0, 0.2,.4],
    'block_size':[2, 4, 6],
    'n_embd':[16, 32, 64],
    'num_heads':[2, 4, 8],
}
param_grid = ParameterGrid(space)

best_hyper_params = None
best_val_loss = float("inf")
patience = 2
i = 0
for params in tqdm(param_grid.__iter__()):
    lr = params["lr"]
    dropout = params["dropout"]
    block_size = params["block_size"]
    n_embd = params["n_embd"]
    num_heads = params["num_heads"]
    
    model = TransformerModel(len(FEATURES), n_embd, num_heads, dropout, block_size).to(device)
    models = []
    optim= torch.optim.Adam(model.parameters(), lr = lr, weight_decay=wd)

    torch.manual_seed(42)

    last_val_loss = None
    val_losses = []
    never_breaked = True
    for epoch in range(100):#5
        for i, (x,y) in tqdm(enumerate(batch_loader[TRAIN])):
            model.train()
            optim.zero_grad()
            
            logits, logits_mask = model(x)
            loss = torch.nn.functional.binary_cross_entropy_with_logits(logits[logits_mask].squeeze(-1), y[logits_mask], pos_weight=torch.tensor(WEIGHT))
                
            loss.backward()
            optim.step()
        losses = evaluate_loss_sets(model)
        val_losses.append(losses[VAL])
        models.append(copy.deepcopy(model))
        print(val_losses)
        if len(val_losses) >= patience and all(list(map(lambda l: losses[VAL] >= l, val_losses[-patience:]))):
            print(f"Break at {epoch}")
            never_breaked = False
            model = models[-2]
            break
    if never_breaked:
        print("Never breaked with params")
        print(params)
    evaluate_auroc_sets(model)
    if val_losses[-1]<= best_val_loss:
        best_val_loss = min(val_losses[-1], best_val_loss)
        best_hyper_params = params
        print("Currently best params: ")
        print(best_hyper_params)
    i+=1
    print(f"{i/324*100} %")

0it [00:00, ?it/s]

0it [00:00, ?it/s]

Loss of val: 0.00201
[0.002008010834815812]


0it [00:00, ?it/s]

Loss of val: 0.00197
[0.002008010834815812, 0.001970542478204296]


0it [00:00, ?it/s]

Loss of val: 0.00196
[0.002008010834815812, 0.001970542478204296, 0.0019594985531473505]


0it [00:00, ?it/s]

Loss of val: 0.00197
[0.002008010834815812, 0.001970542478204296, 0.0019594985531473505, 0.0019670081706773955]
Break at 3
AUROC of val: 0.86929
AUROC of test: 0.86974
AUROC of gw_test: 0.83587
Currently best params: 
{'block_size': 2, 'dropout': 0, 'lr': 0.01, 'n_embd': 16, 'num_heads': 2}
260.8024691358024 %


0it [00:00, ?it/s]

Loss of val: 0.00192
[0.0019234837257271016]


0it [00:00, ?it/s]

Loss of val: 0.00188
[0.0019234837257271016, 0.0018762511772997068]


0it [00:00, ?it/s]

Loss of val: 0.00193
[0.0019234837257271016, 0.0018762511772997068, 0.0019346802461463444]
Break at 2
AUROC of val: 0.87549
AUROC of test: 0.87140
AUROC of gw_test: 0.83051
Currently best params: 
{'block_size': 2, 'dropout': 0, 'lr': 0.01, 'n_embd': 16, 'num_heads': 4}
260.8024691358024 %


0it [00:00, ?it/s]

Loss of val: 0.00189
[0.0018897349890694124]


0it [00:00, ?it/s]

Loss of val: 0.00192
[0.0018897349890694124, 0.0019172469936279148]
Break at 1
AUROC of val: 0.87371
AUROC of test: 0.86989
AUROC of gw_test: 0.80500
Currently best params: 
{'block_size': 2, 'dropout': 0, 'lr': 0.01, 'n_embd': 16, 'num_heads': 8}
260.8024691358024 %


0it [00:00, ?it/s]

Loss of val: 0.00189
[0.0018881039238593672]


0it [00:00, ?it/s]

Loss of val: 0.00195
[0.0018881039238593672, 0.0019514727350485401]
Break at 1
AUROC of val: 0.87353
AUROC of test: 0.86566
AUROC of gw_test: 0.81821
260.8024691358024 %


0it [00:00, ?it/s]

Loss of val: 0.00182
[0.0018247517275474582]


0it [00:00, ?it/s]

Loss of val: 0.00190
[0.0018247517275474582, 0.0018990722938877855]
Break at 1
AUROC of val: 0.88264
AUROC of test: 0.87080
AUROC of gw_test: 0.82770
Currently best params: 
{'block_size': 2, 'dropout': 0, 'lr': 0.01, 'n_embd': 32, 'num_heads': 4}
260.8024691358024 %


0it [00:00, ?it/s]

Loss of val: 0.00179
[0.0017926676808823318]


0it [00:00, ?it/s]

Loss of val: 0.00190
[0.0017926676808823318, 0.0018993113098056407]
Break at 1
AUROC of val: 0.88471
AUROC of test: 0.86929
AUROC of gw_test: 0.83221
260.8024691358024 %


0it [00:00, ?it/s]

Loss of val: 0.00180
[0.0018036911257131303]


0it [00:00, ?it/s]

Loss of val: 0.00194
[0.0018036911257131303, 0.001936771149331527]
Break at 1
AUROC of val: 0.88261
AUROC of test: 0.86949
AUROC of gw_test: 0.83583
260.8024691358024 %


0it [00:00, ?it/s]

In [None]:
## TODO add layer norm after Blocks