# Demo Notebook:
## DeSurv

In [2]:
import os
from pathlib import Path
import sys
node_type = os.getenv('BB_CPU')
venv_dir = f'/rds/homes/g/gaddcz/Projects/CPRD/virtual-envTorch2.0-{node_type}'
venv_site_pkgs = Path(venv_dir) / 'lib' / f'python{sys.version_info.major}.{sys.version_info.minor}' / 'site-packages'
if venv_site_pkgs.exists():
    sys.path.insert(0, str(venv_site_pkgs))
    print(f"Added path '{venv_site_pkgs}' at start of search paths.")
else:
    print(f"Path '{venv_site_pkgs}' not found. Check that it exists and/or that it exists for node-type '{node_type}'.")

%load_ext autoreload
%autoreload 2

Added path '/rds/homes/g/gaddcz/Projects/CPRD/virtual-envTorch2.0-icelake/lib/python3.10/site-packages' at start of search paths.
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
import pytorch_lightning
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import sqlite3
from dataclasses import dataclass
import logging
from CPRD.data.foundational_loader import FoundationalDataModule
import pickle 
from tqdm import tqdm

from pycox.datasets import support
from pycox.evaluation import EvalSurv
from sklearn.preprocessing import StandardScaler
from sklearn_pandas import DataFrameMapper
from torch.utils.data import TensorDataset, DataLoader

from CPRD.src.modules.head_layers.survival.desurv import ODESurvSingle as ODESurvSingleNatalia
from CPRD.src.modules.head_layers.survival.desurv import ODESurvMultiple as ODESurvMultipleNatalia

from CPRD.src.modules.head_layers.survival.desurv_original import ODESurvSingle as ODESurvSingleOriginal1
from CPRD.src.modules.head_layers.survival.desurv_original import ODESurvMultiple as ODESurvMultipleOriginal1

from DeSurv.src.classes import ODESurvSingle as ODESurvSingleOriginal2
from DeSurv.src.classes import ODESurvMultiple as ODESurvMultipleOriginal2


torch.manual_seed(1337)
logging.basicConfig(level=logging.INFO)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# device = "cpu"    # if more informative debugging statements are needed
print(f"Using device: {device}.")

Using device: cuda.


# Load data

In [4]:
dataset = "CVD"
competing_risk = True

# the time grid which we generate over
t_eval = np.linspace(0, 1, 1000) 
# the time grid which we calculate scores over
time_grid = np.linspace(start=0, stop=1 , num=300)


match dataset.lower():
    case "pycox":
        df_train = support.read_df()
        df_test = df_train.sample(frac=0.2)
        df_train = df_train.drop(df_test.index)
        df_val = df_train.sample(frac=0.2)
        df_train = df_train.drop(df_val.index)
        
        cols_standardize = ['x0', 'x7', 'x8', 'x9', 'x10', 'x11', 'x12', 'x13']
        cols_leave = ['x1', 'x2', 'x3', 'x4', 'x5', 'x6']
        
        standardize = [([col], StandardScaler()) for col in cols_standardize]
        leave = [(col, None) for col in cols_leave]
        
        x_mapper = DataFrameMapper(standardize + leave)
        
        x_train = x_mapper.fit_transform(df_train).astype('float32')
        x_val = x_mapper.transform(df_val).astype('float32')
        x_test = x_mapper.transform(df_test).astype('float32')
        
        get_target = lambda df: (df['duration'].values, df['event'].values)
        y_train = get_target(df_train)
        y_val = get_target(df_val)
        y_test = get_target(df_test)
        
        t_train, e_train = y_train
        t_val, e_val = y_val
        t_test, e_test = y_test
        
        t_train_max = np.amax(t_train)
        t_train = t_train / t_train_max
        t_val = t_val / t_train_max
        t_test = t_test / t_train_max
        

    case "hypertension" | "cvd":
        
        with open(f'/rds/projects/g/gokhalkm-optimal/OPTIMAL_MASTER_DATASET/data/FoundationalModel/FineTune_{dataset}/CrossSectionalData.pickle', "rb") as handle:
            data = pickle.load(handle)
        
        # display(data["X_train"].head())
        # display(data["y_train"])
        print(data.keys())
        
        subset = True
        if subset:
            data["X_train"] = data["X_train"][:6000]
            data["y_train"] = data["y_train"][:6000]
            data["X_val"] = data["X_val"][:10000]
            data["y_val"] = data["y_val"][:10000]
            data["X_test"] = data["X_test"][:50000]
            data["y_test"] = data["y_test"][:50000]


        x_train = data["X_train"].to_numpy(dtype=np.float32)
        x_val = data["X_val"].to_numpy(dtype=np.float32)
        x_test = data["X_test"].to_numpy(dtype=np.float32)
        
        t_train = np.asarray([i[1] for i in data["y_train"]])
        t_val = np.asarray([i[1] for i in data["y_val"]])        
        t_test = np.asarray([i[1] for i in data["y_test"]])

        if competing_risk is False:
            e_train = np.asarray([0 if i[0] == 0 else 1 for i in data["y_train"]])
            e_val = np.asarray([0 if i[0] == 0 else 1 for i in data["y_val"]])
            e_test = np.asarray([0 if i[0] == 0 else 1 for i in data["y_test"]])
        else:
            e_train = np.asarray([i[0] for i in data["y_train"]])
            e_val = np.asarray([i[0] for i in data["y_val"]])
            e_test = np.asarray([i[0] for i in data["y_test"]])

batch_size = 32
dataset_train = TensorDataset(*[torch.tensor(u,dtype=dtype_) for u, dtype_ in [(x_train,torch.float32),
                                                                               (t_train,torch.float32),
                                                                               (e_train,torch.long)]])
data_loader_train = DataLoader(dataset_train, batch_size=batch_size, pin_memory=True, shuffle=True, drop_last=True)

dataset_val = TensorDataset(*[torch.tensor(u,dtype=dtype_) for u, dtype_ in [(x_val,torch.float32),
                                                                               (t_val,torch.float32),
                                                                               (e_val,torch.long)]])
data_loader_val = DataLoader(dataset_val, batch_size=batch_size, pin_memory=True, shuffle=True)

dataset_test = TensorDataset(*[torch.tensor(u,dtype=dtype_) for u, dtype_ in [(x_test,torch.float32),
                                                                               (t_test,torch.float32),
                                                                               (e_test,torch.long)]])
data_loader_test = DataLoader(dataset_test, batch_size=batch_size, pin_memory=True, shuffle=True)

display(x_train.shape)
display(type(x_train))
display(type(x_train[0,0]))
display(e_train.shape)
display(type(e_train))
display(type(e_train[0]))
display(t_train.shape)
display(type(t_train))
display(type(t_train[0]))

print(np.mean(e_train))
print(np.mean(t_train))
print(np.std(t_train))
print(np.mean(x_train))
print(t_train.min())
print(t_train.max())

print(np.unique(e_test, return_counts=True))

data.keys()


dict_keys(['X_train', 'y_train', 'X_val', 'y_val', 'X_test', 'y_test'])


(6000, 279)

numpy.ndarray

numpy.float32

(6000,)

numpy.ndarray

numpy.int64

(6000,)

numpy.ndarray

numpy.float64

0.2921666666666667
1.5458627448876698
1.080586215953429
0.1262559
0.0005474090576171875
4.402191162109375
(array([0, 1, 2, 3, 4, 5]), array([31472,  1956,   373,   895,   942,   120]))


dict_keys(['X_train', 'y_train', 'X_val', 'y_val', 'X_test', 'y_test'])

In [5]:
# def divide_chunks(array, n):
#     for i in range(0, array.shape[0], n): 
#         yield array[i:i + n, :]

# Train model

In [6]:
hidden_dim = 32
training = True
lr = 1e-3
xdim = x_train.shape[1]

models, model_names = [], []
if competing_risk is False:
    models.append(ODESurvSingleNatalia(xdim, 32, device=device))
    model_names.append(f"{dataset}_sr_natalia_")
    models.append(ODESurvSingleOriginal1(xdim, hidden_dim, device=device))
    model_names.append(f"{dataset}_sr_original1_")
    models.append(ODESurvSingleOriginal2(lr, xdim, hidden_dim, device=device))
    model_names.append(f"{dataset}_sr_original2_")
else:
    models.append(ODESurvMultipleNatalia(xdim, [32, 32], num_risks=6))
    model_names.append(f"{dataset}_cr_natalia_")
    models.append(ODESurvMultipleOriginal1(xdim, hidden_dim, num_risks=6))
    model_names.append(f"{dataset}_cr_original1_")
    models.append(ODESurvMultipleOriginal2(lr, xdim, hidden_dim, num_risks=6))
    model_names.append(f"{dataset}_cr_original2_")

for model_name, model in zip(model_names, models):

    print(f"\n\n{model_name} with {sum(p.numel() for p in model.parameters() if p.requires_grad)} parameters")
    
    if training:
        print(f"Training")
        model.optimize(data_loader_train, n_epochs=20, logging_freq=1, data_loader_val=data_loader_val, max_wait=2)
        print("finished training")
        torch.save(model.state_dict(), model_name + "tst_model")
        model.eval()

    print(f"Testing")    
    state_dict = torch.load(model_name + "tst_model")
    model.load_state_dict(state_dict)
    model.eval()
    
    argsortttest = np.argsort(t_test)
    t_test = t_test[argsortttest]
    e_test = e_test[argsortttest]
    x_test = x_test[argsortttest,:]
    
    with torch.no_grad():

        # The normalised grid over which to predict
        t_test_grid = torch.tensor(np.concatenate([t_eval] * x_test.shape[0], 0), dtype=torch.float32)
        x_test_grid = torch.tensor(x_test, dtype=torch.float32).repeat_interleave(t_eval.size, 0)
        
        pred_bsz = 51200
        pred = []
        for x_test_batched, t_test_batched in tqdm(zip(torch.split(x_test_grid, pred_bsz), torch.split(t_test_grid, pred_bsz)),
                                                   total=(x_test_grid.shape[0]//pred_bsz)+1 ):
            
            if competing_risk is False:
                pred_ = model.predict(x_test_batched, t_test_batched)          # shape: (x_test.batched.shape[0],)
            else:
                pred_, pi_  = model.predict(x_test_batched, t_test_batched)    # shape: (x_test.batched.shape[0], num_outcomes)
                # pred_ = pred_ #* pi_                  # \sum_k={1,...,K} not \emptyset F_k
            pred.append(pred_)
                
        pred = torch.concat(pred)
    
        pred = pred.reshape((x_test.shape[0], t_eval.size, -1)).cpu().detach().numpy()
        preds = [pred[:, :, _i] for _i in range(pred.shape[-1])]
        # print([_.shape for _ in preds])

        # Merge (additively) each outcome risk curve into a single CDF, and update label for if outcome occurred or not
        cdf = np.zeros_like(preds[0])
        lbls = np.zeros_like(e_test)     
        for _outcome_token in np.unique(e_test)[1:]:
            print(f"{_outcome_token} of {np.unique(e_test)[1:]} included from {len(preds)} surv CDFs")
            print(_outcome_token)
            cdf += preds[_outcome_token - 1] 
            lbls += (e_test == _outcome_token)
        
        surv = pd.DataFrame(np.transpose((1 - cdf.reshape((x_test.shape[0],t_eval.size)))), index=t_eval)

    # Evaluate surv curve with unscaled index with unscaled test times to event 
    print("Evaluating survival metrics")
    ev = EvalSurv(surv, t_test, lbls, censor_surv='km')
    
    print(ev.concordance_td())
    print(ev.integrated_brier_score(time_grid))
    print(ev.integrated_nbll(time_grid))

INFO:root:original


CondODENet: cpu specified, cpu used
FCNet: cpu specified, cpu used


CVD_cr_natalia_ with 20460 parameters
Training
	Epoch:  0. Total loss:     4626.13
best_epoch: 0
	Epoch:  0. Total val loss:     5049.69
	Epoch:  1. Total loss:     3767.31
best_epoch: 1
	Epoch:  1. Total val loss:     4937.90
	Epoch:  2. Total loss:     3674.22
	Epoch:  2. Total val loss:     4944.79
	Epoch:  3. Total loss:     3583.28
best_epoch: 3
	Epoch:  3. Total val loss:     4906.40
	Epoch:  4. Total loss:     3522.92
	Epoch:  4. Total val loss:     4930.55
	Epoch:  5. Total loss:     3447.98
	Epoch:  5. Total val loss:     5018.83
	Epoch:  6. Total loss:     3377.67
finished training
Testing


100%|██████████| 699/699 [08:11<00:00,  1.42it/s]


1 of [1 2 3 4 5] included from 6 surv CDFs
1
2 of [1 2 3 4 5] included from 6 surv CDFs
2
3 of [1 2 3 4 5] included from 6 surv CDFs
3
4 of [1 2 3 4 5] included from 6 surv CDFs
4
5 of [1 2 3 4 5] included from 6 surv CDFs
5
Evaluating survival metrics
0.6092955801705897
0.0339398728091127
0.14779968027386553


CVD_cr_original1_ with 20460 parameters
Training
	Epoch:  0. Total loss:     5169.06
best_epoch: 0
	Epoch:  0. Total val loss:     5072.99
	Epoch:  1. Total loss:     3753.11
best_epoch: 1
	Epoch:  1. Total val loss:     4954.16
	Epoch:  2. Total loss:     3665.37
	Epoch:  2. Total val loss:     4996.64
	Epoch:  3. Total loss:     3604.69
	Epoch:  3. Total val loss:     4975.74
	Epoch:  4. Total loss:     3523.51
finished training
Testing


100%|██████████| 699/699 [05:17<00:00,  2.20it/s]


1 of [1 2 3 4 5] included from 6 surv CDFs
1
2 of [1 2 3 4 5] included from 6 surv CDFs
2
3 of [1 2 3 4 5] included from 6 surv CDFs
3
4 of [1 2 3 4 5] included from 6 surv CDFs
4
5 of [1 2 3 4 5] included from 6 surv CDFs
5
Evaluating survival metrics
0.5891815191970715
0.034083698717162145
0.14782213495611307


CVD_cr_original2_ with 20460 parameters
Training


  input = module(input)


	Epoch:  0. Total loss:     5313.00
best_epoch: 0
	Epoch:  0. Total val loss:     5152.35
	Epoch:  1. Total loss:     3790.26
best_epoch: 1
	Epoch:  1. Total val loss:     4961.07
	Epoch:  2. Total loss:     3667.32
	Epoch:  2. Total val loss:     5001.15
	Epoch:  3. Total loss:     3589.61
best_epoch: 3
	Epoch:  3. Total val loss:     4932.00
	Epoch:  4. Total loss:     3525.06
	Epoch:  4. Total val loss:     4961.14
	Epoch:  5. Total loss:     3442.06
	Epoch:  5. Total val loss:     5127.22
	Epoch:  6. Total loss:     3400.03
finished training
Testing


100%|██████████| 699/699 [00:47<00:00, 14.60it/s]


1 of [1 2 3 4 5] included from 6 surv CDFs
1
2 of [1 2 3 4 5] included from 6 surv CDFs
2
3 of [1 2 3 4 5] included from 6 surv CDFs
3
4 of [1 2 3 4 5] included from 6 surv CDFs
4
5 of [1 2 3 4 5] included from 6 surv CDFs
5
Evaluating survival metrics
0.6057716222956506
0.034037456172092855
0.14743363247370506


In [6]:
# 35758000 / (1000 * 1)
# print(pred.reshape(x_test.shape[0], 1000,-1).shape)
# print(x_test.shape)
# pred = np.concatenate(preds, 0)             # n_test * t_eval.size(), num_risks
# pred = pred.reshape(x_test.shape[0], 1000,-1)
# print(pred.shape)
# print([_p.shape for _p in preds])

In [7]:
display(t_eval.max())

display(t_test.max()*5*365)

print(ev.concordance_td())
print(ev.integrated_brier_score(t_eval))
print(ev.integrated_nbll(t_eval))



1.0

8034.000611305237

0.6057716222956506
0.034038439349878824
0.1474335340266184
