# Random Survival Forest benchmark

In [1]:
import os
from pathlib import Path
import sys
node_type = os.getenv('BB_CPU')
venv_dir = f'/rds/homes/g/gaddcz/Projects/CPRD/virtual-envTorch2.0-{node_type}'
venv_site_pkgs = Path(venv_dir) / 'lib' / f'python{sys.version_info.major}.{sys.version_info.minor}' / 'site-packages'
if venv_site_pkgs.exists():
    sys.path.insert(0, str(venv_site_pkgs))
    print(f"Added path '{venv_site_pkgs}' at start of search paths.")
else:
    print(f"Path '{venv_site_pkgs}' not found. Check that it exists and/or that it exists for node-type '{node_type}'.")

%load_ext autoreload
%autoreload 2

Added path '/rds/homes/g/gaddcz/Projects/CPRD/virtual-envTorch2.0-icelake/lib/python3.10/site-packages' at start of search paths.


In [2]:
import pytorch_lightning
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import sqlite3
from dataclasses import dataclass
import logging
from CPRD.data.foundational_loader import FoundationalDataModule
import pickle 
from tqdm import tqdm

from pycox.datasets import support
from pycox.evaluation import EvalSurv
from sklearn.preprocessing import StandardScaler
from sklearn_pandas import DataFrameMapper
from torch.utils.data import TensorDataset, DataLoader

from sksurv.ensemble import RandomSurvivalForest

torch.manual_seed(1337)
logging.basicConfig(level=logging.INFO)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# device = "cpu"    # if more informative debugging statements are needed
print(f"Using device: {device}.")

Using device: cuda.


# Load data

In [3]:
dataset = "Hypertension"

# the time grid which we generate over
t_eval = np.linspace(0, 1, 1000) 
# the time grid which we calculate scores over
time_grid = np.linspace(start=0, stop=1 , num=300)

competing_risk = False

match dataset.lower():
    case "pycox":
        df_train = support.read_df()
        df_test = df_train.sample(frac=0.2)
        df_train = df_train.drop(df_test.index)
        df_val = df_train.sample(frac=0.2)
        df_train = df_train.drop(df_val.index)
        
        cols_standardize = ['x0', 'x7', 'x8', 'x9', 'x10', 'x11', 'x12', 'x13']
        cols_leave = ['x1', 'x2', 'x3', 'x4', 'x5', 'x6']
        
        standardize = [([col], StandardScaler()) for col in cols_standardize]
        leave = [(col, None) for col in cols_leave]
        
        x_mapper = DataFrameMapper(standardize + leave)
        
        x_train = x_mapper.fit_transform(df_train).astype('float32')
        x_val = x_mapper.transform(df_val).astype('float32')
        x_test = x_mapper.transform(df_test).astype('float32')
        
        get_target = lambda df: (df['duration'].values, df['event'].values)
        y_train = get_target(df_train)
        y_val = get_target(df_val)
        y_test = get_target(df_test)
        
        t_train, e_train = y_train
        t_val, e_val = y_val
        t_test, e_test = y_test
        
        t_train_max = np.amax(t_train)
        t_train = t_train / t_train_max
        t_val = t_val / t_train_max
        t_test = t_test / t_train_max
        

    case "hypertension" | "cvd":
        
        with open(f'/rds/projects/g/gokhalkm-optimal/OPTIMAL_MASTER_DATASET/data/FoundationalModel/FineTune_{dataset}/CrossSectionalData.pickle', "rb") as handle:
            data = pickle.load(handle)
        
        # display(data["X_train"].head())
        # display(data["y_train"])
        print(data.keys())
        
        subset = True
        if subset:
            data["X_train"] = data["X_train"]#[:10000]
            data["y_train"] = data["y_train"]#[:10000]
            data["X_val"] = data["X_val"][:10000]
            data["y_val"] = data["y_val"][:10000]
            data["X_test"] = data["X_test"][:50000]
            data["y_test"] = data["y_test"][:50000]


        x_train = data["X_train"].to_numpy(dtype=np.float32)
        x_val = data["X_val"].to_numpy(dtype=np.float32)
        x_test = data["X_test"].to_numpy(dtype=np.float32)
        
        t_train = np.asarray([i[1] for i in data["y_train"]])
        t_val = np.asarray([i[1] for i in data["y_val"]])        
        t_test = np.asarray([i[1] for i in data["y_test"]])

        if competing_risk is False:
            e_train = np.asarray([0 if i[0] == 0 else 1 for i in data["y_train"]])
            e_val = np.asarray([0 if i[0] == 0 else 1 for i in data["y_val"]])
            e_test = np.asarray([0 if i[0] == 0 else 1 for i in data["y_test"]])
        else:
            e_train = np.asarray([i[0] for i in data["y_train"]])
            e_val = np.asarray([i[0] for i in data["y_val"]])
            e_test = np.asarray([i[0] for i in data["y_test"]])

# batch_size = 32
# dataset_train = TensorDataset(*[torch.tensor(u,dtype=dtype_) for u, dtype_ in [(x_train,torch.float32),
#                                                                                (t_train,torch.float32),
#                                                                                (e_train,torch.long)]])
# data_loader_train = DataLoader(dataset_train, batch_size=batch_size, pin_memory=True, shuffle=True, drop_last=True)

# dataset_val = TensorDataset(*[torch.tensor(u,dtype=dtype_) for u, dtype_ in [(x_val,torch.float32),
#                                                                                (t_val,torch.float32),
#                                                                                (e_val,torch.long)]])
# data_loader_val = DataLoader(dataset_val, batch_size=batch_size, pin_memory=True, shuffle=True)

# dataset_test = TensorDataset(*[torch.tensor(u,dtype=dtype_) for u, dtype_ in [(x_test,torch.float32),
#                                                                                (t_test,torch.float32),
#                                                                                (e_test,torch.long)]])
# data_loader_test = DataLoader(dataset_test, batch_size=batch_size, pin_memory=True, shuffle=True)

display(x_train.shape)
display(type(x_train))
display(type(x_train[0,0]))
display(e_train.shape)
display(type(e_train))
display(type(e_train[0]))
display(t_train.shape)
display(type(t_train))
display(type(t_train[0]))

print(np.mean(e_train))
print(np.mean(t_train))
print(np.std(t_train))
print(np.mean(x_train))
print(t_train.min())
print(t_train.max())

print(np.unique(e_test, return_counts=True))

data.keys()


dict_keys(['X_train', 'y_train', 'X_val', 'y_val', 'X_test', 'y_test'])


(572096, 279)

numpy.ndarray

numpy.float32

(572096,)

numpy.ndarray

numpy.int64

(572096,)

numpy.ndarray

numpy.float64

0.21102227598165343
1.3744715215274252
1.0762837222632102
0.12590238
0.0005474090576171875
4.402192115783691
(array([0, 1]), array([28325,  7433]))


dict_keys(['X_train', 'y_train', 'X_val', 'y_val', 'X_test', 'y_test'])

In [4]:
print(type(e_train))

y_train = np.array([(_yk, _yt) for _yk, _yt in zip(e_train, t_train)],
                   dtype=[('cens', 'bool'), ('time', '<f8')])
print(y_train[:5])

y_test = np.array([(_yk, _yt) for _yk, _yt in zip(e_test, t_test)],
                   dtype=[('cens', 'bool'), ('time', '<f8')])
print(y_test[:5])
# display(Xt.head())

<class 'numpy.ndarray'>
[(False, 1.96876717) (False, 2.2164402 ) (False, 0.77205467)
 (False, 1.29315042) (False, 0.07013702)]
[( True, 1.11616468) (False, 0.34082222) (False, 2.12383461)
 (False, 1.8065753 ) (False, 0.76219177)]


# Train model

In [5]:
random_state = 20

# X_train, X_test, y_train, y_test = train_test_split(Xt, y, test_size=0.25, random_state=random_state)

rsf = RandomSurvivalForest(
    # n_estimators=100, min_samples_split=10, min_samples_leaf=15, n_jobs=-1, 
    bootstrap=True, max_samples=1000,    
    random_state=random_state, low_memory=False
)
rsf.fit(x_train, y_train)


In [6]:
# rsf.score(x_test, y_test)

In [7]:
display(x_test.shape)
rsf.unique_times_ = np.linspace(0, 1, 1000) 
surv = rsf.predict_survival_function(x_test, return_array=True)

# t_eval = np.linspace(0, 1, 1000) 
# surv = surv_funcs[0](t_eval)
# for fn in surv_funcs:
#    plt.step(t_eval, fn(t_eval), where="post")
    
display(surv)

(35758, 279)

array([[1.        , 1.        , 0.99991667, ..., 0.45752568, 0.45752568,
        0.45752568],
       [1.        , 1.        , 1.        , ..., 0.8550001 , 0.8550001 ,
        0.8550001 ],
       [1.        , 1.        , 1.        , ..., 0.75860176, 0.75860176,
        0.75860176],
       ...,
       [0.99971429, 0.99871429, 0.99871429, ..., 0.68479718, 0.68479718,
        0.68479718],
       [1.        , 0.99857143, 0.99703734, ..., 0.53051778, 0.53051778,
        0.53051778],
       [1.        , 1.        , 0.998     , ..., 0.69425414, 0.69425414,
        0.69425414]])

In [8]:
for fn in surv_funcs:
   plt.step(fn.x, fn(fn.x), where="post")

NameError: name 'surv_funcs' is not defined

In [None]:
surv = pd.DataFrame(np.transpose((1 - cdf.reshape((x_test.shape[0],t_eval.size)))), index=t_eval)

# Evaluate surv curve with unscaled index with unscaled test times to event 
print("Evaluating survival metrics")
ev = EvalSurv(surv, t_test, lbls, censor_surv='km')

print(ev.concordance_td())
print(ev.integrated_brier_score(time_grid))
print(ev.integrated_nbll(time_grid))