In [1]:
import os
main = os.chdir(os.path.dirname(os.path.dirname(os.getcwd())))

In [2]:
import json
import numpy as np
import pandas as pd

from model import model_architecture, output_results, utils
from pycox.models import CoxTime
from pycox.models.cox_time import MLPVanillaCoxTime
from sksurv.util import Surv as skSurv
from sklearn.preprocessing import StandardScaler
from sklearn_pandas import DataFrameMapper

Using TensorFlow backend.


# I. METABRIC data

We use METABRIC data. We use clinical and pathological informations and gene expression data. Missing values for explanatory variables are completed. Data is first splitted into 5 folds for the double 5 folds cross validation.  The same 5 folds are used for all the models and are stored in the file "folds.json".

In [3]:
name = "CoxTime"

In [4]:
df = pd.read_csv("data/real_data/metabric_imputed.csv")

In [5]:
with open('data/real_data/folds.json') as f:
        kfolds = json.load(f)

# II. Model's construction and training

The parameters of the architecture are the one listed in the parameters dataframe, selected by a double 5-fold cross-validation among 400 sets of parameters. There are 5 sets of parameters, corresponding to each fold of the outerloop.

In [6]:
results_all = pd.DataFrame(columns=['unoc5', 'unoc10','ibs'])

k=0
for kname, val in kfolds.items():
    param = pd.read_csv('model/best_param_metabric/'+name+'_best_param.csv', sep = ";",index_col=0)
    train_index = val['train']
    val_index = val['test']
    
    df_train, df_val = df.iloc[train_index], df.iloc[val_index]
    
    list_all_cols = df.drop(columns = ['yy','status'],axis=1).columns.to_list()
    cols_leave = ['N', 'grade1','grade2', 'hormonotherapy', 'chemotherapy']
    cols_standardize = [col for col in list_all_cols if col not in cols_leave] 
    standardize = [([col], StandardScaler()) for col in cols_standardize]
    leave = [(col, None) for col in cols_leave]
    df_mapper = DataFrameMapper(standardize + leave, df_out = True)
    
    X_train = df_train.drop(["yy","status"], axis = 1)
    X_train = df_mapper.fit_transform(X_train).astype('float32')
    X_val = df_val.drop(["yy","status"], axis = 1)
    X_val = df_mapper.transform(X_val).astype('float32')

    if name=="DeepHit":
        num_durations = 10 
        labtrans = DeepHitSingle.label_transform(num_durations)
        y_train = labtrans.fit_transform(*(df_train['yy'].values, df_train['status'].values))
    elif name=="CoxTime":
        labtrans = CoxTime.label_transform()
        y_train = labtrans.fit_transform(*(df_train['yy'].values, df_train['status'].values))
    else:
        labtrans=""
        y_train = (df_train['yy'].values, df_train['status'].values)

    model,callbacks  = model_architecture.build_model(np.array(X_train), 
                               param['neurons'][kname], 
                               param['dropout'][kname], 
                               param['activation'][kname],
                               param['lr'][kname],
                               param['optimizer'][kname],
                               param['n_layers'][kname],
                               name,
                               labtrans)
    log = model.fit(np.array(X_train), 
                     y_train,
                    int(param['batch_size'][kname]),
                     epochs=100, 
                     callbacks = callbacks, 
                     verbose = True)
    _ = model.compute_baseline_hazards()
    surv = model.predict_surv_df(np.array(X_val))
    res = output_results.output_stats(model,surv,X_train, df_train, X_val, df_val)
    results_all = results_all.append(res, ignore_index  = True, sort = False)
    k+=1

0:	[0s / 0s],		train_loss: 0.7810
1:	[0s / 0s],		train_loss: 0.7401
2:	[0s / 0s],		train_loss: 0.7148
3:	[0s / 0s],		train_loss: 0.6741
4:	[0s / 0s],		train_loss: 0.7285
5:	[0s / 0s],		train_loss: 0.7105
6:	[0s / 0s],		train_loss: 0.6923
7:	[0s / 0s],		train_loss: 0.7013
8:	[0s / 0s],		train_loss: 0.6707
9:	[0s / 0s],		train_loss: 0.7168
10:	[0s / 0s],		train_loss: 0.6702
11:	[0s / 0s],		train_loss: 0.6574
12:	[0s / 0s],		train_loss: 0.6816
13:	[0s / 0s],		train_loss: 0.6727
14:	[0s / 0s],		train_loss: 0.6707
15:	[0s / 0s],		train_loss: 0.6437
16:	[0s / 0s],		train_loss: 0.6938
17:	[0s / 0s],		train_loss: 0.7307
18:	[0s / 0s],		train_loss: 0.6799
19:	[0s / 0s],		train_loss: 0.6184
20:	[0s / 0s],		train_loss: 0.6557
21:	[0s / 0s],		train_loss: 0.6109
22:	[0s / 0s],		train_loss: 0.6171
23:	[0s / 0s],		train_loss: 0.6487
24:	[0s / 1s],		train_loss: 0.6047
25:	[0s / 1s],		train_loss: 0.6313
26:	[0s / 1s],		train_loss: 0.6088
27:	[0s / 1s],		train_loss: 0.6141
28:	[0s / 1s],		train_loss: 0.

37:	[0s / 1s],		train_loss: 0.5462
38:	[0s / 1s],		train_loss: 0.5496
39:	[0s / 1s],		train_loss: 0.5523
40:	[0s / 1s],		train_loss: 0.5355
41:	[0s / 1s],		train_loss: 0.5295
42:	[0s / 1s],		train_loss: 0.5064
43:	[0s / 1s],		train_loss: 0.5762
44:	[0s / 1s],		train_loss: 0.5794
45:	[0s / 1s],		train_loss: 0.5473
46:	[0s / 1s],		train_loss: 0.5558
47:	[0s / 1s],		train_loss: 0.5703
48:	[0s / 1s],		train_loss: 0.5890
49:	[0s / 1s],		train_loss: 0.5466
50:	[0s / 1s],		train_loss: 0.5906
51:	[0s / 1s],		train_loss: 0.5661
52:	[0s / 1s],		train_loss: 0.5420
53:	[0s / 1s],		train_loss: 0.5423
54:	[0s / 1s],		train_loss: 0.5500
55:	[0s / 1s],		train_loss: 0.5302
56:	[0s / 1s],		train_loss: 0.5734
57:	[0s / 1s],		train_loss: 0.5584
58:	[0s / 1s],		train_loss: 0.5482
59:	[0s / 2s],		train_loss: 0.5422
60:	[0s / 2s],		train_loss: 0.5509
61:	[0s / 2s],		train_loss: 0.5801
62:	[0s / 2s],		train_loss: 0.5972
63:	[0s / 2s],		train_loss: 0.5513
64:	[0s / 2s],		train_loss: 0.5704
65:	[0s / 2s],		trai

71:	[0s / 2s],		train_loss: 0.2909
72:	[0s / 3s],		train_loss: 0.3255
73:	[0s / 3s],		train_loss: 0.3083
74:	[0s / 3s],		train_loss: 0.3235
75:	[0s / 3s],		train_loss: 0.2946
76:	[0s / 3s],		train_loss: 0.2974
77:	[0s / 3s],		train_loss: 0.3294
78:	[0s / 3s],		train_loss: 0.3050
79:	[0s / 3s],		train_loss: 0.2987
80:	[0s / 3s],		train_loss: 0.3034
81:	[0s / 3s],		train_loss: 0.3355
82:	[0s / 3s],		train_loss: 0.3165
83:	[0s / 3s],		train_loss: 0.2826
84:	[0s / 3s],		train_loss: 0.3463
85:	[0s / 3s],		train_loss: 0.3384
86:	[0s / 3s],		train_loss: 0.3039
87:	[0s / 3s],		train_loss: 0.3136
88:	[0s / 3s],		train_loss: 0.2895
89:	[0s / 3s],		train_loss: 0.2938
90:	[0s / 3s],		train_loss: 0.3076
91:	[0s / 3s],		train_loss: 0.2752
92:	[0s / 3s],		train_loss: 0.2613
93:	[0s / 3s],		train_loss: 0.2473
94:	[0s / 3s],		train_loss: 0.2995
95:	[0s / 3s],		train_loss: 0.2943
96:	[0s / 4s],		train_loss: 0.2919
97:	[0s / 4s],		train_loss: 0.2935
98:	[0s / 4s],		train_loss: 0.2784
99:	[0s / 4s],		trai

# III. Results

We output the AUC value at 5 and 10 years and Uno's C-index at 5 and 10 years, with standard deviation among the 5 folds.

In [7]:
mean_all = results_all.mean(axis=0)
std_all = results_all.std(axis=0, ddof=0)
measures = pd.concat([mean_all.rename('mean'), std_all.rename('std')],axis = 1)
measures

Unnamed: 0,mean,std
unoc5,0.667228,0.039833
unoc10,0.639136,0.031863
ibs,0.200904,0.022129
