In [1]:
import os
main = os.chdir(os.path.dirname(os.path.dirname(os.getcwd())))

In [2]:
import json
import numpy as np
import pandas as pd

from model import model_architecture, output_results, utils
from pycox.models import DeepHitSingle
from sksurv.util import Surv as skSurv
from sklearn.preprocessing import StandardScaler
from sklearn_pandas import DataFrameMapper

Using TensorFlow backend.


# I. METABRIC data

We use METABRIC data. We use clinical and pathological informations and gene expression data. Missing values for explanatory variables are completed. Data is first splitted into 5 folds for the double 5 folds cross validation.  The same 5 folds are used for all the models and are stored in the file "folds.json".

In [3]:
name = "DeepHit"

In [4]:
df = pd.read_csv("data/real_data/metabric_imputed.csv")

In [5]:
with open('data/real_data/folds.json') as f:
        kfolds = json.load(f)

# II. Model's construction and training

The parameters of the architecture are the one listed in the parameters dataframe, selected by a double 5-fold cross-validation among 400 sets of parameters. There are 5 sets of parameters, corresponding to each fold of the outerloop.

In [6]:
results_all = pd.DataFrame(columns=['unoc5', 'unoc10','ibs'])

k=0
for kname, val in kfolds.items():
    param = pd.read_csv('model/best_param_metabric/'+name+'_best_param.csv', sep = ";",index_col=0)
    train_index = val['train']
    val_index = val['test']
    
    df_train, df_val = df.iloc[train_index], df.iloc[val_index]
    
    list_all_cols = df.drop(columns = ['yy','status'],axis=1).columns.to_list()
    cols_leave = ['N', 'grade1','grade2', 'hormonotherapy', 'chemotherapy']
    cols_standardize = [col for col in list_all_cols if col not in cols_leave] 
    standardize = [([col], StandardScaler()) for col in cols_standardize]
    leave = [(col, None) for col in cols_leave]
    df_mapper = DataFrameMapper(standardize + leave, df_out = True)
    
    X_train = df_train.drop(["yy","status"], axis = 1)
    X_train = df_mapper.fit_transform(X_train).astype('float32')
    X_val = df_val.drop(["yy","status"], axis = 1)
    X_val = df_mapper.transform(X_val).astype('float32')

    if name=="DeepHit":
        num_durations = 10 
        labtrans = DeepHitSingle.label_transform(num_durations)
        y_train = labtrans.fit_transform(*(df_train['yy'].values, df_train['status'].values))
    elif name=="CoxTime":
        labtrans = CoxTime.label_transform()
        y_train = labtrans.fit_transform(*(df_train['yy'].values, df_train['status'].values))
    else:
        labtrans=""
        y_train = (df_train['yy'].values, df_train['status'].values)

    model,callbacks  = model_architecture.build_model(np.array(X_train), 
                               param['neurons'][kname], 
                               param['dropout'][kname], 
                               param['activation'][kname],
                               param['lr'][kname],
                               param['optimizer'][kname],
                               param['n_layers'][kname],
                               name,
                               labtrans)
    log = model.fit(np.array(X_train), 
                     y_train,
                    int(param['batch_size'][kname]),
                     epochs=100, 
                     callbacks = callbacks, 
                     verbose = True)
    surv = model.predict_surv_df(np.array(X_val))
    res = output_results.output_stats(model,surv,X_train, df_train, X_val, df_val)
    results_all = results_all.append(res, ignore_index  = True, sort = False)
    k+=1

0:	[0s / 0s],		train_loss: 0.6875
1:	[0s / 0s],		train_loss: 0.5881
2:	[0s / 0s],		train_loss: 0.5250
3:	[0s / 0s],		train_loss: 0.5111
4:	[0s / 0s],		train_loss: 0.5055
5:	[0s / 0s],		train_loss: 0.4888
6:	[0s / 0s],		train_loss: 0.4917
7:	[0s / 0s],		train_loss: 0.4939
8:	[0s / 1s],		train_loss: 0.4835
9:	[0s / 1s],		train_loss: 0.4870
10:	[0s / 1s],		train_loss: 0.4812
11:	[0s / 1s],		train_loss: 0.4776
12:	[0s / 1s],		train_loss: 0.4794
13:	[0s / 1s],		train_loss: 0.4790
14:	[0s / 1s],		train_loss: 0.4739
15:	[0s / 1s],		train_loss: 0.4823
16:	[0s / 2s],		train_loss: 0.4732
17:	[0s / 2s],		train_loss: 0.4720
18:	[0s / 2s],		train_loss: 0.4702
19:	[0s / 2s],		train_loss: 0.4627
20:	[0s / 2s],		train_loss: 0.4650
21:	[0s / 2s],		train_loss: 0.4660
22:	[0s / 2s],		train_loss: 0.4538
23:	[0s / 2s],		train_loss: 0.4582
24:	[0s / 3s],		train_loss: 0.4614
25:	[0s / 3s],		train_loss: 0.4588
26:	[0s / 3s],		train_loss: 0.4590
27:	[0s / 3s],		train_loss: 0.4488
28:	[0s / 3s],		train_loss: 0.

34:	[0s / 3s],		train_loss: 0.4333
35:	[0s / 3s],		train_loss: 0.4278
36:	[0s / 4s],		train_loss: 0.4307
37:	[0s / 4s],		train_loss: 0.4216
38:	[0s / 4s],		train_loss: 0.4210
39:	[0s / 4s],		train_loss: 0.4370
40:	[0s / 4s],		train_loss: 0.4494
41:	[0s / 4s],		train_loss: 0.4505
42:	[0s / 4s],		train_loss: 0.4731
43:	[0s / 4s],		train_loss: 0.4531
44:	[0s / 5s],		train_loss: 0.4813
45:	[0s / 5s],		train_loss: 0.4548
46:	[0s / 5s],		train_loss: 0.4388
47:	[0s / 5s],		train_loss: 0.4374
48:	[0s / 5s],		train_loss: 0.4352
49:	[0s / 5s],		train_loss: 0.4091
50:	[0s / 5s],		train_loss: 0.4200
51:	[0s / 5s],		train_loss: 0.4041
52:	[0s / 5s],		train_loss: 0.4134
53:	[0s / 6s],		train_loss: 0.4482
54:	[0s / 6s],		train_loss: 0.4104
55:	[0s / 6s],		train_loss: 0.4087
56:	[0s / 6s],		train_loss: 0.3963
57:	[0s / 6s],		train_loss: 0.3863
58:	[0s / 6s],		train_loss: 0.4107
59:	[0s / 6s],		train_loss: 0.3883
60:	[0s / 6s],		train_loss: 0.3949
61:	[0s / 6s],		train_loss: 0.4074
62:	[0s / 7s],		trai

69:	[0s / 10s],		train_loss: 0.4040
70:	[0s / 10s],		train_loss: 0.3923
71:	[0s / 10s],		train_loss: 0.3968
72:	[0s / 10s],		train_loss: 0.3968
73:	[0s / 10s],		train_loss: 0.3940
74:	[0s / 11s],		train_loss: 0.3962
75:	[0s / 11s],		train_loss: 0.3939
76:	[0s / 11s],		train_loss: 0.3795
77:	[0s / 11s],		train_loss: 0.3759
78:	[0s / 11s],		train_loss: 0.3824
79:	[0s / 11s],		train_loss: 0.3801
80:	[0s / 12s],		train_loss: 0.3832
81:	[0s / 12s],		train_loss: 0.3892
82:	[0s / 12s],		train_loss: 0.4296
83:	[0s / 12s],		train_loss: 0.4535
84:	[0s / 12s],		train_loss: 0.7279
85:	[0s / 12s],		train_loss: 4.6656
86:	[0s / 12s],		train_loss: 9.3101
87:	[0s / 13s],		train_loss: 25.7932
88:	[0s / 13s],		train_loss: 34.0997
89:	[0s / 13s],		train_loss: 22.8387
90:	[0s / 13s],		train_loss: 37.7394
91:	[0s / 13s],		train_loss: 34.9817
92:	[0s / 14s],		train_loss: 39.4611
93:	[0s / 14s],		train_loss: 42.0489
94:	[0s / 14s],		train_loss: 40.2749
95:	[0s / 14s],		train_loss: 43.7602
96:	[0s / 14s],		tr

# III. Results

We output the AUC value at 5 and 10 years and Uno's C-index at 5 and 10 years, with standard deviation among the 5 folds.

In [7]:
mean_all = results_all.mean(axis=0)
std_all = results_all.std(axis=0, ddof=0)
measures = pd.concat([mean_all.rename('mean'), std_all.rename('std')],axis = 1)
measures

Unnamed: 0,mean,std
unoc5,0.621804,0.071709
unoc10,0.585146,0.050832
ibs,0.223809,0.065912
