In [1]:
import os
main = os.chdir(os.path.dirname(os.path.dirname(os.getcwd())))

In [2]:
import json
import numpy as np
import pandas as pd

from model import model_architecture, output_results, utils
from sksurv.util import Surv as skSurv
from sklearn.preprocessing import StandardScaler
from sklearn_pandas import DataFrameMapper

Using TensorFlow backend.


# I. METABRIC data

We use METABRIC data. We use clinical and pathological informations and gene expression data. Missing values for explanatory variables are completed. Data is first splitted into 5 folds for the double 5 folds cross validation.  The same 5 folds are used for all the models and are stored in the file "folds.json".

In [3]:
name = "CoxCC"

In [4]:
df = pd.read_csv("data/real_data/metabric_imputed.csv")

In [5]:
with open('data/real_data/folds.json') as f:
        kfolds = json.load(f)

# II. Model's construction and training

The parameters of the architecture are the one listed in the parameters dataframe, selected by a double 5-fold cross-validation among 400 sets of parameters. There are 5 sets of parameters, corresponding to each fold of the outerloop.

In [6]:
results_all = pd.DataFrame(columns=['unoc5', 'unoc10','ibs'])

k=0
for kname, val in kfolds.items():
    param = pd.read_csv('model/best_param_metabric/'+name+'_best_param.csv', sep = ";",index_col=0)
    train_index = val['train']
    val_index = val['test']
    
    df_train, df_val = df.iloc[train_index], df.iloc[val_index]
    
    list_all_cols = df.drop(columns = ['yy','status'],axis=1).columns.to_list()
    cols_leave = ['N', 'grade1','grade2', 'hormonotherapy', 'chemotherapy']
    cols_standardize = [col for col in list_all_cols if col not in cols_leave] 
    standardize = [([col], StandardScaler()) for col in cols_standardize]
    leave = [(col, None) for col in cols_leave]
    df_mapper = DataFrameMapper(standardize + leave, df_out = True)
    
    X_train = df_train.drop(["yy","status"], axis = 1)
    X_train = df_mapper.fit_transform(X_train).astype('float32')
    X_val = df_val.drop(["yy","status"], axis = 1)
    X_val = df_mapper.transform(X_val).astype('float32')

    if name=="DeepHit":
        num_durations = 10 
        labtrans = DeepHitSingle.label_transform(num_durations)
        y_train = labtrans.fit_transform(*(df_train['yy'].values, df_train['status'].values))
    elif name=="CoxTime":
        labtrans = CoxTime.label_transform()
        y_train = labtrans.fit_transform(*(df_train['yy'].values, df_train['status'].values))
    else:
        labtrans=""
        y_train = (df_train['yy'].values, df_train['status'].values)

    model,callbacks  = model_architecture.build_model(np.array(X_train), 
                               param['neurons'][kname], 
                               param['dropout'][kname], 
                               param['activation'][kname],
                               param['lr'][kname],
                               param['optimizer'][kname],
                               param['n_layers'][kname],
                               name,
                               labtrans)
    log = model.fit(np.array(X_train), 
                     y_train,
                    int(param['batch_size'][kname]),
                     epochs=100, 
                     callbacks = callbacks, 
                     verbose = True)
    _ = model.compute_baseline_hazards()
    surv = model.predict_surv_df(np.array(X_val))
    res = output_results.output_stats(model,surv,X_train, df_train, X_val, df_val)
    results_all = results_all.append(res, ignore_index  = True, sort = False)
    k+=1

0:	[0s / 0s],		train_loss: 6.1692
1:	[0s / 0s],		train_loss: 7.7759
2:	[0s / 0s],		train_loss: 1.4317
3:	[0s / 0s],		train_loss: 0.8203
4:	[0s / 0s],		train_loss: 0.9107
5:	[0s / 0s],		train_loss: 0.6984
6:	[0s / 0s],		train_loss: 0.6807
7:	[0s / 0s],		train_loss: 0.6952
8:	[0s / 0s],		train_loss: 0.6996
9:	[0s / 0s],		train_loss: 0.6916
10:	[0s / 0s],		train_loss: 0.6928
11:	[0s / 0s],		train_loss: 0.7072
12:	[0s / 0s],		train_loss: 0.6864
13:	[0s / 0s],		train_loss: 0.6735
14:	[0s / 0s],		train_loss: 0.6861
15:	[0s / 0s],		train_loss: 0.6834
16:	[0s / 0s],		train_loss: 0.6807
17:	[0s / 0s],		train_loss: 0.6764
18:	[0s / 0s],		train_loss: 0.6777
19:	[0s / 0s],		train_loss: 0.6833
20:	[0s / 0s],		train_loss: 0.6768
21:	[0s / 0s],		train_loss: 0.6842
22:	[0s / 0s],		train_loss: 0.6743
23:	[0s / 0s],		train_loss: 0.6586
24:	[0s / 0s],		train_loss: 0.6581
25:	[0s / 0s],		train_loss: 0.6631
26:	[0s / 0s],		train_loss: 0.6545
27:	[0s / 0s],		train_loss: 0.6703
28:	[0s / 0s],		train_loss: 0.

37:	[0s / 2s],		train_loss: 0.1753
38:	[0s / 2s],		train_loss: 0.1583
39:	[0s / 2s],		train_loss: 0.1465
40:	[0s / 2s],		train_loss: 0.1798
41:	[0s / 2s],		train_loss: 0.1704
42:	[0s / 2s],		train_loss: 0.1529
43:	[0s / 2s],		train_loss: 0.1529
44:	[0s / 3s],		train_loss: 0.1485
45:	[0s / 3s],		train_loss: 0.1256
46:	[0s / 3s],		train_loss: 0.1427
47:	[0s / 3s],		train_loss: 0.1272
48:	[0s / 3s],		train_loss: 0.1385
49:	[0s / 3s],		train_loss: 0.1539
50:	[0s / 3s],		train_loss: 0.1189
51:	[0s / 3s],		train_loss: 0.1358
52:	[0s / 3s],		train_loss: 0.1613
53:	[0s / 3s],		train_loss: 0.1379
54:	[0s / 3s],		train_loss: 0.1537
55:	[0s / 3s],		train_loss: 0.1169
56:	[0s / 3s],		train_loss: 0.1361
57:	[0s / 3s],		train_loss: 0.1434
58:	[0s / 3s],		train_loss: 0.1296
59:	[0s / 3s],		train_loss: 0.1502
60:	[0s / 4s],		train_loss: 0.1289
61:	[0s / 4s],		train_loss: 0.1120
62:	[0s / 4s],		train_loss: 0.1385
63:	[0s / 4s],		train_loss: 0.1234
64:	[0s / 4s],		train_loss: 0.1296
65:	[0s / 4s],		trai

# III. Results

We output the AUC value at 5 and 10 years and Uno's C-index at 5 and 10 years, with standard deviation among the 5 folds.

In [7]:
mean_all = results_all.mean(axis=0)
std_all = results_all.std(axis=0, ddof=0)
measures = pd.concat([mean_all.rename('mean'), std_all.rename('std')],axis = 1)
measures

Unnamed: 0,mean,std
unoc5,0.664347,0.026802
unoc10,0.633312,0.026915
ibs,0.21031,0.043608
