In [1]:
import os
main = os.chdir(os.path.dirname(os.path.dirname(os.getcwd())))

In [2]:
import json
import numpy as np
import pandas as pd

from model import model_architecture, output_results, utils
from sksurv.util import Surv as skSurv
from sklearn.preprocessing import StandardScaler
from sklearn_pandas import DataFrameMapper

Using TensorFlow backend.


# I. METABRIC data

We first choose the type of pseudo-observation among the followings:
- "pseudo_optim"
- "pseudo_km"
- "pseudo-discrete"

In [3]:
name = "pseudo_km"

We use METABRIC data. We use clinical and pathological informations and gene expression data. Missing values for explanatory variables are completed. Data is first splitted into 5 folds for the double 5 folds cross validation.  The same 5 folds are used for all the models and are stored in the file "folds.json".

In [4]:
df = pd.read_csv("data/real_data/metabric_imputed.csv")
df['id'] = df.index + 1
y = pd.read_csv("data/real_data/"+name+".csv")

In [5]:
with open('data/real_data/folds.json') as f:
        kfolds = json.load(f)

# II. Model's construction and training

The parameters of the architecture are the one listed in the parameters dataframe, selected by a double 5-fold cross-validation among 400 sets of parameters. There are 5 sets of parameters, corresponding to each fold of the outerloop.

In [6]:
results_all = pd.DataFrame(columns=['unoc5', 'unoc10','ibs'])
n_picktime = int(y[['s']].apply(pd.Series.nunique))
k=0
for kname, val in kfolds.items():
    param = pd.read_csv('model/best_param_metabric/'+name+'_best_param.csv', sep = ";",index_col=0)
    train_index = val['train']
    val_index = val['test']
    df_train, df_test = df.loc[train_index], df.loc[val_index]
    y_train, y_test = y[y['id'].isin(list(np.unique(df_train[['id']])))], y[y['id'].isin(list(np.unique(df_test[['id']])))] 
    
    X_train_all, y_train_all = utils.prepare_pseudobs_metabric(y_train,df_train,name)
    X_train_all = X_train_all.drop(['id'], axis=1)
    df_test = df_test.drop(['id'], axis=1)
    
    
    model,callbacks  = model_architecture.build_model_pseudobs(X_train_all,
                               param['neurons'][kname],
                               param['dropout'][kname],
                               param['activation'][kname],
                               param['lr'][kname],
                               param['optimizer'][kname],
                               param['n_layers'][kname],
                               100)
    history = model.fit(X_train_all,
             y_train_all,
             int(param['batch_size'][0]),
             epochs =100,
             callbacks = callbacks,
             verbose = 2)

    
    X_test = df_test.drop(['yy','status'], axis = 1)
    X_test_all = pd.concat([X_test]*n_picktime)
    time_test = pd.DataFrame(np.repeat(np.unique(y_train[['s']]),len(X_test)))
    X_test_all.reset_index(inplace=True, drop=True)
    X_test_all = pd.concat([X_test_all, time_test], axis = 1)
    y_pred = model.predict(X_test_all)
    y_pred = y_pred.reshape((n_picktime,len(X_test)))
    y_pred = pd.DataFrame(y_pred)

    if name == "pseudo_discrete" or name == "pseudo_optim2":
        y_pred_all = pd.DataFrame()
        for j in range(len(y_pred.columns)):    
            for i in range(len(y_pred)):
                y_pred_all.loc[i,j] = y_pred.loc[:i,j].prod(axis = 0)
                surv = y_pred_all
    else:
        surv = y_pred

    surv = surv.set_index(np.unique(y_train[['s']]))
    res = output_results.output_stats(model,surv,X_train_all, df_train, X_test_all, df_test)
    results_all = results_all.append(res, ignore_index  = True, sort = False)
    k+=1


Epoch 1/100
 - 1s - loss: 0.1781
Epoch 2/100
 - 0s - loss: 0.1427
Epoch 3/100
 - 1s - loss: 0.1335
Epoch 4/100
 - 1s - loss: 0.1299
Epoch 5/100
 - 1s - loss: 0.1266
Epoch 6/100
 - 1s - loss: 0.1233
Epoch 7/100
 - 0s - loss: 0.1208
Epoch 8/100
 - 0s - loss: 0.1182
Epoch 9/100
 - 0s - loss: 0.1170
Epoch 10/100
 - 0s - loss: 0.1150
Epoch 11/100
 - 0s - loss: 0.1135
Epoch 12/100
 - 0s - loss: 0.1130
Epoch 13/100
 - 1s - loss: 0.1113
Epoch 14/100
 - 0s - loss: 0.1096
Epoch 15/100
 - 0s - loss: 0.1088
Epoch 16/100
 - 0s - loss: 0.1074
Epoch 17/100
 - 0s - loss: 0.1075
Epoch 18/100
 - 1s - loss: 0.1068
Epoch 19/100
 - 1s - loss: 0.1051
Epoch 20/100
 - 1s - loss: 0.1040
Epoch 21/100
 - 1s - loss: 0.1022
Epoch 22/100
 - 1s - loss: 0.1024
Epoch 23/100
 - 1s - loss: 0.1018
Epoch 24/100
 - 1s - loss: 0.1015
Epoch 25/100
 - 1s - loss: 0.1010
Epoch 26/100
 - 1s - loss: 0.1005
Epoch 27/100
 - 1s - loss: 0.0984
Epoch 28/100
 - 1s - loss: 0.1001
Epoch 29/100
 - 1s - loss: 0.0986
Epoch 30/100
 - 1s - l

Epoch 37/100
 - 1s - loss: 0.0842
Epoch 38/100
 - 1s - loss: 0.0844
Epoch 39/100
 - 1s - loss: 0.0860
Epoch 40/100
 - 1s - loss: 0.0840
Epoch 41/100
 - 1s - loss: 0.0837
Epoch 42/100
 - 1s - loss: 0.0839
Epoch 43/100
 - 1s - loss: 0.0821
Epoch 44/100
 - 1s - loss: 0.0833
Epoch 45/100
 - 1s - loss: 0.0835
Epoch 46/100
 - 1s - loss: 0.0833
Epoch 47/100
 - 1s - loss: 0.0827
Epoch 48/100
 - 1s - loss: 0.0822
Epoch 49/100
 - 1s - loss: 0.0838
Epoch 50/100
 - 1s - loss: 0.0828
Epoch 51/100
 - 1s - loss: 0.0885
Epoch 52/100
 - 1s - loss: 0.0870
Epoch 53/100
 - 1s - loss: 0.0860
Epoch 54/100
 - 1s - loss: 0.0853
Epoch 55/100
 - 1s - loss: 0.0857
Epoch 56/100
 - 1s - loss: 0.0851
Epoch 57/100
 - 1s - loss: 0.0853
Epoch 58/100
 - 1s - loss: 0.0828
Epoch 59/100
 - 1s - loss: 0.0842
Epoch 60/100
 - 1s - loss: 0.0830
Epoch 61/100
 - 1s - loss: 0.0820
Epoch 62/100
 - 1s - loss: 0.0833
Epoch 63/100
 - 1s - loss: 0.0807
Epoch 64/100
 - 1s - loss: 0.0807
Epoch 65/100
 - 1s - loss: 0.0820
Epoch 66/100
 

Epoch 79/100
 - 1s - loss: 0.0775
Epoch 80/100
 - 1s - loss: 0.0771
Epoch 81/100
 - 1s - loss: 0.0765
Epoch 82/100
 - 1s - loss: 0.0749
Epoch 83/100
 - 1s - loss: 0.0739
Epoch 84/100
 - 1s - loss: 0.0742
Epoch 85/100
 - 1s - loss: 0.0728
Epoch 86/100
 - 1s - loss: 0.0731
Epoch 87/100
 - 1s - loss: 0.0736
Epoch 88/100
 - 1s - loss: 0.0732
Epoch 89/100
 - 1s - loss: 0.0719
Epoch 90/100
 - 1s - loss: 0.0722
Epoch 91/100
 - 1s - loss: 0.0733
Epoch 92/100
 - 1s - loss: 0.0705
Epoch 93/100
 - 1s - loss: 0.0729
Epoch 94/100
 - 1s - loss: 0.0730
Epoch 95/100
 - 1s - loss: 0.0717
Epoch 96/100
 - 1s - loss: 0.0719
Epoch 97/100
 - 1s - loss: 0.0710
Epoch 98/100
 - 1s - loss: 0.0712
Epoch 99/100
 - 1s - loss: 0.0709
Epoch 100/100
 - 1s - loss: 0.0710


# III. Results

We output the AUC value at 5 and 10 years and Uno's C-index at 5 and 10 years, with standard deviation among the 5 folds.

In [7]:
mean_all = results_all.mean(axis=0)
std_all = results_all.std(axis=0, ddof=0)
measures = pd.concat([mean_all.rename('mean'), std_all.rename('std')],axis = 1)
measures

Unnamed: 0,mean,std
unoc5,0.640961,0.041659
unoc10,0.614684,0.038053
ibs,0.238063,0.027767
