In [1]:
import pandas as pd
import numpy as np

# Data loading

##### Expression profiles

In [2]:
%%time

# DL with Python - François Chollet (pp. 162-163)
from PIL import Image
from keras.preprocessing import image
import glob

path = '../data/COMMON_GENES_CHR_LOCUS_TREEMAPS/'
filelist = glob.glob(path + '*.png')

brca_images = dict()
non_brca_images = dict()
w, h = (image.load_img(filelist[1]).size)

basewidth = 100
wpercent = (basewidth/float(w))
hsize = int((float(h)*float(wpercent)))

i = 0 
for filename in filelist:
    sample = filename.split(" ")[1]
    if(True):
        if('NON' in sample):
            non_brca_images[sample.split("_")[0].replace('.','-')] = image.load_img(filename).crop((11, 11, w-13, h-13)).resize((basewidth,hsize), Image.ANTIALIAS).convert('L')
        else:
            brca_images[sample.split("_")[0].replace('.','-')] = image.load_img(filename).crop((11, 11, w-13, h-13)).resize((basewidth,hsize), Image.ANTIALIAS).convert('L')

Using TensorFlow backend.


CPU times: user 24.9 s, sys: 171 ms, total: 25.1 s
Wall time: 25.1 s


##### Clinical data

In [3]:
brca_clinical = pd.read_hdf('../data/TCGA_data.h5', key='brca_clinical')
non_brca_clinical = pd.read_hdf('../data/TCGA_data.h5', key='non_brca_clinical')

In [4]:
# Deleting samples with 'NaN's in OS-columns 

## non_brca
non_brca_clinical['OS.time'] = non_brca_clinical['OS.time'].map(lambda x: np.nan if x == 'NaN' else x)
non_brca_clinical['OS'] = non_brca_clinical['OS'].map(lambda x: np.nan if x == 'NaN' else x)
non_brca_clinical.dropna(subset=['OS.time', 'OS'], inplace=True)
non_brca_clinical['OS.time'] = non_brca_clinical['OS.time'].astype(float)
non_brca_clinical['OS'] = non_brca_clinical['OS'].astype(float)

## brca
brca_clinical['OS.time'] = brca_clinical['OS.time'].map(lambda x: np.nan if x == 'NaN' else x)
brca_clinical['OS'] = brca_clinical['OS'].map(lambda x: np.nan if x == 'NaN' else x)
brca_clinical.dropna(subset=['OS.time', 'OS'], inplace=True)
brca_clinical['OS.time'] = brca_clinical['OS.time'].astype(float)
brca_clinical['OS'] = brca_clinical['OS'].astype(float)

In [5]:
print('There are {} brca clinical info and {} non-brca clinical info'.format(len(brca_clinical), len(non_brca_clinical)))

There are 1196 brca clinical info and 9224 non-brca clinical info


## Creating X and Y numpy-arrays

Note that there are differences between patients with survival info and with images; only patients in both datasets are mantained

In [6]:
common_pat_brca = set(brca_images.keys()).intersection(brca_clinical.index)

In [7]:
len(common_pat_brca)

1196

#### Deleting images and OS data from not common patients

In [8]:
# Mantaining brca patients with both X and Y available data
brca_clinical = brca_clinical.loc[common_pat_brca]

for k in set(list(brca_images.keys())).difference(common_pat_brca):
    del brca_images[k]

In [9]:
len(brca_images)

1196

In [10]:
len(brca_clinical)

1196

In [11]:
common_pat_non_brca = set(non_brca_images.keys()).intersection(non_brca_clinical.index)

In [12]:
len(common_pat_non_brca)

9224

In [13]:
# Mantaining brca patients with both X and Y available data
non_brca_clinical = non_brca_clinical.loc[common_pat_non_brca]

for k in set(list(non_brca_images.keys())).difference(common_pat_non_brca):
    del non_brca_images[k]

In [14]:
len(non_brca_images)

9224

In [15]:
len(non_brca_clinical)

9224

#### From OS to Survival

In [16]:
# Create 39 equally spaced intervals for follow-up time, from 0 to 5 years:
breaks = np.arange(0.,365.*5,365./8)
n_intervals = len(breaks)-1
timegap = breaks[1:] - breaks[:-1]

In [17]:
import nnet_survival
non_brca_y = nnet_survival.make_surv_array(non_brca_clinical['OS.time'],
                             non_brca_clinical['OS'],
                             breaks)
brca_y = nnet_survival.make_surv_array(brca_clinical['OS.time'],
                             brca_clinical['OS'],
                             breaks)

#### From PIL.Images to Numpy

In [18]:
%%time

import numpy as np
from keras.preprocessing import image

samples = list(non_brca_clinical.index)

X_non_brca = np.empty((len(samples), 10000))

for i in range(0, len(samples)):
    
    X_non_brca[i] = image.img_to_array(non_brca_images[samples[i]]).reshape(10000)

CPU times: user 211 ms, sys: 148 ms, total: 359 ms
Wall time: 358 ms


In [19]:
%%time

import numpy as np
from keras.preprocessing import image

samples = list(brca_clinical.index)

X_brca = np.empty((len(samples), 100*100))

for i in range(0, len(samples)):
    X_brca[i] = image.img_to_array(brca_images[samples[i]]).reshape(10000)

CPU times: user 29.1 ms, sys: 20 ms, total: 49.2 ms
Wall time: 48.3 ms


In [20]:
print("BRCA data shapes: \n X: {} \t Y: {}".format(X_brca.shape, brca_y.shape))

BRCA data shapes: 
 X: (1196, 10000) 	 Y: (1196, 78)


In [21]:
print("Non-BRCA data shapes: \n X: {} \t Y: {}".format(X_non_brca.shape, non_brca_y.shape))

Non-BRCA data shapes: 
 X: (9224, 10000) 	 Y: (9224, 78)


In [22]:
# to save memory
del non_brca_images
del brca_images

# Training models

In [25]:
from keras.models import Sequential
from keras.layers import Dense
import tensorflow as tf
import keras.backend as K
from keras.optimizers import SGD
import numpy
# fix random seed for reproducibility
numpy.random.seed(7)

In [26]:
# TRAINING MODELS

import keras.backend as K
from keras.models import Sequential
from keras.layers import Input, Dense, Conv1D, MaxPool1D, Activation, Dropout, Flatten
from keras import regularizers, optimizers
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping
import warnings
from keras.regularizers import l1
from keras.optimizers import Adam
import tensorflow as tf

# Using C-index as evaluation metric (using a custom callback)

from lifelines.utils import concordance_index
from keras.callbacks import Callback

class CIndex(Callback):
    """
    Callback that computes the C-index metric both on training and test data after each epoch.
    
    Arguments:
        train_x: Numpy array containing the training dataset.
        train_time: Numpy array with survival time for training dataset.
        train_event: Numpy array with survival event for training dataset.
        val_x: Numpy array containing the test dataset.
        val_time: Numpy array with survival time for test dataset.
        val_event: Numpy array with survival event for test dataset.
        filepath: path to save the model.
        years: number of years to compute c-index
        patience: patience for early-stop training
    """
    
    def __init__(self, train_x, train_time, train_event, val_x, val_time, val_event, filepath, years=2, patience=5):
        super(Callback, self).__init__()
        self.X_tr = train_x
        self.time_tr = train_time
        self.event_tr = train_event
        self.X_val = val_x
        self.time_val = val_time
        self.event_val = val_event
        self.years = years
        self.patience = patience
        self.best = 0.0
        self.wait = 0  #counter for patience
        self.best_rounds = 1
        self.counter = 0
        self.filepath = filepath
        
    def on_train_begin(self, logs={}):
        self.c_index_dict = {'c-index': [], 'val_c-index': []}

    def on_epoch_end(self, epoch, logs={}):
        self.counter +=1

        y_tr=self.model.predict_proba(self.X_tr,verbose=0)
        yr_surv=np.cumprod(y_tr[:,0:np.nonzero(breaks>365*self.years)[0][0]], axis=1)[:,-1]
        c_index_tr = concordance_index(self.time_tr, yr_surv, self.event_tr)
        y_val=self.model.predict_proba(self.X_val,verbose=0)
        yr_surv_val=np.cumprod(y_val[:,0:np.nonzero(breaks>365*self.years)[0][0]], axis=1)[:,-1]
        c_index_val = concordance_index(self.time_val, yr_surv_val, self.event_val)

        #print('\tc-index: %s - val_c-index: %s' % (str(round(c_index_tr, 4)), str(round(c_index_val, 4))))
        
        if (c_index_val > self.best):
            self.c_index_dict['c-index'].append(c_index_tr)
            self.c_index_dict['val_c-index'].append(c_index_val)

            self.best = c_index_val
            self.best_rounds = self.counter
            self.wait = 0
            self.model.save_weights(self.filepath, overwrite=True)
        else:
            if self.wait >= self.patience: #no more patience, retrieve best model
                self.model.stop_training = True
                print('Best number of rounds: %d \nValidation C-Index: %f \n' % (self.best_rounds, self.best))
                self.model.load_weights(self.filepath)
            self.wait += 1

In [27]:
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.per_process_gpu_memory_fraction = 0.4
K.set_session(tf.Session(config=config))

In [32]:
# create model
model = Sequential()
model.add(Dense(702, input_dim=X_brca.shape[1], activation='relu'))
model.add(Dense(n_intervals, activation='sigmoid'))

In [29]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 702)               7020702   
_________________________________________________________________
dense_2 (Dense)              (None, 200)               140600    
_________________________________________________________________
dense_3 (Dense)              (None, 39)                7839      
Total params: 7,169,141
Trainable params: 7,169,141
Non-trainable params: 0
_________________________________________________________________


In [33]:
# BRCA
# Split dataset intor train and test
from sklearn.model_selection import train_test_split

X_brca_train, X_brca_test, y_brca_train, y_brca_test, train_index, test_index  = train_test_split(X_brca, brca_y, np.arange(len(X_brca)), test_size=0.2, random_state=42)
X_brca_train, X_brca_val, y_brca_train, y_brca_val, train_index, val_index = train_test_split(X_brca_train, y_brca_train, train_index, test_size=0.2, random_state=44)

In [35]:
optimizer = SGD(lr=0.0001)

model.compile(loss=nnet_survival.surv_likelihood(n_intervals), optimizer=optimizer)

hist_c_index = CIndex(X_brca_train,
                    brca_clinical.iloc[train_index]['OS.time'],
                    brca_clinical.iloc[train_index]['OS'],
                    X_brca_val,
                    brca_clinical.iloc[val_index]['OS.time'],
                    brca_clinical.iloc[val_index]['OS'],
                    filepath = 'keras-models/brca_simple_nnet.h5')

callbacks = [hist_c_index]   


model.fit(X_brca_train, 
          y_brca_train, 
          batch_size=128, 
          epochs=50, 
          validation_data = [X_brca_val, y_brca_val],
          callbacks=callbacks)

predictions = model.predict_proba(X_brca_test,verbose=0)
twoyr_surv=np.cumprod(predictions[:,0:np.nonzero(breaks>365*2)[0][0]], axis=1)[:,-1]
score = concordance_index(brca_clinical.iloc[test_index]['OS.time'], twoyr_surv, brca_clinical.iloc[test_index]['OS'])

print(score)

Train on 764 samples, validate on 192 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Best number of rounds: 1 
Validation C-Index: 0.500000 

0.5


In [38]:
predictions[0].shape

(39,)