Deep learning prediction of time to Breast Cancer recurrence. Data: https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Prognostic)

@author feBueno, June 2020 fernando.bueno.gutie@gmail.com

In [9]:
%tensorflow_version 2.x 
from __future__ import absolute_import, division, print_function, unicode_literals

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import clear_output
from six.moves import urllib

import tensorflow.compat.v2.feature_column as fc
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow as tf

import pathlib
import seaborn as sns


In [10]:
!pip install -q git+https://github.com/tensorflow/docs
import tensorflow_docs as tfdocs
import tensorflow_docs.plots
import tensorflow_docs.modeling


  Building wheel for tensorflow-docs (setup.py) ... [?25l[?25hdone


Data loading and define train/test sets

In [None]:
data_df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wpbc.data', header=None)
data_df.columns = data_df.columns.map(str)#set colnames as trings
#print(data_df.isna().sum())#check missing


data_df=data_df.rename(columns = {'0':'Id','1':'Recurrence','2':'Time'})#rename the first 3 columns

#define dependent variable as numeric
data_df['Recurrence']=data_df['Recurrence'].str.replace('N', '0')#R = 1 = recurrent, N = 0 = nonrecurrent
data_df['Recurrence']=data_df['Recurrence'].str.replace('R', '1')
data_df['Recurrence']=data_df['Recurrence'].astype('int64')

data_df=data_df[data_df['Recurrence'] == 1]#Only 47/198 samples are recurrent and will be considered

#random sample 70% of observations as training set
train_df=data_df.sample(frac=0.7)
test_df = data_df.drop(train_df.index)
print(test_df.shape)#The test set will consist of only 14 instances (30% of 47)

#separate dependent variable
y_train_series = train_df.pop('Time')
y_test_series = test_df.pop('Time')

train_df=train_df.iloc[:,2:33]#Remove Id and trivial categorical variable that indicates that all instances considered are recurrent
test_df=test_df.iloc[:,2:33]

Normalize data

In [12]:
train_stats = train_df.describe()
train_stats = train_stats.transpose()

def norm(x):#convert to z-scores
  return (x - train_stats['mean']) / train_stats['std']#test is also standarized based on train
normed_train_df = norm(train_df)
normed_test_df = norm(test_df)

Data exploration

In [None]:
sns.pairplot(train_df[["3", "4", "5", "6"]], diag_kind="kde")#Pairwise joint distribution for the first 4 predictors

Model building

In [None]:
def build_model():# two densely connected hidden layers
  model = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=[len(train_df.keys())]),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)
  ])

  optimizer = tf.keras.optimizers.RMSprop(0.001)

  model.compile(loss='mse',
                optimizer=optimizer,
                metrics=['mae', 'mse'])#mean absolute error and mean squared error
  return model

model = build_model()

model.summary()

#check, for the first 10 instances, that the model returns a single continuous output value per instance
example_batch = normed_train_df[:10]
example_result = model.predict(example_batch)
example_result

Model train

In [None]:
EPOCHS = 1000# n times that the etire dataset will pass through the model

history = model.fit(
  normed_train_df, y_train_series,
  epochs=EPOCHS, validation_split = 0.2, verbose=0,
  callbacks=[tfdocs.modeling.EpochDots()],)


Results evaluation

In [17]:
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
hist.tail()# seems that the validation error does not keep improving in the last epochs

Unnamed: 0,loss,mae,mse,val_loss,val_mae,val_mse,epoch
995,0.410771,0.476079,0.410771,446.728668,16.445374,446.728668,995
996,0.408537,0.456787,0.408537,391.948059,15.260337,391.948059,996
997,0.407192,0.439724,0.407192,444.86618,16.41341,444.86618,997
998,0.370306,0.409784,0.370306,394.304688,15.301208,394.304688,998
999,0.346938,0.389682,0.346938,442.019775,16.37178,442.019775,999


In [None]:
plotter = tfdocs.plots.HistoryPlotter(smoothing_std=2)
plotter.plot({'Basic': history}, metric = "mae")
plt.ylabel('MAE [time]')# seems that after ~300 epoch the model does not improve any more


In [None]:
plotter.plot({'Basic': history}, metric = "mse")
plt.ylabel('MSE [time^2]')# the same seems to ocurr for the MSE


In [None]:
#stop training when the validation does not improve
model = build_model()

early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)

early_history = model.fit(normed_train_df, y_train_series, 
                    epochs=EPOCHS, validation_split = 0.2, verbose=0, 
                    callbacks=[early_stop, tfdocs.modeling.EpochDots()])

In [None]:
plotter.plot({'Early Stopping': early_history}, metric = "mae")
plt.ylabel('MAE [time]')# seems that early stopping for the validation is at ~35 epochs

In [27]:
loss, mae, mse = model.evaluate(normed_test_df, y_test_series, verbose=2)

print("Testing set Mean Abs Error: {:5.2f} time".format(mae))

1/1 - 0s - loss: 294.7453 - mae: 13.4925 - mse: 294.7453
Testing set Mean Abs Error: 13.49 time


In [None]:
#observed VS predicted. Number of points at both sides of predictions line is similar
test_predictions = model.predict(normed_test_df).flatten()

a = plt.axes(aspect='equal')
plt.scatter(y_test_series, test_predictions)
plt.xlabel('True Values [time]')
plt.ylabel('Predictions [time]')
lims = [0, 50]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims, lims)

In [None]:
error = test_predictions - y_test_series# negative and positive residuals seem to be balanced
plt.hist(error, bins = 10)
plt.xlabel("Prediction Error [time]")
_ = plt.ylabel("Count")