In [1]:
#Loading useful packages
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import os.path
import sys
import argparse
import warnings
warnings.filterwarnings('ignore')

#General purpose AI packages
from sklearn.cross_validation import train_test_split,KFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn.model_selection import ParameterGrid
from sklearn.gaussian_process import GaussianProcess

#Keras packages
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, ActivityRegularization
from keras.callbacks import Callback, ModelCheckpoint, EarlyStopping
from keras.optimizers import RMSprop
from keras import regularizers 

Using TensorFlow backend.


In [2]:
############## LOSSHISTORY CALLBACK CLASS ######################################
class LossHistory(Callback):
    def on_train_begin(self, logs={}):
        self.losses = []
        self.val_losses = []

    def on_epoch_end(self, batch, logs={}):
        self.losses.append(logs.get('loss'))
        self.val_losses.append(logs.get('val_loss'))

In [3]:
DATAFILE = os.path.join('data','data.csv')
TARGETFILE = os.path.join('data','target.csv')
OUTDIR = os.path.join('results')

In [4]:
def train_nn(X_tr,Y_tr,X_val,Y_val):
    
    verbose = 1
    
    #Model callbacks
    filepath = os.path.join('results','weights.best.hdf5')
    mdlcheck = ModelCheckpoint(filepath, verbose=0, save_best_only=True)
    mdllosses = LossHistory()
    mdlstop = EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=0, mode='auto')

    #Model fit
    n_epochs = 5000
    n_batch = 68
    kf = KFold(n = np.shape(X_tr_val)[0], n_folds = 5)
    performance_cv = []
    #mdllosses_cv = []
    models = []
    
    i = 1
    for tr_idx, val_idx in kf:
        #'activation_1': 0, 'activation_2': 1, 'activation_3': 1, 'dropout_1': 0.08813572098580352, 
        #'dropout_2': 0.03155693545556867, 'fit_n_batch': 68.0, 'n_nodes_1': 436.0, 'n_nodes_2': 969.0, 'n_nodes_3': 373.0
        #Build NN
        model = Sequential()
        model.add(Dense(units=436, input_dim=np.shape(X_tr)[1], activity_regularizer=regularizers.l2(0)))
        model.add(Activation('relu'))
        model.add(Dropout(0.0881357))
        model.add(Dense(units=969,activity_regularizer=regularizers.l2(0)))
        model.add(Activation('sigmoid'))
        model.add(Dropout(0.0315569))
        model.add(Dense(units=373,activity_regularizer=regularizers.l2(0)))
        model.add(Activation('sigmoid'))
        model.add(Dense(units=1))

        opt = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)
        model.compile(loss='mse',optimizer=opt)
    
        print("Fold: ",i," of 5")
        i = i+1
        X_train, X_valid = X_tr_val[tr_idx], X_tr_val[val_idx]
        Y_train, Y_valid = Y_tr_val[tr_idx], Y_tr_val[val_idx]
        scaler = StandardScaler().fit(X_train)
        X_train = scaler.transform(X_train)
        X_valid = scaler.transform(X_valid)

        history = model.fit(X_train, Y_train, validation_data = (X_valid, Y_valid),  epochs = n_epochs, batch_size = n_batch, callbacks = [mdlstop,mdlcheck,mdllosses],verbose = verbose)
        
        #Recalling best weights and appending loss value and loss history
        model.load_weights(filepath)
        models.append(model)
        performance_cv.append(min(mdllosses.val_losses))
        #mdllosses_cv.append(mdllosses)
        
    #Calculating in-cv std 
    loss_std = np.std(performance_cv)
    
    print('Obtained loss: ', np.mean(performance_cv), ' (', loss_std, ')')
    
    return model, min(mdllosses.val_losses)

In [4]:
dataset_trans = pd.read_table(os.path.join('data','dataset_trans.csv'),sep=',')
dataset_trans.describe()

Unnamed: 0,subj,Gc,CHO,ROC,Gt,IOB,var_class,Vmx,kp3,CR,CF,BW,u2ss,Ib,min_risk,Y
count,9963.0,9963.0,9963.0,9963.0,9963.0,9963.0,9963.0,9963.0,9963.0,9963.0,9963.0,9963.0,9963.0,9963.0,9963.0,9963.0
mean,51.591388,114.664258,74.62712,-0.333333,119.862853,1.929754,4.73773,0.070821,0.010829,16.013952,43.360333,70.012447,1.758747,106.129012,6.225203,22.04858
std,29.087193,63.364758,17.039728,1.547287,6.707839,2.457164,2.054412,0.023353,0.005171,5.238797,9.625555,12.529655,0.511385,17.855293,9.034714,85.405424
min,1.0,60.0,50.0,-2.0,107.53,0.0,1.0,0.026252,0.002596,7.0,26.0,46.692,0.64792,68.668,0.7721,-190.0
25%,25.5,70.0,60.0,-2.0,115.09,0.025917,4.0,0.05689,0.007041,12.0,36.0,60.371,1.4463,91.455,2.57985,-30.0
50%,53.0,80.0,70.0,-1.0,119.79,1.0335,5.0,0.064676,0.010244,15.0,43.0,68.404,1.6812,104.23,3.9314,30.0
75%,77.0,150.0,90.0,1.0,124.6,2.9579,6.0,0.085176,0.013947,20.0,50.0,76.51,2.0735,118.92,6.04225,80.0
max,100.0,250.0,100.0,2.0,137.8,16.508,7.0,0.14212,0.025736,30.0,67.0,106.09,3.5517,161.56,117.4,190.0


In [None]:
############## PREPARING DATA ##################################################

dataset_trans = pd.read_table(os.path.join('data','dataset_trans.csv'),sep=',')
target = np.asarray(dataset_trans['Y'])
pazienti = np.asarray(dataset_trans['subj'])
del dataset_trans['Y']
del dataset_trans['min_risk']

train = np.asarray(dataset_trans)
train_val_size = 0.8 #80% training+validation set and 20% test set
train_size = 0.7 #70% training set and 30% validation set
X_tr_val, X_te, Y_tr_val, Y_te = train_test_split(train, target, train_size=train_val_size, random_state=1)
X_tr, X_val, Y_tr, Y_val = train_test_split(X_tr_val, Y_tr_val, train_size=train_size, random_state=1)

paz_tr_val = X_tr_val[:,0]
paz_tr = X_tr[:,0]
paz_val = X_val[:,0]
paz_te = X_te[:,0]
X_tr_val = X_tr_val[:,1:14]
X_tr = X_tr[:,1:14]
X_val = X_val[:,1:14]
X_te = X_te[:,1:14]

scaler = StandardScaler().fit(X_tr)
X_tr = scaler.transform(X_tr)
X_val = scaler.transform(X_val)
X_te = scaler.transform(X_te)

model, score = train_nn(X_tr,Y_tr,X_val,Y_val)

Fold:  1  of 5
Train on 6376 samples, validate on 1594 samples
Epoch 1/5000
Epoch 2/5000
Epoch 3/5000
Epoch 4/5000
Epoch 5/5000
Epoch 6/5000
Epoch 7/5000
Epoch 8/5000
Epoch 9/5000
Epoch 10/5000
Epoch 11/5000
Epoch 12/5000
Epoch 13/5000
Epoch 14/5000
Epoch 15/5000
Epoch 16/5000
Epoch 17/5000
Epoch 18/5000
Epoch 19/5000
Epoch 20/5000
Epoch 21/5000
Epoch 22/5000
Epoch 23/5000
Epoch 24/5000
Epoch 25/5000
Epoch 26/5000
Epoch 27/5000
Epoch 28/5000
Epoch 29/5000
Epoch 30/5000
Epoch 31/5000
Epoch 32/5000
Epoch 33/5000
Epoch 34/5000
Epoch 35/5000
Epoch 36/5000
Epoch 37/5000
Epoch 38/5000
Epoch 39/5000
Epoch 40/5000
Epoch 41/5000
Epoch 42/5000
Epoch 43/5000
Epoch 44/5000
Epoch 45/5000
Epoch 46/5000
Epoch 47/5000
Epoch 48/5000
Epoch 49/5000
Epoch 50/5000
Epoch 51/5000
Epoch 52/5000
Epoch 53/5000
Epoch 54/5000
Epoch 55/5000
Epoch 56/5000
Epoch 57/5000
Epoch 58/5000
Epoch 59/5000
Epoch 60/5000
Epoch 61/5000
Epoch 62/5000
Epoch 63/5000
Epoch 64/5000
Epoch 65/5000
Epoch 66/5000
Epoch 67/5000
Epoch 68

In [None]:
############## EVALUATING RESULTS  #############################################
Y_te = np.squeeze(Y_te)
Y_NN = np.squeeze(model.predict(X_te))

#MSE
print('\n Score NN: ',mean_squared_error(Y_NN,Y_te))

#Plot train and validation losses
#plt.plot(loss.losses)
#plt.plot(loss.val_losses)
#plt.show()

#Boxplot of the difference between actual values and estimates
data_to_plot = [Y_te-Y_NN]
plt.boxplot(data_to_plot)
plt.show()

#Histogram of the difference between actual values and estimates
plt.hist(data_to_plot,bins=40)
plt.show()

#Plot of the actual values and estimates
plt.plot(Y_te, marker='^')
plt.plot(Y_NN, marker='o')
plt.show()



In [None]:
np.save('Y_NN',Y_NN)
np.save('Y_te',Y_te)
np.save('paz_te',paz_te)
np.save('X_te',X_te)