In [7]:
#Ridge anti-overfitting
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.metrics import mean_absolute_error, r2_score, explained_variance_score,mean_squared_error
from math import sqrt
from sklearn.model_selection import GridSearchCV
#1 load dataset
#trainset
trainset = pd.read_csv("/data/fjsdata/physionet/eICU-CRD/EMBC2020/trainset.csv",sep=',',index_col=['patientunitstayid']) 
print ('The shape of trainset is : %d,%d'%(trainset.shape[0],trainset.shape[1]))
#testset
teststet = pd.read_csv("/data/fjsdata/physionet/eICU-CRD/EMBC2020/testset.csv",sep=',',index_col=['patientunitstayid'])
print ('The shape of testset is : %d,%d'%(teststet.shape[0],teststet.shape[1]))

#2 LR+L1 training
X = trainset.drop(columns=["actualiculos"], inplace=False)  #feature
y = trainset['actualiculos']#label
param_grid = {'fit_intercept':[True,False],'alpha':[0.01,0.05,0.1,0.5]}
clf = linear_model.Ridge(normalize=False,random_state=0) #max_iter
grid_clf = GridSearchCV(clf, param_grid, cv=5)
grid_clf.fit(X, y.ravel())

#4.3 prediction and evaluation
X_test = teststet.drop(columns=["actualiculos"], inplace=False)  #feature
y_test = teststet['actualiculos']#label 
y_pred = grid_clf.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print("MAE Score of LS+L2 on eICU-CRD dataset is :", mae)  
r2 = r2_score(y_test, y_pred)
print("R^2 Score of LS+L2 on eICU-CRD dataset is :", r2) 
ev = explained_variance_score(y_test, y_pred)
print("EV Score of LS+L2 on eICU-CRD dataset is :", ev)
#trainset
y_pred_tr = grid_clf.predict(X)
mae = mean_absolute_error(y, y_pred_tr)
print("MAE Score of LS+L2 on eICU-CRD trainset is :", mae)  
r2 = r2_score(y, y_pred_tr)
print("R^2 Score of LS+L2 on eICU-CRD trainset is :", r2) 
ev = explained_variance_score(y, y_pred_tr)
print("EV Score of LS+L2 on eICU-CRD trainset is :", ev)

The shape of trainset is : 108988,53
The shape of testset is : 27248,53
MAE Score of LS+L2 on eICU-CRD dataset is : 2.010006863171179
R^2 Score of LS+L2 on eICU-CRD dataset is : 0.09309768755163184
EV Score of LS+L2 on eICU-CRD dataset is : 0.09312384144652552
MAE Score of LS+L2 on eICU-CRD trainset is : 1.9892872461008608
R^2 Score of LS+L2 on eICU-CRD trainset is : 0.11570251878989302
EV Score of LS+L2 on eICU-CRD trainset is : 0.11570251879049842


In [5]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, r2_score, explained_variance_score,mean_squared_error
from math import sqrt
import tensorflow as tf
import sys
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings("ignore")

#1.1.1 design model
class TF_DNNRegressor_eICU:
    def __init__(self, lr=0.001, dim=52, num_class=1, batchSize=1000):
        #global parameters
        self.lr = lr
        self.dim = dim # dimensions of sample
        self.num_class = num_class #output 
        self.hidden_layers = [16, 4]
        #set network structure
        self.add_placeholders()
        self.add_weight()
        self.add_model()
        self.add_loss()
        self.add_optimizer()
        self.init_sess()
        
    def add_placeholders(self):    
        self.X_input = tf.placeholder("float", [None, self.dim])
        self.Y_input = tf.placeholder("float", [None, self.num_class])
        self.keep_prob = tf.placeholder(tf.float32)  
    
    def add_weight(self):
        # Store layers weight & bias
        #init_uniform = tf.random_uniform_initializer(minval=0, maxval=1, seed=None, dtype=tf.float32)
        self.weights = {
            'w1': tf.Variable(tf.random_normal([self.dim, self.hidden_layers[0]])),
            'w2': tf.Variable(tf.random_normal([self.hidden_layers[0], self.hidden_layers[1]])),
            #'w3': tf.Variable(tf.random_normal([self.hidden_layers[1], self.hidden_layers[2]])),
            #'w4': tf.Variable(tf.random_normal([self.hidden_layers[2], self.hidden_layers[3]])),
            'wout': tf.Variable(tf.random_normal([self.hidden_layers[1], self.num_class]))
        }
        self.biases = {
            'b1': tf.Variable(tf.random_normal([self.hidden_layers[0]])),
            'b2': tf.Variable(tf.random_normal([self.hidden_layers[1]])),
            #'b3': tf.Variable(tf.random_normal([self.hidden_layers[2]])),
            #'b4': tf.Variable(tf.random_normal([self.hidden_layers[3]])),
            'bout': tf.Variable(tf.random_normal([self.num_class]))
        }
        
    def add_model(self):
        # Hidden fully connected layer with 1024 neurons
        layer_1 =  tf.add(tf.matmul(self.X_input, self.weights['w1']), self.biases['b1']) 
        # Hidden fully connected layer with 256 neurons
        layer_2 = tf.add(tf.matmul(layer_1, self.weights['w2']), self.biases['b2']) 
        # Hidden fully connected layer with 128 neurons
        #layer_3 =  tf.add(tf.matmul(layer_2, self.weights['w3']), self.biases['b3']) 
        # Hidden fully connected layer with 32 neurons 
        #layer_4 =  tf.add(tf.matmul(layer_3, self.weights['w4']), self.biases['b4']) 
        # Output fully connected layer with a neuron for each class
        out_layer =tf.matmul(layer_2, self.weights['wout']) + self.biases['bout'] 
        self.Y_output =  out_layer
    
    def add_loss(self):
         self.loss = tf.losses.mean_squared_error( self.Y_input , self.Y_output ) 
            
    def add_optimizer(self):
        optimizer = tf.train.AdamOptimizer(self.lr)
        self.train_step = optimizer.minimize(self.loss)
        
    def init_sess(self):
        self.config = tf.ConfigProto()
        self.config.gpu_options.allow_growth = True
        self.config.allow_soft_placement = True
        #self.config.gpu_options.per_process_gpu_memory_fraction = 0.5
        self.sess = tf.Session(config=self.config)
        self.sess.run(tf.global_variables_initializer())

#1.1.2 train model
#load trainset
trainset = pd.read_csv("/data/fjsdata/physionet/eICU-CRD/EMBC2020/trainset.csv",sep=',',index_col=['patientunitstayid']) 
#min-max scale the continous features
ss = MinMaxScaler()
scale_features = ['ph', 'creatinine', 'albumin','diagnosis']
trainset[scale_features] = ss.fit_transform(trainset[scale_features])
print ('The shape of trainset is : %d,%d'%(trainset.shape[0],trainset.shape[1]))
X = trainset.drop(columns=["actualiculos"], inplace=False)  #feature
Y = trainset['actualiculos'].to_frame()#label, from series to array
#define model
tf_model = TF_DNNRegressor_eICU()
#set paramete
verbose = 10
batchSize=1000
num_batches = X.shape[0] // batchSize + 1 
pre_loss = 0.0
while True:#convergence
    losses = []
    for i in range(num_batches):
        min_idx = i * batchSize
        max_idx = np.min([X.shape[0], (i+1)*batchSize])
        X_batch = X[min_idx: max_idx]
        Y_batch = Y[min_idx: max_idx]
        #_, tmp_loss, y_out = tf_model.sess.run([tf_model.train_step, tf_model.loss, tf_model.Y_output], 
        #                                 feed_dict={tf_model.X_input: X_batch,tf_model.Y_input: Y_batch})
        _, tmp_loss,  = tf_model.sess.run([tf_model.train_step, tf_model.loss], 
                                         feed_dict={tf_model.X_input: X_batch,tf_model.Y_input: Y_batch, tf_model.keep_prob: 0.6})
        losses.append(tmp_loss)
        if verbose and i % verbose == 0:
            sys.stdout.write('\r{} / {} : loss = {}'.format(i, num_batches, np.mean(losses[-verbose:])))
            sys.stdout.flush()
    sys.stdout.write("\nMean loss in this epoch is: {}".format( np.mean(losses) ))
    sys.stdout.flush()
    #whether convergence
    if abs( np.mean(losses) - pre_loss)<0.001:
        break
    else:
        pre_loss = np.mean(losses)

The shape of trainset is : 108988,53
100 / 109 : loss = 117.19596862792969
100 / 109 : loss = 42.032379150390625260742188
100 / 109 : loss = 28.67951011657715020019531
100 / 109 : loss = 23.9923191070556643515625
100 / 109 : loss = 21.177944183349612381591797
100 / 109 : loss = 19.33827209472656200366211
100 / 109 : loss = 18.082189559936523302734375
100 / 109 : loss = 17.193670272827155791015625
100 / 109 : loss = 16.54924774169922921875
100 / 109 : loss = 16.074808120727544012451172
100 / 109 : loss = 15.723123550415039506835938
100 / 109 : loss = 15.462282180786133409973145
100 / 109 : loss = 15.269572257995605306396484
100 / 109 : loss = 15.128169059753418700073242
100 / 109 : loss = 15.025276184082031871276855
100 / 109 : loss = 14.951022148132324326538086
100 / 109 : loss = 14.897807121276855020385742
100 / 109 : loss = 14.85982704162597700213623
100 / 109 : loss = 14.832713127136231952514648
100 / 109 : loss = 14.813230514526367242675781
100 / 109 : loss = 14.7990493774414068239

In [6]:
#1.1.3  prediction and evaluation
#testset
teststet = pd.read_csv("/data/fjsdata/physionet/eICU-CRD/EMBC2020/testset.csv",sep=',',index_col=['patientunitstayid'])
teststet[scale_features] = ss.fit_transform(teststet[scale_features])
print ('The shape of testset is : %d,%d'%(teststet.shape[0],teststet.shape[1]))
X_test = teststet.drop(columns=["actualiculos"], inplace=False)  #feature
y_test = teststet['actualiculos'].to_frame()#label 
#prediction
y_pred = tf_model.sess.run(tf_model.Y_output, feed_dict={tf_model.X_input: X_test,tf_model.Y_input: y_test, tf_model.keep_prob: 1}) 
#y_pred = tf_model.sess.run(tf.nn.relu(y_pred))
#np.set_printoptions(precision=4)
mae = mean_absolute_error(y_test, y_pred)
print("MAE Score of DNN on eICU-CRD dataset is :", mae)  
rmse = sqrt(mean_squared_error(y_test, y_pred))
print("RMSE Score of DNN on eICU-CRD dataset is :", rmse)  
r2 = r2_score(y_test, y_pred)
print("R^2 Score of DNN on eICU-CRD dataset is :", r2) 
ev = explained_variance_score(y_test, y_pred)
print("EV Score of DNN on eICU-CRD dataset is :", ev)

#trainset
y_pred_tr = tf_model.sess.run(tf_model.Y_output, feed_dict={tf_model.X_input: X,tf_model.Y_input: Y, tf_model.keep_prob: 1}) 
mae = mean_absolute_error(Y, y_pred_tr)
print("MAE Score of RandomForest on eICU-CRD trainset is :", mae)  
r2 = r2_score(Y, y_pred_tr)
print("R^2 Score of RandomForest on eICU-CRD trainset is :", r2) 
ev = explained_variance_score(Y, y_pred_tr)
print("EV Score of RandomForest on eICU-CRD trainset is :", ev)

The shape of testset is : 27248,53
MAE Score of DNN on eICU-CRD dataset is : 2.0309516431815196
RMSE Score of DNN on eICU-CRD dataset is : 4.231293240333169
R^2 Score of DNN on eICU-CRD dataset is : 0.09257889620076531
EV Score of DNN on eICU-CRD dataset is : 0.09262862172546693
MAE Score of RandomForest on eICU-CRD trainset is : 2.0105510213615463
R^2 Score of RandomForest on eICU-CRD trainset is : 0.11467417707790517
EV Score of RandomForest on eICU-CRD trainset is : 0.11484501810532255


In [3]:
import itertools
import pandas as pd
import tensorflow as tf
from math import sqrt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, r2_score, explained_variance_score,mean_squared_error
import warnings
warnings.filterwarnings('ignore')

#1.2.1 load trainset
trainset = pd.read_csv("/data/fjsdata/physionet/eICU-CRD/EMBC2020/trainset.csv",sep=',',index_col=['patientunitstayid']) 
#min-max scale the continous features
ss = MinMaxScaler()
scale_features = ['ph', 'creatinine', 'albumin','diagnosis']
trainset[scale_features] = ss.fit_transform(trainset[scale_features])
print ('The shape of trainset is : %d,%d'%(trainset.shape[0],trainset.shape[1]))
X = trainset.drop(columns=["actualiculos"], inplace=False)  #feature
Y = trainset['actualiculos']#label,

#1.2.2 train model
#Features filtered
FEATURES = X.columns
def get_input_fn(X,Y, num_epochs=None, shuffle=True):
    return tf.estimator.inputs.pandas_input_fn(
          x=pd.DataFrame({k: X[k].values for k in FEATURES}),
          y=pd.Series(Y.values),
          num_epochs=num_epochs,
          shuffle=shuffle)
#model
feature_cols = [tf.feature_column.numeric_column(k) for k in FEATURES]
regressor = tf.estimator.DNNRegressor(feature_columns=feature_cols, hidden_units=[16,4], model_dir="/data/tmpexec/eICU_DNN_model")
regressor.train(input_fn=get_input_fn(X,Y), steps=20000)

#1.2.3 performance
#testset
teststet = pd.read_csv("/data/fjsdata/physionet/eICU-CRD/EMBC2020/testset.csv",sep=',',index_col=['patientunitstayid'])
teststet[scale_features] = ss.fit_transform(teststet[scale_features])
print ('The shape of testset is : %d,%d'%(teststet.shape[0],teststet.shape[1]))
X_test = teststet.drop(columns=["actualiculos"], inplace=False)  #feature
y_test = teststet['actualiculos']#label 

predictions = regressor.predict(input_fn=get_input_fn(X_test, y_test, num_epochs=1, shuffle=False))
#predictions = list(p["predictions"] for p in itertools.islice(y_pred, 6))
#print("Predictions: {}".format(str(predictions)))
y_pred = []
for it in list(predictions):
    y_pred.append(it.get('predictions'))
    
mae = mean_absolute_error(y_test, y_pred)
print("MAE Score of DNN on eICU-CRD dataset is :", mae)  
rmse = sqrt(mean_squared_error(y_test, y_pred))
print("RMSE Score of DNN on eICU-CRD dataset is :", rmse)  
r2 = r2_score(y_test, y_pred)
print("R^2 Score of DNN on eICU-CRD dataset is :", r2) 
ev = explained_variance_score(y_test, y_pred)
print("EV Score of DNN on eICU-CRD dataset is :", ev)

#trainset
predictions = regressor.predict(input_fn=get_input_fn(X, Y, num_epochs=1, shuffle=False))
y_pred = []
for it in list(predictions):
    y_pred.append(it.get('predictions'))
mae = mean_absolute_error(Y, y_pred_tr)
print("MAE Score of RandomForest on eICU-CRD trainset is :", mae)  
r2 = r2_score(Y, y_pred_tr)
print("R^2 Score of RandomForest on eICU-CRD trainset is :", r2) 
ev = explained_variance_score(Y, y_pred_tr)
print("EV Score of RandomForest on eICU-CRD trainset is :", ev)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


The shape of trainset is : 108988,53
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/data/tmpexec/eICU_DNN_model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f647b9a6bd0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
Instructions for updating:
Use Variable.read_va

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
INFO:tensorflow:Graph was finalized.
Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from /data/tmpexec/eICU_DNN_model/model.ckpt-30000
Instructions for updating:
Use standard file utilities to get mtimes.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
INFO:tensorflow:Saving checkpoints for 30000 into /data/tmpexec/eICU_DNN_model/model.ckpt.
INFO:tensorflow:loss = 1006.29846, step = 30000
INFO:tensorflow:global_step/sec: 45.4205
INFO:tensorflow:loss = 2431.926, step = 30100 (2.206 sec)
INFO:tensorflow:global_step/sec: 47.4375
INFO:tensorflow:loss = 1007.73895, step = 30200 (2.107 sec)
INFO:tensorflow:global_step/sec: 48.

INFO:tensorflow:global_step/sec: 49.3305
INFO:tensorflow:loss = 2839.2075, step = 36600 (2.015 sec)
INFO:tensorflow:global_step/sec: 49.8592
INFO:tensorflow:loss = 1868.3276, step = 36700 (2.018 sec)
INFO:tensorflow:global_step/sec: 49.6779
INFO:tensorflow:loss = 691.3883, step = 36800 (2.011 sec)
INFO:tensorflow:global_step/sec: 48.8434
INFO:tensorflow:loss = 1386.9429, step = 36900 (2.040 sec)
INFO:tensorflow:global_step/sec: 49.407
INFO:tensorflow:loss = 1233.8008, step = 37000 (2.021 sec)
INFO:tensorflow:global_step/sec: 48.4738
INFO:tensorflow:loss = 2187.7734, step = 37100 (2.062 sec)
INFO:tensorflow:global_step/sec: 48.3505
INFO:tensorflow:loss = 1098.9229, step = 37200 (2.076 sec)
INFO:tensorflow:global_step/sec: 50.0654
INFO:tensorflow:loss = 1101.0891, step = 37300 (1.991 sec)
INFO:tensorflow:global_step/sec: 48.6586
INFO:tensorflow:loss = 1959.6177, step = 37400 (2.062 sec)
INFO:tensorflow:global_step/sec: 47.3643
INFO:tensorflow:loss = 587.26306, step = 37500 (2.103 sec)
IN

INFO:tensorflow:loss = 1799.4369, step = 44800 (2.008 sec)
INFO:tensorflow:global_step/sec: 49.5439
INFO:tensorflow:loss = 5218.819, step = 44900 (2.021 sec)
INFO:tensorflow:global_step/sec: 49.3478
INFO:tensorflow:loss = 2307.9275, step = 45000 (2.024 sec)
INFO:tensorflow:global_step/sec: 49.4064
INFO:tensorflow:loss = 1622.9044, step = 45100 (2.024 sec)
INFO:tensorflow:global_step/sec: 49.2794
INFO:tensorflow:loss = 880.43604, step = 45200 (2.033 sec)
INFO:tensorflow:global_step/sec: 49.1769
INFO:tensorflow:loss = 11720.0205, step = 45300 (2.030 sec)
INFO:tensorflow:global_step/sec: 48.6671
INFO:tensorflow:loss = 1298.5867, step = 45400 (2.054 sec)
INFO:tensorflow:global_step/sec: 48.8057
INFO:tensorflow:loss = 2862.152, step = 45500 (2.062 sec)
INFO:tensorflow:global_step/sec: 50.1388
INFO:tensorflow:loss = 2928.2598, step = 45600 (1.981 sec)
INFO:tensorflow:global_step/sec: 48.7867
INFO:tensorflow:loss = 1163.9479, step = 45700 (2.053 sec)
INFO:tensorflow:global_step/sec: 48.1142
I

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /data/tmpexec/eICU_DNN_model/model.ckpt-50000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
MAE Score of DNN on eICU-CRD dataset is : 1.9692119945962727
RMSE Score of DNN on eICU-CRD dataset is : 4.1974371623713385
R^2 Score of DNN on eICU-CRD dataset is : 0.1070419977675452
EV Score of DNN on eICU-CRD dataset is : 0.10721543757116703
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /data/tmpexec/eICU_DNN_model/model.ckpt-50000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
MAE Score of RandomForest on eICU-CRD trainset is : 1.9975701087029465
R^2 Score of RandomForest on eICU-CRD trainset is : 0.10931889942768958
EV Score of RandomForest on eICU-CRD trainset is : 0.10932520512546218


In [2]:
#Lasso anti-overfitting
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.metrics import mean_absolute_error, r2_score, explained_variance_score,mean_squared_error
from math import sqrt
from sklearn.model_selection import GridSearchCV
#1 load dataset
#trainset
trainset = pd.read_csv("/data/fjsdata/physionet/eICU-CRD/EMBC2020/trainset.csv",sep=',',index_col=['patientunitstayid']) 
print ('The shape of trainset is : %d,%d'%(trainset.shape[0],trainset.shape[1]))
#testset
teststet = pd.read_csv("/data/fjsdata/physionet/eICU-CRD/EMBC2020/testset.csv",sep=',',index_col=['patientunitstayid'])
print ('The shape of testset is : %d,%d'%(teststet.shape[0],teststet.shape[1]))

#2 LR+L1 training
X = trainset.drop(columns=["actualiculos"], inplace=False)  #feature
y = trainset['actualiculos']#label
param_grid = {'fit_intercept':[True,False],'alpha':[0.01,0.05,0.1,0.5]}
clf = linear_model.Lasso(normalize=False,random_state=0) #max_iter
grid_clf = GridSearchCV(clf, param_grid, cv=5)
grid_clf.fit(X, y.ravel())

#4.3 prediction and evaluation
X_test = teststet.drop(columns=["actualiculos"], inplace=False)  #feature
y_test = teststet['actualiculos']#label 
y_pred = grid_clf.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print("MAE Score of LS+L1 on eICU-CRD dataset is :", mae)  
r2 = r2_score(y_test, y_pred)
print("R^2 Score of LS+L1 on eICU-CRD dataset is :", r2) 
ev = explained_variance_score(y_test, y_pred)
print("EV Score of LS+L1 on eICU-CRD dataset is :", ev)
#trainset
y_pred_tr = grid_clf.predict(X)
mae = mean_absolute_error(y, y_pred_tr)
print("MAE Score of RandomForest on eICU-CRD trainset is :", mae)  
r2 = r2_score(y, y_pred_tr)
print("R^2 Score of RandomForest on eICU-CRD trainset is :", r2) 
ev = explained_variance_score(y, y_pred_tr)
print("EV Score of RandomForest on eICU-CRD trainset is :", ev)

The shape of trainset is : 108988,53
The shape of testset is : 27248,53
MAE Score of LS+L1 on eICU-CRD dataset is : 2.0137342960250932
R^2 Score of LS+L1 on eICU-CRD dataset is : 0.0906148821108681
EV Score of LS+L1 on eICU-CRD dataset is : 0.09066660186217923
MAE Score of RandomForest on eICU-CRD trainset is : 1.9975701087029465
R^2 Score of RandomForest on eICU-CRD trainset is : 0.10931889942768958
EV Score of RandomForest on eICU-CRD trainset is : 0.10932520512546218


In [22]:
#Rondom Forest anti-overfitting
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score, explained_variance_score,mean_squared_error
from math import sqrt
from sklearn.model_selection import GridSearchCV
#1 load dataset
#trainset
trainset = pd.read_csv("/data/fjsdata/physionet/eICU-CRD/EMBC2020/trainset.csv",sep=',',index_col=['patientunitstayid']) 
print ('The shape of trainset is : %d,%d'%(trainset.shape[0],trainset.shape[1]))
#testset
teststet = pd.read_csv("/data/fjsdata/physionet/eICU-CRD/EMBC2020/testset.csv",sep=',',index_col=['patientunitstayid'])
print ('The shape of testset is : %d,%d'%(teststet.shape[0],teststet.shape[1]))

#2 RF training
X = trainset.drop(columns=["actualiculos"], inplace=False)  #feature
y = trainset['actualiculos']#label
#param_grid = { 'n_estimators': [5, 10, 15, 20], 'max_depth': [10, 20, 30, 50] }
param_grid = { 'n_estimators': [5, 10, 15, 20], 'max_depth': [10, 20, 30, 50] }
clf = RandomForestRegressor(max_features='sqrt', min_samples_split=110, min_samples_leaf=20, oob_score=False, random_state=0)
grid_clf = GridSearchCV(clf, param_grid, cv=5)
grid_clf.fit(X, y.ravel())

#3 prediction and evaluation
X_test = teststet.drop(columns=["actualiculos"], inplace=False)  #feature
y_test = teststet['actualiculos']#label 
y_pred = grid_clf.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print("MAE Score of RandomForest on eICU-CRD dataset is :", mae)   
r2 = r2_score(y_test, y_pred)
print("R^2 Score of RandomForest on eICU-CRD dataset is :", r2) 
ev = explained_variance_score(y_test, y_pred)
print("EV Score of RandomForest on eICU-CRD dataset is :", ev)

#trainset
y_pred_tr = grid_clf.predict(X)
mae = mean_absolute_error(y, y_pred_tr)
print("MAE Score of RandomForest on eICU-CRD trainset is :", mae)  
r2 = r2_score(y, y_pred_tr)
print("R^2 Score of RandomForest on eICU-CRD trainset is :", r2) 
ev = explained_variance_score(y, y_pred_tr)
print("EV Score of RandomForest on eICU-CRD trainset is :", ev)

The shape of trainset is : 108988,53
The shape of testset is : 27248,53
MAE Score of RandomForest on eICU-CRD dataset is : 1.9682482116184594
R^2 Score of RandomForest on eICU-CRD dataset is : 0.1118119960986127
EV Score of RandomForest on eICU-CRD dataset is : 0.11184313747406005
MAE Score of RandomForest on eICU-CRD trainset is : 1.8671441237747783
R^2 Score of RandomForest on eICU-CRD trainset is : 0.20101395441038294
EV Score of RandomForest on eICU-CRD trainset is : 0.2010139544312547


In [20]:
import numpy as np
import pandas as pd
from math import sqrt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, r2_score, explained_variance_score,mean_squared_error
import sys
if "PRML/" not in sys.path:
    sys.path.append("PRML/")
from prml import nn
np.random.seed(1234)
class Gaussian(nn.Network):

    def __init__(self, shape):
        super().__init__()
        with self.set_parameter():
            self.m = nn.zeros(shape)
            self.s = nn.zeros(shape)

    def __call__(self):
        self.q = nn.Gaussian(self.m, nn.softplus(self.s) + 1e-8)
        return self.q.draw()


class BayesianNetwork(nn.Network):
    
    def __init__(self, n_input, n_output):
        super().__init__()
        with self.set_parameter():
            self.qw1 = Gaussian((n_input, 16))
            self.qb1 = Gaussian(16)
            self.qw2 = Gaussian((16, 4))
            self.qb2 = Gaussian(4)
            self.qw3 = Gaussian((4, n_output))
            self.qb3 = Gaussian(n_output)
        self.posterior = [self.qw1, self.qb1, self.qw2, self.qb2, self.qw3, self.qb3]
        self.prior = nn.Gaussian(0, 1)

    def __call__(self, x):
        h = x @ self.qw1() + self.qb1()
        h = h @ self.qw2() + self.qb2()
        return nn.Gaussian(h @ self.qw3() + self.qb3(), 1)
    
    def kl(self):
        kl = 0
        for pos in self.posterior:
            kl += nn.loss.kl_divergence(pos.q, self.prior).mean()
        return kl
trainset = pd.read_csv("/data/fjsdata/physionet/eICU-CRD/EMBC2020/trainset.csv",sep=',',index_col=['patientunitstayid']) 
#min-max scale the continous features
ss = MinMaxScaler()
scale_features = ['ph', 'creatinine', 'albumin','diagnosis']
trainset[scale_features] = ss.fit_transform(trainset[scale_features])
print ('The shape of trainset is : %d,%d'%(trainset.shape[0],trainset.shape[1]))
X = trainset.drop(columns=["actualiculos"], inplace=False)  #feature
X = np.array(X)
Y = trainset['actualiculos'].to_frame()#label
model = BayesianNetwork(n_input=52, n_output=1)
optimizer = nn.optimizer.Adam(model.parameter, 0.1)
for i in range(20000):
    model.clear()
    py = model(X)
    elbo = py.log_pdf(Y).mean(0).sum() - model.kl() / len(X)
    optimizer.maximize(elbo)
    #if i % 100 == 0:
        #optimizer.learning_rate *= 0.9
    if i % 100 == 0:
        sys.stdout.write('\r{} / {}'.format(i, 20000))
        sys.stdout.flush()
#testset
teststet = pd.read_csv("/data/fjsdata/physionet/eICU-CRD/EMBC2020/testset.csv",sep=',',index_col=['patientunitstayid'])
teststet[scale_features] = ss.fit_transform(teststet[scale_features])
print ('The shape of testset is : %d,%d'%(teststet.shape[0],teststet.shape[1]))
X_test = teststet.drop(columns=["actualiculos"], inplace=False)  #feature
X_test = np.array(X_test)
y_test = teststet['actualiculos'].to_frame()#label

#prediction 3
mae = 5.0
r2 = 0.0
ev = 0.0
for i in range(5000):#sample 2000, get best performance
    y_pred = model(X_test).mean.value
    t_mae = mean_absolute_error(y_test, y_pred)
    if t_mae<mae: mae = t_mae
    t_r2 = r2_score(y_test, y_pred)
    if t_r2>r2: r2 = t_r2
    t_ev = explained_variance_score(y_test, y_pred)
    if t_ev>ev: ev = t_ev
print("MAE Score of BNN on eICU-CRD dataset is :",mae)
print("R^2 Score of BNN on eICU-CRD dataset is :", r2) 
print("EV Score of BNN on eICU-CRD dataset is :", ev)

mae = 5.0
r2 = 0.0
ev = 0.0
for i in range(5000):#sample 2000, get best performance
    y_pred = model(X).mean.value
    t_mae = mean_absolute_error(Y, y_pred)
    if t_mae<mae: mae = t_mae
    t_r2 = r2_score(Y, y_pred)
    if t_r2>r2: r2 = t_r2
    t_ev = explained_variance_score(Y, y_pred)
    if t_ev>ev: ev = t_ev
print("MAE Score of BNN on eICU-CRD trainset is :",mae)
print("R^2 Score of BNN on eICU-CRD trainset is :", r2) 
print("EV Score of BNN on eICU-CRD trainset is :", ev)

The shape of trainset is : 108988,53
19900 / 20000The shape of testset is : 27248,53
MAE Score of BNN on eICU-CRD dataset is : 1.8815541461451715
RMSE Score of BNN on eICU-CRD dataset is : 4.229972967926525
R^2 Score of BNN on eICU-CRD dataset is : 0.09314508533793098
EV Score of BNN on eICU-CRD dataset is : 0.09337936295761196
MAE Score of BNN on eICU-CRD trainset is : 1.865988546279092
RMSE Score of BNN on eICU-CRD trainset is : 3.7454114650412134
R^2 Score of BNN on eICU-CRD trainset is : 0.11554723260836264
EV Score of BNN on eICU-CRD trainset is : 0.11561402012165778


In [14]:
import pandas as pd
import time
import copy
import math
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.optim import Optimizer
from sklearn.model_selection import KFold
from torchvision import datasets, transforms
from math import sqrt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, r2_score, explained_variance_score,mean_squared_error
print (torch.cuda.is_available())
print (torch.version.cuda)
print (torch.cuda.get_device_name(torch.cuda.current_device()))

True
9.0.176
GeForce RTX 2080 Ti


In [118]:
def to_variable(var=(), cuda=True, volatile=False):
    out = []
    for v in var:
        
        if isinstance(v, np.ndarray):
            v = torch.from_numpy(v).type(torch.FloatTensor)

        if not v.is_cuda and cuda:
            v = v.cuda()

        if not isinstance(v, Variable):
            v = Variable(v, volatile=volatile)

        out.append(v)
    return out

class gaussian:
    def __init__(self, mu, sigma):
        self.mu = mu
        self.sigma = sigma
        
    def loglik(self, weights):
        exponent = -0.5*(weights - self.mu)**2/self.sigma**2
        log_coeff = -0.5*(np.log(2*np.pi) + 2*np.log(self.sigma))
        
        return (exponent + log_coeff).sum()
    
class BayesLinear_Normalq(nn.Module):
    def __init__(self, input_dim, output_dim, prior):
        super(BayesLinear_Normalq, self).__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.prior = prior
        
        self.weight_mus = nn.Parameter(torch.Tensor(self.input_dim, self.output_dim).uniform_(-0.01, 0.01))
        self.weight_rhos = nn.Parameter(torch.Tensor(self.input_dim, self.output_dim).uniform_(-3, -3))
        
    def forward(self, x):
        # sample gaussian noise for each weight
        weight_epsilons = Variable(self.weight_mus.data.new(self.weight_mus.size()).normal_())      
        # calculate the weight stds from the rho parameters
        weight_stds = torch.log(1 + torch.exp(self.weight_rhos))
        # calculate samples from the posterior from the sampled noise and mus/stds
        weight_sample = self.weight_mus + weight_epsilons*weight_stds
            
        torch.cuda.synchronize()
        output = torch.mm(x, weight_sample)
            
        # computing the KL loss term
        #reference: https://github.com/jojonki/AutoEncoders/blob/master/kl_divergence_between_two_gaussians.pdf
        prior_cov, varpost_cov = self.prior.sigma**2, weight_stds**2
        KL_loss = 0.5*(torch.log(prior_cov/varpost_cov)).sum() - 0.5*weight_stds.numel()
        KL_loss = KL_loss + 0.5*(varpost_cov/prior_cov).sum()
        KL_loss = KL_loss + 0.5*((self.weight_mus - self.prior.mu)**2/prior_cov).sum()
            
        return output, KL_loss
    
class BBP_Heteroscedastic_Model(nn.Module):
    def __init__(self, input_dim, output_dim, num_units):
        super(BBP_Heteroscedastic_Model, self).__init__()
        
        self.input_dim = input_dim
        self.output_dim = output_dim
        
        # network with two hidden and one output layer
        self.layer1 = BayesLinear_Normalq(input_dim, num_units[0], gaussian(0, 1))
        self.layer2 = BayesLinear_Normalq(num_units[0], num_units[1], gaussian(0, 1))
        self.layer3 = BayesLinear_Normalq(num_units[1], output_dim, gaussian(0, 1))
        
        # activation to be used between hidden layers
        self.activation = nn.ReLU(inplace = True)
    
    def forward(self, x):
        
        KL_loss_total = 0
        x = x.view(-1, self.input_dim)
        
        x, KL_loss = self.layer1(x)
        KL_loss_total = KL_loss_total + KL_loss
        x = self.activation(x)
        
        x, KL_loss = self.layer2(x)
        KL_loss_total = KL_loss_total + KL_loss
        x = self.activation(x)
        
        x, KL_loss = self.layer3(x)
        KL_loss_total = KL_loss_total + KL_loss
        x = self.activation(x)
        
        return x, KL_loss_total
    
class BBP_Heteroscedastic_Model_Wrapper:
    def __init__(self, network, learn_rate, batch_size, no_batches):
        
        self.learn_rate = learn_rate
        self.batch_size = batch_size
        self.no_batches = no_batches
        
        self.network = network
        self.network.cuda()
        
        self.optimizer = torch.optim.Adam(self.network.parameters(), lr = self.learn_rate)
        self.loss_func = nn.MSELoss() 
    
    def fit(self, x, y, no_samples):
        
        x, y = to_variable(var=(x, y), cuda=True)
        
        # reset gradient and total loss
        self.optimizer.zero_grad()
        fit_loss_total = 0
        
        for i in range(no_samples):
            output, KL_loss_total = self.network(x)
            
            # calculate fit loss based on mean and standard deviation of output
            fit_loss = self.loss_func(output,y)
            fit_loss_total = fit_loss_total + fit_loss
        
        KL_loss_total = KL_loss_total/self.no_batches
        total_loss = (fit_loss_total + KL_loss_total)/(no_samples*x.shape[0])
        total_loss.backward()
        self.optimizer.step()

        return fit_loss_total/no_samples, KL_loss_total

In [120]:
#load dataset
trainset = pd.read_csv("/data/fjsdata/physionet/eICU-CRD/EMBC2020/trainset.csv",sep=',',index_col=['patientunitstayid']) 
#min-max scale the continous features
ss = MinMaxScaler()
scale_features = ['ph', 'creatinine', 'albumin','diagnosis']
trainset[scale_features] = ss.fit_transform(trainset[scale_features])
print ('The shape of trainset is : %d,%d'%(trainset.shape[0],trainset.shape[1]))
trainset = np.array(trainset)

#model build and trained
kf = KFold(n_splits=10)
in_dim = trainset.shape[1]-1
train_logliks, val_logliks = [], []
train_rmses, val_rmses = [], []
num_epochs = 100
log_every=10
best_net, best_loss = None, float('inf')

x_train, y_train = trainset[:, :in_dim], trainset[:, in_dim:]
batch_size, nb_train = len(x_train), len(x_train)

net = BBP_Heteroscedastic_Model_Wrapper(network=BBP_Heteroscedastic_Model(input_dim=in_dim, output_dim=1, num_units=[16,4]),
                                                learn_rate=1e-2, batch_size=batch_size, no_batches=1)

fit_loss_train = np.zeros(num_epochs)
KL_loss_train = np.zeros(num_epochs)
total_loss = np.zeros(num_epochs)

for i in range(num_epochs):
        
    fit_loss, KL_loss = net.fit(x_train, y_train, no_samples = 20)
    fit_loss_train[i] += fit_loss.cpu().data.numpy()
    KL_loss_train[i] += KL_loss.cpu().data.numpy()

    total_loss[i] = fit_loss_train[i] + KL_loss_train[i]

    if fit_loss < best_loss:
        best_loss = fit_loss
        best_net = copy.deepcopy(net.network)
            
    if i % log_every == 0 or i == num_epochs - 1:
        print('Epoch: %s/%d, fit_loss = %.3f' %(str(i+1).zfill(3), num_epochs, fit_loss))

The shape of trainset is : 108988,53
Epoch: 001/100, fit_loss = 24.491
Epoch: 011/100, fit_loss = 24.384
Epoch: 021/100, fit_loss = 19.911
Epoch: 031/100, fit_loss = 16.031
Epoch: 041/100, fit_loss = 14.950
Epoch: 051/100, fit_loss = 14.476
Epoch: 061/100, fit_loss = 14.615
Epoch: 071/100, fit_loss = 14.400
Epoch: 081/100, fit_loss = 14.462
Epoch: 091/100, fit_loss = 14.403
Epoch: 100/100, fit_loss = 14.519


In [123]:
#load testset
testset = pd.read_csv("/data/fjsdata/physionet/eICU-CRD/EMBC2020/testset.csv",sep=',',index_col=['patientunitstayid'])
testset[scale_features] = ss.fit_transform(testset[scale_features])
print ('The shape of testset is : %d,%d'%(testset.shape[0],testset.shape[1]))
testset = np.array(testset)
x_test, y_test = testset[:, :in_dim], testset[:, in_dim:]
x, y = to_variable(var=(x_test, y_test), cuda=True)
#performance
'''
mae = []
rmse = []
r2 = []
ev = []
for i in range(200):#sample 
    output, KL_loss_total = best_net(x)
    y_pred = output.cpu().data.numpy()
    mae.append(mean_absolute_error(y_test, y_pred))
    rmse = sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    ev = explained_variance_score(y_test, y_pred)
print("MAE Score of BNN on eICU-CRD dataset is :", np.mean(mae))  
print("RMSE Score of BNN on eICU-CRD dataset is :", np.mean(rmse))   
print("R^2 Score of BNN on eICU-CRD dataset is :", np.mean(r2))  
print("EV Score of BNN on eICU-CRD dataset is :", np.mean(ev)) 
'''
mae = 5.0
rmse = 5.0
r2 = 0.0
ev = 0.0
for i in range(100):#sample 100, get best performance
    output, KL_loss_total = best_net(x)
    y_pred = output.cpu().data.numpy()
    t_mae = mean_absolute_error(y_test, y_pred)
    if t_mae<mae: mae = t_mae
    t_rmse = sqrt(mean_squared_error(y_test, y_pred))
    if t_rmse<rmse: rmse = t_rmse
    t_r2 = r2_score(y_test, y_pred)
    if t_r2>r2: r2 = t_r2
    t_ev = explained_variance_score(y_test, y_pred)
    if t_ev>ev: ev = t_ev
print("MAE Score of BNN on eICU-CRD dataset is :",mae)
print("RMSE Score of BNN on eICU-CRD dataset is :", rmse)  
print("R^2 Score of BNN on eICU-CRD dataset is :", r2) 
print("EV Score of BNN on eICU-CRD dataset is :", ev)

The shape of testset is : 27248,53
MAE Score of BNN on eICU-CRD dataset is : 1.8314177600661938
RMSE Score of BNN on eICU-CRD dataset is : 4.246108811476909
R^2 Score of BNN on eICU-CRD dataset is : 0.08621323081570487
EV Score of BNN on eICU-CRD dataset is : 0.08761417641399072
