In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, r2_score, explained_variance_score,mean_squared_error
from math import sqrt
from sklearn_rvm import RVR

In [None]:
#6.1 load dataset
#trainset
trainset = pd.read_csv("/data/fjsdata/physionet/eICU-CRD/EMBC2020/trainset.csv",sep=',',index_col=['patientunitstayid']) 
print ('The shape of trainset is : %d,%d'%(trainset.shape[0],trainset.shape[1]))
#testset
teststet = pd.read_csv("/data/fjsdata/physionet/eICU-CRD/EMBC2020/testset.csv",sep=',',index_col=['patientunitstayid'])
print ('The shape of testset is : %d,%d'%(teststet.shape[0],teststet.shape[1]))

#6.2 SVR training
X = trainset.drop(columns=["actualiculos"], inplace=False)  #feature
y = trainset['actualiculos']#label
clf = RVR(kernel='rbf',max_iter=2000)
clf.fit(X, y.ravel())

#6.3  prediction and evaluation
X_test = teststet.drop(columns=["actualiculos"], inplace=False)  #feature
y_test = teststet['actualiculos']#label 
y_pred = clf.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print("MAE Score of RVR on eICU-CRD dataset is :", mae)  
rmse = sqrt(mean_squared_error(y_test, y_pred))
print("RMSE Score of RVR on eICU-CRD dataset is :", rmse)  
r2 = r2_score(y_test, y_pred)
print("R^2 Score of RVR on eICU-CRD dataset is :", r2) 
ev = explained_variance_score(y_test, y_pred)
print("EV Score of RVR on eICU-CRD dataset is :", ev)

The shape of trainset is : 108988,53
The shape of testset is : 27248,53




In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, r2_score, explained_variance_score,mean_squared_error
from math import sqrt

In [2]:
#5.1 load dataset
#trainset
trainset = pd.read_csv("/data/fjsdata/physionet/eICU-CRD/EMBC2020/trainset.csv",sep=',',index_col=['patientunitstayid']) 
print ('The shape of trainset is : %d,%d'%(trainset.shape[0],trainset.shape[1]))
#testset
teststet = pd.read_csv("/data/fjsdata/physionet/eICU-CRD/EMBC2020/testset.csv",sep=',',index_col=['patientunitstayid'])
print ('The shape of testset is : %d,%d'%(teststet.shape[0],teststet.shape[1]))

#5.2 SVR training
X = trainset.drop(columns=["actualiculos"], inplace=False)  #feature
y = trainset['actualiculos']#label
clf = SVR(kernel='rbf', gamma=0.1, max_iter=2000, C=0.5)
clf.fit(X, y.ravel())

#5.3  prediction and evaluation
X_test = teststet.drop(columns=["actualiculos"], inplace=False)  #feature
y_test = teststet['actualiculos']#label 
y_pred = clf.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print("MAE Score of SVR on eICU-CRD dataset is :", mae)  
rmse = sqrt(mean_squared_error(y_test, y_pred))
print("RMSE Score of SVR on eICU-CRD dataset is :", rmse)  
r2 = r2_score(y_test, y_pred)
print("R^2 Score of SVR on eICU-CRD dataset is :", r2) 
ev = explained_variance_score(y_test, y_pred)
print("EV Score of SVR on eICU-CRD dataset is :", ev)

The shape of trainset is : 108988,53
The shape of testset is : 27248,53




MAE Score of SVR on eICU-CRD dataset is : 5.57542699480618
RMSE Score of SVR on eICU-CRD dataset is : 6.451687645777499
R^2 Score of SVR on eICU-CRD dataset is : -1.109644575741434
EV Score of SVR on eICU-CRD dataset is : -0.01209747607288203


In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.metrics import mean_absolute_error, r2_score, explained_variance_score,mean_squared_error
from math import sqrt
from sklearn.model_selection import GridSearchCV

In [2]:
#4.1 load dataset
#trainset
trainset = pd.read_csv("/data/fjsdata/physionet/eICU-CRD/EMBC2020/trainset.csv",sep=',',index_col=['patientunitstayid']) 
print ('The shape of trainset is : %d,%d'%(trainset.shape[0],trainset.shape[1]))
#testset
teststet = pd.read_csv("/data/fjsdata/physionet/eICU-CRD/EMBC2020/testset.csv",sep=',',index_col=['patientunitstayid'])
print ('The shape of testset is : %d,%d'%(teststet.shape[0],teststet.shape[1]))

#4.2 LR+L1 training
X = trainset.drop(columns=["actualiculos"], inplace=False)  #feature
y = trainset['actualiculos']#label
param_grid = {'fit_intercept':[True,False],'alpha':[0.01,0.05,0.1,0.5]}
clf = linear_model.Lasso(normalize=False,random_state=0) #max_iter
grid_clf = GridSearchCV(clf, param_grid, cv=5)
grid_clf.fit(X, y.ravel())

#4.3 prediction and evaluation
X_test = teststet.drop(columns=["actualiculos"], inplace=False)  #feature
y_test = teststet['actualiculos']#label 
y_pred = grid_clf.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print("MAE Score of LS+L1 on eICU-CRD dataset is :", mae)  
rmse = sqrt(mean_squared_error(y_test, y_pred))
print("RMSE Score of LS+L1 on eICU-CRD dataset is :", rmse)  
r2 = r2_score(y_test, y_pred)
print("R^2 Score of LS+L1 on eICU-CRD dataset is :", r2) 
ev = explained_variance_score(y_test, y_pred)
print("EV Score of LS+L1 on eICU-CRD dataset is :", ev)

The shape of trainset is : 108988,53
The shape of testset is : 27248,53
MAE Score of LS+L1 on eICU-CRD dataset is : 2.0137342960250932
RMSE Score of LS+L1 on eICU-CRD dataset is : 4.2358698518271085
R^2 Score of LS+L1 on eICU-CRD dataset is : 0.0906148821108681
EV Score of LS+L1 on eICU-CRD dataset is : 0.09066660186217923


In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score, explained_variance_score,mean_squared_error
from math import sqrt
from sklearn.model_selection import GridSearchCV

In [2]:
#3.1 load dataset
#trainset
trainset = pd.read_csv("/data/fjsdata/physionet/eICU-CRD/EMBC2020/trainset.csv",sep=',',index_col=['patientunitstayid']) 
print ('The shape of trainset is : %d,%d'%(trainset.shape[0],trainset.shape[1]))
#testset
teststet = pd.read_csv("/data/fjsdata/physionet/eICU-CRD/EMBC2020/testset.csv",sep=',',index_col=['patientunitstayid'])
print ('The shape of testset is : %d,%d'%(teststet.shape[0],teststet.shape[1]))

#3.2 RF training
X = trainset.drop(columns=["actualiculos"], inplace=False)  #feature
y = trainset['actualiculos']#label
param_grid = { 'n_estimators': [5, 10, 15, 20], 'max_depth': [10, 20, 30, 50] }
clf = RandomForestRegressor(max_features='sqrt', min_samples_split=110, min_samples_leaf=20, oob_score=False, random_state=0)
grid_clf = GridSearchCV(clf, param_grid, cv=5)
grid_clf.fit(X, y.ravel())

#3.3 prediction and evaluation
X_test = teststet.drop(columns=["actualiculos"], inplace=False)  #feature
y_test = teststet['actualiculos']#label 
y_pred = grid_clf.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print("MAE Score of RandomForest on eICU-CRD dataset is :", mae)  
rmse = sqrt(mean_squared_error(y_test, y_pred))
print("RMSE Score of RandomForest on eICU-CRD dataset is :", rmse)  
r2 = r2_score(y_test, y_pred)
print("R^2 Score of RandomForest on eICU-CRD dataset is :", r2) 
ev = explained_variance_score(y_test, y_pred)
print("EV Score of RandomForest on eICU-CRD dataset is :", ev)

The shape of trainset is : 108988,53
The shape of testset is : 27248,53
MAE Score of RandomForest on eICU-CRD dataset is : 1.9682482116184594
RMSE Score of RandomForest on eICU-CRD dataset is : 4.186211226825111
R^2 Score of RandomForest on eICU-CRD dataset is : 0.1118119960986127
EV Score of RandomForest on eICU-CRD dataset is : 0.11184313747406005


In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, r2_score, explained_variance_score,mean_squared_error
from math import sqrt

In [2]:
#2.1 load prediction 
#prediction of ApacheIV
ApacheIV = pd.read_csv("/data/fjsdata/physionet/eICU-CRD/EMBC2020/ApacheIV.csv",sep=',',index_col=['patientunitstayid']) 
print ('The shape of ApacheIV is : %d,%d'%(ApacheIV.shape[0],ApacheIV.shape[1]))
#prediction of ApacheIVa
ApacheIVa = pd.read_csv("/data/fjsdata/physionet/eICU-CRD/EMBC2020/ApacheIVa.csv",sep=',',index_col=['patientunitstayid'])
print ('The shape of ApacheIVa is : %d,%d'%(ApacheIVa.shape[0],ApacheIVa.shape[1]))

#2.2 Performance metrics of ApacheIV: MAE,RMSE,R^2,EV
apacheIV_mae = mean_absolute_error(ApacheIV['actualiculos'], ApacheIV['predictediculos'])
print("MAE Score of ApacheIV on eICU-CRD dataset is :", apacheIV_mae)  
apacheIV_rmse = sqrt(mean_squared_error(ApacheIV['actualiculos'], ApacheIV['predictediculos']))
print("RMSE Score of ApacheIV on eICU-CRD dataset is :", apacheIV_rmse)  
apacheIV_r2 = r2_score(ApacheIV['actualiculos'], ApacheIV['predictediculos'])
print("R^2 Score of ApacheIV on eICU-CRD dataset is :", apacheIV_r2) 
apacheIV_ev = explained_variance_score(ApacheIV['actualiculos'], ApacheIV['predictediculos'])
print("EV Score of ApacheIV on eICU-CRD dataset is :", apacheIV_ev) 

#2.3 Performance metrics of ApacheIVa: MAE,RMSE,R^2,EV
apacheIVa_mae = mean_absolute_error(ApacheIVa['actualiculos'], ApacheIVa['predictediculos'])
print("MAE Score of ApacheIVa on eICU-CRD dataset is :", apacheIVa_mae)  
apacheIVa_rmse = sqrt(mean_squared_error(ApacheIVa['actualiculos'], ApacheIVa['predictediculos']))
print("RMSE Score of ApacheIVa on eICU-CRD dataset is :", apacheIVa_rmse)  
apacheIVa_r2 = r2_score(ApacheIVa['actualiculos'], ApacheIVa['predictediculos'])
print("R^2 Score of ApacheIVa on eICU-CRD dataset is :", apacheIVa_r2) 
apacheIVa_ev = explained_variance_score(ApacheIVa['actualiculos'], ApacheIVa['predictediculos'])
print("EV Score of ApacheIVa on eICU-CRD dataset is :", apacheIVa_ev)

The shape of ApacheIV is : 27248,2
The shape of ApacheIVa is : 27248,2
MAE Score of ApacheIV on eICU-CRD dataset is : 2.250560756750709
RMSE Score of ApacheIV on eICU-CRD dataset is : 4.289074218220265
R^2 Score of ApacheIV on eICU-CRD dataset is : 0.06762686874463264
EV Score of ApacheIV on eICU-CRD dataset is : 0.07628212872943518
MAE Score of ApacheIV on eICU-CRD dataset is : 2.4540459386771154
RMSE Score of ApacheIV on eICU-CRD dataset is : 4.379843254866504
R^2 Score of ApacheIV on eICU-CRD dataset is : 0.027745943839827536
EV Score of ApacheIV on eICU-CRD dataset is : 0.0658870874517633


In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, r2_score, explained_variance_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [2]:
#1.1 load APS data
dataApsVar = pd.read_csv("/data/fjsdata/physionet/eICU-CRD/apacheApsVar.csv")    # Acute Physiology Score, APS
dataApsVar.drop(columns=["apacheapsvarid"], inplace=True)  # get rid of "apacheapsvarid"

#1.2 load chronic data
dataPredVar = pd.read_csv("/data/fjsdata/physionet/eICU-CRD/apachePredVar.csv")  # Chronic and patient information
#get the diagnostic text firstly.
admit_diagnosis = dataPredVar.loc[:, ['admitdiagnosis']]  #diagnostic text
admit_diagnosis.fillna('-1', inplace=True)  # fill NaN with unknown, add a category
admit_diagnosis = LabelEncoder().fit_transform(admit_diagnosis)  #encoder
#Select necessary features
dataPredVar = dataPredVar[['age','thrombolytics','aids','hepaticfailure','lymphoma','metastaticcancer',
                            'leukemia','immunosuppression','cirrhosis','electivesurgery','readmit','visitnumber']] 

#1.3 Concate table apacheApsVar and apachePredVar to form features dataset.
dataApache = pd.concat([dataApsVar, dataPredVar], axis=1)
dataApache['age'].fillna(dataApache['age'].mean(), inplace=True)  # fill NaN of field age with mean
dataApache['electivesurgery'].fillna(-1, inplace=True)  # fill NaN of field electivesurgery with -1, add a category
dataApache.set_index(["patientunitstayid"], inplace=True)  # set patientunitstayid as index
dataApache['diagnosis'] = admit_diagnosis #add feature diagnosis

#1.4 load label dataset, includes Length of Stay  and Mordality.
dataPatientResult =  pd.read_csv("/data/fjsdata/physionet/eICU-CRD/apachePatientResult.csv")

#1.5 Summary the valid index according to the table apachePatientResult
neg_index = dataPatientResult[dataPatientResult['predictedhospitallos'].values == -1].index  # find index with invalid label 
dataPatientResult.drop(index=neg_index, inplace=True)  #drop invalid index
saveid = dataPatientResult['patientunitstayid']  # valid index
saveid.drop_duplicates(keep='first',inplace=True)  # delete duplicate index
dataApache = dataApache.loc[saveid.values]  # get index with valid label
print ('The name of columns of dataset are: %s'%(dataApache.columns))

#1.6 Feature processing
#min-max scale the continous features
ss = MinMaxScaler()
scale_features = ['urine', 'wbc', 'temperature','respiratoryrate','sodium','heartrate','meanbp',
                  'hematocrit','pao2','pco2','bun','glucose','fio2','age']
dataApache[scale_features] = ss.fit_transform(dataApache[scale_features])
#one-hot the discrete features
categorical_features = ['eyes', 'motor', 'verbal']
dataApache = pd.get_dummies(dataApache, columns = categorical_features)#onehot

#1.7 get Label LoS
gp = dataPatientResult.groupby(['apacheversion'])
val = []
for i in gp:
    val.append(i[1])
# Separate according to version IV and IVa
val[1].index = dataApache.index   
val[0].index = dataApache.index
ApacheIV = val[0][['predictediculos','actualiculos']] #ApacheIV prediction
ApacheIVa = val[1][['predictediculos','actualiculos']] #ApacheIVa prediction
dataApache_LoS = val[1]['actualiculos']  # Label of LoS

#1.8 Split and store the datset into trainset and testset
X_train, X_test, y_train, y_test = train_test_split(dataApache, dataApache_LoS, test_size=0.2, random_state=0) #8:2
trainset = pd.concat([X_train, y_train], axis=1)
trainset.to_csv('/data/fjsdata/physionet/eICU-CRD/EMBC2020/trainset.csv', sep=',', encoding='utf-8') #store index
print ('The shape of trainset is : %d,%d'%(trainset.shape[0],trainset.shape[1]))
testset = pd.concat([X_test, y_test], axis=1)
testset.to_csv('/data/fjsdata/physionet/eICU-CRD/EMBC2020/testset.csv', sep=',', encoding='utf-8') #store index
print ('The shape of testset is : %d,%d'%(testset.shape[0],testset.shape[1]))
ApacheIV = ApacheIV.loc[testset.index]
ApacheIV.to_csv('/data/fjsdata/physionet/eICU-CRD/EMBC2020/ApacheIV.csv', sep=',', encoding='utf-8') #store index
ApacheIVa = ApacheIVa.loc[testset.index]
ApacheIVa.to_csv('/data/fjsdata/physionet/eICU-CRD/EMBC2020/ApacheIVa.csv', sep=',', encoding='utf-8') #store index
assert ApacheIV.shape[0] == ApacheIVa.shape[0] and ApacheIV.shape[0] == testset.shape[0]

  y = column_or_1d(y, warn=True)


The name of columns of dataset are: Index(['intubated', 'vent', 'dialysis', 'eyes', 'motor', 'verbal', 'meds',
       'urine', 'wbc', 'temperature', 'respiratoryrate', 'sodium', 'heartrate',
       'meanbp', 'ph', 'hematocrit', 'creatinine', 'albumin', 'pao2', 'pco2',
       'bun', 'glucose', 'bilirubin', 'fio2', 'age', 'thrombolytics', 'aids',
       'hepaticfailure', 'lymphoma', 'metastaticcancer', 'leukemia',
       'immunosuppression', 'cirrhosis', 'electivesurgery', 'readmit',
       'visitnumber', 'diagnosis'],
      dtype='object')
The shape of trainset is : 108988,53
The shape of testset is : 27248,53
