In [1]:
import pandas as pd

# Load data

In [2]:
df = pd.read_csv('hospital_mortality.csv')
df.head(5)

Unnamed: 0,PATIENT_ID,Age,Gender,Height,ICUType,BUN_Min,Creatinine_Min,GCS_Min,Glucose_Min,HCO3_Min,...,Na_Range,PaCO2_Range,PaO2_Range,Platelets_Range,Temp_Range,Urine_Range,WBC_Range,Weight_Range,pH_Range,In-hospital_death
0,132539.0,54.0,0.0,-1.0,4.0,8.0,0.7,14.0,115.0,26.0,...,1.0,,,36.0,3.1,900.0,1.8,0.0,,0
1,132540.0,76.0,1.0,175.3,2.0,16.0,0.8,3.0,105.0,21.0,...,4.0,13.0,363.0,91.0,3.4,770.0,5.9,5.6,0.11,0
2,132541.0,44.0,0.0,-1.0,3.0,3.0,0.3,5.0,119.0,24.0,...,3.0,4.0,167.0,41.0,2.3,407.0,2.5,0.0,0.04,0
3,132543.0,68.0,1.0,180.3,3.0,10.0,0.7,14.0,106.0,27.0,...,4.0,,,107.0,1.9,600.0,3.6,0.0,,0
4,132545.0,88.0,0.0,-1.0,3.0,25.0,1.0,15.0,92.0,18.0,...,1.0,,,12.0,2.0,204.0,1.0,0.0,,0


### How many patients have 20 or more missing feature values? 

In [3]:
#### creating a column that counts the number of missing features 
df['miss_cnt']=df.isnull().sum(axis=1)

#### since the number of patients is asked, here we are making sure each record in the dataset corresponds to one patient (not duplicate)
assert len(df)==len(df['PATIENT_ID'].unique())

#### print out the result 
print( "Number of patients that have 20 or more missing feature values is:", len(df[df['miss_cnt']>=20]))

Number of patients that have 20 or more missing feature values is: 210


### What’s the difference in means of HR_min for patients that died vs survived (not counting patients with HR_min of  0)?

In [4]:
#### filter null value and zero value HR_Min rows 
df_cleaned=df[(df['HR_Min'].notnull())&(df['HR_Min']!=0)]

#### filter the dataframe into two groups: survivors and dead 
df_survived=df_cleaned[df_cleaned['In-hospital_death']==0]
df_death=df_cleaned[df_cleaned['In-hospital_death']==1]

#### print out the result 
print("The difference in means of hr_min for patients that dies and survived is:", df_death['HR_Min'].mean()-df_survived['HR_Min'].mean())

The difference in means of hr_min for patients that dies and survived is: 1.6746448633591484


### What is the median maximum heart rate for patients whose maximum temperature was 2 degrees Celsius higher than their minimum temperature?

In [5]:
#### print the descrition of these two columns to make sure there is no outlier values 
#### For example there might be extremly high or low temperature, and in those cases dataset 
#### needs to be further cleaned
print("description of Temp max: ")
print(df['Temp_Max'].describe())

print("descrition of Temp min: ")
print(df['Temp_Min'].describe())

description of Temp max: 
count    3936.000000
mean       37.929065
std         0.747168
min        35.600000
25%        37.400000
50%        37.800000
75%        38.400000
max        42.100000
Name: Temp_Max, dtype: float64
descrition of Temp min: 
count    3936.000000
mean       35.026118
std         6.010021
min       -17.800000
25%        35.400000
50%        35.900000
75%        36.400000
max        38.800000
Name: Temp_Min, dtype: float64


In [6]:
#### As we see from above, min value for Temp_min is -17.8000 , which is very low. We need to clean this dataset.  
#### We print out the Temp_min of all the patiens who have Temp_min values of lower than 0 

print(df[df['Temp_Min']<=0]['Temp_Min'].describe())

count    63.000000
mean     -6.696825
std       8.630571
min     -17.800000
25%     -17.800000
50%       0.000000
75%       0.000000
max       0.000000
Name: Temp_Min, dtype: float64


In [30]:
#### As we can see above, there are 63 patients who have Temp_Min equal to or smaller than 0; 
#### Suspecting these patients might be dead petients, we looked at their In-hospital_death as well; 

In [7]:
print(df[df['Temp_Min']<=0][['In-hospital_death','Temp_Min']])

      In-hospital_death  Temp_Min
45                    0       0.0
102                   0       0.0
149                   0     -17.8
190                   0       0.0
389                   0       0.0
418                   0       0.0
608                   1     -17.8
673                   0       0.0
702                   0       0.0
877                   0       0.0
895                   0       0.0
992                   0       0.0
1017                  1       0.0
1030                  0       0.0
1113                  0       0.0
1144                  1       0.0
1160                  0       0.0
1166                  0       0.0
1182                  0     -17.8
1217                  0       0.0
1233                  0       0.0
1356                  0       0.0
1367                  0     -17.8
1413                  0     -17.8
1423                  0       0.0
1560                  0     -17.8
1589                  0     -17.8
1643                  0       0.0
1646          

In [8]:
#### As can be seen above, most of these patients did not die in hospital. But they have Temp_min of negative value 
#### This could be a data quality issue. We need to further investigate this. 
#### In our calculation, we filtered out patienst who have Temp_min values of equal to or smaller than 0 
#### Depending on medical knowledge, we could further filter out. For example, we could filter out patients who have 
#### Temp_min value of smaller than 36.5 

df_temp_cleaned=df[(df['Temp_Max'].notnull())&(df['Temp_Min'].notnull())&(df['HR_Max'].notnull())&(df['Temp_Min']>0)]
df_two_c=df_temp_cleaned[(df_temp_cleaned['Temp_Max']==df_temp_cleaned['Temp_Min']+2)]
print("The median maximum heart rate for patients whose maximum temperature was 2 degrees Celsius higher than their minimum temperature is:", df_two_c['HR_Max'].median())

The median maximum heart rate for patients whose maximum temperature was 2 degrees Celsius higher than their minimum temperature is: 109.0


### Create a plot to analyze the relationship between median temperature, outcome (died vs. survived), and age.

In [9]:
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
from plotly import tools
import plotly.graph_objs as graph_objs
init_notebook_mode(connected=True)

In [10]:
df_m_temp_clean=df[df['Temp_Median'].notnull()]
df_clean_dead=df_m_temp_clean[(df_m_temp_clean['In-hospital_death']==1)&(df_m_temp_clean['Age'].notnull())]
df_clean_survive=df_m_temp_clean[(df_m_temp_clean['In-hospital_death']==0)&(df_m_temp_clean['Age'].notnull())]

In [12]:
trace1 = {"x":df_clean_dead['Age'].tolist() , 
          "y":df_clean_dead['Temp_Median'].tolist()  , 
          "marker": {"color": "pink", "size": 12}, 
          "mode": "markers", 
          "name": "dead", 
          "type": "scatter"
}

trace2 = {"x": df_clean_survive['Age'].tolist() , 
          "y": df_clean_survive['Temp_Median'].tolist(), 
          "marker": {"color": "green", "size": 12}, 
          "mode": "markers", 
          "name": "survice", 
          "type": "scatter", 
}
data=[trace1]
fig = graph_objs.Figure(data=data)
fig['layout'].update(title='Median Temperature and Age (dead)',
                    xaxis=dict( title='Age'), 
                    yaxis=dict( title='Median Median Temperature '),
                                        ), 
iplot(fig)

In [13]:
data=[trace2]
fig = graph_objs.Figure(data=data)
fig['layout'].update(title='Median Temperature and Age (survive)',
                    xaxis=dict( title='Age'), 
                    yaxis=dict( title='Median Median Temperature '),
                                        ), 
iplot(fig)

### Create a plot to analyze the relationship between median temperature, outcome (died vs. survived), and gender.

In [14]:
## Your code here
df_m_temp_clean=df[df['Temp_Median'].notnull()]

df_clean_dead_male=df_m_temp_clean[(df_m_temp_clean['In-hospital_death']==1)&(df_m_temp_clean['Gender']==1)]
df_clean_dead_female=df_m_temp_clean[(df_m_temp_clean['In-hospital_death']==1)&(df_m_temp_clean['Gender']==0)]

df_clean_survive_male=df_m_temp_clean[(df_m_temp_clean['In-hospital_death']==0)&(df_m_temp_clean['Gender']==1)]
df_clean_survive_female=df_m_temp_clean[(df_m_temp_clean['In-hospital_death']==0)&(df_m_temp_clean['Gender']==0)]

In [15]:
trace1 = {"x":df_clean_dead_female['Age'].tolist() , 
          "y":df_clean_dead_female['Temp_Median'].tolist()  , 
          "marker": {"color": "pink", "size": 12}, 
          "mode": "markers", 
          "name": "FEMALE", 
          "type": "scatter"
}

trace2 = {"x": df_clean_dead_male['Age'].tolist() , 
          "y": df_clean_dead_male['Temp_Median'].tolist(), 
          "marker": {"color": "blue", "size": 12}, 
          "mode": "markers", 
          "name": "MALE", 
          "type": "scatter", 
}
data=[trace1,trace2]
fig = graph_objs.Figure(data=data)
fig['layout'].update(title='Median Temperature and Gender (dead)',
                    xaxis=dict( title='Age'), 
                    yaxis=dict( title='Median Median Temperature '),
                                        ), 
iplot(fig)

In [16]:
trace1 = {"x":df_clean_survive_female['Age'].tolist() , 
          "y":df_clean_survive_female['Temp_Median'].tolist()  , 
          "marker": {"color": "pink", "size": 12}, 
          "mode": "markers", 
          "name": "FEMALE", 
          "type": "scatter"
}

trace2 = {"x": df_clean_survive_male['Age'].tolist() , 
          "y": df_clean_survive_male['Temp_Median'].tolist(), 
          "marker": {"color": "green", "size": 12}, 
          "mode": "markers", 
          "name": "MALE", 
          "type": "scatter", 
}
data=[trace1,trace2]
fig = graph_objs.Figure(data=data)
fig['layout'].update(title='Median Temperature and Gender (dead)',
                    xaxis=dict( title='Age'), 
                    yaxis=dict( title='Median Median Temperature '),
                                        ), 
iplot(fig)

### Build and summarize the results of a machine learning model that predicts whether a patient dies or survives in the hospital. (We do not expect a perfect or highest-accuracy solution; this is primarily an opportunity for you to show your machine learning project workflow and to document your thought process, approach to solving problems, and interpretation of results.)

There are many steps in this prediction model. The steps are : 

1. clean the dataset (drop na values)
2. split the dataset into training and testing data (80% training, 20% testing)
3. train a random forest model and get evaluation result on testing data (Accuracy and AUC score)
4. get feature importance from the model trained in step 3 
5. select most useful n features from the entire features (we will explain this later in details)
6. train a random forest model on the features selected in step 5. 
7. further improve model accuracy through hyperparameter tuning process. 
8. provide further insights on the model parameters and feature importance 

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

In [18]:
#### drop patient_id column, drop rows with nan values. There are other ways we can clean this dataset. 
#### But for simplicity, we chose to drop nan values. 
print("Length of dataframe before cleaning is:", len(df))
df.drop(['PATIENT_ID'], axis=1, inplace=True)
df.dropna(inplace=True)
print("Length of dataframe after cleaning is:", len(df))

Length of dataframe before cleaning is: 4000
Length of dataframe after cleaning is: 2450


In [19]:
#### create features and label list 

all_columns=list(df.columns.values)
features=[x for x in all_columns if x !='In-hospital_death']
label=['In-hospital_death']
assert len(all_columns)==len(features)+len(label)

In [20]:
X=df[features]  # Features
y=df[label]  # Labels


# Split dataset into training set and test set. We set the random state at 99 to make sure the result is reproducible
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=99)

In [21]:
#### create a random forest model and train the model. We set the random state at 18 to make sure the result is reproducible
clf=RandomForestClassifier(class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100,random_state=18)

#### fit the model 
clf.fit(X_train,y_train)

#### make prediction 
y_pred=clf.predict(X_test)


#### evaluate our model 
print("Accuracy:",accuracy_score(y_test, y_pred))
print ("Auc score is", roc_auc_score(y_test, y_pred))


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



Accuracy: 0.8734693877551021
Auc score is 0.5812780071274832


In [22]:
####  Now we get the feature importance and select top n features. n here is a hyperparameter of the model and we can 
#### Optimize for n through cross validation 

In [23]:
#### getting feature importance 
feature_imp = pd.Series(clf.feature_importances_,index=features).sort_values(ascending=False)

##### getting top 20 features

top_features=feature_imp.index[:40]

X=df[top_features]  # Features
y=df[label]  # Labels

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=99) 

In [24]:
#### now train a random forest model only considering these 40 features 
clf_selected=RandomForestClassifier(class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100,random_state=18)

#### train the model and make prediction 
clf_selected.fit(X_train,y_train)
y_pred=clf_selected.predict(X_test)

#### model metrics 
print("Accuracy:",accuracy_score(y_test, y_pred))
print ("Auc score is", roc_auc_score(y_test, y_pred))


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



Accuracy: 0.8591836734693877
Auc score is 0.5918457358597086


In [25]:
#### our accuracy did not improve, but our AUC score improved 

In [26]:
#### now, we conduct hyperparameter tuning to further improve our model 

In [27]:
rfc=RandomForestClassifier(random_state=18)
param_grid = {
    'n_estimators': [50,100,200],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8,10,20],
    'criterion' :['gini', 'entropy'],
    'min_samples_split':[2,3,4]
}
CV_rfc = RandomizedSearchCV(estimator=rfc, param_distributions=param_grid, cv=5,verbose=10)
CV_rfc.fit(X_train,y_train)
best_param=CV_rfc.best_params_

print("best parameters of the model is", best_param)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] n_estimators=200, min_samples_split=3, max_features=log2, max_depth=4, criterion=gini 



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



[CV]  n_estimators=200, min_samples_split=3, max_features=log2, max_depth=4, criterion=gini, score=0.8447837150127226, total=   0.5s
[CV] n_estimators=200, min_samples_split=3, max_features=log2, max_depth=4, criterion=gini 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



[CV]  n_estimators=200, min_samples_split=3, max_features=log2, max_depth=4, criterion=gini, score=0.8549618320610687, total=   0.5s
[CV] n_estimators=200, min_samples_split=3, max_features=log2, max_depth=4, criterion=gini 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.1s remaining:    0.0s

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



[CV]  n_estimators=200, min_samples_split=3, max_features=log2, max_depth=4, criterion=gini, score=0.8469387755102041, total=   0.5s
[CV] n_estimators=200, min_samples_split=3, max_features=log2, max_depth=4, criterion=gini 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.6s remaining:    0.0s

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



[CV]  n_estimators=200, min_samples_split=3, max_features=log2, max_depth=4, criterion=gini, score=0.8465473145780051, total=   0.5s
[CV] n_estimators=200, min_samples_split=3, max_features=log2, max_depth=4, criterion=gini 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    2.2s remaining:    0.0s

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



[CV]  n_estimators=200, min_samples_split=3, max_features=log2, max_depth=4, criterion=gini, score=0.8414322250639387, total=   0.6s
[CV] n_estimators=50, min_samples_split=2, max_features=log2, max_depth=8, criterion=gini 
[CV]  n_estimators=50, min_samples_split=2, max_features=log2, max_depth=8, criterion=gini, score=0.8524173027989822, total=   0.2s
[CV] n_estimators=50, min_samples_split=2, max_features=log2, max_depth=8, criterion=gini 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    2.8s remaining:    0.0s

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().

[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    3.0s remaining:    0.0s

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



[CV]  n_estimators=50, min_samples_split=2, max_features=log2, max_depth=8, criterion=gini, score=0.8447837150127226, total=   0.2s
[CV] n_estimators=50, min_samples_split=2, max_features=log2, max_depth=8, criterion=gini 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    3.2s remaining:    0.0s

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



[CV]  n_estimators=50, min_samples_split=2, max_features=log2, max_depth=8, criterion=gini, score=0.8571428571428571, total=   0.2s
[CV] n_estimators=50, min_samples_split=2, max_features=log2, max_depth=8, criterion=gini 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    3.5s remaining:    0.0s

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



[CV]  n_estimators=50, min_samples_split=2, max_features=log2, max_depth=8, criterion=gini, score=0.8491048593350383, total=   0.2s
[CV] n_estimators=50, min_samples_split=2, max_features=log2, max_depth=8, criterion=gini 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    3.7s remaining:    0.0s

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



[CV]  n_estimators=50, min_samples_split=2, max_features=log2, max_depth=8, criterion=gini, score=0.8439897698209718, total=   0.2s
[CV] n_estimators=50, min_samples_split=2, max_features=sqrt, max_depth=7, criterion=entropy 



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



[CV]  n_estimators=50, min_samples_split=2, max_features=sqrt, max_depth=7, criterion=entropy, score=0.8473282442748091, total=   0.3s
[CV] n_estimators=50, min_samples_split=2, max_features=sqrt, max_depth=7, criterion=entropy 



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



[CV]  n_estimators=50, min_samples_split=2, max_features=sqrt, max_depth=7, criterion=entropy, score=0.8549618320610687, total=   0.3s
[CV] n_estimators=50, min_samples_split=2, max_features=sqrt, max_depth=7, criterion=entropy 



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



[CV]  n_estimators=50, min_samples_split=2, max_features=sqrt, max_depth=7, criterion=entropy, score=0.8494897959183674, total=   0.2s
[CV] n_estimators=50, min_samples_split=2, max_features=sqrt, max_depth=7, criterion=entropy 



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



[CV]  n_estimators=50, min_samples_split=2, max_features=sqrt, max_depth=7, criterion=entropy, score=0.8363171355498721, total=   0.3s
[CV] n_estimators=50, min_samples_split=2, max_features=sqrt, max_depth=7, criterion=entropy 



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



[CV]  n_estimators=50, min_samples_split=2, max_features=sqrt, max_depth=7, criterion=entropy, score=0.8491048593350383, total=   0.3s
[CV] n_estimators=200, min_samples_split=2, max_features=auto, max_depth=7, criterion=entropy 



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



[CV]  n_estimators=200, min_samples_split=2, max_features=auto, max_depth=7, criterion=entropy, score=0.8473282442748091, total=   1.4s
[CV] n_estimators=200, min_samples_split=2, max_features=auto, max_depth=7, criterion=entropy 



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



[CV]  n_estimators=200, min_samples_split=2, max_features=auto, max_depth=7, criterion=entropy, score=0.8549618320610687, total=   2.0s
[CV] n_estimators=200, min_samples_split=2, max_features=auto, max_depth=7, criterion=entropy 



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



[CV]  n_estimators=200, min_samples_split=2, max_features=auto, max_depth=7, criterion=entropy, score=0.8469387755102041, total=   1.2s
[CV] n_estimators=200, min_samples_split=2, max_features=auto, max_depth=7, criterion=entropy 



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



[CV]  n_estimators=200, min_samples_split=2, max_features=auto, max_depth=7, criterion=entropy, score=0.8465473145780051, total=   1.4s
[CV] n_estimators=200, min_samples_split=2, max_features=auto, max_depth=7, criterion=entropy 



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



[CV]  n_estimators=200, min_samples_split=2, max_features=auto, max_depth=7, criterion=entropy, score=0.8465473145780051, total=   1.1s
[CV] n_estimators=50, min_samples_split=3, max_features=auto, max_depth=4, criterion=entropy 



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



[CV]  n_estimators=50, min_samples_split=3, max_features=auto, max_depth=4, criterion=entropy, score=0.8473282442748091, total=   0.2s
[CV] n_estimators=50, min_samples_split=3, max_features=auto, max_depth=4, criterion=entropy 



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



[CV]  n_estimators=50, min_samples_split=3, max_features=auto, max_depth=4, criterion=entropy, score=0.8473282442748091, total=   0.2s
[CV] n_estimators=50, min_samples_split=3, max_features=auto, max_depth=4, criterion=entropy 
[CV]  n_estimators=50, min_samples_split=3, max_features=auto, max_depth=4, criterion=entropy, score=0.8443877551020408, total=   0.2s
[CV] n_estimators=50, min_samples_split=3, max_features=auto, max_depth=4, criterion=entropy 



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



[CV]  n_estimators=50, min_samples_split=3, max_features=auto, max_depth=4, criterion=entropy, score=0.8465473145780051, total=   0.2s
[CV] n_estimators=50, min_samples_split=3, max_features=auto, max_depth=4, criterion=entropy 
[CV]  n_estimators=50, min_samples_split=3, max_features=auto, max_depth=4, criterion=entropy, score=0.8465473145780051, total=   0.2s
[CV] n_estimators=200, min_samples_split=2, max_features=auto, max_depth=5, criterion=gini 



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



[CV]  n_estimators=200, min_samples_split=2, max_features=auto, max_depth=5, criterion=gini, score=0.8473282442748091, total=   0.7s
[CV] n_estimators=200, min_samples_split=2, max_features=auto, max_depth=5, criterion=gini 



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



[CV]  n_estimators=200, min_samples_split=2, max_features=auto, max_depth=5, criterion=gini, score=0.8549618320610687, total=   0.7s
[CV] n_estimators=200, min_samples_split=2, max_features=auto, max_depth=5, criterion=gini 



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



[CV]  n_estimators=200, min_samples_split=2, max_features=auto, max_depth=5, criterion=gini, score=0.8494897959183674, total=   0.7s
[CV] n_estimators=200, min_samples_split=2, max_features=auto, max_depth=5, criterion=gini 



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



[CV]  n_estimators=200, min_samples_split=2, max_features=auto, max_depth=5, criterion=gini, score=0.8465473145780051, total=   0.7s
[CV] n_estimators=200, min_samples_split=2, max_features=auto, max_depth=5, criterion=gini 



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



[CV]  n_estimators=200, min_samples_split=2, max_features=auto, max_depth=5, criterion=gini, score=0.8516624040920716, total=   0.7s
[CV] n_estimators=50, min_samples_split=3, max_features=auto, max_depth=7, criterion=entropy 



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



[CV]  n_estimators=50, min_samples_split=3, max_features=auto, max_depth=7, criterion=entropy, score=0.8422391857506362, total=   0.3s
[CV] n_estimators=50, min_samples_split=3, max_features=auto, max_depth=7, criterion=entropy 



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



[CV]  n_estimators=50, min_samples_split=3, max_features=auto, max_depth=7, criterion=entropy, score=0.8651399491094147, total=   0.3s
[CV] n_estimators=50, min_samples_split=3, max_features=auto, max_depth=7, criterion=entropy 



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



[CV]  n_estimators=50, min_samples_split=3, max_features=auto, max_depth=7, criterion=entropy, score=0.8469387755102041, total=   0.4s
[CV] n_estimators=50, min_samples_split=3, max_features=auto, max_depth=7, criterion=entropy 



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



[CV]  n_estimators=50, min_samples_split=3, max_features=auto, max_depth=7, criterion=entropy, score=0.8388746803069054, total=   0.3s
[CV] n_estimators=50, min_samples_split=3, max_features=auto, max_depth=7, criterion=entropy 



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



[CV]  n_estimators=50, min_samples_split=3, max_features=auto, max_depth=7, criterion=entropy, score=0.8465473145780051, total=   0.3s
[CV] n_estimators=100, min_samples_split=2, max_features=sqrt, max_depth=4, criterion=gini 



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



[CV]  n_estimators=100, min_samples_split=2, max_features=sqrt, max_depth=4, criterion=gini, score=0.8473282442748091, total=   0.6s
[CV] n_estimators=100, min_samples_split=2, max_features=sqrt, max_depth=4, criterion=gini 



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



[CV]  n_estimators=100, min_samples_split=2, max_features=sqrt, max_depth=4, criterion=gini, score=0.8447837150127226, total=   0.3s
[CV] n_estimators=100, min_samples_split=2, max_features=sqrt, max_depth=4, criterion=gini 



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



[CV]  n_estimators=100, min_samples_split=2, max_features=sqrt, max_depth=4, criterion=gini, score=0.8469387755102041, total=   0.4s
[CV] n_estimators=100, min_samples_split=2, max_features=sqrt, max_depth=4, criterion=gini 



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



[CV]  n_estimators=100, min_samples_split=2, max_features=sqrt, max_depth=4, criterion=gini, score=0.8465473145780051, total=   0.4s
[CV] n_estimators=100, min_samples_split=2, max_features=sqrt, max_depth=4, criterion=gini 



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



[CV]  n_estimators=100, min_samples_split=2, max_features=sqrt, max_depth=4, criterion=gini, score=0.8439897698209718, total=   0.3s
[CV] n_estimators=50, min_samples_split=4, max_features=log2, max_depth=7, criterion=gini 



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



[CV]  n_estimators=50, min_samples_split=4, max_features=log2, max_depth=7, criterion=gini, score=0.8498727735368957, total=   0.2s
[CV] n_estimators=50, min_samples_split=4, max_features=log2, max_depth=7, criterion=gini 



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



[CV]  n_estimators=50, min_samples_split=4, max_features=log2, max_depth=7, criterion=gini, score=0.8422391857506362, total=   0.2s
[CV] n_estimators=50, min_samples_split=4, max_features=log2, max_depth=7, criterion=gini 



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



[CV]  n_estimators=50, min_samples_split=4, max_features=log2, max_depth=7, criterion=gini, score=0.8494897959183674, total=   0.3s
[CV] n_estimators=50, min_samples_split=4, max_features=log2, max_depth=7, criterion=gini 



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



[CV]  n_estimators=50, min_samples_split=4, max_features=log2, max_depth=7, criterion=gini, score=0.8465473145780051, total=   0.3s
[CV] n_estimators=50, min_samples_split=4, max_features=log2, max_depth=7, criterion=gini 



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



[CV]  n_estimators=50, min_samples_split=4, max_features=log2, max_depth=7, criterion=gini, score=0.8516624040920716, total=   0.3s
[CV] n_estimators=200, min_samples_split=4, max_features=sqrt, max_depth=6, criterion=gini 



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



[CV]  n_estimators=200, min_samples_split=4, max_features=sqrt, max_depth=6, criterion=gini, score=0.8447837150127226, total=   1.0s
[CV] n_estimators=200, min_samples_split=4, max_features=sqrt, max_depth=6, criterion=gini 



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



[CV]  n_estimators=200, min_samples_split=4, max_features=sqrt, max_depth=6, criterion=gini, score=0.8524173027989822, total=   1.2s
[CV] n_estimators=200, min_samples_split=4, max_features=sqrt, max_depth=6, criterion=gini 



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



[CV]  n_estimators=200, min_samples_split=4, max_features=sqrt, max_depth=6, criterion=gini, score=0.8520408163265306, total=   1.2s
[CV] n_estimators=200, min_samples_split=4, max_features=sqrt, max_depth=6, criterion=gini 



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



[CV]  n_estimators=200, min_samples_split=4, max_features=sqrt, max_depth=6, criterion=gini, score=0.8414322250639387, total=   1.0s
[CV] n_estimators=200, min_samples_split=4, max_features=sqrt, max_depth=6, criterion=gini 



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



[CV]  n_estimators=200, min_samples_split=4, max_features=sqrt, max_depth=6, criterion=gini, score=0.8516624040920716, total=   0.9s


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   28.0s finished

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



best parameters of the model is {'n_estimators': 200, 'min_samples_split': 2, 'max_features': 'auto', 'max_depth': 5, 'criterion': 'gini'}


In [29]:
##### now create a model with best parameters and evaluate the result 
rfc_best=RandomForestClassifier(random_state=18, max_features=CV_rfc.best_params_['max_features'], n_estimators=CV_rfc.best_params_['n_estimators'], max_depth=CV_rfc.best_params_['max_depth'], criterion=CV_rfc.best_params_['criterion'])
rfc_best.fit(X_train,y_train)
y_pred=rfc_best.predict(X_test)
print("Accuracy:",accuracy_score(y_test, y_pred))
print ("Auc score is", roc_auc_score(y_test, y_pred))


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



Accuracy: 0.863265306122449
Auc score is 0.5502452277618997


In [32]:
####Insights: After tuning the parameter, our accuracy has improved, and the auc score dropped. 
####Depending on what metric we are optimizing for we could choose to optimize for auc in the future.
####From the model of best parameters, we obtained the feature importance score. As can be seen, GCS_mean is the most important feature. 
#### Top five important features that impact the survivial in the hospital are: GCS_Mean, Urine_Mean, GCS_Median, GCS_Max, BUN_Median.
feature_imp_best = pd.Series(rfc_best.feature_importances_,index=top_features).sort_values(ascending=False)
feature_imp_best

GCS_Mean            0.077794
Urine_Mean          0.075312
GCS_Median          0.058797
GCS_Max             0.051129
BUN_Median          0.043245
Urine_Median        0.037374
Urine_Max           0.035143
BUN_Min             0.033719
HCO3_Mean           0.031275
BUN_Mean            0.030640
Urine_Range         0.027253
BUN_Max             0.026234
Temp_Mean           0.025559
PaCO2_Mean          0.023178
Temp_Median         0.022809
HCO3_Min            0.021997
Platelets_Median    0.021226
Platelets_Mean      0.021014
Glucose_Mean        0.020927
HCO3_Median         0.020860
pH_Mean             0.020207
Age                 0.019252
pH_Min              0.019232
PaCO2_Median        0.017885
NIMAP_Median        0.017878
WBC_Mean            0.017870
Platelets_Min       0.017620
NIMAP_Mean          0.017417
Glucose_Max         0.015290
WBC_Min             0.015097
WBC_Max             0.013446
HR_Mean             0.013362
HR_Min              0.012870
PaO2_Mean           0.012826
NISysABP_Max  