In [169]:
%matplotlib inline
# Required Python Packages
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

In [170]:
# Reading in data from the 50-50 readmit vs. nonreadmit dataset

df = pd.read_csv('dfd.csv')
df.head()

Unnamed: 0,subject_id,hadm_id,ethnicity,insurance,gender,admittime,dischtime,age_on_admiss,length_of_stay,readmit_time,...,vancomycin,neosynephrine,neostigmine,glycopyrrolate,ceftriaxone,atropine sulfate,chlorhexidine,nitroglycerin,nitroprusside,lansoprazole
0,36,182104,WHITE,Medicare,M,2131-04-30,2131-05-08,70.0,8,0.0,...,True,True,True,True,False,False,False,True,False,False
1,68,170467,BLACK/AFRICAN AMERICAN,Medicare,F,2173-12-15,2174-01-03,42.0,19,0.0,...,True,False,False,False,False,False,False,False,False,False
2,103,130744,UNKNOWN/NOT SPECIFIED,Private,F,2144-08-12,2144-08-20,60.0,8,0.0,...,False,False,False,False,False,False,False,False,True,False
3,105,161160,BLACK/AFRICAN AMERICAN,Medicare,F,2189-01-28,2189-02-02,35.0,5,0.0,...,False,False,False,False,True,False,False,False,False,False
4,109,164029,BLACK/AFRICAN AMERICAN,Medicaid,F,2140-01-19,2140-01-21,22.0,2,0.0,...,False,False,False,False,False,True,False,False,False,False


In [171]:
# Dropping certain columns

# Dropping ID numbers and dates
df = df.drop(columns=['subject_id', 'hadm_id', 'admittime', 'dischtime'])

# Dropping labevents and chartevents values of less importance based on feature selection (2/3 values for each measurement)
df = df.drop(columns=['rdw_min', 'rdw_max', 'hemoglobin_min', 'hemoglobin_max', 'creatinine_median', 'creatinine_min', 
                      'hematocrit_median', 'hematocrit_min', 'tempc_median', 'tempc_max', 'resprate_median', 
                      'resprate_min', 'wbc_median', 'wbc_max', 'inr_min', 'inr_median', 'ptt_median', 'ptt_max', 
                      'lactate_median', 'lactate_max', 'sysbp_median', 'sysbp_min', 'spo2_median', 'spo2_max', 
                      'bilirubin_median', 'bilirubin_max', 'platelet_median', 'platelet_max', 'heartrate_min',
                      'heartrate_median'])
df.head()

Unnamed: 0,ethnicity,insurance,gender,age_on_admiss,length_of_stay,readmit_time,is_readmit,followed_by_readmit,bilirubin_min,creatinine_max,...,vancomycin,neosynephrine,neostigmine,glycopyrrolate,ceftriaxone,atropine sulfate,chlorhexidine,nitroglycerin,nitroprusside,lansoprazole
0,WHITE,Medicare,M,70.0,8,0.0,False,True,0.5,1.3,...,True,True,True,True,False,False,False,True,False,False
1,BLACK/AFRICAN AMERICAN,Medicare,F,42.0,19,0.0,False,True,0.3,3.1,...,True,False,False,False,False,False,False,False,False,False
2,UNKNOWN/NOT SPECIFIED,Private,F,60.0,8,0.0,False,True,0.5,0.4,...,False,False,False,False,False,False,False,False,True,False
3,BLACK/AFRICAN AMERICAN,Medicare,F,35.0,5,0.0,False,True,0.3,1.7,...,False,False,False,False,True,False,False,False,False,False
4,BLACK/AFRICAN AMERICAN,Medicaid,F,22.0,2,0.0,False,True,0.5,6.6,...,False,False,False,False,False,True,False,False,False,False


In [172]:
# Converting categorical features into dummy variables

df_converted = pd.get_dummies(df)
df_converted.head()


Unnamed: 0,age_on_admiss,length_of_stay,readmit_time,is_readmit,followed_by_readmit,bilirubin_min,creatinine_max,hematocrit_max,hemoglobin_median,lactate_min,...,Depart1_diseases of the nervous system and sense organs,Depart1_diseases of the respiratory system,Depart1_diseases of the skin and subcutaneous tissue,"Depart1_endocrine, nutritional and metabolic diseases, and immunity disorders",Depart1_external causes of injury and supplemental classification,Depart1_infectious and parasitic diseases,Depart1_injury and poisoning,Depart1_mental disorders,Depart1_neoplasms,"Depart1_symptoms, signs, and ill-defined conditions"
0,70.0,8,0.0,False,True,0.5,1.3,42.1,10.4,1.2,...,0,0,0,0,0,0,0,0,0,0
1,42.0,19,0.0,False,True,0.3,3.1,35.3,8.7,1.0,...,0,0,0,0,0,0,0,0,0,0
2,60.0,8,0.0,False,True,0.5,0.4,30.2,9.1,0.9,...,0,0,0,0,0,0,0,0,1,0
3,35.0,5,0.0,False,True,0.3,1.7,33.7,9.8,1.2,...,0,0,0,0,0,0,0,0,0,0
4,22.0,2,0.0,False,True,0.5,6.6,40.7,12.8,1.2,...,0,0,0,0,0,0,0,0,0,0


In [173]:
# Splitting dataframe into data (predictors) vs. label (attributed to be predicted)

label_df = df_converted.pop('followed_by_readmit')
data_df = df_converted
names = data_df.columns.values
#print('label_df:\n', label_df.head(), 2*'\n', 'data_df:\n', data_df.head())

In [174]:
names

array(['age_on_admiss', 'length_of_stay', 'readmit_time', 'is_readmit',
       'bilirubin_min', 'creatinine_max', 'hematocrit_max',
       'hemoglobin_median', 'lactate_min', 'platelet_min', 'ptt_min',
       'inr_max', 'wbc_min', 'rdw_median', 'heartrate_max', 'sysbp_max',
       'resprate_max', 'tempc_min', 'spo2_min',
       'diseases of the circulatory system',
       'diseases of the genitourinary system',
       'diseases of the respiratory system',
       'endocrine, nutritional and metabolic diseases, and immunity disorders',
       'diseases of the musculoskeletal system and connective tissue',
       'diseases of the blood and blood-forming organs',
       'diseases of the digestive system',
       'diseases of the nervous system and sense organs', 'neoplasms',
       'symptoms, signs, and ill-defined conditions',
       'external causes of injury and supplemental classification',
       'mental disorders', 'injury and poisoning',
       'diseases of the skin and subcutaneous

In [27]:
# Converting dataframes to NumPy arrays

label = label_df.values
data = data_df.values

In [28]:
# 80/20 train-test split

from sklearn.model_selection import train_test_split

train_data, test_data, train_label, test_label = train_test_split(data, label, train_size=0.8, test_size=0.2, random_state=10)
print('Training data:', train_data.shape, '\tTest data:', test_data.shape)
print('Training labels:', train_label.shape, '\tTest labels:', test_label.shape)

Training data: (4873, 110) 	Test data: (1219, 110)
Training labels: (4873,) 	Test labels: (1219,)


In [190]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
model = RandomForestClassifier(n_estimators = 700,max_depth=6, oob_score = True, n_jobs = -1,random_state =50,  max_features = "auto" , min_samples_leaf = 20)

In [191]:
model.fit(train_data,train_label)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=6, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=20, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=700, n_jobs=-1,
            oob_score=True, random_state=50, verbose=0, warm_start=False)

In [192]:
print(model.oob_score_)

0.629591627334291


In [193]:
# Use the forest's predict method on the test data
predictions = model.predict(test_data)

In [194]:
accuracy_score(test_label, predictions)

0.6570959803117309

In [195]:
print(model.feature_importances_)

[2.09535992e-02 4.90980972e-02 7.87284062e-02 7.41060276e-02
 1.10264823e-02 4.50631963e-02 2.90470177e-02 4.53768595e-02
 1.88551701e-02 2.35858357e-02 2.69918193e-02 1.78000031e-02
 3.74510854e-02 1.26938058e-01 2.09613186e-02 2.23661214e-02
 2.46945862e-02 2.60834079e-02 2.28420601e-02 3.30120619e-03
 2.16977929e-02 7.22959784e-03 2.03890787e-03 2.62809846e-03
 1.10615878e-02 6.53201719e-03 5.73146338e-03 1.76994244e-03
 4.66502551e-03 7.53052159e-03 3.03522664e-03 3.20070775e-03
 1.64852103e-03 9.60405020e-03 3.03199264e-04 6.79976641e-02
 9.95138907e-04 3.15699197e-04 2.13731501e-04 7.80066101e-05
 9.92193338e-03 6.73694422e-03 2.55702365e-04 2.34676545e-04
 2.44186876e-03 2.35081574e-03 5.41652396e-03 7.35181962e-03
 1.14280003e-03 1.11478029e-03 1.60801610e-03 2.74965187e-03
 7.19017641e-04 5.65404891e-03 0.00000000e+00 2.35409773e-05
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 1.21053945e-02 0.00000000e+00
 0.00000000e+00 2.941350

In [144]:
param_grid = { 
    'n_estimators': [200, 300,400,500,700],
    'max_features': ['auto', 'sqrt', 'log2']
}

In [159]:
from sklearn.grid_search import GridSearchCV
CV_rfc = GridSearchCV(estimator=model, param_grid=param_grid, cv= 10)

In [160]:
CV_rfc.fit(train_data, train_label)

GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=6, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=20, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=700, n_jobs=-1,
            oob_score=True, random_state=50, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [200, 700], 'max_features': ['auto', 'sqrt', 'log2']},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [161]:
print (CV_rfc.best_params_)

{'max_features': 'auto', 'n_estimators': 200}


In [196]:
print (sorted(zip(map(lambda x: round(x, 4), model.feature_importances_), names), 
             reverse=True))

[(0.1269, 'rdw_median'), (0.0787, 'readmit_time'), (0.0741, 'is_readmit'), (0.068, 'No_of_Depart'), (0.0491, 'length_of_stay'), (0.0454, 'hemoglobin_median'), (0.0451, 'creatinine_max'), (0.0375, 'wbc_min'), (0.029, 'hematocrit_max'), (0.027, 'ptt_min'), (0.0261, 'tempc_min'), (0.0247, 'resprate_max'), (0.0236, 'platelet_min'), (0.0228, 'spo2_min'), (0.0224, 'sysbp_max'), (0.0217, 'diseases of the genitourinary system'), (0.021, 'heartrate_max'), (0.021, 'age_on_admiss'), (0.0202, 'ethnicity_UNKNOWN/NOT SPECIFIED'), (0.0189, 'lactate_min'), (0.0178, 'inr_max'), (0.0121, 'ethnicity_BLACK/AFRICAN AMERICAN'), (0.0111, 'diseases of the blood and blood-forming organs'), (0.011, 'bilirubin_min'), (0.0099, 'single_int_mammCAB'), (0.0096, 'infectious and parasitic diseases'), (0.0075, 'external causes of injury and supplemental classification'), (0.0074, 'glycopyrrolate'), (0.0072, 'diseases of the respiratory system'), (0.0067, 'aorto_CBT'), (0.0065, 'diseases of the digestive system'), (0.00