In [1]:
import pandas as pd
import numpy as np
import time

from sklearn import linear_model, svm
from sklearn.linear_model import LogisticRegression

import sklearn.model_selection as ms
from sklearn.model_selection import GridSearchCV, StratifiedKFold, StratifiedShuffleSplit,\
    cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score,\
    roc_auc_score

import lightgbm as lgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

import eli5

pd.set_option('display.max_columns',99)
pd.set_option('display.max_rows',300)



In [3]:
#READ CSVs

#xing
train_unique_orig = pd.read_csv('dataset_diabetes/data2/train_unique_cleaned_df.csv')
test_unique_orig = pd.read_csv('dataset_diabetes/data2/test_unique_cleaned_df.csv')

#15,20,25,30 whole and unique

train_unique_15_orig = pd.read_csv('dataset_diabetes/experiment/15/unique_train_cleaned_15.csv')
train_whole_15_orig = pd.read_csv('dataset_diabetes/experiment/15/whole_train_cleaned_15.csv')
test_unique_15_orig = pd.read_csv('dataset_diabetes/experiment/15/unique_test_cleaned_15.csv')

train_unique_20_orig = pd.read_csv('dataset_diabetes/experiment/20/unique_train_cleaned_20.csv')
train_whole_20_orig = pd.read_csv('dataset_diabetes/experiment/20/whole_train_cleaned_20.csv')
test_unique_20_orig = pd.read_csv('dataset_diabetes/experiment/20/unique_test_cleaned_20.csv')

train_unique_25_orig = pd.read_csv('dataset_diabetes/experiment/25/unique_train_cleaned_25.csv')
train_whole_25_orig = pd.read_csv('dataset_diabetes/experiment/25/whole_train_cleaned_25.csv')
test_unique_25_orig = pd.read_csv('dataset_diabetes/experiment/25/unique_test_cleaned_25.csv')

train_unique_30_orig = pd.read_csv('dataset_diabetes/experiment/30/unique_train_cleaned_30.csv')
train_whole_30_orig = pd.read_csv('dataset_diabetes/experiment/30/whole_train_cleaned_30.csv')
test_unique_30_orig = pd.read_csv('dataset_diabetes/experiment/30/unique_test_cleaned_30.csv')

In [4]:
#make copies

train_unique_xing = train_unique_orig.copy()
test_unique_xing = test_unique_orig.copy()


train_unique_15 = train_unique_15_orig.copy()
test_unique_15 = test_unique_15_orig.copy()
train_whole_15 = train_whole_15_orig.copy()

train_unique_20 = train_unique_20_orig.copy()
test_unique_20 = test_unique_20_orig.copy()
train_whole_20 = train_whole_20_orig.copy()

train_unique_25 = train_unique_25_orig.copy()
test_unique_25 = test_unique_25_orig.copy()
train_whole_25 = train_whole_25_orig.copy()

train_unique_30 = train_unique_30_orig.copy()
test_unique_30 = test_unique_30_orig.copy()
train_whole_30 = train_whole_30_orig.copy()

In [5]:
def print_report(y_actual, y_pred, y_pred_proba):
    tp, fp, fn, tn = confusion_matrix(y_actual, y_pred).ravel()
    fpr = fp/(fp+tn)
#     auc = roc_auc_score(y_actual, y_pred_proba)
#     print('AUC:%.3f'%auc)
    print(f'False positive rate is {fpr}')

In [6]:
def prep_dfs(split,train_unique,train_whole,test_unique):
    
    train_unique_y = train_unique['readmitted']
    unique_encounter_patient = train_unique[['encounter_id','patient_nbr']]
    train_unique.drop(['readmitted','encounter_id','patient_nbr'], inplace = True, axis = 1)
    
    train_whole_y = train_whole['readmitted']
    whole_encounter_patient = train_whole[['encounter_id','patient_nbr']]
    train_whole.drop(['readmitted','encounter_id','patient_nbr'], inplace = True, axis = 1)
    
    test_unique_y = test_unique['readmitted']
    test_unique.drop(['readmitted','encounter_id','patient_nbr'], inplace = True, axis = 1)
    
    print('{split} percent X_train_unique shape is {shape}'.format(split = split, shape = train_unique.shape))
    print('{split} percent y_train_unique shape is {shape}'.format(split = split, shape = train_unique_y.shape))
    print('{split} percent X_train_whole shape is {shape}'.format(split = split, shape = train_whole.shape))
    print('{split} percent y_train_whole shape is {shape}'.format(split = split, shape = train_whole_y.shape))
    print('{split} percent X_test_unique shape is {shape}'.format(split = split, shape = test_unique.shape))
    print('{split} percent y_test_unique shape is {shape}'.format(split = split, shape = test_unique_y.shape))
    print(' ')
    
    return train_unique, train_unique_y, unique_encounter_patient, train_whole, train_whole_y,\
        whole_encounter_patient,test_unique,test_unique_y

In [7]:
#create dataframes for models
X_train_unique_15, y_train_unique_15, unique_15_encounter_patient, X_train_whole_15,\
y_train_whole_15, whole_15_encounter_patient,X_test_unique_15,y_test_unique_15 = prep_dfs(15,train_unique_15,\
                                                                       train_whole_15,\
                                                                       test_unique_15)

X_train_unique_20, y_train_unique_20, unique_20_encounter_patient, X_train_whole_20,\
y_train_whole_20, whole_20_encounter_patient,X_test_unique_20,y_test_unique_20 = prep_dfs(20,train_unique_20,\
                                                                       train_whole_20,\
                                                                       test_unique_20)

X_train_unique_25, y_train_unique_25, unique_25_encounter_patient, X_train_whole_25,\
y_train_whole_25, whole_25_encounter_patient,X_test_unique_25,y_test_unique_25 = prep_dfs(25,train_unique_25,\
                                                                        train_whole_25,\
                                                                        test_unique_25)

X_train_unique_30, y_train_unique_30, unique_30_encounter_patient, X_train_whole_30,\
y_train_whole_30, whole_30_encounter_patient,X_test_unique_30,y_test_unique_30 = prep_dfs(30,train_unique_30,\
                                                                       train_whole_30,\
                                                                       test_unique_30)

15 percent X_train_unique shape is (61674, 91)
15 percent y_train_unique shape is (61674,)
15 percent X_train_whole shape is (84422, 91)
15 percent y_train_whole shape is (84422,)
15 percent X_test_unique shape is (13835, 91)
15 percent y_test_unique shape is (13835,)
 
20 percent X_train_unique shape is (58793, 91)
20 percent y_train_unique shape is (58793,)
20 percent X_train_whole shape is (79456, 91)
20 percent y_train_whole shape is (79456,)
20 percent X_test_unique shape is (18019, 91)
20 percent y_test_unique shape is (18019,)
 
25 percent X_train_unique shape is (55815, 91)
25 percent y_train_unique shape is (55815,)
25 percent X_train_whole shape is (74490, 91)
25 percent y_train_whole shape is (74490,)
25 percent X_test_unique shape is (22049, 91)
25 percent y_test_unique shape is (22049,)
 
30 percent X_train_unique shape is (52825, 91)
30 percent y_train_unique shape is (52825,)
30 percent X_train_whole shape is (69524, 91)
30 percent y_train_whole shape is (69524,)
30 perc

In [8]:
#dictionary to store all the different dataframes with different splits
versions = {'X_train':{'15w':X_train_whole_15,'15u':X_train_unique_15,\
                       '20w':X_train_whole_20,'20u':X_train_unique_20,\
                       '25w':X_train_whole_25,'25u':X_train_unique_25,\
                       '30w':X_train_whole_30,'30u':X_train_unique_30},\
            'y_train':{'15w':y_train_whole_15,'15u':y_train_unique_15,\
                       '20w':y_train_whole_20,'20u':y_train_unique_20,\
                       '25w':y_train_whole_25,'25u':y_train_unique_25,\
                       '30w':y_train_whole_30,'30u':y_train_unique_30},
            'X_test':{ '15u':X_test_unique_15,\
                       '20u':X_test_unique_20,\
                       '25u':X_test_unique_25,\
                       '30u':X_test_unique_30},\
            'y_test':{ '15u':y_test_unique_15,\
                       '20u':y_test_unique_20,\
                       '25u':y_test_unique_25,\
                       '30u':y_test_unique_30}}

In [9]:
#versions['X_train']['15w'].head()

In [10]:
#versions['X_test']['15u'].head()

In [11]:
#versions['y_test']['15u'].head()

In [12]:
#declare which version to run in model below:

def choose_version(data_split,selection):

    X_train = versions['X_train'][str(data_split)+selection] 
    y_train = versions['y_train'][str(data_split)+selection]
    X_test = versions['X_test'][str(data_split)+'u']
    y_test = versions['y_test'][str(data_split)+'u']
    
    return X_train, y_train, X_test, y_test

In [66]:
#++++++++++++++++RUN THIS CELL TO SET WHICH SPLIT VERSION:++++++++++++++++++++

X_train, y_train, X_test, y_test = choose_version(25,'u')

# X_train, y_train, X_test, y_test = choose_version(25,'u')
# X_train, y_train, X_test, y_test = choose_version(30,'w')

#++++++++++++++++RUN THIS CELL TO SET WHICH SPLIT VERSION:++++++++++++++++++++

#### Logistic Regression

In [31]:
# use stratifiedKFold
skfl = ms.StratifiedKFold(n_splits=10, shuffle=True, random_state=99)

In [32]:
logit1 = LogisticRegression()
logit1.set_params(class_weight = 'balanced')#turn on Ridge for heavily penalized coefficients. 
params1 = {'C':np.logspace(-4,4, 50)}

gs_logit1 = ms.GridSearchCV(estimator = logit1,
                           param_grid = params1,
                           cv = skfl,
                           verbose = True, n_jobs = -1)

In [34]:
# gs_logit1.fit(X_train, y_train)
# gs_logit1.best_params_

# logit_best = gs_logit1.best_estimator_
# logit_best_predict = logit_best.predict(X_train)
# cm1 = confusion_matrix(y_train, logit_best.predict(X_train))
# cm1

#### LightGBM w/Bayesian Optimization

In [None]:
#to plot ROC curve, base model
from sklearn.metrics import roc_auc_score, roc_curve

# Fit the model with the default paramter values
pipe.fit(X_train, y_train)

# Score with the test data
y_score = pipe.predict_proba(X_test)

auc_score_base = roc_auc_score(y_test, y_score[:,1])
print('Test AUC = {:.4f}'.format(auc_score_base))

# Compute false positive rate (fpr) and true positive rate (tpr)
fpr_base, tpr_base, _ = roc_curve(y_test,  y_score[:,1])

# Plot the ROC curve
plt.plot(fpr_base, tpr_base, label='Base ROC curve (area = %0.3f)' % auc_score_base)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate or (1 - Specifity)')
plt.ylabel('True Positive Rate or (Sensitivity)')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

In [None]:
#plot for tuned model
fpr_tuned, tpr_tuned, _ = roc_curve(y_test,  y_score[:,1])

# Plot ROC curve
plt.plot(fpr_base, tpr_base, label='Base ROC curve (area = %0.3f)' % auc_score_base)
plt.plot(fpr_tuned, tpr_tuned, label='Tuned ROC curve (area = %0.3f)' % auc_score_tuned)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate or (1 - Specifity)')
plt.ylabel('True Positive Rate or (Sensitivity)')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

In [99]:
#declare objective function for the optimizer
def objective_function(params):
    clf = lgb.LGBMClassifier(**params)
    skf = ms.StratifiedKFold(n_splits=10, shuffle=True, random_state=99)
    score = cross_val_score(clf, X_train, y_train, scoring = 'roc_auc', cv=skf, n_jobs=1)
    return {'loss': 1-score.mean(), 'status': STATUS_OK}     

In [101]:
num_eval = 100
#params to consider
param_hyperopt= {
                'learning_rate': hp.loguniform('learning_rate', np.log(0.001), np.log(2)),
                'max_depth': scope.int(hp.quniform('max_depth', 5, 19, 1)),
                'n_estimators': scope.int(hp.quniform('n_estimators', 100, 250, 5)), #5,55,1
                'num_leaves': scope.int(hp.quniform('num_leaves', 5, 38, 1)),
                'boosting_type': hp.choice('boosting_type', ['gbdt','dart','goss']), #check lightgbm for types
                'colsample_bytree': hp.uniform('colsample_bytree', 0.1, 1.0),
                'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.5), #ridge
                #'drop_rate': hp.uniform('drop_rate', 0.0, 1.0),
                'reg_alpha': hp.uniform('reg_alpha', 0.0, 1.1),
                'max_bin': scope.int(hp.quniform('max_bin', 255, 265, 1)),
                #'scale_pos_weight': hp.uniform('scale_pos_weight', 1.0,12.0),
                #'early_stopping_round': scope.int(hp.quniform('early_stopping_round', 35, 95, 1))
                }

#loss minimizer, then store parameters
trials = Trials()
best_param = fmin(objective_function, 
                  param_hyperopt, 
                  algo=tpe.suggest, 
                  max_evals=num_eval, 
                  trials=trials,
                  rstate= np.random.RandomState(1))
loss = [x['result']['loss'] for x in trials.trials]
best_param_values = [x for x in best_param.values()]

100%|██████████| 100/100 [4:50:27<00:00, 215.74s/it, best loss: 0.32513890999313]     


In [102]:
best_param

{'boosting_type': 2,
 'colsample_bytree': 0.6407227412301181,
 'learning_rate': 0.001701726773701295,
 'max_bin': 264.0,
 'max_depth': 8.0,
 'n_estimators': 2400.0,
 'num_leaves': 34.0,
 'reg_alpha': 0.8820270956084394,
 'reg_lambda': 1.409512918662029}

In [44]:
best_param

{'boosting_type': 2,
 'colsample_bytree': 0.537929469927223,
 'learning_rate': 0.0048541258946577055,
 'max_bin': 264.0,
 'max_depth': 8.0,
 'n_estimators': 38.0,
 'num_leaves': 25.0,
 'reg_alpha': 0.2509921763653179,
 'reg_lambda': 0.013648040562265595,
 'scale_pos_weight': 5.850282307622445}

In [19]:
selected_params = best_param.copy()

In [103]:
if best_param_values[0] == 0:
    boosting_type = 'gbdt'

elif best_param_values[0] == 1:
    boosting_type = 'dart'

else:
    #best_param_values[0] == 2
    boosting_type = 'goss'    
    
# else:
#     boosting_type = 'goss'
        
    
clf_best = lgb.LGBMClassifier(boosting_type=boosting_type,
                              colsample_bytree = best_param['colsample_bytree'],
                              #drop_rate = best_param['drop_rate'],
                              learning_rate = best_param['learning_rate'],
                              max_bin = int(best_param['max_bin']),
                              max_depth = int(best_param['max_depth']),
                              n_estimators = int(best_param['n_estimators']),
                              num_leaves = int(best_param['num_leaves']),
                              reg_alpha = best_param['reg_alpha'],
                              reg_lambda = best_param['reg_lambda'],
                              objective = 'binary',
                              metric = 'binary_logloss',
                              #scale_pos_weight = best_param['scale_pos_weight'],
                              #bagging_fraction = 
                              #early_stopping_round = int(best_param['early_stopping_round'])
                              )
                                  
clf_best.fit(X_train, y_train)

LGBMClassifier(boosting_type='goss', class_weight=None,
               colsample_bytree=0.6407227412301181, importance_type='split',
               learning_rate=0.001701726773701295, max_bin=264, max_depth=8,
               metric='binary_logloss', min_child_samples=20,
               min_child_weight=0.001, min_split_gain=0.0, n_estimators=2400,
               n_jobs=-1, num_leaves=34, objective='binary', random_state=None,
               reg_alpha=0.8820270956084394, reg_lambda=1.409512918662029,
               silent=True, subsample=1.0, subsample_for_bin=200000,
               subsample_freq=0)

In [104]:
print("")
print('='*25,'Results','='*25)
print('')
print("Score best parameters: ", min(loss)*-1)
print("Best parameters: ", best_param)
print("Test Score: ", clf_best.score(X_test, y_test))
#print("Time elapsed: ", time.time() - start)
print("Parameter combinations evaluated: ", num_eval)
auc = roc_auc_score(y_test, clf_best.predict_proba(X_test)[:,1])
auct = roc_auc_score(y_train, clf_best.predict_proba(X_train)[:,1])
print('AUC train is: ',auct)
print('AUC test is: ',auc)
print_report(y_test,clf_best.predict(X_test),clf_best.predict_proba(X_test)[:,1])
cmlgbm = confusion_matrix(y_test,clf_best.predict(X_test))
cmlgbm



Score best parameters:  -0.32513890999313
Best parameters:  {'boosting_type': 2, 'colsample_bytree': 0.6407227412301181, 'learning_rate': 0.001701726773701295, 'max_bin': 264.0, 'max_depth': 8.0, 'n_estimators': 2400.0, 'num_leaves': 34.0, 'reg_alpha': 0.8820270956084394, 'reg_lambda': 1.409512918662029}
Test Score:  0.9016281917547281
Parameter combinations evaluated:  100
AUC train is:  0.7328579278514924
AUC test is:  0.6468167696821778


  This is separate from the ipykernel package so we can avoid doing imports until


False positive rate is nan


array([[19880,     0],
       [ 2169,     0]])

In [25]:
clf_best.predict(X_test)

6

0.2

In [28]:
sum(np.array(y_test))

2169

In [86]:
results = {'30u':0.6535859962867702, '30w':0.6545053440555348, '30w':0.6559087210856963, 'selected':selected_params, '25w':}


In [None]:
model_results = {'Best parameters':selected_params,'auc':0.6559202603322818, 'FPR':0.16666666666666666}

In [64]:
# del X_test['actual_readmitted']

In [None]:
dir(clf_best)

In [None]:
clf_best.feature_importances_

In [None]:
# sorted(zip(X_train.columns,clf_best.feature_importances_), key= lambda t:t[1], reverse=True)

In [107]:
feature_weights = eli5.explain_weights(clf_best)

['__attrs_attrs__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_repr_html_',
 'decision_tree',
 'description',
 'error',
 'estimator',
 'feature_importances',
 'highlight_spaces',
 'image',
 'is_regression',
 'method',
 'targets',
 'transition_features']

In [122]:
#SELECTED

feature_weights

Weight,Feature
0.3248,number_inpatient
0.0906,discharge_disposition_id_home
0.0756,num_lab_procedures
0.0664,num_medications
0.0512,number_emergency
0.0505,number_diagnoses
0.0480,time_in_hospital
0.0431,age
0.0290,num_procedures
0.0253,discharge_disposition_id_hospital


In [24]:
clf_best.predict_proba(X_test)[:,1]

array([0.09661054, 0.09197153, 0.09168702, ..., 0.06743366, 0.17572086,
       0.10721664])

In [26]:
probabilities = clf_best.predict_proba(X_test)[:,1]
probabilities

array([0.09661054, 0.09197153, 0.09168702, ..., 0.06743366, 0.17572086,
       0.10721664])

In [40]:
tmp = pd.concat([X_test,y_test,pd.DataFrame(probabilities)], axis = 1, ignore_index = False)

In [38]:
pd.qcut(probabilities,10)

[(0.0913, 0.104], (0.0913, 0.104], (0.0913, 0.104], (0.0259, 0.0541], (0.0815, 0.0913], ..., (0.144, 0.18], (0.12, 0.144], (0.064, 0.0725], (0.144, 0.18], (0.104, 0.12]]
Length: 25902
Categories (10, interval[float64]): [(0.0259, 0.0541] < (0.0541, 0.064] < (0.064, 0.0725] < (0.0725, 0.0815] ... (0.104, 0.12] < (0.12, 0.144] < (0.144, 0.18] < (0.18, 0.584]]

In [45]:
cols = tmp.columns.tolist()

In [46]:
cols.append('probabilities')

In [47]:
tmp.shape

(25902, 93)

In [48]:
len(cols)

94

In [53]:
tmp.columns = cols

In [54]:
tmp.head()

Unnamed: 0,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,med_dosage_change,number_of_medicine,race_Asian,race_Caucasian,race_Hispanic,race_Other,race_unknown,gender_Male,admission_type_id_na,admission_type_id_newborn,admission_type_id_urgent,discharge_disposition_id_home,discharge_disposition_id_hospital,discharge_disposition_id_na,admission_source_id_other,admission_source_id_referral,admission_source_id_transfer,admission_source_id_urgent,diag_1_diabetes,diag_1_digestive,diag_1_genitourinary,diag_1_injury,diag_1_musculoskeletal,diag_1_neoplasms,diag_1_other,diag_1_respiratory,max_glu_serum_>300,max_glu_serum_None,max_glu_serum_Norm,A1Cresult_>8,A1Cresult_None,A1Cresult_Norm,metformin_No,metformin_Steady,metformin_Up,repaglinide_No,repaglinide_Steady,repaglinide_Up,nateglinide_No,nateglinide_Steady,nateglinide_Up,chlorpropamide_No,chlorpropamide_Steady,chlorpropamide_Up,glimepiride_No,glimepiride_Steady,glimepiride_Up,acetohexamide_Steady,glipizide_No,glipizide_Steady,glipizide_Up,glyburide_No,glyburide_Steady,glyburide_Up,tolbutamide_Steady,pioglitazone_No,pioglitazone_Steady,pioglitazone_Up,rosiglitazone_No,rosiglitazone_Steady,rosiglitazone_Up,acarbose_No,acarbose_Steady,acarbose_Up,miglitol_No,miglitol_Steady,miglitol_Up,troglitazone_Steady,tolazamide_Steady,tolazamide_Up,insulin_No,insulin_Steady,insulin_Up,glyburide-metformin_No,glyburide-metformin_Steady,glyburide-metformin_Up,glipizide-metformin_Steady,glimepiride-pioglitazone_Steady,metformin-rosiglitazone_Steady,metformin-pioglitazone_Steady,change_No,diabetesMed_Yes,readmitted,probabilities
0,75,2,32,1,31,0,0,1,5,0,2,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0.096611
1,85,12,17,0,20,1,0,0,9,0,3,0,1,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0.091972
2,75,1,30,6,20,0,0,1,9,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0.091687
3,5,2,25,0,2,0,0,0,2,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0.027571
4,65,7,30,0,14,0,0,0,9,0,4,0,0,0,1,0,1,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0.084098


In [56]:
tmp.sort_values(by = 'probabilities', inplace = True)

In [57]:
tmp.head()

Unnamed: 0,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,med_dosage_change,number_of_medicine,race_Asian,race_Caucasian,race_Hispanic,race_Other,race_unknown,gender_Male,admission_type_id_na,admission_type_id_newborn,admission_type_id_urgent,discharge_disposition_id_home,discharge_disposition_id_hospital,discharge_disposition_id_na,admission_source_id_other,admission_source_id_referral,admission_source_id_transfer,admission_source_id_urgent,diag_1_diabetes,diag_1_digestive,diag_1_genitourinary,diag_1_injury,diag_1_musculoskeletal,diag_1_neoplasms,diag_1_other,diag_1_respiratory,max_glu_serum_>300,max_glu_serum_None,max_glu_serum_Norm,A1Cresult_>8,A1Cresult_None,A1Cresult_Norm,metformin_No,metformin_Steady,metformin_Up,repaglinide_No,repaglinide_Steady,repaglinide_Up,nateglinide_No,nateglinide_Steady,nateglinide_Up,chlorpropamide_No,chlorpropamide_Steady,chlorpropamide_Up,glimepiride_No,glimepiride_Steady,glimepiride_Up,acetohexamide_Steady,glipizide_No,glipizide_Steady,glipizide_Up,glyburide_No,glyburide_Steady,glyburide_Up,tolbutamide_Steady,pioglitazone_No,pioglitazone_Steady,pioglitazone_Up,rosiglitazone_No,rosiglitazone_Steady,rosiglitazone_Up,acarbose_No,acarbose_Steady,acarbose_Up,miglitol_No,miglitol_Steady,miglitol_Up,troglitazone_Steady,tolazamide_Steady,tolazamide_Up,insulin_No,insulin_Steady,insulin_Up,glyburide-metformin_No,glyburide-metformin_Steady,glyburide-metformin_Up,glipizide-metformin_Steady,glimepiride-pioglitazone_Steady,metformin-rosiglitazone_Steady,metformin-pioglitazone_Steady,change_No,diabetesMed_Yes,readmitted,probabilities
2052,45,1,10,1,11,0,0,0,2,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0.026874
3,5,2,25,0,2,0,0,0,2,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0.027571
7331,45,1,23,4,10,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0.02952
16075,35,1,1,1,3,0,0,0,2,0,1,0,0,0,0,1,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,1,0,0.029915
8589,35,1,20,4,20,0,0,0,4,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0.030426


In [82]:
#pd.qcut(tmp['probabilities'],10)

In [65]:
liftdf = X_test.copy()

liftdf['predicted_probaility'] = probabilities
liftdf['predicted_readmitted'] = clf_best.predict(X_test)
liftdf['actual_readmitted'] = y_test
#adding the quantiles as another columns
liftdf['Quantile_rank'] = pd.qcut(liftdf['predicted_probaility'],10,labels=np.arange(1,11))

In [69]:
liftdf['Quantile_rank'] = pd.qcut(liftdf['predicted_probaility'],10,labels=np.arange(1,11))

In [68]:
np.arange(1,11)

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [70]:
liftdf

Unnamed: 0,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,med_dosage_change,number_of_medicine,race_Asian,race_Caucasian,race_Hispanic,race_Other,race_unknown,gender_Male,admission_type_id_na,admission_type_id_newborn,admission_type_id_urgent,discharge_disposition_id_home,discharge_disposition_id_hospital,discharge_disposition_id_na,admission_source_id_other,admission_source_id_referral,admission_source_id_transfer,admission_source_id_urgent,diag_1_diabetes,diag_1_digestive,diag_1_genitourinary,diag_1_injury,diag_1_musculoskeletal,diag_1_neoplasms,diag_1_other,diag_1_respiratory,max_glu_serum_>300,max_glu_serum_None,max_glu_serum_Norm,A1Cresult_>8,A1Cresult_None,A1Cresult_Norm,metformin_No,metformin_Steady,metformin_Up,repaglinide_No,repaglinide_Steady,repaglinide_Up,nateglinide_No,nateglinide_Steady,nateglinide_Up,chlorpropamide_No,chlorpropamide_Steady,chlorpropamide_Up,glimepiride_No,glimepiride_Steady,glimepiride_Up,acetohexamide_Steady,glipizide_No,glipizide_Steady,glipizide_Up,glyburide_No,glyburide_Steady,glyburide_Up,tolbutamide_Steady,pioglitazone_No,pioglitazone_Steady,pioglitazone_Up,rosiglitazone_No,rosiglitazone_Steady,rosiglitazone_Up,acarbose_No,acarbose_Steady,acarbose_Up,miglitol_No,miglitol_Steady,miglitol_Up,troglitazone_Steady,tolazamide_Steady,tolazamide_Up,insulin_No,insulin_Steady,insulin_Up,glyburide-metformin_No,glyburide-metformin_Steady,glyburide-metformin_Up,glipizide-metformin_Steady,glimepiride-pioglitazone_Steady,metformin-rosiglitazone_Steady,metformin-pioglitazone_Steady,change_No,diabetesMed_Yes,predicted_probaility,predicted_readmitted,actual_readmitted,Quantile_rank
0,75,2,32,1,31,0,0,1,5,0,2,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0.096611,0,0,6
1,85,12,17,0,20,1,0,0,9,0,3,0,1,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0.091972,0,0,6
2,75,1,30,6,20,0,0,1,9,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0.091687,0,0,6
3,5,2,25,0,2,0,0,0,2,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0.027571,0,0,1
4,65,7,30,0,14,0,0,0,9,0,4,0,0,0,1,0,1,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0.084098,0,0,5
5,75,4,53,2,12,0,0,1,9,0,1,0,0,0,0,0,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,1,0.088345,0,0,5
6,75,4,42,3,26,0,0,0,6,0,1,0,0,0,1,0,1,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,1,0.098444,0,0,6
7,65,3,52,5,18,0,0,0,8,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,1,0.080434,0,0,4
8,75,2,12,0,10,0,0,0,9,0,1,0,1,0,0,0,1,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,1,0.091408,0,0,6
9,75,8,78,1,17,0,0,1,9,1,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0.150868,0,0,9


In [80]:
liftdf['Quantile_rank'].value_counts()

10    2591
1     2591
9     2590
8     2590
7     2590
6     2590
5     2590
4     2590
3     2590
2     2590
Name: Quantile_rank, dtype: int64

In [81]:
lift_chart = liftdf.groupby('Quantile_rank')['predicted_probaility'].agg(['min','max','mean'])
lift_chart

Unnamed: 0_level_0,min,max,mean
Quantile_rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.026874,0.054112,0.046614
2,0.054114,0.064019,0.059175
3,0.06402,0.072515,0.068281
4,0.07252,0.081455,0.076916
5,0.081455,0.091278,0.086181
6,0.091279,0.103811,0.097254
7,0.103817,0.120335,0.111636
8,0.12034,0.143671,0.131306
9,0.143674,0.180462,0.160151
10,0.18047,0.58443,0.233106


In [None]:
feature_names = X_train.columns
feature_imports = rf_en.feature_importances_
most_imp_features = pd.DataFrame([f for f in zip(feature_names,feature_imports)], columns=["Feature", "Importance"]).nlargest(10, "Importance")
most_imp_features.sort_values(by="Importance", inplace=True)
plt.figure(figsize=(10,6))
plt.barh(range(len(most_imp_features)), most_imp_features.Importance, align='center', alpha=0.8)
plt.yticks(range(len(most_imp_features)), most_imp_features.Feature, fontsize=14)
plt.xlabel('Importance')
plt.title('Most important features - Random Forest (entropy)')
plt.show()