### data prepping

In [46]:
import pandas as pd
import numpy as np

In [47]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [48]:
diabetic = pd.read_csv('../../data/diabetic_data_initial.csv')
diabetic.shape

(101766, 50)

In [49]:
# cleaning
diabetic_df = diabetic.replace('?', np.nan)

diabetic_df = diabetic_df.sort_values('encounter_id')
diabetic_df.drop_duplicates(subset = ['patient_nbr'], keep = 'first', inplace = True)

diabetic_df = diabetic_df[~diabetic_df['discharge_disposition_id'].isin([11,13,14,19,20,21])]
diabetic_df.drop(diabetic_df.loc[diabetic_df.gender=='Unknown/Invalid'].index, inplace=True)

diabetic_df.drop(['encounter_id','patient_nbr','weight','medical_specialty','payer_code'],\
                 axis = 1, inplace = True)

diabetic_df['admission_type_id'] = diabetic_df['admission_type_id'].astype('str')
diabetic_df['admission_source_id'] = diabetic_df['admission_source_id'].astype('str')
diabetic_df['discharge_disposition_id'] = diabetic_df['discharge_disposition_id'].astype('str')

In [50]:
diabetic_df.shape

(69970, 45)

In [51]:
# feature engineering
diabetic_df['readmit_30d'] = (diabetic_df['readmitted'] == '<30')

diabetic_df['age_num'] = diabetic_df['age'].replace(
    ['[0-10)','[10-20)','[20-30)','[30-40)','[40-50)','[50-60)','[60-70)','[70-80)','[80-90)','[90-100)'],
    [0,10,20,30,40,50,60,70,80,90])

diabetic_df['diag_1'] = diabetic_df['diag_1'].str.split('.', expand = True).drop(1, axis = 1)
diabetic_df['diag_2'] = diabetic_df['diag_2'].str.split('.', expand = True).drop(1, axis = 1)
diabetic_df['diag_3'] = diabetic_df['diag_3'].str.split('.', expand = True).drop(1, axis = 1)
count_1 = diabetic_df['diag_1'].value_counts()
index_1 = count_1[count_1>=500].index.tolist()
count_2 = diabetic_df['diag_2'].value_counts()
index_2 = count_2[count_2>=500].index.tolist()
count_3 = diabetic_df['diag_3'].value_counts()
index_3 = count_3[count_3>=500].index.tolist()
diagnoses = set(index_1 + index_2 + index_3)
for d in diagnoses:
    diabetic_df[d+'_diag'] = ((diabetic_df['diag_1']==d)|
                              (diabetic_df['diag_2']==d)|
                              (diabetic_df['diag_3']==d))
    
for col in ['metformin','repaglinide','glimepiride','glipizide', 'glyburide','pioglitazone',
            'rosiglitazone','insulin']:
    diabetic_df[col+'_used'] = np.where(diabetic_df[col]=='No', False, True)

In [52]:
diabetic_df = diabetic_df.drop(['age',
                                'diag_1','diag_2','diag_3',
                                'metformin','repaglinide','nateglinide','chlorpropamide','glimepiride',
                                'acetohexamide','glipizide','glyburide','tolbutamide','pioglitazone',
                                'rosiglitazone','acarbose','miglitol','troglitazone','tolazamide','examide',
                                'citoglipton','insulin','glyburide-metformin','glipizide-metformin',
                                'glimepiride-pioglitazone','metformin-rosiglitazone','metformin-pioglitazone',
                                'readmitted'],
                                 axis = 1)

In [53]:
diabetic_dum = pd.get_dummies(diabetic_df, drop_first = True)
diabetic_dum.shape

(69970, 119)

In [54]:
# dropping variables with <500 minority classes
descr = diabetic_dum.describe().T
under_500 = descr[descr['mean'] <= (500/len(diabetic_dum))].index.tolist()
under_500.remove('race_Asian')
diabetic_final = diabetic_dum.drop(under_500, axis = 1)
diabetic_final.shape

(69970, 94)

train test split

In [55]:
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV, train_test_split

In [56]:
features = diabetic_final.drop(['readmit_30d'], axis = 1)
target = diabetic_final['readmit_30d']

In [57]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2,stratify = target,
                                                    random_state = 42)

checking VIF

In [58]:
# X_vif = pd.DataFrame(X_train, dtype=float)

# from statsmodels.stats.outliers_influence import variance_inflation_factor 
# vif_data = pd.DataFrame() 
# vif_data["feature"] = X_vif.columns
# vif_data["VIF"] = [variance_inflation_factor(X_vif.values, i) 
#                           for i in range(len(X_vif.columns))] 
# print(vif_data)

### model testing

logistic regression with no regularization or CV (93 features)

In [59]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, roc_auc_score

In [60]:
logit = LogisticRegression(C = 1e7, solver='liblinear', random_state = 42, class_weight = 'balanced')
logit.fit(X_train, y_train)

LogisticRegression(C=10000000.0, class_weight='balanced', random_state=42,
                   solver='liblinear')

In [61]:
print('accuracy:')
print(logit.score(X_train, y_train))
print(logit.score(X_test, y_test))
print('--'*30)
y_train_pred = logit.predict(X_train)
y_test_pred = logit.predict(X_test)
print('confusion matrix:')
print(confusion_matrix(y_train, y_train_pred))
print(confusion_matrix(y_test, y_test_pred))
print('--'*30)
print('AUC-ROC:')
print(roc_auc_score(y_train, y_train_pred))
print(roc_auc_score(y_test, y_test_pred))

accuracy:
0.661354866371302
0.6634271830784622
------------------------------------------------------------
confusion matrix:
[[34226 16728]
 [ 2228  2794]]
[[8621 4118]
 [ 592  663]]
------------------------------------------------------------
AUC-ROC:
0.6140279703793241
0.6025137850356953


logistic regression with no regularization or CV (top 43 features from previous model)

In [62]:
logit_coefs = pd.DataFrame({'features': X_train.columns,
                            'coef': logit.coef_[0]})
logit_coefs['abs_coef'] = np.abs(logit_coefs['coef'])
logit_43 = logit_coefs.sort_values('abs_coef', ascending = False).iloc[:,0:2].head(43).features.tolist()

In [63]:
X_train_43 = X_train[logit_43]
X_test_43 = X_test[logit_43]

In [64]:
logit = LogisticRegression(C = 1e7, solver='liblinear', random_state = 42, class_weight = 'balanced')
logit.fit(X_train_43, y_train)

print('accuracy:')
print(logit.score(X_train_43, y_train))
print(logit.score(X_test_43, y_test))
print('--'*30)
y_train_pred = logit.predict(X_train_43)
y_test_pred = logit.predict(X_test_43)
print('confusion matrix:')
print(confusion_matrix(y_train, y_train_pred))
print(confusion_matrix(y_test, y_test_pred))
print('--'*30)
print('AUC-ROC:')
print(roc_auc_score(y_train, y_train_pred))
print(roc_auc_score(y_test, y_test_pred))

accuracy:
0.6620515935400886
0.6642132342432471
------------------------------------------------------------
confusion matrix:
[[34359 16595]
 [ 2322  2700]]
[[8647 4092]
 [ 607  648]]
------------------------------------------------------------
AUC-ROC:
0.6059742479090354
0.5975581776825504


In [65]:
logit_coefs = pd.DataFrame({'features': X_train_43.columns,
                            'coef': logit.coef_[0]})
logit_coefs['abs_coef'] = np.abs(logit_coefs['coef'])
logit_coefs.sort_values('abs_coef', ascending = False).iloc[:,0:2].head(15)

Unnamed: 0,features,coef
0,discharge_disposition_id_22,1.519637
1,discharge_disposition_id_5,1.251283
2,discharge_disposition_id_2,0.719786
4,discharge_disposition_id_3,0.683762
3,722_diag,-0.619348
6,discharge_disposition_id_4,0.57737
5,discharge_disposition_id_18,0.543966
8,786_diag,-0.475954
10,403_diag,0.441048
9,440_diag,0.426389


logistic regression with regularization and CV (93 features)

In [66]:
# standardization
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [67]:
logit = LogisticRegression(max_iter = 1000, class_weight = 'balanced', random_state = 42)
logit.fit(X_train_scaled, y_train)

logit_grid_params = [{'C': np.linspace(1e-4,0.5,50),'penalty':['l1','l2'],'solver': ['liblinear']}]
grid_search_logit = GridSearchCV(logit, logit_grid_params, scoring='roc_auc', cv=5, verbose = 1, n_jobs=-1)
%time grid_search_logit.fit(X_train_scaled, y_train)

print(grid_search_logit.best_estimator_)
print('--'*30)
print('accuracy:')
print(grid_search_logit.best_estimator_.score(X_train_scaled, y_train))
print(grid_search_logit.best_estimator_.score(X_test_scaled, y_test))
print('--'*30)
y_train_pred = grid_search_logit.best_estimator_.predict(X_train_scaled)
y_test_pred = grid_search_logit.best_estimator_.predict(X_test_scaled)
print('confusion matrix:')
print(confusion_matrix(y_train, y_train_pred))
print(confusion_matrix(y_test, y_test_pred))
print('--'*30)
print('AUC-ROC:')
print(roc_auc_score(y_train, y_train_pred))
print(roc_auc_score(y_test, y_test_pred))

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   10.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   45.1s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  2.0min finished


Wall time: 2min 2s
LogisticRegression(C=0.01030204081632653, class_weight='balanced',
                   max_iter=1000, penalty='l1', random_state=42,
                   solver='liblinear')
------------------------------------------------------------
accuracy:
0.6644097470344433
0.6652136629984279
------------------------------------------------------------
confusion matrix:
[[34443 16511]
 [ 2274  2748]]
[[8633 4106]
 [ 579  676]]
------------------------------------------------------------
AUC-ROC:
0.6115774933035156
0.6081640624877833


logistic regression with regularization and CV (top 43 features from previous model)

In [69]:
logit_cv_coefs = pd.DataFrame({'features': X_train.columns,'coef': grid_search_logit.best_estimator_.coef_[0]})
logit_cv_coefs['abs_coef'] = np.abs(logit_cv_coefs['coef'])
logit_cv_43 = logit_cv_coefs.sort_values('abs_coef', ascending = False).iloc[:,0:2].head(43).features.tolist()

In [71]:
X_train_43 = X_train[logit_cv_43]
X_test_43 = X_test[logit_cv_43]

# standardization
scaler = StandardScaler().fit(X_train_43)
X_train_scaled = scaler.transform(X_train_43)
X_test_scaled = scaler.transform(X_test_43)

In [86]:
logit = LogisticRegression(max_iter = 1000, class_weight = 'balanced', random_state = 42)
logit.fit(X_train_scaled, y_train)

logit_grid_params = [{'C': np.linspace(1e-3,1,100),'penalty':['l1'],'solver': ['liblinear']}]
grid_search_logit = GridSearchCV(logit, logit_grid_params, scoring='roc_auc', cv=5, verbose = 1, n_jobs=-1)
%time grid_search_logit.fit(X_train_scaled, y_train)

print(grid_search_logit.best_params_)
print('--'*30)
print('accuracy:')
print(grid_search_logit.best_estimator_.score(X_train_scaled, y_train))
print(grid_search_logit.best_estimator_.score(X_test_scaled, y_test))
print('--'*30)
y_train_pred = grid_search_logit.best_estimator_.predict(X_train_scaled)
y_test_pred = grid_search_logit.best_estimator_.predict(X_test_scaled)
print('confusion matrix:')
print(confusion_matrix(y_train, y_train_pred))
print(confusion_matrix(y_test, y_test_pred))
print('--'*30)
print('AUC-ROC:')
print(roc_auc_score(y_train, y_train_pred))
print(roc_auc_score(y_test, y_test_pred))

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   15.5s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   36.5s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:   41.8s finished


Wall time: 42.1 s
{'C': 0.19272727272727275, 'penalty': 'l1', 'solver': 'liblinear'}
------------------------------------------------------------
accuracy:
0.6618193511504931
0.6619265399456911
------------------------------------------------------------
confusion matrix:
[[34293 16661]
 [ 2269  2753]]
[[8597 4142]
 [ 589  666]]
------------------------------------------------------------
AUC-ROC:
0.6106033870954455
0.6027670149920765


In [87]:
logit_cv_coefs = pd.DataFrame({'feature': X_train_43.columns,'coef': grid_search_logit.best_estimator_.coef_[0]})
logit_cv_coefs['abs_coef'] = np.abs(logit_cv_coefs['coef'])
logit_cv_coefs.sort_values('abs_coef', ascending = False).iloc[:,0:2].head(15)

Unnamed: 0,features,coef
0,number_inpatient,0.202739
1,discharge_disposition_id_22,0.201254
2,discharge_disposition_id_3,0.185901
3,discharge_disposition_id_5,0.132137
5,diabetesMed_Yes,0.102001
4,discharge_disposition_id_2,0.09825
6,discharge_disposition_id_18,0.095155
7,786_diag,-0.090354
8,403_diag,0.082011
9,number_emergency,0.079422


In [94]:
logit_cv_coefs.coef[0:15]

0     0.202739
1     0.201254
2     0.185901
3     0.132137
4     0.098250
5     0.102001
6     0.095155
7    -0.090354
8     0.082011
9     0.079422
10    0.073152
11    0.071927
12    0.063272
13    0.050810
14   -0.058058
Name: coef, dtype: float64

In [88]:
predicted_probs = pd.Series(grid_search_logit.best_estimator_.predict_proba(X_test_scaled)[:,1],
                            index=X_test_43.index)
bins = pd.qcut(predicted_probs.sort_values(),q=10,labels=range(1,11))
results = []
for i in range(1,11):
    results.append(y_test[bins[bins==i].index])
overall = y_test.sum()/len(y_test)
print([(i.sum())/(len(i)) for i in results])
print([(i.sum())/(len(i))/overall for i in results])

tmp = pd.DataFrame([[(i.sum())/(len(i)) for i in results],[(i.sum())/(len(i))/overall for i in results]]).T
tmp.columns = ['% Rehospitalized','Lift']
dec = ['Decile 1','Decile 2','Decile 3','Decile 4','Decile 5','Decile 6','Decile 7','Decile 8','Decile 9','Decile 10']
tmp.index = dec
tmp

[0.037142857142857144, 0.04789135096497498, 0.061472480343102216, 0.06714285714285714, 0.07719799857040743, 0.07719799857040743, 0.09571428571428571, 0.11007862759113653, 0.12080057183702644, 0.20214285714285715]
[0.41416505406943654, 0.534017183588733, 0.6854548923676275, 0.7486829823562892, 0.8608038183221368, 0.8608038183221368, 1.067271485486625, 1.2274424816815654, 1.3469985675596399, 2.254013659647126]


Unnamed: 0,% Rehospitalized,Lift
Decile 1,0.037143,0.414165
Decile 2,0.047891,0.534017
Decile 3,0.061472,0.685455
Decile 4,0.067143,0.748683
Decile 5,0.077198,0.860804
Decile 6,0.077198,0.860804
Decile 7,0.095714,1.067271
Decile 8,0.110079,1.227442
Decile 9,0.120801,1.346999
Decile 10,0.202143,2.254014


In [89]:
print(f'Overall rate of readmission:\n{y_test.value_counts()/len(y_test)}')
print([i.sum()for i in results])
pd.DataFrame(bins).reset_index().groupby(0).count().T

Overall rate of readmission:
False    0.910319
True     0.089681
Name: readmit_30d, dtype: float64
[52, 67, 86, 94, 108, 108, 134, 154, 169, 283]


Unnamed: 0,1,2,3,4,5,6,7,8,9,10
index,1400,1399,1399,1400,1399,1399,1400,1399,1399,1400


random forest with CV (93 features)

In [95]:
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(oob_score=True, class_weight = 'balanced_subsample', random_state = 42)

In [100]:
RF_grid_params = [{
    'n_estimators': [600], #range(100,800,100),
    'max_depth': [6], #[4,5,6],
    'max_features': ['sqrt'], #['sqrt','log2'],
    'criterion': ['gini'], #['gini','entropy'],
    'min_samples_leaf': [50], #[40,50,60],
    'min_samples_split': [300], #range(200,400,50),
    'class_weight': ['balanced_subsample'],
    'random_state': [42]}]
grid_search_RF = GridSearchCV(RF, RF_grid_params, scoring='roc_auc', cv=5, n_jobs=-1)
%time grid_search_RF.fit(X_train, y_train)

print(grid_search_RF.best_estimator_)
print('--'*30)
print('accuracy')
print(grid_search_RF.best_estimator_.score(X_train, y_train))
print(grid_search_RF.best_estimator_.score(X_test, y_test))
print('--'*30)
y_train_pred = grid_search_RF.best_estimator_.predict(X_train)
y_test_pred = grid_search_RF.best_estimator_.predict(X_test)
print('confusion matrix')
print(confusion_matrix(y_train, y_train_pred))
print(confusion_matrix(y_test, y_test_pred))
print('--'*30)
print('AUC-ROC')
print(roc_auc_score(y_train, y_train_pred))
print(roc_auc_score(y_test, y_test_pred))

Wall time: 58 s
RandomForestClassifier(class_weight='balanced_subsample', max_depth=6,
                       max_features='sqrt', min_samples_leaf=50,
                       min_samples_split=300, n_estimators=600, oob_score=True,
                       random_state=42)
------------------------------------------------------------
accuracy
0.6519401171930828
0.6498499356867229
------------------------------------------------------------
confusion matrix
[[33668 17286]
 [ 2197  2825]]
[[8404 4335]
 [ 565  690]]
------------------------------------------------------------
AUC-ROC
0.6116388631865379
0.6047536050944976


random forest with CV (top 43 features from previous model)

In [101]:
RF_coefs = pd.DataFrame({'feature':X_train.columns,
                      'importance':grid_search_RF.best_estimator_.feature_importances_})
RF_43 = RF_coefs.sort_values('importance',ascending = False).head(43).feature.tolist()

In [102]:
X_train_43 = X_train[RF_43]
X_test_43 = X_test[RF_43]

In [103]:
RF_grid_params = [{
    'n_estimators': [300], #range(100,800,100),
    'max_depth': [6], #[4,5,6],
    'max_features': ['sqrt'], #['sqrt','log2'],
    'criterion': ['gini','entropy'], #['gini','entropy'],
    'min_samples_leaf': [50], #[40,50,60],
    'min_samples_split': [300], #range(200,400,50),
    'class_weight': ['balanced_subsample'],
    'random_state': [42]}]
grid_search_RF = GridSearchCV(RF, RF_grid_params, scoring='roc_auc', cv=5, n_jobs=-1)
%time grid_search_RF.fit(X_train_43, y_train)

print(grid_search_RF.best_estimator_)
print('--'*30)
print('accuracy')
print(grid_search_RF.best_estimator_.score(X_train_43, y_train))
print(grid_search_RF.best_estimator_.score(X_test_43, y_test))
print('--'*30)
y_train_pred = grid_search_RF.best_estimator_.predict(X_train_43)
y_test_pred = grid_search_RF.best_estimator_.predict(X_test_43)
print('confusion matrix')
print(confusion_matrix(y_train, y_train_pred))
print(confusion_matrix(y_test, y_test_pred))
print('--'*30)
print('AUC-ROC')
print(roc_auc_score(y_train, y_train_pred))
print(roc_auc_score(y_test, y_test_pred))

Wall time: 43.2 s
RandomForestClassifier(class_weight='balanced_subsample', criterion='entropy',
                       max_depth=6, max_features='sqrt', min_samples_leaf=50,
                       min_samples_split=300, n_estimators=300, oob_score=True,
                       random_state=42)
------------------------------------------------------------
accuracy
0.6550843218522224
0.6529941403458626
------------------------------------------------------------
confusion matrix
[[33860 17094]
 [ 2213  2809]]
[[8456 4283]
 [ 573  682]]
------------------------------------------------------------
AUC-ROC
0.611929924628686
0.6036073306272516


In [104]:
RF_coefs = pd.DataFrame({'feature':X_train_43.columns,
                         'importance':grid_search_RF.best_estimator_.feature_importances_})
RF_coefs.sort_values('importance',ascending = False).head(15)

Unnamed: 0,feature,importance
0,number_inpatient,0.194918
1,discharge_disposition_id_22,0.134458
3,time_in_hospital,0.10096
2,discharge_disposition_id_3,0.084984
4,age_num,0.072847
6,number_diagnoses,0.051148
5,num_medications,0.048522
7,num_lab_procedures,0.034522
8,discharge_disposition_id_5,0.034134
9,786_diag,0.024871


In [109]:
RF_coefs.importance[0:15]

0     0.194918
1     0.134458
2     0.084984
3     0.100960
4     0.072847
5     0.048522
6     0.051148
7     0.034522
8     0.034134
9     0.024871
10    0.021440
11    0.022759
12    0.017334
13    0.011777
14    0.014945
Name: importance, dtype: float64

In [105]:
predicted_probs = pd.Series(grid_search_RF.best_estimator_.predict_proba(X_test_43)[:,1],
                            index=X_test_43.index)
bins = pd.qcut(predicted_probs.sort_values(),q=10,labels=range(1,11))
results = []
for i in range(1,11):
    results.append(y_test[bins[bins==i].index])
overall = y_test.sum()/len(y_test)
print([(i.sum())/(len(i)) for i in results])
print([(i.sum())/(len(i))/overall for i in results])

tmp = pd.DataFrame([[(i.sum())/(len(i)) for i in results],[(i.sum())/(len(i))/overall for i in results]]).T
tmp.columns = ['% Rehospitalized','Lift']
dec = ['Decile 1','Decile 2','Decile 3','Decile 4','Decile 5','Decile 6','Decile 7','Decile 8','Decile 9','Decile 10']
tmp.index = dec
tmp

[0.04214285714285714, 0.05003573981415296, 0.06647605432451752, 0.06857142857142857, 0.062187276626161546, 0.08005718370264475, 0.08857142857142856, 0.10793423874195854, 0.12651894210150108, 0.2042857142857143]
[0.4699180421172453, 0.5579284007643479, 0.7412477324440623, 0.764612407512806, 0.6934252980928324, 0.8926854412229567, 0.9876243597040409, 1.2035312645059504, 1.4107618133612798, 2.2779077973819013]


Unnamed: 0,% Rehospitalized,Lift
Decile 1,0.042143,0.469918
Decile 2,0.050036,0.557928
Decile 3,0.066476,0.741248
Decile 4,0.068571,0.764612
Decile 5,0.062187,0.693425
Decile 6,0.080057,0.892685
Decile 7,0.088571,0.987624
Decile 8,0.107934,1.203531
Decile 9,0.126519,1.410762
Decile 10,0.204286,2.277908


In [106]:
print(f'Overall rate of readmission:\n{y_test.value_counts()/len(y_test)}')
print([i.sum()for i in results])
pd.DataFrame(bins).reset_index().groupby(0).count().T

Overall rate of readmission:
False    0.910319
True     0.089681
Name: readmit_30d, dtype: float64
[59, 70, 93, 96, 87, 112, 124, 151, 177, 286]


Unnamed: 0,1,2,3,4,5,6,7,8,9,10
index,1400,1399,1399,1400,1399,1399,1400,1399,1399,1400


In [110]:
y_test.value_counts()

False    12739
True      1255
Name: readmit_30d, dtype: int64

gradient boosting with CV (93 features)

In [37]:
# resampling
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state = 42)
X_train, y_train = ros.fit_sample(X_train, y_train)
print(X_train.shape)
y_train.value_counts()

(101908, 93)


True     50954
False    50954
Name: readmit_30d, dtype: int64

In [38]:
from sklearn.ensemble import GradientBoostingClassifier
GB = GradientBoostingClassifier(random_state = 42)

In [39]:
GB_grid_params = [{
    'learning_rate': [1], #[0.8,0,9,1.0,1.1,1.2],
    'n_estimators': [300], #range(100,800,100),
    'max_depth': [4],
    'subsample': [0.8], #np.linspace(0.6,0.9,10),
#     'max_features':['sqrt','log2'],
#     'criterion': ['mse','friedman_mse','mae'],
    'min_samples_leaf': [50], #range(50,80,5),
    'min_samples_split': [300], #[400,500,600,700,800],
    'random_state':[42]
}]
grid_search_GB = GridSearchCV(GB, GB_grid_params, scoring='roc_auc', cv=5, verbose = 1, n_jobs=-1)
%time grid_search_GB.fit(X_train, y_train)

print(grid_search_GB.best_estimator_)
print('--'*30)
y_train_pred = grid_search_GB.best_estimator_.predict(X_train)
y_test_pred = grid_search_GB.best_estimator_.predict(X_test)
print('accuracy:')
print(grid_search_GB.best_estimator_.score(X_train, y_train))
print(grid_search_GB.best_estimator_.score(X_test, y_test))
print('confusion matrices:')
print(confusion_matrix(y_train, y_train_pred))
print(confusion_matrix(y_test, y_test_pred))
print('--'*30)
print('AUC-ROC:')
print(roc_auc_score(y_train, y_train_pred))
print(roc_auc_score(y_test, y_test_pred))

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  2.1min remaining:  3.2min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  2.1min finished


Wall time: 3min 44s
GradientBoostingClassifier(learning_rate=1, max_depth=4, min_samples_leaf=50,
                           min_samples_split=300, n_estimators=300,
                           random_state=42, subsample=0.8)
------------------------------------------------------------
accuracy:
0.8317011422066962
0.723024153208518
confusion matrices:
[[40894 10060]
 [ 7091 43863]]
[[9680 3059]
 [ 817  438]]
------------------------------------------------------------
AUC-ROC:
0.8317011422066962
0.5544376227721189


gradient boosting with CV (top 43 features from previous model)

In [40]:
GB_coefs = pd.DataFrame({'feature':X_train.columns,
                      'importance':grid_search_GB.best_estimator_.feature_importances_})
GB_43 = RF_coefs.sort_values('importance',ascending = False).head(43).feature.tolist()

In [41]:
X_train_43 = X_train[GB_43]
X_test_43 = X_test[GB_43]

In [42]:
GB_grid_params = [{
#     'learning_rate': [0.8,0,9,1.0,1.1,1.2],
    'n_estimators': [600], #range(100,800,100),
    'max_depth': [6],
    'subsample': [0.8], #np.linspace(0.6,0.9,10),
    'max_features':['sqrt'],
#     'criterion': ['mse','friedman_mse','mae'],
    'min_samples_leaf': [50], #range(50,80,5),
    'min_samples_split': [500], #[400,500,600,700,800],
    'random_state':[42]
}]
grid_search_GB = GridSearchCV(GB, GB_grid_params, scoring='roc_auc', cv=5, verbose = 1, n_jobs=-1)
%time grid_search_GB.fit(X_train_43, y_train)

print(grid_search_GB.best_estimator_)
print('--'*30)
y_train_pred = grid_search_GB.best_estimator_.predict(X_train_43)
y_test_pred = grid_search_GB.best_estimator_.predict(X_test_43)
print('accuracy:')
print(grid_search_GB.best_estimator_.score(X_train_43, y_train))
print(grid_search_GB.best_estimator_.score(X_test_43, y_test))
print('confusion matrices:')
print(confusion_matrix(y_train, y_train_pred))
print(confusion_matrix(y_test, y_test_pred))
print('--'*30)
print('AUC-ROC:')
print(roc_auc_score(y_train, y_train_pred))
print(roc_auc_score(y_test, y_test_pred))

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   58.5s remaining:  1.5min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   58.9s finished


Wall time: 1min 33s
GradientBoostingClassifier(max_depth=6, max_features='sqrt',
                           min_samples_leaf=50, min_samples_split=500,
                           n_estimators=600, random_state=42, subsample=0.8)
------------------------------------------------------------
accuracy:
0.7749146288809514
0.7190224381877948
confusion matrices:
[[39561 11393]
 [11545 39409]]
[[9537 3202]
 [ 730  525]]
------------------------------------------------------------
AUC-ROC:
0.7749146288809515
0.583486291899675


In [45]:
GB_coefs = pd.DataFrame({'feature':X_train_43.columns,
                      'importance':grid_search_GB.best_estimator_.feature_importances_})
GB_coefs.sort_values('importance', ascending = False).head(15)

Unnamed: 0,feature,importance
7,num_lab_procedures,0.125331
6,num_medications,0.105178
2,time_in_hospital,0.07168
0,number_inpatient,0.062906
4,age_num,0.06185
5,number_diagnoses,0.054292
17,num_procedures,0.044664
1,discharge_disposition_id_22,0.038652
21,number_outpatient,0.031803
3,discharge_disposition_id_3,0.030096
