### data prepping

In [144]:
import pandas as pd
import numpy as np

In [145]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [146]:
diabetic = pd.read_csv('../../data/diabetic_data_initial.csv')
diabetic.shape

(101766, 50)

In [147]:
# cleaning
diabetic_df = diabetic.replace('?', np.nan)

diabetic_df = diabetic_df.sort_values('encounter_id')
diabetic_df.drop_duplicates(subset = ['patient_nbr'], keep = 'first', inplace = True)

diabetic_df = diabetic_df[~diabetic_df['discharge_disposition_id'].isin([11,13,14,19,20,21])]
diabetic_df.drop(diabetic_df.loc[diabetic_df.gender=='Unknown/Invalid'].index, inplace=True)

diabetic_df.drop(['encounter_id','patient_nbr','weight','medical_specialty','payer_code'],\
                 axis = 1, inplace = True)

diabetic_df['admission_type_id'] = diabetic_df['admission_type_id'].astype('str')
diabetic_df['admission_source_id'] = diabetic_df['admission_source_id'].astype('str')
diabetic_df['discharge_disposition_id'] = diabetic_df['discharge_disposition_id'].astype('str')

In [148]:
diabetic_df.shape

(69970, 45)

In [149]:
# feature engineering
diabetic_df['readmit_30d'] = (diabetic_df['readmitted'] == '<30')

diabetic_df['age_num'] = diabetic_df['age'].replace(
    ['[0-10)','[10-20)','[20-30)','[30-40)','[40-50)','[50-60)','[60-70)','[70-80)','[80-90)','[90-100)'],
    [0,10,20,30,40,50,60,70,80,90])

diabetic_df['diag_1'] = diabetic_df['diag_1'].str.split('.', expand = True).drop(1, axis = 1)
diabetic_df['diag_2'] = diabetic_df['diag_2'].str.split('.', expand = True).drop(1, axis = 1)
diabetic_df['diag_3'] = diabetic_df['diag_3'].str.split('.', expand = True).drop(1, axis = 1)
count_1 = diabetic_df['diag_1'].value_counts()
index_1 = count_1[count_1>=500].index.tolist()
count_2 = diabetic_df['diag_2'].value_counts()
index_2 = count_2[count_2>=500].index.tolist()
count_3 = diabetic_df['diag_3'].value_counts()
index_3 = count_3[count_3>=500].index.tolist()
diagnoses = set(index_1 + index_2 + index_3)
for d in diagnoses:
    diabetic_df[d+'_diag'] = ((diabetic_df['diag_1']==d)|
                              (diabetic_df['diag_2']==d)|
                              (diabetic_df['diag_3']==d))
    
for col in ['metformin','repaglinide','glimepiride','glipizide', 'glyburide','pioglitazone',
            'rosiglitazone','insulin']:
    diabetic_df[col+'_used'] = np.where(diabetic_df[col]=='No', False, True)

In [150]:
diabetic_df = diabetic_df.drop(['age',
                                'diag_1','diag_2','diag_3',
                                'metformin','repaglinide','nateglinide','chlorpropamide','glimepiride',
                                'acetohexamide','glipizide','glyburide','tolbutamide','pioglitazone',
                                'rosiglitazone','acarbose','miglitol','troglitazone','tolazamide','examide',
                                'citoglipton','insulin','glyburide-metformin','glipizide-metformin',
                                'glimepiride-pioglitazone','metformin-rosiglitazone','metformin-pioglitazone',
                                'readmitted'],
                                 axis = 1)

In [151]:
diabetic_dum = pd.get_dummies(diabetic_df, drop_first = True)
diabetic_dum.shape

(69970, 119)

In [152]:
# dropping variables with <500 minority classes
descr = diabetic_dum.describe().T
under_500 = descr[descr['mean'] <= (500/len(diabetic_dum))].index.tolist()
under_500.remove('race_Asian')
diabetic_final = diabetic_dum.drop(under_500, axis = 1)
diabetic_final.shape

(69970, 94)

train test split

In [153]:
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV, train_test_split

In [154]:
features = diabetic_final.drop(['readmit_30d'], axis = 1)
target = diabetic_final['readmit_30d']

In [155]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2,stratify = target,
                                                    random_state = 42)

checking VIF

In [156]:
# X_vif = pd.DataFrame(X_train, dtype=float)

# from statsmodels.stats.outliers_influence import variance_inflation_factor 
# vif_data = pd.DataFrame() 
# vif_data["feature"] = X_vif.columns
# vif_data["VIF"] = [variance_inflation_factor(X_vif.values, i) 
#                           for i in range(len(X_vif.columns))] 
# print(vif_data)

### model testing

logistic regression with no regularization or CV (93 features)

In [157]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, roc_auc_score

In [158]:
logit = LogisticRegression(C = 1e7, solver='liblinear', random_state = 42, class_weight = 'balanced')
logit.fit(X_train, y_train)

LogisticRegression(C=10000000.0, class_weight='balanced', random_state=42,
                   solver='liblinear')

In [159]:
print('accuracy:')
print(logit.score(X_train, y_train))
print(logit.score(X_test, y_test))
print('--'*30)
y_train_pred = logit.predict(X_train)
y_test_pred = logit.predict(X_test)
print('confusion matrix:')
print(confusion_matrix(y_train, y_train_pred))
print(confusion_matrix(y_test, y_test_pred))
print('--'*30)
print('AUC-ROC:')
print(roc_auc_score(y_train, y_train_pred))
print(roc_auc_score(y_test, y_test_pred))

accuracy:
0.6613727311705017
0.6632842646848649
------------------------------------------------------------
confusion matrix:
[[34227 16727]
 [ 2228  2794]]
[[8620 4119]
 [ 593  662]]
------------------------------------------------------------
AUC-ROC:
0.6140377831516285
0.602076129112563


logistic regression with no regularization or CV (top 43 features from previous model)

In [160]:
logit_coefs = pd.DataFrame({'features': X_train.columns,
                            'coef': logit.coef_[0]})
logit_coefs['abs_coef'] = np.abs(logit_coefs['coef'])
logit_43 = logit_coefs.sort_values('abs_coef', ascending = False).iloc[:,0:2].head(43).features.tolist()

In [161]:
X_train_43 = X_train[logit_43]
X_test_43 = X_test[logit_43]

In [162]:
logit = LogisticRegression(C = 1e7, solver='liblinear', random_state = 42, class_weight = 'balanced')
logit.fit(X_train_43, y_train)

print('accuracy:')
print(logit.score(X_train_43, y_train))
print(logit.score(X_test_43, y_test))
print('--'*30)
y_train_pred = logit.predict(X_train_43)
y_test_pred = logit.predict(X_test_43)
print('confusion matrix:')
print(confusion_matrix(y_train, y_train_pred))
print(confusion_matrix(y_test, y_test_pred))
print('--'*30)
print('AUC-ROC:')
print(roc_auc_score(y_train, y_train_pred))
print(roc_auc_score(y_test, y_test_pred))

accuracy:
0.6620515935400886
0.6642132342432471
------------------------------------------------------------
confusion matrix:
[[34359 16595]
 [ 2322  2700]]
[[8647 4092]
 [ 607  648]]
------------------------------------------------------------
AUC-ROC:
0.6059742479090354
0.5975581776825504


In [163]:
logit_coefs = pd.DataFrame({'features': X_train_43.columns,
                            'coef': logit.coef_[0]})
logit_coefs['abs_coef'] = np.abs(logit_coefs['coef'])
logit_coefs.sort_values('abs_coef', ascending = False).iloc[:,0:2].head(15)

Unnamed: 0,features,coef
0,discharge_disposition_id_22,1.519637
1,discharge_disposition_id_5,1.251283
2,discharge_disposition_id_2,0.719786
4,discharge_disposition_id_3,0.683762
3,722_diag,-0.619348
6,discharge_disposition_id_4,0.57737
5,discharge_disposition_id_18,0.543966
8,786_diag,-0.475954
10,403_diag,0.441048
9,440_diag,0.426389


logistic regression with regularization and CV (93 features)

In [164]:
# standardization
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [165]:
logit = LogisticRegression(max_iter = 1000, class_weight = 'balanced', random_state = 42)
logit.fit(X_train_scaled, y_train)

logit_grid_params = [{'C': np.linspace(1e-3,0.5,50),'penalty':['l1','l2'],
                      'class_weight':[None,'balanced'],'solver': ['liblinear']}]
grid_search_logit = GridSearchCV(logit, logit_grid_params, scoring='roc_auc', cv=5, verbose = 1, n_jobs=-1)
%time grid_search_logit.fit(X_train_scaled, y_train)

print(grid_search_logit.best_estimator_)
print('--'*30)
print('accuracy:')
print(grid_search_logit.best_estimator_.score(X_train_scaled, y_train))
print(grid_search_logit.best_estimator_.score(X_test_scaled, y_test))
print('--'*30)
y_train_pred = grid_search_logit.best_estimator_.predict(X_train_scaled)
y_test_pred = grid_search_logit.best_estimator_.predict(X_test_scaled)
print('confusion matrix:')
print(confusion_matrix(y_train, y_train_pred))
print(confusion_matrix(y_test, y_test_pred))
print('--'*30)
print('AUC-ROC:')
print(roc_auc_score(y_train, y_train_pred))
print(roc_auc_score(y_test, y_test_pred))

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    8.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   39.5s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:  3.8min finished


Wall time: 3min 50s
LogisticRegression(C=0.011183673469387756, class_weight='balanced',
                   max_iter=1000, penalty='l1', random_state=42,
                   solver='liblinear')
------------------------------------------------------------
accuracy:
0.6642132342432471
0.6654994997856224
------------------------------------------------------------
confusion matrix:
[[34428 16526]
 [ 2270  2752]]
[[8636 4103]
 [ 578  677]]
------------------------------------------------------------
AUC-ROC:
0.6118285494290249
0.6086802175081758


logistic regression with regularization and CV (top 43 features from previous model)

In [166]:
logit_cv_coefs = pd.DataFrame({'features': X_train.columns,'coef': grid_search_logit.best_estimator_.coef_[0]})
logit_cv_coefs['abs_coef'] = np.abs(logit_cv_coefs['coef'])
logit_cv_43 = logit_cv_coefs.sort_values('abs_coef', ascending = False).iloc[:,0:2].head(43).features.tolist()

In [136]:
X_train_43 = X_train[logit_cv_43]
X_test_43 = X_test[logit_cv_43]

# # standardization
# scaler = StandardScaler().fit(X_train_43)
# X_train_scaled = scaler.transform(X_train_43)
# X_test_scaled = scaler.transform(X_test_43)

In [142]:
logit = LogisticRegression(max_iter = 1000, class_weight = 'balanced', random_state = 42)
logit.fit(X_train_scaled, y_train)

logit_grid_params = [{'C': np.linspace(1e-4,0.1,50),'penalty':['l1','l2'],
                      'class_weight':['balanced'],'solver': ['liblinear']}]
grid_search_logit = GridSearchCV(logit, logit_grid_params, scoring='roc_auc', cv=5, verbose = 1, n_jobs=-1)
%time grid_search_logit.fit(X_train, y_train)

print(grid_search_logit.best_estimator_)
print('--'*30)
print('accuracy:')
print(grid_search_logit.best_estimator_.score(X_train, y_train))
print(grid_search_logit.best_estimator_.score(X_test, y_test))
print('--'*30)
y_train_pred = grid_search_logit.best_estimator_.predict(X_train)
y_test_pred = grid_search_logit.best_estimator_.predict(X_test)
print('confusion matrix:')
print(confusion_matrix(y_train, y_train_pred))
print(confusion_matrix(y_test, y_test_pred))
print('--'*30)
print('AUC-ROC:')
print(roc_auc_score(y_train, y_train_pred))
print(roc_auc_score(y_test, y_test_pred))

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   12.6s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   30.5s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:   35.2s finished


Wall time: 35.5 s
LogisticRegression(C=0.0009061224489795917, class_weight='balanced',
                   max_iter=1000, random_state=42, solver='liblinear')
------------------------------------------------------------
accuracy:
0.6574246105473774
0.6576389881377733
------------------------------------------------------------
confusion matrix:
[[34016 16938]
 [ 2238  2784]]
[[8532 4207]
 [ 584  671]]
------------------------------------------------------------
AUC-ROC:
0.6109716689202044
0.6022078262036241


In [110]:
logit_cv_coefs = pd.DataFrame({'features': X_train_43.columns,'coef': grid_search_logit.best_estimator_.coef_[0]})
logit_cv_coefs['abs_coef'] = np.abs(logit_cv_coefs['coef'])
logit_cv_coefs.sort_values('abs_coef', ascending = False).iloc[:,0:2].head(15)

Unnamed: 0,features,coef
0,number_inpatient,0.18185
1,discharge_disposition_id_22,0.174372
2,discharge_disposition_id_3,0.154015
3,discharge_disposition_id_5,0.113616
5,diabetesMed_Yes,0.084625
4,discharge_disposition_id_2,0.083586
7,786_diag,-0.077248
6,discharge_disposition_id_18,0.07358
8,403_diag,0.072752
9,number_emergency,0.069256


random forest with CV (93 features)

In [28]:
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(oob_score=True, class_weight = 'balanced_subsample', random_state = 42)

In [35]:
RF_grid_params = [{
    'n_estimators': [600], #range(100,800,100),
    'max_depth': [6], #[4,5,6],
    'max_features': ['sqrt'], #['sqrt','log2'],
    'criterion': ['gini'], #['gini','entropy'],
    'min_samples_leaf': [50], #[40,50,60],
    'min_samples_split': [300], #range(200,400,50),
    'class_weight': ['balanced_subsample'],
    'random_state': [42]}]
grid_search_RF = GridSearchCV(RF, RF_grid_params, scoring='roc_auc', cv=5, n_jobs=-1)
%time grid_search_RF.fit(X_train, y_train)

print(grid_search_RF.best_estimator_)
print('--'*30)
print('accuracy')
print(grid_search_RF.best_estimator_.score(X_train, y_train))
print(grid_search_RF.best_estimator_.score(X_test, y_test))
print('--'*30)
y_train_pred = grid_search_RF.best_estimator_.predict(X_train)
y_test_pred = grid_search_RF.best_estimator_.predict(X_test)
print('confusion matrix')
print(confusion_matrix(y_train, y_train_pred))
print(confusion_matrix(y_test, y_test_pred))
print('--'*30)
print('AUC-ROC')
print(roc_auc_score(y_train, y_train_pred))
print(roc_auc_score(y_test, y_test_pred))

Wall time: 1min 56s
RandomForestClassifier(class_weight='balanced_subsample', max_depth=6,
                       min_samples_leaf=50, min_samples_split=300,
                       n_estimators=600, oob_score=True, random_state=42)
------------------------------------------------------------
accuracy
0.6506002572531084
0.6489924253251393
------------------------------------------------------------
confusion matrix
[[33551 17403]
 [ 2155  2867]]
[[8380 4359]
 [ 553  702]]
------------------------------------------------------------
AUC-ROC
0.6146723697827139
0.6085924924213969


random forest with CV (top 43 features from previous model)

In [40]:
RF_coefs = pd.DataFrame({'feature':X_train.columns,
                      'importance':grid_search_RF.best_estimator_.feature_importances_})
RF_43 = RF_coefs.sort_values('importance',ascending = False).head(43).feature.tolist()

In [42]:
X_train_43 = X_train[RF_43]
X_test_43 = X_test[RF_43]

In [45]:
RF_grid_params = [{
    'n_estimators': [300], #range(100,800,100),
    'max_depth': [6], #[4,5,6],
    'max_features': ['sqrt'], #['sqrt','log2'],
    'criterion': ['gini'], #['gini','entropy'],
    'min_samples_leaf': [50], #[40,50,60],
    'min_samples_split': [250], #range(200,400,50),
    'class_weight': ['balanced_subsample'],
    'random_state': [42]}]
grid_search_RF = GridSearchCV(RF, RF_grid_params, scoring='roc_auc', cv=5, n_jobs=-1)
%time grid_search_RF.fit(X_train_43, y_train)

print(grid_search_RF.best_estimator_)
print('--'*30)
print('accuracy')
print(grid_search_RF.best_estimator_.score(X_train_43, y_train))
print(grid_search_RF.best_estimator_.score(X_test_43, y_test))
print('--'*30)
y_train_pred = grid_search_RF.best_estimator_.predict(X_train_43)
y_test_pred = grid_search_RF.best_estimator_.predict(X_test_43)
print('confusion matrix')
print(confusion_matrix(y_train, y_train_pred))
print(confusion_matrix(y_test, y_test_pred))
print('--'*30)
print('AUC-ROC')
print(roc_auc_score(y_train, y_train_pred))
print(roc_auc_score(y_test, y_test_pred))

Wall time: 1min 7s
RandomForestClassifier(class_weight='balanced_subsample', max_depth=6,
                       max_features='sqrt', min_samples_leaf=50,
                       min_samples_split=250, n_estimators=300, oob_score=True,
                       random_state=42)
------------------------------------------------------------
accuracy
0.6550843218522224
0.6547091610690295
------------------------------------------------------------
confusion matrix
[[33858 17096]
 [ 2211  2811]]
[[8478 4261]
 [ 571  684]]
------------------------------------------------------------
AUC-ROC
0.612109422939115
0.6052676334461198


In [None]:
RF_coefs = pd.DataFrame({'feature':X_train_43.columns,
                         'importance':grid_search_RF.best_estimator_.feature_importances_})
RF_coefs.sort_values('importance',ascending = False).head(15)

gradient boosting with CV (93 features)

In [46]:
# resampling
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state = 42)
X_train, y_train = ros.fit_sample(X_train, y_train)
print(X_train.shape)
y_train.value_counts()

(101908, 93)


True     50954
False    50954
Name: readmit_30d, dtype: int64

In [47]:
from sklearn.ensemble import GradientBoostingClassifier
GB = GradientBoostingClassifier(random_state = 42)

In [59]:
GB_grid_params = [{
    'learning_rate': [1], #[0.8,0,9,1.0,1.1,1.2],
    'n_estimators': [300], #range(100,800,100),
    'max_depth': [4],
    'subsample': [0.8], #np.linspace(0.6,0.9,10),
#     'max_features':['sqrt','log2'],
#     'criterion': ['mse','friedman_mse','mae'],
    'min_samples_leaf': [50], #range(50,80,5),
    'min_samples_split': [300], #[400,500,600,700,800],
    'random_state':[42]
}]
grid_search_GB = GridSearchCV(GB, GB_grid_params, scoring='roc_auc', cv=5, verbose = 1, n_jobs=-1)
%time grid_search_GB.fit(X_train, y_train)

print(grid_search_GB.best_estimator_)
print('--'*30)
y_train_pred = grid_search_GB.best_estimator_.predict(X_train)
y_test_pred = grid_search_GB.best_estimator_.predict(X_test)
print('accuracy:')
print(grid_search_GB.best_estimator_.score(X_train, y_train))
print(grid_search_GB.best_estimator_.score(X_test, y_test))
print('confusion matrices:')
print(confusion_matrix(y_train, y_train_pred))
print(confusion_matrix(y_test, y_test_pred))
print('--'*30)
print('AUC-ROC:')
print(roc_auc_score(y_train, y_train_pred))
print(roc_auc_score(y_test, y_test_pred))

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  1.8min remaining:  2.7min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.8min finished


Wall time: 3min
GradientBoostingClassifier(learning_rate=1, max_depth=4, min_samples_leaf=50,
                           min_samples_split=300, n_estimators=300,
                           random_state=42, subsample=0.8)
------------------------------------------------------------
accuracy:
0.8317011422066962
0.723024153208518
confusion matrices:
[[40894 10060]
 [ 7091 43863]]
[[9680 3059]
 [ 817  438]]
------------------------------------------------------------
AUC-ROC:
0.8317011422066962
0.5544376227721189


gradient boosting with CV (top 43 features from previous model)

In [51]:
GB_coefs = pd.DataFrame({'feature':X_train.columns,
                      'importance':grid_search_GB.best_estimator_.feature_importances_})
GB_43 = RF_coefs.sort_values('importance',ascending = False).head(43).feature.tolist()

In [53]:
X_train_43 = X_train[GB_43]
X_test_43 = X_test[GB_43]

In [None]:
GB_grid_params = [{
#     'learning_rate': [0.8,0,9,1.0,1.1,1.2],
    'n_estimators': [600], #range(100,800,100),
    'max_depth': [6],
    'subsample': [0.8], #np.linspace(0.6,0.9,10),
    'max_features':['sqrt'],
#     'criterion': ['mse','friedman_mse','mae'],
    'min_samples_leaf': [50], #range(50,80,5),
    'min_samples_split': [500], #[400,500,600,700,800],
    'random_state':[42]
}]
grid_search_GB = GridSearchCV(GB, GB_grid_params, scoring='roc_auc', cv=5, verbose = 1, n_jobs=-1)
%time grid_search_GB.fit(X_train_43, y_train)

print(grid_search_GB.best_estimator_)
print('--'*30)
y_train_pred = grid_search_GB.best_estimator_.predict(X_train_43)
y_test_pred = grid_search_GB.best_estimator_.predict(X_test_43)
print('accuracy:')
print(grid_search_GB.best_estimator_.score(X_train_43, y_train))
print(grid_search_GB.best_estimator_.score(X_test_43, y_test))
print('confusion matrices:')
print(confusion_matrix(y_train, y_train_pred))
print(confusion_matrix(y_test, y_test_pred))
print('--'*30)
print('AUC-ROC:')
print(roc_auc_score(y_train, y_train_pred))
print(roc_auc_score(y_test, y_test_pred))

In [None]:
GB_coefs = pd.DataFrame({'feature':X_train.columns,
                      'importance':grid_search_GB.best_estimator_.feature_importances_})
GB_coef.sort_values('importance', ascending = False).head(15)