### data prep

In [2]:
import pandas as pd
import numpy as np

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [4]:
diabetic = pd.read_csv('../../data/diabetic_data_initial.csv')

In [5]:
diabetic_df = diabetic.replace('?', np.nan)

diabetic_df = diabetic_df.sort_values('encounter_id')
diabetic_df.drop_duplicates(subset = ['patient_nbr'], keep = 'first', inplace = True)

diabetic_df = diabetic_df[~diabetic_df['discharge_disposition_id'].isin([11,13,14,19,20,21])]
diabetic_df.drop(diabetic_df.loc[diabetic_df.gender=='Unknown/Invalid'].index, inplace=True)

diabetic_df.drop(['encounter_id','patient_nbr','weight','medical_specialty','payer_code'],\
                 axis = 1, inplace = True)

diabetic_df['admission_type_id'] = diabetic_df['admission_type_id'].astype('str')
diabetic_df['admission_source_id'] = diabetic_df['admission_source_id'].astype('str')
diabetic_df['discharge_disposition_id'] = diabetic_df['discharge_disposition_id'].astype('str')

In [6]:
diabetic_df.shape

(69970, 45)

In [7]:
diabetic_df['readmit_30d'] = (diabetic_df['readmitted'] == '<30')

# diabetic_df['age_group'] = diabetic_df['age'].replace(
#     ['[0-10)','[10-20)','[20-30)','[30-40)','[40-50)','[50-60)','[60-70)','[70-80)','[80-90)','[90-100)'],
#     ['inf to adole','inf to adole','adult','adult','mid-age','mid-age','senior','senior','senior','senior'])
diabetic_df['age_num'] = diabetic_df['age'].replace(
    ['[0-10)','[10-20)','[20-30)','[30-40)','[40-50)','[50-60)','[60-70)','[70-80)','[80-90)','[90-100)'],
    [0,10,20,30,40,50,60,70,80,90])

In [8]:
diabetic_df['diag_1'] = diabetic_df['diag_1'].str.split('.', expand = True).drop(1, axis = 1)
diabetic_df['diag_2'] = diabetic_df['diag_2'].str.split('.', expand = True).drop(1, axis = 1)
diabetic_df['diag_3'] = diabetic_df['diag_3'].str.split('.', expand = True).drop(1, axis = 1)

count_1 = diabetic_df['diag_1'].value_counts()
index_1 = count_1[count_1>=500].index.tolist()

count_2 = diabetic_df['diag_2'].value_counts()
index_2 = count_2[count_2>=500].index.tolist()

count_3 = diabetic_df['diag_3'].value_counts()
index_3 = count_3[count_3>=500].index.tolist()

diagnoses = set(index_1 + index_2 + index_3)

for d in diagnoses:
    diabetic_df[d+'_diag'] = ((diabetic_df['diag_1']==d)|
                              (diabetic_df['diag_2']==d)|
                              (diabetic_df['diag_3']==d))

In [9]:
for col in ['metformin','repaglinide','glimepiride','glipizide', 'glyburide','pioglitazone',
            'rosiglitazone','insulin']:
    diabetic_df[col+'_used'] = np.where(diabetic_df[col]=='No', False, True)
# 'nateglinide','glyburide-metformin' (close to 500)

In [10]:
diabetic_df = diabetic_df.drop(['age',
                               'diag_1','diag_2','diag_3',
                               'metformin','repaglinide','nateglinide','chlorpropamide','glimepiride',
                               'acetohexamide','glipizide','glyburide','tolbutamide','pioglitazone',
                               'rosiglitazone','acarbose','miglitol','troglitazone','tolazamide','examide',
                               'citoglipton','insulin','glyburide-metformin','glipizide-metformin',
                               'glimepiride-pioglitazone','metformin-rosiglitazone','metformin-pioglitazone',
                               'readmitted'],
                               axis = 1)

In [11]:
diabetic_dum = pd.get_dummies(diabetic_df, drop_first = True)
diabetic_dum.shape

(69970, 119)

In [12]:
# dropping variables with <500 minority classes
descr = diabetic_dum.describe().T
under_500 = descr[descr['mean'] <= (500/len(diabetic_dum))].index.tolist()
under_500.remove('race_Asian')
diabetic_final = diabetic_dum.drop(under_500, axis = 1)
diabetic_final.shape

(69970, 94)

train test split

In [13]:
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV, train_test_split

In [14]:
features = diabetic_final.drop(['readmit_30d'], axis = 1)
target = diabetic_final['readmit_30d']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(features, 
                                                    target, 
                                                    test_size=0.2,
                                                    stratify = target,
                                                    random_state = 42)

over- or under- sample

In [16]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

In [17]:
# RandomOverSampler
ros = RandomOverSampler(random_state = 42)
X_train_ros, y_train_ros = ros.fit_sample(X_train, y_train)
print(f'RandomOverSampler: {X_train_ros.shape}')

# # RandomUnderSample
# rus = RandomUnderSampler(random_state = 42)
# X_train_rus, y_train_rus = rus.fit_sample(X_train, y_train)
# print(f'RandomUnderSampler: {X_train_rus.shape}')

RandomOverSampler: (101908, 93)


### model testing

random forest with oversampled data

In [18]:
from sklearn.metrics import confusion_matrix, roc_auc_score

In [19]:
from sklearn.ensemble import RandomForestClassifier
RFC = RandomForestClassifier(oob_score=True, class_weight = 'balanced_subsample', random_state = 108)
RFC

RandomForestClassifier(class_weight='balanced_subsample', oob_score=True,
                       random_state=108)

In [20]:
# RF grid search
RF_grid_params = [{
    'n_estimators': [700], #range(100,900,100),
    'max_depth': [6],
#     'max_features': ['sqrt',10,11],
    'criterion': ['gini'], #['gini','entropy'],
    'min_samples_leaf': [60,70], #[60,70,80],
    'min_samples_split': [750], #range(500,800,50)
    'class_weight': ['balanced_subsample'],
    'random_state': [108]}]
grid_search_RF = GridSearchCV(RFC, RF_grid_params, scoring='roc_auc', cv=5, n_jobs=-1)
%time grid_search_RF.fit(X_train_ros, y_train_ros)

Wall time: 11min 59s


GridSearchCV(cv=5,
             estimator=RandomForestClassifier(class_weight='balanced_subsample',
                                              oob_score=True,
                                              random_state=108),
             n_jobs=-1,
             param_grid=[{'class_weight': ['balanced_subsample'],
                          'criterion': ['gini'], 'max_depth': [6],
                          'min_samples_leaf': [60, 70],
                          'min_samples_split': [750], 'n_estimators': [700],
                          'random_state': [108]}],
             scoring='roc_auc')

In [21]:
# best params and estimator
print(grid_search_RF.best_params_)
grid_search_RF.best_estimator_

{'class_weight': 'balanced_subsample', 'criterion': 'gini', 'max_depth': 6, 'min_samples_leaf': 60, 'min_samples_split': 750, 'n_estimators': 700, 'random_state': 108}


RandomForestClassifier(class_weight='balanced_subsample', max_depth=6,
                       min_samples_leaf=60, min_samples_split=750,
                       n_estimators=700, oob_score=True, random_state=108)

In [22]:
print('accuracy')
print(grid_search_RF.best_estimator_.score(X_train_ros, y_train_ros))
print(grid_search_RF.best_estimator_.score(X_test, y_test))
print('--'*30)
y_train_pred = grid_search_RF.best_estimator_.predict(X_train_ros)
y_test_pred = grid_search_RF.best_estimator_.predict(X_test)
print('confusion matrix')
print(confusion_matrix(y_train_ros, y_train_pred))
print(confusion_matrix(y_test, y_test_pred))
print('--'*30)
print('AUC-ROC')
print(roc_auc_score(y_train_ros, y_train_pred))
print(roc_auc_score(y_test, y_test_pred))

accuracy
0.6137398437806649
0.6407746176932971
------------------------------------------------------------
confusion matrix
[[33078 17876]
 [21487 29467]]
[[8258 4481]
 [ 546  709]]
------------------------------------------------------------
AUC-ROC
0.6137398437806649
0.6065928921100276


In [30]:
RF_fp = pd.DataFrame({'feature':X_train_ros.columns,
                      'importance':grid_search_RF.best_estimator_.feature_importances_}).\
sort_values('importance',ascending = False)
RF_fp.head(15)

Unnamed: 0,feature,importance
6,number_inpatient,0.181835
73,discharge_disposition_id_22,0.131854
75,discharge_disposition_id_3,0.092326
0,time_in_hospital,0.090017
8,age_num,0.063736
3,num_medications,0.047339
7,number_diagnoses,0.046058
77,discharge_disposition_id_5,0.040184
37,786_diag,0.032154
41,434_diag,0.028988


gradient boosting

In [31]:
from sklearn.ensemble import GradientBoostingClassifier

In [32]:
GB = GradientBoostingClassifier(random_state = 108)
GB

# GB grid search
GB_grid_params = [{
    'learning_rate': [1.0], #[0.8,1.0,1.2],
    'n_estimators': [700], #range(200,800,100),
    'max_depth': [6],
    'subsample': [0.7,0.8], #np.linspace(0.6,0.9,10),
#     'max_features':['sqrt',7,9],
#     'criterion': ['mse','friedman_mse','mae'],
    'min_samples_leaf': [60], #[50], #range(60,90,5),
    'min_samples_split': [750], #[500,600,700,800,900],
    'random_state':[108]
}]
grid_search_GB = GridSearchCV(GB, GB_grid_params, scoring='roc_auc', cv=5, verbose = 1, n_jobs=-1)
%time grid_search_GB.fit(X_train_ros, y_train_ros)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed: 10.7min remaining:  7.1min
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 15.0min finished


Wall time: 20min 9s


GridSearchCV(cv=5, estimator=GradientBoostingClassifier(random_state=108),
             n_jobs=-1,
             param_grid=[{'learning_rate': [1.0], 'max_depth': [6],
                          'min_samples_leaf': [60], 'min_samples_split': [750],
                          'n_estimators': [700], 'random_state': [108],
                          'subsample': [0.7, 0.8]}],
             scoring='roc_auc', verbose=1)

In [33]:
print(grid_search_GB.best_params_)
print(grid_search_GB.best_params_)
# print(grid_search_GB.cv_results_['mean_test_score'])
print('--'*30)

# using grid search best estimator
y_train_pred = grid_search_GB.best_estimator_.predict(X_train_ros)
y_test_pred = grid_search_GB.best_estimator_.predict(X_test)
print('accuracy:')
print(grid_search_GB.best_estimator_.score(X_train_ros, y_train_ros))
print(grid_search_GB.best_estimator_.score(X_test, y_test))
print('confusion matrices:')
print(confusion_matrix(y_train_ros, y_train_pred))
print(confusion_matrix(y_test, y_test_pred))
print('--'*30)
print('AUC-ROC:')
print(roc_auc_score(y_train_ros, y_train_pred))
print(roc_auc_score(y_test, y_test_pred))

{'learning_rate': 1.0, 'max_depth': 6, 'min_samples_leaf': 60, 'min_samples_split': 750, 'n_estimators': 700, 'random_state': 108, 'subsample': 0.8}
{'learning_rate': 1.0, 'max_depth': 6, 'min_samples_leaf': 60, 'min_samples_split': 750, 'n_estimators': 700, 'random_state': 108, 'subsample': 0.8}
------------------------------------------------------------
accuracy:
0.989196137692821
0.8274260397313135
confusion matrices:
[[49897  1057]
 [   44 50910]]
[[11378  1361]
 [ 1054   201]]
------------------------------------------------------------
AUC-ROC:
0.989196137692821
0.5266610455892108


In [34]:
pd.DataFrame({'feature': X_train_ros.columns, 'importance':grid_search_GB.best_estimator_.feature_importances_}).\
sort_values('importance', ascending = False).head(20)

Unnamed: 0,feature,importance
1,num_lab_procedures,0.127352
3,num_medications,0.103645
0,time_in_hospital,0.065149
8,age_num,0.050293
7,number_diagnoses,0.043815
6,number_inpatient,0.038345
2,num_procedures,0.038268
75,discharge_disposition_id_3,0.021616
4,number_outpatient,0.019608
73,discharge_disposition_id_22,0.018804


kNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

In [None]:
print('accuracy')
print(knn.score(X_train, y_train))
print(knn.score(X_test, y_test))

print('--'*30)
y_train_pred = knn.predict(X_train)
y_test_pred = knn.predict(X_test)

print('confusion matrix')
print(confusion_matrix(y_train, y_train_pred))
print(confusion_matrix(y_test, y_test_pred))

print('--'*30)
print('AUC-ROC')
print(roc_auc_score(y_train, y_train_pred))
print(roc_auc_score(y_test, y_test_pred))

In [None]:
# kNN CV

RFE CV

In [None]:
from sklearn.feature_selection import RFECV

In [None]:
logit.set_params(C = 0.1)

In [None]:
logit_rfecv = RFECV(estimator=logit, step=1, cv=3, scoring='roc_auc')

In [None]:
logit_rfecv.fit(X_train, y_train)

In [None]:
logit_rfecv.grid_scores_

In [None]:
rfecv_ranking = pd.DataFrame({'feature':X_train.columns, 'ranking':logit_rfecv.ranking_})
rfecv_ranking[rfecv_ranking['ranking'] == 1]

In [None]:
pd.qcut()