### data prep

In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
diabetic = pd.read_csv('../../data/diabetic_data_initial.csv')

In [4]:
diabetic_df = diabetic.replace('?', np.nan)

diabetic_df = diabetic_df.sort_values('encounter_id')
diabetic_df.drop_duplicates(subset = ['patient_nbr'], keep = 'first', inplace = True)

diabetic_df = diabetic_df[~diabetic_df['discharge_disposition_id'].isin([11,13,14,19,20,21])]
diabetic_df.drop(diabetic_df.loc[diabetic_df.gender=='Unknown/Invalid'].index, inplace=True)

diabetic_df.drop(['encounter_id','patient_nbr','weight','medical_specialty','payer_code'],\
                 axis = 1, inplace = True)

diabetic_df['admission_type_id'] = diabetic_df['admission_type_id'].astype('str')
diabetic_df['admission_source_id'] = diabetic_df['admission_source_id'].astype('str')
diabetic_df['discharge_disposition_id'] = diabetic_df['discharge_disposition_id'].astype('str')

In [5]:
diabetic_df.shape

(69970, 45)

In [6]:
diabetic_df['readmit_30d'] = (diabetic_df['readmitted'] == '<30')

# diabetic_df['age_group'] = diabetic_df['age'].replace(
#     ['[0-10)','[10-20)','[20-30)','[30-40)','[40-50)','[50-60)','[60-70)','[70-80)','[80-90)','[90-100)'],
#     ['inf to adole','inf to adole','adult','adult','mid-age','mid-age','senior','senior','senior','senior'])
diabetic_df['age_num'] = diabetic_df['age'].replace(
    ['[0-10)','[10-20)','[20-30)','[30-40)','[40-50)','[50-60)','[60-70)','[70-80)','[80-90)','[90-100)'],
    [0,10,20,30,40,50,60,70,80,90])

In [7]:
diabetic_df['diag_1'] = diabetic_df['diag_1'].str.split('.', expand = True).drop(1, axis = 1)
diabetic_df['diag_2'] = diabetic_df['diag_2'].str.split('.', expand = True).drop(1, axis = 1)
diabetic_df['diag_3'] = diabetic_df['diag_3'].str.split('.', expand = True).drop(1, axis = 1)

count_1 = diabetic_df['diag_1'].value_counts()
index_1 = count_1[count_1>=500].index.tolist()

count_2 = diabetic_df['diag_2'].value_counts()
index_2 = count_2[count_2>=500].index.tolist()

count_3 = diabetic_df['diag_3'].value_counts()
index_3 = count_3[count_3>=500].index.tolist()

diagnoses = set(index_1 + index_2 + index_3)

for d in diagnoses:
    diabetic_df[d+'_diag'] = ((diabetic_df['diag_1']==d)|
                              (diabetic_df['diag_2']==d)|
                              (diabetic_df['diag_3']==d))

In [8]:
for col in ['metformin','repaglinide','glimepiride','glipizide', 'glyburide','pioglitazone',
            'rosiglitazone','insulin']:
    diabetic_df[col+'_used'] = np.where(diabetic_df[col]=='No', False, True)
# 'nateglinide','glyburide-metformin' (close to 500)

In [9]:
diabetic_df = diabetic_df.drop(['age',
                               'diag_1','diag_2','diag_3',
                               'metformin','repaglinide','nateglinide','chlorpropamide','glimepiride',
                               'acetohexamide','glipizide','glyburide','tolbutamide','pioglitazone',
                               'rosiglitazone','acarbose','miglitol','troglitazone','tolazamide','examide',
                               'citoglipton','insulin','glyburide-metformin','glipizide-metformin',
                               'glimepiride-pioglitazone','metformin-rosiglitazone','metformin-pioglitazone',
                               'readmitted'],
                               axis = 1)

In [10]:
diabetic_dum = pd.get_dummies(diabetic_df, drop_first = True)
diabetic_dum.shape

(69970, 119)

In [11]:
# dropping variables with <500 minority classes
descr = diabetic_dum.describe().T
under_500 = descr[descr['mean'] <= (500/len(diabetic_dum))].index.tolist()
under_500.remove('race_Asian')
diabetic_final = diabetic_dum.drop(under_500, axis = 1)
diabetic_final.shape

(69970, 94)

train test split

In [12]:
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV, train_test_split

In [13]:
features = diabetic_final.drop(['readmit_30d'], axis = 1)
target = diabetic_final['readmit_30d']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(features, 
                                                    target, 
                                                    test_size=0.2,
                                                    stratify = target,
                                                    random_state = 42)

resampling

In [16]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

In [17]:
# RandomOverSampler
ros = RandomOverSampler(random_state = 42)
X_train, y_train = ros.fit_sample(X_train, y_train)
print(f'RandomOverSampler: {X_train.shape}')

# # RandomUnderSample
# rus = RandomUnderSampler(random_state = 42)
# X_train_rus, y_train_rus = rus.fit_sample(X_train, y_train)
# print(f'RandomUnderSampler: {X_train_rus.shape}')

RandomOverSampler: (101908, 93)


select important features

In [None]:
selected = ['number_inpatient','discharge_disposition_id_22','time_in_hospital',
 'discharge_disposition_id_3','age_num','num_medications',
 'num_lab_procedures','number_diagnoses','discharge_disposition_id_5','num_procedures',
 '786_diag','434_diag','number_emergency','428_diag','diabetesMed_Yes']
X_train_s = X_train[selected]
X_test_s = X_test[selected]

print(X_train_s.shape)
print(X_test_s.shape)

### model testing

linear models

In [None]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, roc_auc_score

In [None]:
logit = LogisticRegression(random_state = 108)
logit.fit(X_train_s, y_train_ros)

logit_grid_params = [{
    'C': np.linspace(1e-3,0.1,50),
    'penalty':['l1','l2'],
    'class_weight': [None, 'balanced'],
    'solver': ['liblinear'],
    'random_state':[108]
}]
grid_search_logit = GridSearchCV(logit, logit_grid_params, scoring='roc_auc', cv=5,
                                 verbose = 1, n_jobs=-1, )
%time grid_search_logit.fit(X_train_s, y_train_ros)

print(grid_search_logit.best_params_)
print('accuracy:')
print(grid_search_logit.best_estimator_.score(X_train_s, y_train_ros))
print(grid_search_logit.best_estimator_.score(X_test_s, y_test))
print('--'*30)
y_train_pred = grid_search_logit.best_estimator_.predict(X_train_s)
y_test_pred = grid_search_logit.best_estimator_.predict(X_test_s)
print('confusion matrix:')
print(confusion_matrix(y_train_ros, y_train_pred))
print(confusion_matrix(y_test, y_test_pred))
print('--'*30)
print('AUC-ROC:')
print(roc_auc_score(y_train_ros, y_train_pred))
print(roc_auc_score(y_test, y_test_pred))

random forest

In [None]:
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.ensemble import RandomForestClassifier

In [None]:
RFC = RandomForestClassifier(oob_score=True, class_weight = 'balanced_subsample', random_state = 108)

# RF grid search
RF_grid_params = [{
    'n_estimators': [500], #range(100,600,100),
    'max_depth': [5], #[2,4,6,8],
    'max_features': ['sqrt'],
    'criterion': ['gini'],
    'min_samples_leaf': [50], 
    'min_samples_split': [250,350], 
    'class_weight': ['balanced_subsample']
}]
grid_search_RF = GridSearchCV(RFC, RF_grid_params, scoring='roc_auc', cv=5, n_jobs=-1)
%time grid_search_RF.fit(X_train_s, y_train_ros)

In [None]:
# best params
print(grid_search_RF.best_params_)
grid_search_RF.best_estimator_

In [None]:
print('accuracy')
print(grid_search_RF.best_estimator_.score(X_train_s, y_train_ros))
print(grid_search_RF.best_estimator_.score(X_test_s, y_test))
print('--'*30)
y_train_pred = grid_search_RF.best_estimator_.predict(X_train_s)
y_test_pred = grid_search_RF.best_estimator_.predict(X_test_s)
print('confusion matrix')
print(confusion_matrix(y_train_ros, y_train_pred))
print(confusion_matrix(y_test, y_test_pred))
print('--'*30)
print('AUC-ROC')
print(roc_auc_score(y_train_ros, y_train_pred))
print(roc_auc_score(y_test, y_test_pred))

In [None]:
RF_fp = pd.DataFrame({'feature':X_train_s.columns,
                      'importance':grid_search_RF.best_estimator_.feature_importances_}).\
sort_values('importance',ascending = False)
RF_fp.head(15)