### data prepping

In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
diabetic = pd.read_csv('../../data/diabetic_data_initial.csv')

In [4]:
diabetic.shape

(101766, 50)

In [5]:
diabetic_df = diabetic.replace('?', np.nan)

diabetic_df = diabetic_df.sort_values('encounter_id')
diabetic_df.drop_duplicates(subset = ['patient_nbr'], keep = 'first', inplace = True)

diabetic_df = diabetic_df[~diabetic_df['discharge_disposition_id'].isin([11,13,14,19,20,21])]
diabetic_df.drop(diabetic_df.loc[diabetic_df.gender=='Unknown/Invalid'].index, inplace=True)

diabetic_df.drop(['encounter_id','patient_nbr','weight','medical_specialty','payer_code'],\
                 axis = 1, inplace = True)

diabetic_df['admission_type_id'] = diabetic_df['admission_type_id'].astype('str')
diabetic_df['admission_source_id'] = diabetic_df['admission_source_id'].astype('str')
diabetic_df['discharge_disposition_id'] = diabetic_df['discharge_disposition_id'].astype('str')

In [6]:
diabetic_df.shape

(69970, 45)

In [7]:
diabetic_df['readmit_30d'] = (diabetic_df['readmitted'] == '<30')

# diabetic_df['age_group'] = diabetic_df['age'].replace(
#     ['[0-10)','[10-20)','[20-30)','[30-40)','[40-50)','[50-60)','[60-70)','[70-80)','[80-90)','[90-100)'],
#     ['inf to adole','inf to adole','adult','adult','mid-age','mid-age','senior','senior','senior','senior'])
diabetic_df['age_num'] = diabetic_df['age'].replace(
    ['[0-10)','[10-20)','[20-30)','[30-40)','[40-50)','[50-60)','[60-70)','[70-80)','[80-90)','[90-100)'],
    [0,10,20,30,40,50,60,70,80,90])

In [8]:
diabetic_df['diag_1'] = diabetic_df['diag_1'].str.split('.', expand = True).drop(1, axis = 1)
diabetic_df['diag_2'] = diabetic_df['diag_2'].str.split('.', expand = True).drop(1, axis = 1)
diabetic_df['diag_3'] = diabetic_df['diag_3'].str.split('.', expand = True).drop(1, axis = 1)

count_1 = diabetic_df['diag_1'].value_counts()
index_1 = count_1[count_1>=500].index.tolist()

count_2 = diabetic_df['diag_2'].value_counts()
index_2 = count_2[count_2>=500].index.tolist()

count_3 = diabetic_df['diag_3'].value_counts()
index_3 = count_3[count_3>=500].index.tolist()

diagnoses = set(index_1 + index_2 + index_3)

for d in diagnoses:
    diabetic_df[d+'_diag'] = ((diabetic_df['diag_1']==d)|
                              (diabetic_df['diag_2']==d)|
                              (diabetic_df['diag_3']==d))

In [9]:
for col in ['metformin','repaglinide','glimepiride','glipizide', 'glyburide','pioglitazone',
            'rosiglitazone','insulin']:
    diabetic_df[col+'_used'] = np.where(diabetic_df[col]=='No', False, True)
# 'nateglinide','glyburide-metformin' (close to 500)

In [10]:
diabetic_df = diabetic_df.drop(['age',
                                'diag_1','diag_2','diag_3',
                                'metformin','repaglinide','nateglinide','chlorpropamide','glimepiride',
                                'acetohexamide','glipizide','glyburide','tolbutamide','pioglitazone',
                                'rosiglitazone','acarbose','miglitol','troglitazone','tolazamide','examide',
                                'citoglipton','insulin','glyburide-metformin','glipizide-metformin',
                                'glimepiride-pioglitazone','metformin-rosiglitazone','metformin-pioglitazone',
                                'readmitted'],
                                 axis = 1)
#'max_glu_serum','A1Cresult'

In [11]:
diabetic_dum = pd.get_dummies(diabetic_df, drop_first = True)
diabetic_dum.shape

(69970, 119)

In [12]:
# dropping variables with <500 minority classes
descr = diabetic_dum.describe().T
under_500 = descr[descr['mean'] <= (500/len(diabetic_dum))].index.tolist()
under_500.remove('race_Asian')
diabetic_final = diabetic_dum.drop(under_500, axis = 1)
diabetic_final.shape

(69970, 94)

train test split

In [13]:
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV, train_test_split

In [14]:
features = diabetic_final.drop(['readmit_30d'], axis = 1)
target = diabetic_final['readmit_30d']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(features, 
                                                    target, 
                                                    test_size=0.2,
                                                    stratify = target,
                                                    random_state = 42)

In [16]:
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import TomekLinks, RandomUnderSampler, NearMiss

# SMOTE doesn't provide close-to-true oversampling at k=319
# smote = SMOTE(random_state = 42, k_neighbors=319)
# X_train, y_train = smote.fit_sample(X_train, y_train)

ros = RandomOverSampler(random_state = 42)
X_train, y_train = ros.fit_sample(X_train, y_train)
print(X_train.shape)
print(y_train.shape)

(101908, 93)
(101908,)


standardization

In [17]:
from sklearn.preprocessing import StandardScaler

In [18]:
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

checking VIF

In [19]:
# X_vif = pd.DataFrame(X_train, dtype=float)

# from statsmodels.stats.outliers_influence import variance_inflation_factor 
# vif_data = pd.DataFrame() 
# vif_data["feature"] = X_vif.columns
# vif_data["VIF"] = [variance_inflation_factor(X_vif.values, i) 
#                           for i in range(len(X_vif.columns))] 
# print(vif_data)

### model testing

logistic regression with no regularization

In [20]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, roc_auc_score

In [21]:
# logistic regression with no regularization or tuning
logit = LogisticRegression(solver='liblinear', random_state = 108)
logit.fit(X_train, y_train)

LogisticRegression(random_state=108, solver='liblinear')

In [22]:
print('accuracy:')
print(logit.score(X_train, y_train))
print(logit.score(X_test, y_test))
print('--'*30)
y_train_pred = logit.predict(X_train)
y_test_pred = logit.predict(X_test)
print('confusion matrix:')
print(confusion_matrix(y_train, y_train_pred))
print(confusion_matrix(y_test, y_test_pred))
print('--'*30)
print('AUC-ROC:')
print(roc_auc_score(y_train, y_train_pred))
print(roc_auc_score(y_test, y_test_pred))

accuracy:
0.6125524983318287
0.656924396169787
------------------------------------------------------------
confusion matrix:
[[34148 16806]
 [22678 28276]]
[[8537 4202]
 [ 599  656]]
------------------------------------------------------------
AUC-ROC:
0.6125524983318287
0.5964279783292453


logistic regression with CV

In [31]:
logit = LogisticRegression(random_state = 108, max_iter = 1000)
logit.fit(X_train, y_train)

logit_grid_params = [{
    'C': np.linspace(5e-3,1,50),
    'penalty':['l1','l2'],
    'solver': ['liblinear'],
    'class_weight':[None,'balanced'],
    'random_state':[108]
}]

grid_search_logit = GridSearchCV(logit, logit_grid_params, scoring='roc_auc', cv=10,
                                 verbose = 1, n_jobs=-1)
%time grid_search_logit.fit(X_train, y_train)

print(grid_search_logit.best_params_)
print(grid_search_logit.best_estimator_)
print('--'*30)
print('accuracy:')
print(grid_search_logit.best_estimator_.score(X_train, y_train))
print(grid_search_logit.best_estimator_.score(X_test, y_test))
print('--'*30)
y_train_pred = grid_search_logit.best_estimator_.predict(X_train)
y_test_pred = grid_search_logit.best_estimator_.predict(X_test)
print('confusion matrix:')
print(confusion_matrix(y_train, y_train_pred))
print(confusion_matrix(y_test, y_test_pred))
print('--'*30)
print('AUC-ROC:')
print(roc_auc_score(y_train, y_train_pred))
print(roc_auc_score(y_test, y_test_pred))

Fitting 10 folds for each of 200 candidates, totalling 2000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   16.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  8.3min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed: 12.5min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed: 17.2min
[Parallel(n_jobs=-1)]: Done 2000 out of 2000 | elapsed: 18.9min finished


Wall time: 18min 58s
{'C': 0.025306122448979593, 'class_weight': None, 'penalty': 'l1', 'random_state': 108, 'solver': 'liblinear'}
LogisticRegression(C=0.025306122448979593, max_iter=1000, penalty='l1',
                   random_state=108, solver='liblinear')
------------------------------------------------------------
accuracy:
0.6124838089256976
0.6604258968129199
------------------------------------------------------------
confusion matrix:
[[34310 16644]
 [22847 28107]]
[[8583 4156]
 [ 596  659]]
------------------------------------------------------------
AUC-ROC:
0.6124838089256978
0.59942867668974


In [37]:
print(grid_search_logit.best_estimator_)

LogisticRegression(C=0.025306122448979593, max_iter=1000, penalty='l1',
                   random_state=108, solver='liblinear')


In [36]:
cv_coefs = pd.DataFrame({'features': features.columns,
                         'coef': grid_search_logit.best_estimator_.coef_[0]}).\
sort_values('coef', ascending = False)
cv_coefs['abs. coef'] = np.abs(cv_coefs['coef'])
cv_coefs.sort_values(['abs. coef'], ascending = False).iloc[:,0:2].head(15)

Unnamed: 0,features,coef
73,discharge_disposition_id_22,0.264303
6,number_inpatient,0.260653
75,discharge_disposition_id_3,0.199311
77,discharge_disposition_id_5,0.169602
72,discharge_disposition_id_2,0.103426
71,discharge_disposition_id_18,0.097572
92,diabetesMed_Yes,0.096132
39,786_diag,-0.084026
20,403_diag,0.082585
5,number_emergency,0.078046


In [30]:
print(len(cv_coefs[cv_coefs['coef']!=0]))
cv_coefs[cv_coefs['coef']!=0]

88


Unnamed: 0,features,coef
73,discharge_disposition_id_22,0.265281
6,number_inpatient,0.261026
75,discharge_disposition_id_3,0.200323
77,discharge_disposition_id_5,0.170518
72,discharge_disposition_id_2,0.10409
71,discharge_disposition_id_18,0.098634
92,diabetesMed_Yes,0.097055
20,403_diag,0.082926
5,number_emergency,0.078775
78,discharge_disposition_id_6,0.07487


In [None]:
# from sklearn import metrics
# sorted(metrics.SCORERS.keys())

logistic regression with lasso

In [None]:
logit_l1 = LogisticRegression(solver='liblinear', class_weight = 'balanced', penalty = 'l1', random_state = 108)
logit_l1

In [None]:
# logistic regression lasso CV
logit_l1_cv = LogisticRegressionCV(Cs = np.linspace(1e-3, 0.3, 50), cv = 3, penalty = 'l1',
                                scoring = 'roc_auc', solver='liblinear', random_state = 108)
logit_l1_cv.fit(X_train, y_train)

# best C
logit_l1_cv.C_

In [None]:
logit_l1_best = LogisticRegression(C = logit_l1_cv.C_[0], solver='liblinear',
                                   penalty = 'l1', random_state = 108)
logit_l1_best.fit(X_train, y_train)

print(grid_search_logit.best_params_)
print(grid_search_logit.best_estimator_)
print('--'*30)
print('accuracy:')
print(logit_l1_best.score(X_train, y_train))
print(logit_l1_best.score(X_test, y_test))
print('--'*30)
y_train_pred = logit_l1_best.predict(X_train)
y_test_pred = logit_l1_best.predict(X_test)
print('confusion matrix:')
print(confusion_matrix(y_train, y_train_pred))
print(confusion_matrix(y_test, y_test_pred))
print('--'*30)
print('AUC-ROC:')
print(roc_auc_score(y_train, y_train_pred))
print(roc_auc_score(y_test, y_test_pred))

In [None]:
lasso_cv_coefs = pd.DataFrame({'features': X_train.columns,
                               'coef': logit_l1_cv.coef_[0]}).sort_values('coef', ascending = False)

# non-zero coefficients
print(len(lasso_cv_coefs[lasso_cv_coefs['coef']!=0]))
lasso_cv_coefs[lasso_cv_coefs['coef']!=0]

logistic and let CV decide all parameters

In [None]:
logit = LogisticRegression(solver='liblinear', class_weight = 'balanced', random_state = 108, )
logit

In [None]:
# logicstic regression CV
logit_cv = LogisticRegressionCV(Cs = np.linspace(1e-4, 0.1, 20), 
                                cv = 3, class_weight = 'balanced',
                                penalty = ['l1','l2'],
                                scoring = 'roc_auc',
                                solver='liblinear',
                                random_state = 108)
logit_cv.fit(X_train, y_train)

# best C
logit_cv.C_

SGD

In [None]:
sgd = SGDClassifier(class_weight = 'balanced', random_state = 108)
sgd

In [None]:
# SGD CV
SGD_grid_params = [{
    'class_weight':['balanced'],
    'learning_rate':['optimal'],
    'alpha': np.linspace(0.001, 0.01, 30),
    'random_state': [108]}]
grid_search_SGD = GridSearchCV(sgd, SGD_grid_params, scoring='roc_auc', cv=3, n_jobs=-1)
%time grid_search_SGD.fit(X_train, y_train)

In [None]:
grid_search_SGD.best_params_

In [None]:
sgd_best = SGDClassifier(alpha = grid_search_SGD.best_params_['alpha'], class_weight = 'balanced', 
                         random_state = 108)
sgd_best.fit(X_train, y_train)

In [None]:
print('accuracy')
print(sgd_best.score(X_train, y_train))
print(sgd_best.score(X_test, y_test))

print('--'*30)
y_train_pred = sgd_best.predict(X_train)
y_test_pred = sgd_best.predict(X_test)

print('confusion matrix')
print(confusion_matrix(y_train, y_train_pred))
print(confusion_matrix(y_test, y_test_pred))

print('--'*30)
print('AUC-ROC')
print(roc_auc_score(y_train, y_train_pred))
print(roc_auc_score(y_test, y_test_pred))