### data prepping

In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
diabetic = pd.read_csv('../../data/diabetic_data_initial.csv')

In [4]:
diabetic_df = diabetic.replace('?', np.nan)

diabetic_df = diabetic_df.sort_values('encounter_id')
diabetic_df.drop_duplicates(subset = ['patient_nbr'], keep = 'first', inplace = True)

diabetic_df = diabetic_df[~diabetic_df['discharge_disposition_id'].isin([11,13,14,19,20,21])]
diabetic_df.drop(diabetic_df.loc[diabetic_df.gender=='Unknown/Invalid'].index, inplace=True)

diabetic_df.drop(['encounter_id','patient_nbr','weight','medical_specialty','payer_code'],\
                 axis = 1, inplace = True)

diabetic_df['admission_type_id'] = diabetic_df['admission_type_id'].astype('str')
diabetic_df['admission_source_id'] = diabetic_df['admission_source_id'].astype('str')
diabetic_df['discharge_disposition_id'] = diabetic_df['discharge_disposition_id'].astype('str')

In [5]:
diabetic_df.shape

(69970, 45)

In [6]:
diabetic_df['readmit_30d'] = (diabetic_df['readmitted'] == '<30')
diabetic_df['age_num'] = diabetic_df['age'].replace(
    ['[0-10)','[10-20)','[20-30)','[30-40)','[40-50)','[50-60)','[60-70)','[70-80)','[80-90)','[90-100)'],
    [0,10,20,30,40,50,60,70,80,90])

In [7]:
diabetic_df['diag_1'] = diabetic_df['diag_1'].str.split('.', expand = True).drop(1, axis = 1)
diabetic_df['diag_2'] = diabetic_df['diag_2'].str.split('.', expand = True).drop(1, axis = 1)
diabetic_df['diag_3'] = diabetic_df['diag_3'].str.split('.', expand = True).drop(1, axis = 1)

count_1 = diabetic_df['diag_1'].value_counts()
index_1 = count_1[count_1>=500].index.tolist()
count_2 = diabetic_df['diag_2'].value_counts()
index_2 = count_2[count_2>=500].index.tolist()
count_3 = diabetic_df['diag_3'].value_counts()
index_3 = count_3[count_3>=500].index.tolist()
diagnoses = set(index_1 + index_2 + index_3)

for d in diagnoses:
    diabetic_df[d+'_diag'] = ((diabetic_df['diag_1']==d)|
                              (diabetic_df['diag_2']==d)|
                              (diabetic_df['diag_3']==d))

In [8]:
for col in ['metformin','repaglinide','glimepiride','glipizide', 'glyburide','pioglitazone',
            'rosiglitazone','insulin']:
    diabetic_df[col+'_used'] = np.where(diabetic_df[col]=='No', False, True)
# 'nateglinide','glyburide-metformin' (close to 500)

In [9]:
diabetic_df = diabetic_df.drop(['age',
                                'diag_1','diag_2','diag_3',
                                'metformin','repaglinide','nateglinide','chlorpropamide','glimepiride',
                                'acetohexamide','glipizide','glyburide','tolbutamide','pioglitazone',
                                'rosiglitazone','acarbose','miglitol','troglitazone','tolazamide','examide',
                                'citoglipton','insulin','glyburide-metformin','glipizide-metformin',
                                'glimepiride-pioglitazone','metformin-rosiglitazone','metformin-pioglitazone',
                                'readmitted'],
                                 axis = 1)
#'max_glu_serum','A1Cresult'

In [10]:
diabetic_dum = pd.get_dummies(diabetic_df, drop_first = True)
diabetic_dum.shape

(69970, 119)

In [11]:
# dropping variables with <500 minority classes
descr = diabetic_dum.describe().T
under_500 = descr[descr['mean'] <= (500/len(diabetic_dum))].index.tolist()
under_500.remove('race_Asian')
diabetic_final = diabetic_dum.drop(under_500, axis = 1)
diabetic_final.shape

(69970, 94)

train test split

In [12]:
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV, train_test_split

In [13]:
features = diabetic_final.drop(['readmit_30d'], axis = 1)
target = diabetic_final['readmit_30d']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(features, target, 
                                                    test_size=0.2, stratify = target,
                                                    random_state = 42)

resampling methods

In [15]:
from imblearn.over_sampling import SMOTE, SVMSMOTE, RandomOverSampler
from imblearn.under_sampling import EditedNearestNeighbours, TomekLinks, NearMiss, RandomUnderSampler
from imblearn.combine import SMOTETomek

In [27]:
# SMOTE
sm = SMOTE(random_state = 42, k_neighbors = 23)
X_train_sm, y_train_sm = sm.fit_sample(X_train, y_train)
print(f'SMOTE: {X_train_sm.shape}')

# SVMSMOTE
svmsm = SVMSMOTE(k_neighbors = 23)
X_train_svmsm, y_train_svmsm = svmsm.fit_sample(X_train, y_train)
print(f'SVMSMOTE: {X_train_svmsm.shape}')

# RandomOverSampler
ros = RandomOverSampler(random_state = 42)
X_train_ros, y_train_ros = ros.fit_sample(X_train, y_train)
print(f'RandomOverSampler: {X_train_ros.shape}')

print('--'*30)
# EditedNearestNeighbours
enn = EditedNearestNeighbours(n_neighbors = 23)
X_train_enn, y_train_enn = enn.fit_sample(X_train, y_train)
print(f'EditedNearestNeighbours: {X_train_enn.shape}')

# TomekLinks
tl = TomekLinks()
X_train_tl, y_train_tl = tl.fit_sample(X_train, y_train)
print(f'TomekLinks: {X_train_tl.shape}')

# NearMiss
nm = NearMiss(n_neighbors = 23)
X_train_nm, y_train_nm = nm.fit_sample(X_train, y_train)
print(f'NearMiss: {X_train_nm.shape}')

# RandomUnderSample
rus = RandomUnderSampler(random_state = 42)
X_train_rus, y_train_rus = rus.fit_sample(X_train, y_train)
print(f'RandomUnderSampler: {X_train_rus.shape}')

print('--'*30)
# SMOTETomek
smtom = SMOTETomek(random_state = 42, smote = sm)
X_train_smtom, y_train_smtom = smtom.fit_sample(X_train, y_train)
print(f'SMOTETomek: {X_train_smtom.shape}')

SMOTE: (101908, 93)
SVMSMOTE: (101908, 93)
RandomOverSampler: (101908, 93)
------------------------------------------------------------
EditedNearestNeighbours: (13233, 93)
TomekLinks: (54316, 93)
NearMiss: (10044, 93)
RandomUnderSampler: (10044, 93)
------------------------------------------------------------
SMOTETomek: (101346, 93)


### model testing

logistic regression

In [17]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, roc_auc_score

In [18]:
def resampling_models(X_re, y_re):
    logit_cv = LogisticRegressionCV(Cs = [1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1, 10, 100, 1e5], #class_weight = 'balanced',
                                    cv = 5, scoring = 'roc_auc', solver='liblinear',
                                    random_state = 108, n_jobs = -1)
    logit_cv.fit(X_re, y_re)

    print(f'best C: {logit_cv.C_[0]}')
    logit_best = LogisticRegression(C = logit_cv.C_[0], solver = 'liblinear', random_state = 108)
    logit_best.fit(X_re, y_re)

    print('accuracy:')
    print(logit_best.score(X_re, y_re))
    print(logit_best.score(X_test, y_test))
    print('--'*30)
    y_train_pred = logit_best.predict(X_re)
    y_test_pred = logit_best.predict(X_test)
    print('confusion matrix:')
    print(confusion_matrix(y_re, y_train_pred))
    print(confusion_matrix(y_test, y_test_pred))
    print('--'*30)
    print('AUC-ROC:')
    print(roc_auc_score(y_re, y_train_pred))
    print(roc_auc_score(y_test, y_test_pred))

In [19]:
# SMOTE
resampling_models(X_train_sm, y_train_sm)

best C: 0.1
accuracy:
0.8920987557404718
0.8520080034300415
------------------------------------------------------------
confusion matrix:
[[47066  3888]
 [ 7108 43846]]
[[11810   929]
 [ 1142   113]]
------------------------------------------------------------
AUC-ROC:
0.8920987557404718
0.5085570896412779


In [20]:
# SVMSMOTE
resampling_models(X_train_svmsm, y_train_svmsm)

best C: 0.1
accuracy:
0.9027456136907799
0.8571530655995426
------------------------------------------------------------
confusion matrix:
[[47436  3518]
 [ 6393 44561]]
[[11874   865]
 [ 1134   121]]
------------------------------------------------------------
AUC-ROC:
0.9027456136907799
0.5142563117496259


In [21]:
# RandomOverSampler
resampling_models(X_train_ros, y_train_ros)

best C: 0.1
accuracy:
0.6132688307100522
0.6577819065313706
------------------------------------------------------------
confusion matrix:
[[34167 16787]
 [22624 28330]]
[[8546 4193]
 [ 596  659]]
------------------------------------------------------------
AUC-ROC:
0.6132688307100522
0.597976443390423


In [22]:
# EditedNearestNeighbours
resampling_models(X_train_enn, y_train_enn)

best C: 1.0
accuracy:
0.8872217605175086
0.9093897384593397
------------------------------------------------------------
confusion matrix:
[[39436    63]
 [ 4958    64]]
[[12710    29]
 [ 1239    16]]
------------------------------------------------------------
AUC-ROC:
0.505574474817197
0.5052362650817562


In [23]:
# TomekLinks
resampling_models(X_train_tl, y_train_tl)

best C: 1.0
accuracy:
0.9073017158848221
0.9103901672145205
------------------------------------------------------------
confusion matrix:
[[49262    32]
 [ 5003    19]]
[[12733     6]
 [ 1248     7]]
------------------------------------------------------------
AUC-ROC:
0.5015670935092958
0.5025533473297328


In [24]:
# NearMiss
resampling_models(X_train_nm, y_train_nm)

best C: 10.0
accuracy:
0.784647550776583
0.37394597684722025
------------------------------------------------------------
confusion matrix:
[[4140  882]
 [1281 3741]]
[[4337 8402]
 [ 359  896]]
------------------------------------------------------------
AUC-ROC:
0.784647550776583
0.5271974039629221


In [25]:
# RandomUnderSampler
resampling_models(X_train_rus, y_train_rus)

best C: 0.1
accuracy:
0.6149940262843488
0.6529941403458626
------------------------------------------------------------
confusion matrix:
[[3354 1668]
 [2199 2823]]
[[8477 4262]
 [ 594  661]]
------------------------------------------------------------
AUC-ROC:
0.6149940262843488
0.5960650372839437


In [26]:
# SMOTETomek
resampling_models(X_train_smtom, y_train_smtom)

best C: 0.1
accuracy:
0.8919345608114775
0.8520794626268401
------------------------------------------------------------
confusion matrix:
[[46801  3872]
 [ 7080 43593]]
[[11810   929]
 [ 1141   114]]
------------------------------------------------------------
AUC-ROC:
0.8919345608114776
0.5089554960157798


In [42]:
# running original X_train and y_train on class_weight = 'balanced'
logit_cv = LogisticRegressionCV(Cs = np.linspace(1e-2, 1, 30), cv = 5, class_weight = 'balanced',
                                scoring = 'roc_auc', solver='liblinear', random_state = 108, n_jobs = -1)
logit_cv.fit(X_train, y_train)

print(f'best C: {logit_cv.C_[0]}')
logit_best = LogisticRegression(C = logit_cv.C_[0], solver = 'liblinear', 
                                random_state = 108)
logit_best.fit(X_train, y_train)

print('accuracy:')
print(logit_best.score(X_train, y_train))
print(logit_best.score(X_test, y_test))
print('--'*30)
y_train_pred = logit_best.predict(X_train)
y_test_pred = logit_best.predict(X_test)
print('confusion matrix:')
print(confusion_matrix(y_train, y_train_pred))
print(confusion_matrix(y_test, y_test_pred))
print('--'*30)
print('AUC-ROC:')
print(roc_auc_score(y_train, y_train_pred))
print(roc_auc_score(y_test, y_test_pred))

best C: 0.41965517241379313
accuracy:
0.9100686008289267
0.9102472488209232
------------------------------------------------------------
confusion matrix:
[[50923    31]
 [ 5003    19]]
[[12733     6]
 [ 1250     5]]
------------------------------------------------------------
AUC-ROC:
0.501587480681422
0.5017565345807289


In [45]:
logit_cv.scores_

{True: array([[0.61455455, 0.62661953, 0.62790883, 0.62870272, 0.62896342,
         0.6291585 , 0.62929538, 0.62936646, 0.62944867, 0.62947269,
         0.62952083, 0.62961066, 0.62965449, 0.62965918, 0.62965098,
         0.62968291, 0.62966397, 0.62952864, 0.62967324, 0.62912842,
         0.62970966, 0.62955266, 0.62955598, 0.62955852, 0.62956066,
         0.6296916 , 0.62955578, 0.62956359, 0.62955793, 0.62955188],
        [0.62678527, 0.64653915, 0.64906921, 0.6497816 , 0.65019062,
         0.65024848, 0.65046721, 0.65050767, 0.65064177, 0.65067021,
         0.65070764, 0.65074009, 0.65075103, 0.65073549, 0.65073569,
         0.65075309, 0.65076745, 0.65074927, 0.65077009, 0.65075318,
         0.65076843, 0.65076462, 0.6507699 , 0.65076853, 0.65077459,
         0.65079032, 0.65078827, 0.65075934, 0.65077146, 0.65082912],
        [0.63200143, 0.64772814, 0.64909067, 0.64981654, 0.65010515,
         0.65028108, 0.65035809, 0.65043403, 0.65048847, 0.65061328,
         0.65055942, 0.650

In [44]:
logit_best = LogisticRegression(solver = 'liblinear', random_state = 108)
logit_best.fit(X_train, y_train)

print('accuracy:')
print(logit_best.score(X_train, y_train))
print(logit_best.score(X_test, y_test))
print('--'*30)
y_train_pred = logit_best.predict(X_train)
y_test_pred = logit_best.predict(X_test)
print('confusion matrix:')
print(confusion_matrix(y_train, y_train_pred))
print(confusion_matrix(y_test, y_test_pred))
print('--'*30)
print('AUC-ROC:')
print(roc_auc_score(y_train, y_train_pred))
print(roc_auc_score(y_test, y_test_pred))

accuracy:
0.9100507360297271
0.9102472488209232
------------------------------------------------------------
confusion matrix:
[[50922    32]
 [ 5003    19]]
[[12733     6]
 [ 1250     5]]
------------------------------------------------------------
AUC-ROC:
0.5015776679091176
0.5017565345807289


In [82]:
from imblearn.ensemble import RUSBoostClassifier

In [83]:
rusb = RUSBoostClassifier(learning_rate = 0.11)
rusb.fit(X_train_ros, y_train_ros)
y_train_pred = rusb.predict(X_train_ros)
y_test_pred = rusb.predict(X_test)
print('confusion matrix:')
print(confusion_matrix(y_train_ros, y_train_pred))
print(confusion_matrix(y_test, y_test_pred))
print('--'*30)
print('AUC-ROC:')
print(roc_auc_score(y_train_ros, y_train_pred))
print(roc_auc_score(y_test, y_test_pred))

confusion matrix:
[[36595 14359]
 [26725 24229]]
[[9171 3568]
 [ 646  609]]
------------------------------------------------------------
AUC-ROC:
0.5968520626447383
0.6025870925591926


In [85]:
RUS = RUSBoostClassifier()

RUSBoostClassifier()

In [91]:
# GB grid search
RUS_grid_params = [{
    'learning_rate':[0.6,0.8,1.0,1.2],
    'n_estimators': [80], #range(50,90,10),
    'sampling_strategy': ['auto'],
    'random_state':[42]
}]
grid_search_RUS = GridSearchCV(RUS, RUS_grid_params, scoring='roc_auc', cv=5, verbose = 1, n_jobs=-1)
%time grid_search_RUS.fit(X_train_ros, y_train_ros)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  7.1min finished


Wall time: 9min 1s


GridSearchCV(cv=5, estimator=RUSBoostClassifier(), n_jobs=-1,
             param_grid=[{'learning_rate': [0.6, 0.8, 1.0, 1.2],
                          'n_estimators': [80], 'random_state': [42],
                          'sampling_strategy': ['auto']}],
             scoring='roc_auc', verbose=1)

In [92]:
grid_search_RUS.best_params_

{'learning_rate': 1.0,
 'n_estimators': 80,
 'random_state': 42,
 'sampling_strategy': 'auto'}

In [93]:
y_train_pred = grid_search_RUS.best_estimator_.predict(X_train_ros)
y_test_pred = grid_search_RUS.best_estimator_.predict(X_test)
print('confusion matrix:')
print(confusion_matrix(y_train_ros, y_train_pred))
print(confusion_matrix(y_test, y_test_pred))
print('--'*30)
print('AUC-ROC:')
print(roc_auc_score(y_train_ros, y_train_pred))
print(roc_auc_score(y_test, y_test_pred))

confusion matrix:
[[34092 16862]
 [22290 28664]]
[[8529 4210]
 [ 573  682]]
------------------------------------------------------------
AUC-ROC:
0.6158103387368999
0.6064725476772556
