preparing final dataset

In [3]:
import pandas as pd
import numpy as np

In [4]:
diabetic = pd.read_csv('../../data/diabetic_data_initial.csv')

In [5]:
diabetic.shape

(101766, 50)

In [6]:
diabetic_df = diabetic.replace('?', np.nan)

In [7]:
diabetic_df = diabetic_df.sort_values('encounter_id')
diabetic_df.drop_duplicates(subset = ['patient_nbr'], keep = 'first', inplace = True)

In [8]:
diabetic_df.shape

(71518, 50)

In [9]:
diabetic_df = diabetic_df[~diabetic_df['discharge_disposition_id'].isin([13,14,19,20,21])]

In [10]:
diabetic_df.shape

(71050, 50)

dropping encounter_id, patient_nbr, and columns with high missingness

In [11]:
diabetic_df.drop(['encounter_id','patient_nbr','admission_source_id','weight','medical_specialty','payer_code'],\
                 axis = 1, inplace = True)

binarizing target variable

In [12]:
diabetic_df['readmit_30d'] = (diabetic_df['readmitted'] == '<30')

In [13]:
diabetic_df.columns

Index(['race', 'gender', 'age', 'admission_type_id',
       'discharge_disposition_id', 'time_in_hospital', 'num_lab_procedures',
       'num_procedures', 'num_medications', 'number_outpatient',
       'number_emergency', 'number_inpatient', 'diag_1', 'diag_2', 'diag_3',
       'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'metformin',
       'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
       'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted',
       'readmit_30d'],
      dtype='object')

In [34]:
diabetic_df[diabetic_df['insulin']=='No']['diabetesMed'].value_counts()

Yes    17666
No     17049
Name: diabetesMed, dtype: int64

In [30]:
diabetic_df[diabetic_df['insulin']=='No'].groupby('diag_1')[['insulin']].count().reset_index().sort_values('insulin',ascending = False)

Unnamed: 0,diag_1,insulin
205,414,2717
217,428,1926
459,786,1920
86,250,1452
201,410,1302
...,...,...
145,335,1
402,683,1
502,842,1
148,34,1


dicretizing id columns

In [17]:
diabetic_df['admission_type_id'] = diabetic_df['admission_type_id'].astype('str')

simplifying age column

In [47]:
diabetic_df['age_group'] = diabetic_df['age'].replace(
    ['[0-10)','[10-20)','[20-30)','[30-40)','[40-50)','[50-60)','[60-70)','[70-80)','[80-90)','[90-100)'],
    ['inf to adole','inf to adole','adult','adult','mid-age','mid-age','senior','senior','senior','senior'])

In [48]:
diabetic_df['age_group'].unique()

array(['senior', 'mid-age', 'adult', 'inf to adole'], dtype=object)

In [49]:
diabetic_df['age_group'].value_counts()

senior          47223
mid-age         19313
adult            3825
inf to adole      689
Name: age_group, dtype: int64

removing sub-ICDs and adding diag-based features

In [18]:
diabetic_df['diag_1'] = diabetic_df['diag_1'].str.split('.', expand = True).drop(1, axis = 1)
diabetic_df['diag_2'] = diabetic_df['diag_2'].str.split('.', expand = True).drop(1, axis = 1)
diabetic_df['diag_3'] = diabetic_df['diag_3'].str.split('.', expand = True).drop(1, axis = 1)

In [19]:
# 250 is in top 3 diagnoses
diabetic_df['diag_250_top3'] = ((diabetic_df['diag_1']=='250')|
                                (diabetic_df['diag_2']=='250')|
                                (diabetic_df['diag_3']=='250'))

remove columns used to engineer features

In [50]:
diabetic_final = diabetic_df.drop(['admission_type_id','discharge_disposition_id','age',
                                   'diag_1', 'diag_2', 'diag_3',
                                   'metformin','repaglinide','nateglinide','chlorpropamide','glimepiride',
                                   'acetohexamide','glipizide','glyburide','tolbutamide','pioglitazone',
                                   'rosiglitazone','acarbose','miglitol','troglitazone','tolazamide','examide',
                                   'citoglipton','insulin','glyburide-metformin','glipizide-metformin',
                                   'glimepiride-pioglitazone','metformin-rosiglitazone','metformin-pioglitazone'],
                                 axis = 1)

In [51]:
diabetic_final.columns

Index(['race', 'gender', 'time_in_hospital', 'num_lab_procedures',
       'num_procedures', 'num_medications', 'number_outpatient',
       'number_emergency', 'number_inpatient', 'number_diagnoses',
       'max_glu_serum', 'A1Cresult', 'change', 'diabetesMed', 'readmitted',
       'readmit_30d', 'diag_250_top3', 'age_group'],
      dtype='object')

dummification

In [53]:
diabetic_final = pd.get_dummies(diabetic_final, drop_first = True)
diabetic_final.columns

Index(['time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'number_diagnoses', 'readmit_30d', 'diag_250_top3',
       'race_Asian', 'race_Caucasian', 'race_Hispanic', 'race_Other',
       'gender_Male', 'gender_Unknown/Invalid', 'max_glu_serum_>300',
       'max_glu_serum_None', 'max_glu_serum_Norm', 'A1Cresult_>8',
       'A1Cresult_None', 'A1Cresult_Norm', 'change_No', 'diabetesMed_Yes',
       'readmitted_>30', 'readmitted_NO', 'age_group_inf to adole',
       'age_group_mid-age', 'age_group_senior'],
      dtype='object')

train test split

In [22]:
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV, train_test_split

In [None]:
features = diabetic_final.drop(['readmit_30d'], axis = 1)
target = diabetic_final['readmit_30d']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, 
                                                    target, 
                                                    test_size=0.2,
                                                    stratify = target,
                                                    random_state = 42)

logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix 

logit = LogisticRegression(multi_class='ovr', solver='liblinear')


CV

In [None]:
params = {'C':np.logspace(-4,4, 20)}
gs_logit = GridSearchCV(logit, params, cv=3, iid=True)

model with best params