## Supervised Learning Capstone: Diabetic patient readmission prediction

In [164]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import scipy as stats
import seaborn as sns

In [165]:
diabetes= pd.read_csv('diabetic_data1.csv')
#preview of dataset
diabetes.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [166]:
#let's explore data!
diabetes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 50 columns):
encounter_id                101766 non-null int64
patient_nbr                 101766 non-null int64
race                        101766 non-null object
gender                      101766 non-null object
age                         101766 non-null object
weight                      101766 non-null object
admission_type_id           101766 non-null int64
discharge_disposition_id    101766 non-null int64
admission_source_id         101766 non-null int64
time_in_hospital            101766 non-null int64
payer_code                  101766 non-null object
medical_specialty           101766 non-null object
num_lab_procedures          101766 non-null int64
num_procedures              101766 non-null int64
num_medications             101766 non-null int64
number_outpatient           101766 non-null int64
number_emergency            101766 non-null int64
number_inpatient            10176

In [167]:
diabetes.describe()

Unnamed: 0,encounter_id,patient_nbr,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses
count,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0
mean,165201600.0,54330400.0,2.024006,3.715642,5.754437,4.395987,43.095641,1.33973,16.021844,0.369357,0.197836,0.635566,7.422607
std,102640300.0,38696360.0,1.445403,5.280166,4.064081,2.985108,19.674362,1.705807,8.127566,1.267265,0.930472,1.262863,1.9336
min,12522.0,135.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
25%,84961190.0,23413220.0,1.0,1.0,1.0,2.0,31.0,0.0,10.0,0.0,0.0,0.0,6.0
50%,152389000.0,45505140.0,1.0,1.0,7.0,4.0,44.0,1.0,15.0,0.0,0.0,0.0,8.0
75%,230270900.0,87545950.0,3.0,4.0,7.0,6.0,57.0,2.0,20.0,0.0,0.0,1.0,9.0
max,443867200.0,189502600.0,8.0,28.0,25.0,14.0,132.0,6.0,81.0,42.0,76.0,21.0,16.0


In [122]:
#Let's look for null
diabetes.isnull().sum()*100/diabetes.shape[0]

encounter_id                0.0
patient_nbr                 0.0
race                        0.0
gender                      0.0
age                         0.0
weight                      0.0
admission_type_id           0.0
discharge_disposition_id    0.0
admission_source_id         0.0
time_in_hospital            0.0
payer_code                  0.0
medical_specialty           0.0
num_lab_procedures          0.0
num_procedures              0.0
num_medications             0.0
number_outpatient           0.0
number_emergency            0.0
number_inpatient            0.0
diag_1                      0.0
diag_2                      0.0
diag_3                      0.0
number_diagnoses            0.0
max_glu_serum               0.0
A1Cresult                   0.0
metformin                   0.0
repaglinide                 0.0
nateglinide                 0.0
chlorpropamide              0.0
glimepiride                 0.0
acetohexamide               0.0
glipizide                   0.0
glyburid

In [168]:
#define our target variable
diabetes['target'] = (diabetes.readmitted == '<30').astype('int')

### I want to seperate my numerical values from my categorical values. 

In [169]:
#Let's look at categorical columns first. 
cat_cols= diabetes.select_dtypes(include=['object'])

cat_cols.head()

Unnamed: 0,race,gender,age,weight,payer_code,medical_specialty,diag_1,diag_2,diag_3,max_glu_serum,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,Caucasian,Female,[0-10),?,?,Pediatrics-Endocrinology,250.83,?,?,,...,No,No,No,No,No,No,No,No,No,NO
1,Caucasian,Female,[10-20),?,?,?,276.0,250.01,255,,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,AfricanAmerican,Female,[20-30),?,?,?,648.0,250,V27,,...,No,No,No,No,No,No,No,No,Yes,NO
3,Caucasian,Male,[30-40),?,?,?,8.0,250.43,403,,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,Caucasian,Male,[40-50),?,?,?,197.0,157,250,,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


### It's saying there are no null values, but we can see there are '?' values. Let's dig deeper. 

In [125]:
#looking for ? values in categorical columns
for col in diabetes.columns:
    if diabetes[col].dtype == object:
         print(col,diabetes[col][diabetes[col] == '?'].count())

race 2273
gender 0
age 0
weight 98569
payer_code 40256
medical_specialty 49949
diag_1 21
diag_2 358
diag_3 1423
max_glu_serum 0
A1Cresult 0
metformin 0
repaglinide 0
nateglinide 0
chlorpropamide 0
glimepiride 0
acetohexamide 0
glipizide 0
glyburide 0
tolbutamide 0
pioglitazone 0
rosiglitazone 0
acarbose 0
miglitol 0
troglitazone 0
tolazamide 0
examide 0
citoglipton 0
insulin 0
glyburide-metformin 0
glipizide-metformin 0
glimepiride-pioglitazone 0
metformin-rosiglitazone 0
metformin-pioglitazone 0
change 0
diabetesMed 0
readmitted 0


In [126]:
#let's replace ? with 'Other'
diabetes.race.replace('?', 'Other', inplace=True)
diabetes.race.value_counts()

Caucasian          76099
AfricanAmerican    19210
Other               3779
Hispanic            2037
Asian                641
Name: race, dtype: int64

In [127]:
missing_weight= diabetes[(diabetes.weight=='?')].shape[0]/len(diabetes)
missing_payer_code=diabetes[(diabetes.payer_code=='?')].shape[0]/len(diabetes)
missing_med_spec=diabetes[(diabetes.medical_specialty=='?')].shape[0]/len(diabetes)

print ('Weight has {:.2%} missing. Payer code has {:.2%} missing and medical specialty has {:.2%} missing'.format
(missing_weight, missing_payer_code,missing_med_spec))


Weight has 96.86% missing. Payer code has 39.56% missing and medical specialty has 49.08% missing


In [128]:
#here I'm dropping columns that have plenty of missing values. Weight would have been ideal as 
#demographic variable, unfortunately, 96% of data is missing.
#payer code is for insurance purposes and medical specialty is about specialty of admitting doctors, which doesn't 
#really apply to the patient's care or outcome.
diabetes = diabetes.drop(['medical_specialty','payer_code','weight'],axis=1)

In [129]:
#dropping columns that have high cardinality
#max_cardinality = 100

#high_cardinality = [col for col in diabetes.select_dtypes(exclude=np.number)
#                   if diabetes[col].nunique() > max_cardinality]
#diabetes = diabetes.drop(columns=high_cardinality)
#diabetes.di

In [130]:
#creating my categorical columns of value. For now I leave out diag_1, diag_2, diag_3 which are high cardinality 
#columns. Those columns represent ICD-9 codes, which may be useful. Especially when looking at readmission rates for 
# patients with comorbidities. 
#We will explore diag_1 , diag_2, diag_3 in a seperate section.
cat_cols = ['race', 'gender', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed']

In [131]:
#I want to create dummy variable for these 3 columns, making them into strings first. 
cols_cat_num = ['admission_type_id', 'discharge_disposition_id', 'admission_source_id']

diabetes[cols_cat_num] = diabetes[cols_cat_num].astype('str')


In [134]:
df_cat = pd.get_dummies(diabetes[cat_cols + cols_cat_num ],drop_first = True)
diabetes = pd.concat([diabetes,df_cat], axis = 1)
diabetes.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,...,admission_source_id_20,admission_source_id_22,admission_source_id_25,admission_source_id_3,admission_source_id_4,admission_source_id_5,admission_source_id_6,admission_source_id_7,admission_source_id_8,admission_source_id_9
0,2278392,8222157,Caucasian,Female,[0-10),6,25,1,1,41,...,0,0,0,0,0,0,0,0,0,0
1,149190,55629189,Caucasian,Female,[10-20),1,1,7,3,59,...,0,0,0,0,0,0,0,1,0,0
2,64410,86047875,AfricanAmerican,Female,[20-30),1,1,7,2,11,...,0,0,0,0,0,0,0,1,0,0
3,500364,82442376,Caucasian,Male,[30-40),1,1,7,2,44,...,0,0,0,0,0,0,0,1,0,0
4,16680,42519267,Caucasian,Male,[40-50),1,1,7,1,51,...,0,0,0,0,0,0,0,1,0,0


In [135]:
#creating new feature for age
new_age = {'[0-10)':0, 
          '[10-20)':10, 
          '[20-30)':20, 
          '[30-40)':30, 
          '[40-50)':40, 
          '[50-60)':50,
          '[60-70)':60, 
          '[70-80)':70, 
          '[80-90)':80, 
          '[90-100)':90}
diabetes['age_group'] = diabetes.age.replace(new_age)

In [136]:
cols_all_cat = list(df_cat.columns)

### Now let's look at numerical values

In [160]:
num_cols= diabetes.select_dtypes(include=['int', 'float'])
num_cols.isnull().sum().sort_values(ascending=False)

age_group             0
target                0
number_diagnoses      0
number_inpatient      0
number_emergency      0
number_outpatient     0
num_medications       0
num_procedures        0
num_lab_procedures    0
time_in_hospital      0
patient_nbr           0
encounter_id          0
dtype: int64

In [161]:
#making sure there is no '?' in variables.
for col in diabetes.columns:
    if diabetes[col].dtype == int:
         print(col,diabetes[col][diabetes[col] == '?'].count())

encounter_id 0
patient_nbr 0
time_in_hospital 0
num_lab_procedures 0
num_procedures 0
num_medications 0
number_outpatient 0
number_emergency 0
number_inpatient 0
number_diagnoses 0
target 0


AttributeError: 'DataFrame' object has no attribute 'dtype'

In [162]:
# I will not use encounter ID and patient_nbr since these are just patient identifiers. 
# I will create seperate features for admission source and type and discharge disposition.

In [163]:
new_num_cols= ['number_diagnoses', 'number_inpatient', 'number_emergency', 'number_outpatient', 'num_medications',
               'num_procedures', 'num_lab_procedures', 'time_in_hospital']

### Based on the seperate file for mapping admission_type_id, discharge_disposition and admission_source_id, there are some patients whose dispositions are expired or are on hospice care. We will drop those since there is no chance of a patient being readmitted. 

In [147]:
diabetes= diabetes.loc[~diabetes.discharge_disposition_id.isin([11,13,14,19,20,21])]

In [148]:
diabetes.discharge_disposition_id.unique()

array(['25', '1', '3', '6', '2', '5', '11', '7', '10', '4', '14', '18',
       '8', '13', '12', '16', '17', '22', '23', '9', '20', '15', '24',
       '28', '19', '27'], dtype=object)

In [149]:
#creating the new dataframe of interesting columns
extra_feat= ['age_group']
new_columns = extra_feat + cols_all_cat + new_num_cols
df_data = diabetes[new_columns + ['target']]

In [151]:
df_data.head()

Unnamed: 0,age_group,race_Asian,race_Asian.1,race_Caucasian,race_Caucasian.1,race_Hispanic,race_Hispanic.1,race_Other,race_Other.1,gender_Male,...,admission_source_id_9,number_diagnoses,number_inpatient,number_emergency,number_outpatient,num_medications,num_procedures,num_lab_procedures,time_in_hospital,target
0,0,0,0,1,1,0,0,0,0,0,...,0,1,0,0,0,1,0,41,1,0
1,10,0,0,1,1,0,0,0,0,0,...,0,9,0,0,0,18,0,59,3,0
2,20,0,0,0,0,0,0,0,0,0,...,0,6,1,0,2,13,5,11,2,0
3,30,0,0,1,1,0,0,0,0,1,...,0,7,0,0,0,16,1,44,2,0
4,40,0,0,1,1,0,0,0,0,1,...,0,5,0,0,0,8,0,51,1,0


In [None]:
#Trying out our data

In [152]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB,GaussianNB,BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [153]:
X = df_data.drop(['target'],axis=1)
Y = df_data['target']
print(X.shape)
print(Y.shape)

(101766, 229)
(101766,)


In [154]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=2)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((71236, 229), (30530, 229), (71236,), (30530,))

In [155]:
#Logistic Regression
m1=LogisticRegression()
m1.fit(X_train,y_train)
y_pred_lr=m1.predict(X_test)
Train_Score_lr = m1.score(X_train,y_train)
Test_Score_lr = accuracy_score(y_test,y_pred_lr)


print('Training Accuracy is:',Train_Score_lr)
print('Testing Accuracy is:',Test_Score_lr)
print(classification_report(y_test,y_pred_lr))



Training Accuracy is: 0.8876410803526307
Testing Accuracy is: 0.8890271863740583
              precision    recall  f1-score   support

           0       0.89      1.00      0.94     27159
           1       0.45      0.02      0.04      3371

    accuracy                           0.89     30530
   macro avg       0.67      0.51      0.49     30530
weighted avg       0.84      0.89      0.84     30530



In [156]:
m2 = KNeighborsClassifier()
m2.fit(X_train,y_train)
y_pred_knn = m2.predict(X_test)
Train_Score_knn = m2.score(X_train,y_train)
Test_Score_knn = accuracy_score(y_test,y_pred_knn)

print('Training Accuracy is :',Train_Score_knn)
print('Testing Accuracy is:',Test_Score_knn)
print(classification_report(y_test,y_pred_knn))

Training Accuracy is : 0.8935650513785165
Testing Accuracy is: 0.881035047494268
              precision    recall  f1-score   support

           0       0.89      0.99      0.94     27159
           1       0.22      0.03      0.05      3371

    accuracy                           0.88     30530
   macro avg       0.55      0.51      0.49     30530
weighted avg       0.82      0.88      0.84     30530



In [157]:
m3=BernoulliNB()
m3.fit(X_train,y_train)
y_pred_bnb=m3.predict(X_test)
Train_Score_bnb = m3.score(X_train,y_train)
Test_Score_bnb = accuracy_score(y_test,y_pred_bnb)

print('Training Accuracy :',Train_Score_bnb)
print('Testing Accuracy  :',Test_Score_bnb)
print(classification_report(y_test,y_pred_bnb))

Training Accuracy : 0.8775478690549722
Testing Accuracy  : 0.8786112020962987
              precision    recall  f1-score   support

           0       0.89      0.98      0.93     27159
           1       0.26      0.05      0.09      3371

    accuracy                           0.88     30530
   macro avg       0.58      0.52      0.51     30530
weighted avg       0.82      0.88      0.84     30530

