In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix

%matplotlib inline

In [2]:
df = pd.read_csv('data/diabetic_data.csv')
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [3]:
df.shape

(101766, 50)

In [4]:
df.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [5]:
# look at a count of missing (?) values in the dataset
for col in df.columns:
    if df[col].dtype == object:
        if df[col][df[col] == '?'].count() != 0:
            print(col,df[col][df[col] == '?'].count(),(df[col][df[col] == '?'].count()/101766)*100)
            
# gender was coded differently so we use a custom count for this one
print('gender', df['gender'][df['gender'] == 'Unknown/Invalid'].count(),
      (df['gender'][df['gender'] == 'Unknown/Invalid'].count()/101766)*100)

race 2273 2.2335554114340743
weight 98569 96.85847925633315
payer_code 40256 39.5574160328597
medical_specialty 49949 49.08220820313268
diag_1 21 0.02063557573256294
diag_2 358 0.3517874339170253
diag_3 1423 1.398305917497003
gender 3 0.002947939390366134


Weight is missing in over 96% of records in the dataset. Probably should drop this entire column since there are very few records that actually have data.

Also, Payer Code and Medical Specialty of the physician are missing 40-50% of the time. I am going to drop these columns from the dataset as well, although I may revisit this in the future since it would be interesting to see the effect that these values have on readmissions.

In [6]:
# dropping weight, payer_code, and medical_specialty from the dataset
df.drop(['weight', 'payer_code', 'medical_specialty'], axis = 1, inplace=True)

# check the column count
df.shape

(101766, 47)

In [7]:
# dropping records where gender is missing
df = df[df.gender != 'Unknown/Invalid']

# check the row count
df.shape

(101763, 47)

In [8]:
def diag_missing_label(row):
    """This function will return a 1 if all diag attributes are missing"""
    
    if row['diag_1'] == '?' and row['diag_2'] == '?' and row['diag_3'] == '?':
        return 1
    else:
        return 0

In [9]:
# apply function to return a 1 if all diag attributes
df['diag_missing_label'] = df.apply(lambda row: diag_missing_label(row), axis=1)

In [10]:
df['diag_missing_label'].value_counts()

0    101762
1         1
Name: diag_missing_label, dtype: int64

In [11]:
# drop records where all diag attributes are missing
df = df[df.diag_missing_label != 1]

df.shape

(101762, 48)

In [12]:
df.discharge_disposition_id.value_counts()

1     60232
3     13954
6     12902
18     3691
2      2128
22     1992
11     1642
5      1184
25      989
4       815
7       623
23      412
13      398
14      372
28      139
8       108
15       63
24       48
9        21
17       14
16       11
19        8
10        6
27        5
12        3
20        2
Name: discharge_disposition_id, dtype: int64

In [13]:
# drop records where discharge disposition = 11 (patient died)
df = df[df.discharge_disposition_id != 11]

df.shape

(100120, 48)

In [14]:
# are there multiple records per patient id?
df.patient_nbr.nunique()

70442

In [16]:
# drop duplicates of patient id to match assumption of logistic regression model
df.drop_duplicates('patient_nbr', inplace=True)

df.shape

In [22]:
def get_unique_values(df):
    for col in df.columns:
        print(str(col), df[col].nunique())

In [23]:
get_unique_values(df)

encounter_id 70442
patient_nbr 70442
race 6
gender 2
age 10
admission_type_id 8
discharge_disposition_id 25
admission_source_id 17
time_in_hospital 14
num_lab_procedures 116
num_procedures 7
num_medications 75
number_outpatient 33
number_emergency 18
number_inpatient 13
diag_1 696
diag_2 725
diag_3 758
number_diagnoses 16
max_glu_serum 4
A1Cresult 4
metformin 4
repaglinide 4
nateglinide 4
chlorpropamide 4
glimepiride 4
acetohexamide 2
glipizide 4
glyburide 4
tolbutamide 2
pioglitazone 4
rosiglitazone 4
acarbose 3
miglitol 4
troglitazone 2
tolazamide 2
examide 1
citoglipton 1
insulin 4
glyburide-metformin 4
glipizide-metformin 2
glimepiride-pioglitazone 1
metformin-rosiglitazone 2
metformin-pioglitazone 2
change 2
diabetesMed 2
readmitted 3
diag_missing_label 1


There are four columns that only have 1 value for all records. These cannot provide any reliable information for predicting readmission, so let's drop the following columns:

examide, citoglipton, glimepiride-pioglitazone, diag_missing_label (which was created above)

In [24]:
# drop the four columns that only have 1 value for all records
df.drop(['examide', 'citoglipton', 'glimepiride-pioglitazone', 'diag_missing_label'], axis=1, inplace=True)

df.shape

(70442, 44)