In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix

%matplotlib inline

In [23]:
df = pd.read_csv('data/diabetic_data.csv')
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [24]:
df.shape

(101766, 50)

In [25]:
df.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [26]:
# look at a count of missing (?) values in the dataset
for col in df.columns:
    if df[col].dtype == object:
        if df[col][df[col] == '?'].count() != 0:
            print(col,df[col][df[col] == '?'].count(),(df[col][df[col] == '?'].count()/101766)*100)
            
# gender was coded differently so we use a custom count for this one
print('gender', df['gender'][df['gender'] == 'Unknown/Invalid'].count(),
      (df['gender'][df['gender'] == 'Unknown/Invalid'].count()/101766)*100)

race 2273 2.2335554114340743
weight 98569 96.85847925633315
payer_code 40256 39.5574160328597
medical_specialty 49949 49.08220820313268
diag_1 21 0.02063557573256294
diag_2 358 0.3517874339170253
diag_3 1423 1.398305917497003
gender 3 0.002947939390366134


Weight is missing in over 96% of records in the dataset. Probably should drop this entire column since there are very few records that actually have data.

Also, Payer Code and Medical Specialty of the physician are missing 40-50% of the time. I am going to drop these columns from the dataset as well, although I may revisit this in the future since it would be interesting to see the effect that these values have on readmissions.

In [27]:
# dropping weight, payer_code, and medical_specialty from the dataset
df.drop(['weight', 'payer_code', 'medical_specialty'], axis = 1, inplace=True)

# check the row count
df.shape

(101766, 47)

In [28]:
# dropping records where gender is missing
df = df[df.gender != Unknown/Invalid]

# check the row count
df.shape

(101766, 47)