### Importing neccessary libraries

In [36]:
# Data Manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score


#### Data Collection and Exploration



In [37]:
data = pd.read_csv('diabetic_data.csv')
mapping = pd.read_csv('IDS_mapping.csv') # mapping for admission_type_id, discharge_disposition_id, admission_source_id
# we should map data cause numerical values do not reflect any order, 
# a model might infer that admission_type_id = 2 (Urgent) is "closer" to admission_type_id = 3 (Elective) 
# than to admission_type_id = 7 (Trauma Center), which is not meaningful.

In [38]:
# general summary
print(data.shape)  # Rows and columns
print(data.columns)

(101766, 50)
Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')


In [39]:
num_features = data.select_dtypes(include=['float64', 'int64']).columns
cat_features = data.select_dtypes(include=['object', 'category']).columns

In [40]:
num_data = data[num_features]
num_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
encounter_id,101766.0,165201600.0,102640300.0,12522.0,84961194.0,152388987.0,230270900.0,443867222.0
patient_nbr,101766.0,54330400.0,38696360.0,135.0,23413221.0,45505143.0,87545950.0,189502619.0
admission_type_id,101766.0,2.024006,1.445403,1.0,1.0,1.0,3.0,8.0
discharge_disposition_id,101766.0,3.715642,5.280166,1.0,1.0,1.0,4.0,28.0
admission_source_id,101766.0,5.754437,4.064081,1.0,1.0,7.0,7.0,25.0
time_in_hospital,101766.0,4.395987,2.985108,1.0,2.0,4.0,6.0,14.0
num_lab_procedures,101766.0,43.09564,19.67436,1.0,31.0,44.0,57.0,132.0
num_procedures,101766.0,1.33973,1.705807,0.0,0.0,1.0,2.0,6.0
num_medications,101766.0,16.02184,8.127566,1.0,10.0,15.0,20.0,81.0
number_outpatient,101766.0,0.3693572,1.267265,0.0,0.0,0.0,0.0,42.0


Features that contain missing values according to documentation


In [41]:
missing_features=['race', 'weight', 'payer_code','medical_specialty','diag_1', 'diag_2', 'diag_3','max_glu_serum', 'A1Cresult']
missing_info = data[missing_features].isnull().sum().sort_values(ascending=False)
print(missing_info/data.shape[0]) # of missing values 

for col in cat_features:
    print(data[col].value_counts(dropna=False)/data.shape[0])


max_glu_serum        0.947468
A1Cresult            0.832773
race                 0.000000
weight               0.000000
payer_code           0.000000
medical_specialty    0.000000
diag_1               0.000000
diag_2               0.000000
diag_3               0.000000
dtype: float64
race
Caucasian          0.747784
AfricanAmerican    0.188766
?                  0.022336
Hispanic           0.020017
Other              0.014799
Asian              0.006299
Name: count, dtype: float64
gender
Female             0.537586
Male               0.462384
Unknown/Invalid    0.000029
Name: count, dtype: float64
age
[70-80)     0.256156
[60-70)     0.220928
[50-60)     0.169565
[80-90)     0.168986
[40-50)     0.095169
[30-40)     0.037095
[90-100)    0.027445
[20-30)     0.016282
[10-20)     0.006790
[0-10)      0.001582
Name: count, dtype: float64
weight
?            0.968585
[75-100)     0.013128
[50-75)      0.008814
[100-125)    0.006142
[125-150)    0.001425
[25-50)      0.000953
[0-25)       0

In [42]:
# dropping features which has more than 85% of missing values or has only one value
data = data.drop(columns= ['max_glu_serum','A1Cresult','weight','citoglipton','examide'])
print(data.shape)

(101766, 45)


In [43]:
# Handling missing data
data['race'] = data['race'].replace('?', 'unknown') # we can not assume what is the race
data['payer_code'] = data['payer_code'].replace('?', 'unknown') 
data['medical_specialty'] = data['medical_specialty'].replace('?', 'unknown')




#### Feature Engineering

In [46]:
from sklearn.preprocessing import LabelEncoder

# Label encode the age ranges 
encoder = LabelEncoder()
data['age'] = encoder.fit_transform(data['age']) + 1
