In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("diabetic_data.csv")

In [3]:
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [4]:
df.shape

(101766, 50)

In [5]:
df.drop(columns=['encounter_id'], inplace=True)

In [6]:
df.shape

(101766, 49)

In [7]:
missin_values = df.isnull().sum()

In [8]:
missin_values[missin_values>0]

race                  2273
weight               98569
payer_code           40256
medical_specialty    49949
diag_1                  21
diag_2                 358
diag_3                1423
max_glu_serum        96420
A1Cresult            84748
dtype: int64

In [9]:
df["max_glu_serum"]

0         NaN
1         NaN
2         NaN
3         NaN
4         NaN
         ... 
101761    NaN
101762    NaN
101763    NaN
101764    NaN
101765    NaN
Name: max_glu_serum, Length: 101766, dtype: object

In [10]:
# replace all '?' with NAN
df = df.replace("?", np.nan)

In [11]:
df.isna().sum()

patient_nbr                     0
race                         2273
gender                          0
age                             0
weight                      98569
admission_type_id               0
discharge_disposition_id        0
admission_source_id             0
time_in_hospital                0
payer_code                  40256
medical_specialty           49949
num_lab_procedures              0
num_procedures                  0
num_medications                 0
number_outpatient               0
number_emergency                0
number_inpatient                0
diag_1                         21
diag_2                        358
diag_3                       1423
number_diagnoses                0
max_glu_serum               96420
A1Cresult                   84748
metformin                       0
repaglinide                     0
nateglinide                     0
chlorpropamide                  0
glimepiride                     0
acetohexamide                   0
glipizide     

In [12]:
df["readmitted"].dtype

dtype('O')

In [13]:
# replace categorical data '<30' with 1 and '>30'/NO => 0
def replace_readmitted_data(value):
    return 1 if value == "<30" else 0

In [14]:
df["readmitted"].map(lambda x: 1 if x == "<30" else 0)

0         0
1         0
2         0
3         0
4         0
         ..
101761    0
101762    0
101763    0
101764    0
101765    0
Name: readmitted, Length: 101766, dtype: int64

In [15]:
df

Unnamed: 0,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,8222157,Caucasian,Female,[0-10),,6,25,1,1,,...,No,No,No,No,No,No,No,No,No,NO
1,55629189,Caucasian,Female,[10-20),,1,1,7,3,,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,86047875,AfricanAmerican,Female,[20-30),,1,1,7,2,,...,No,No,No,No,No,No,No,No,Yes,NO
3,82442376,Caucasian,Male,[30-40),,1,1,7,2,,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,42519267,Caucasian,Male,[40-50),,1,1,7,1,,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,100162476,AfricanAmerican,Male,[70-80),,1,3,7,3,MC,...,No,Down,No,No,No,No,No,Ch,Yes,>30
101762,74694222,AfricanAmerican,Female,[80-90),,1,4,5,5,MC,...,No,Steady,No,No,No,No,No,No,Yes,NO
101763,41088789,Caucasian,Male,[70-80),,1,1,7,1,MC,...,No,Down,No,No,No,No,No,Ch,Yes,NO
101764,31693671,Caucasian,Female,[80-90),,2,3,7,10,MC,...,No,Up,No,No,No,No,No,Ch,Yes,NO


In [16]:
df.shape

(101766, 49)

In [17]:
# function to calculate % of missing columns
def calculate_percentage(num_missing, size):
    return (num_missing/size) * 100
    
def calculate_percentage_missing(frame):
    for column in frame.columns:
        percentage_missing = (frame[column].isna().sum() / len(frame)) * 100
        if  percentage_missing > 90:
            frame.drop(column, axis=1, inplace=True)
    return frame
        

In [18]:
calculate_percentage_missing(df)

Unnamed: 0,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,8222157,Caucasian,Female,[0-10),6,25,1,1,,Pediatrics-Endocrinology,...,No,No,No,No,No,No,No,No,No,NO
1,55629189,Caucasian,Female,[10-20),1,1,7,3,,,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,86047875,AfricanAmerican,Female,[20-30),1,1,7,2,,,...,No,No,No,No,No,No,No,No,Yes,NO
3,82442376,Caucasian,Male,[30-40),1,1,7,2,,,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,42519267,Caucasian,Male,[40-50),1,1,7,1,,,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,100162476,AfricanAmerican,Male,[70-80),1,3,7,3,MC,,...,No,Down,No,No,No,No,No,Ch,Yes,>30
101762,74694222,AfricanAmerican,Female,[80-90),1,4,5,5,MC,,...,No,Steady,No,No,No,No,No,No,Yes,NO
101763,41088789,Caucasian,Male,[70-80),1,1,7,1,MC,,...,No,Down,No,No,No,No,No,Ch,Yes,NO
101764,31693671,Caucasian,Female,[80-90),2,3,7,10,MC,Surgery-General,...,No,Up,No,No,No,No,No,Ch,Yes,NO


In [19]:
df.shape

(101766, 47)

In [20]:
# dropping null rows
df.dropna(axis=1)

Unnamed: 0,patient_nbr,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,8222157,Female,[0-10),6,25,1,1,41,0,1,...,No,No,No,No,No,No,No,No,No,NO
1,55629189,Female,[10-20),1,1,7,3,59,0,18,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,86047875,Female,[20-30),1,1,7,2,11,5,13,...,No,No,No,No,No,No,No,No,Yes,NO
3,82442376,Male,[30-40),1,1,7,2,44,1,16,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,42519267,Male,[40-50),1,1,7,1,51,0,8,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,100162476,Male,[70-80),1,3,7,3,51,0,16,...,No,Down,No,No,No,No,No,Ch,Yes,>30
101762,74694222,Female,[80-90),1,4,5,5,33,3,18,...,No,Steady,No,No,No,No,No,No,Yes,NO
101763,41088789,Male,[70-80),1,1,7,1,53,0,9,...,No,Down,No,No,No,No,No,Ch,Yes,NO
101764,31693671,Female,[80-90),2,3,7,10,45,2,21,...,No,Up,No,No,No,No,No,Ch,Yes,NO


In [21]:
df.shape

(101766, 47)

In [22]:
# drop near zero-variance columns
cols_to_drop = ['repaglinide',
'nateglinide','chlorpropamide','glimepiride','acetohexamide','tolbutamide','acarbose','miglitol','troglitazone','tolazamide','examide','citoglipton','glyburide-metformin',
'glipizide-metformin','glimepiride-pioglitazone','metformin-rosiglitazone','metformin-pioglitazone']

In [23]:
df.drop(columns=cols_to_drop,axis=1,inplace=True)

In [32]:
df.shape

(101766, 30)

In [43]:
df.describe()

Unnamed: 0,patient_nbr,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses
count,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0
mean,54330400.0,2.024006,3.715642,5.754437,4.395987,43.095641,1.33973,16.021844,0.369357,0.197836,0.635566,7.422607
std,38696360.0,1.445403,5.280166,4.064081,2.985108,19.674362,1.705807,8.127566,1.267265,0.930472,1.262863,1.9336
min,135.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
25%,23413220.0,1.0,1.0,1.0,2.0,31.0,0.0,10.0,0.0,0.0,0.0,6.0
50%,45505140.0,1.0,1.0,7.0,4.0,44.0,1.0,15.0,0.0,0.0,0.0,8.0
75%,87545950.0,3.0,4.0,7.0,6.0,57.0,2.0,20.0,0.0,0.0,1.0,9.0
max,189502600.0,8.0,28.0,25.0,14.0,132.0,6.0,81.0,42.0,76.0,21.0,16.0


In [44]:
# select only numerical columns
df.select_dtypes(include="int64")

Unnamed: 0,patient_nbr,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses
0,8222157,6,25,1,1,41,0,1,0,0,0,1
1,55629189,1,1,7,3,59,0,18,0,0,0,9
2,86047875,1,1,7,2,11,5,13,2,0,1,6
3,82442376,1,1,7,2,44,1,16,0,0,0,7
4,42519267,1,1,7,1,51,0,8,0,0,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...
101761,100162476,1,3,7,3,51,0,16,0,0,0,9
101762,74694222,1,4,5,5,33,3,18,0,0,1,9
101763,41088789,1,1,7,1,53,0,9,1,0,0,13
101764,31693671,2,3,7,10,45,2,21,0,0,1,9


In [26]:
selected_numerical_columns

Unnamed: 0,patient_nbr,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses
0,8222157,6,25,1,1,41,0,1,0,0,0,1
1,55629189,1,1,7,3,59,0,18,0,0,0,9
2,86047875,1,1,7,2,11,5,13,2,0,1,6
3,82442376,1,1,7,2,44,1,16,0,0,0,7
4,42519267,1,1,7,1,51,0,8,0,0,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...
101761,100162476,1,3,7,3,51,0,16,0,0,0,9
101762,74694222,1,4,5,5,33,3,18,0,0,1,9
101763,41088789,1,1,7,1,53,0,9,1,0,0,13
101764,31693671,2,3,7,10,45,2,21,0,0,1,9


In [27]:
# finding and removing outliers
def find_remove_outliers(frame):
    # loop through all columns
    # get the 1st and 3rd QTR
    # calculate the interquartile range
    df_filtered = None
    for column in frame.columns:
        Q1 = frame[column].quantile(0.25)
        Q3 = frame[column].quantile(0.75)
        IQR = Q3 - Q1
        
        # Defining bounds for the outliers
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        # Filtering the DataFrame to remove outliers
        df_filtered = frame[(frame[column] >= lower_bound) & (frame[column] <= upper_bound)]
    return df_filtered

In [40]:
outliers_removed = find_remove_outliers(df.select_dtypes(include="int64"))

In [42]:
outliers_removed.sum()

patient_nbr                 5516653819528
admission_type_id                  205487
discharge_disposition_id           377300
admission_source_id                584028
time_in_hospital                   446509
num_lab_procedures                4373486
num_procedures                     136211
num_medications                   1627754
number_outpatient                   37531
number_emergency                    20093
number_inpatient                    64554
number_diagnoses                   754182
dtype: int64

In [30]:
# z score normalization

# This gives some wrong answers because I messed up something up there. 

df_standardized = (outliers_removed - outliers_removed.mean()) / outliers_removed.std()

In [31]:
df_standardized

Unnamed: 0,patient_nbr,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses
1,0.032833,-0.708872,-0.514692,0.306249,-0.468790,0.808271,-0.786651,0.241513,-0.291772,-0.212675,-0.503831,0.824541
2,0.819299,-0.708872,-0.514692,0.306249,-0.803698,-1.631010,2.143850,-0.374389,1.286149,-0.212675,0.288239,-0.752485
3,0.726080,-0.708872,-0.514692,0.306249,-0.803698,0.045996,-0.200551,-0.004848,-0.291772,-0.212675,-0.503831,-0.226809
4,-0.306121,-0.708872,-0.514692,0.306249,-1.138607,0.401724,-0.786651,-0.990291,-0.291772,-0.212675,-0.503831,-1.278160
5,0.731124,-0.017156,-0.514692,-0.923489,-0.468790,-0.614643,2.729950,-0.004848,-0.291772,-0.212675,-0.503831,0.824541
...,...,...,...,...,...,...,...,...,...,...,...,...
101761,1.184228,-0.708872,-0.135934,0.306249,-0.468790,0.401724,-0.786651,-0.004848,-0.291772,-0.212675,-0.503831,0.824541
101762,0.525754,-0.708872,0.053444,-0.185646,0.201028,-0.513006,0.971649,0.241513,-0.291772,-0.212675,0.288239,0.824541
101763,-0.343105,-0.708872,-0.514692,0.306249,-1.138607,0.503361,-0.786651,-0.867111,0.497189,-0.212675,-0.503831,2.927242
101764,-0.586013,-0.017156,-0.135934,0.306249,1.875571,0.096814,0.385549,0.611054,-0.291772,-0.212675,0.288239,0.824541


In [45]:
type(None)

NoneType