In [1]:
import pandas as pd 
from sklearn.preprocessing import LabelEncoder
import numpy as np


In [2]:
df = pd.read_csv("../data/raw/diabetic_data.csv")

## data_cleaning

In [4]:
#there are other typesof Null values in the data but starting with checking 
#the ones system recognize 

df.isnull().sum()

encounter_id                    0
patient_nbr                     0
race                            0
gender                          0
age                             0
weight                          0
admission_type_id               0
discharge_disposition_id        0
admission_source_id             0
time_in_hospital                0
payer_code                      0
medical_specialty               0
num_lab_procedures              0
num_procedures                  0
num_medications                 0
number_outpatient               0
number_emergency                0
number_inpatient                0
diag_1                          0
diag_2                          0
diag_3                          0
number_diagnoses                0
max_glu_serum               96420
A1Cresult                   84748
metformin                       0
repaglinide                     0
nateglinide                     0
chlorpropamide                  0
glimepiride                     0
acetohexamide 

In [5]:
print(f"max_glu_serum column is {(df['max_glu_serum'].isnull().sum()*100 / len(df['max_glu_serum'])).round(2)}% empty")
print(f"A1Cresult column is {(df['A1Cresult'].isnull().sum()*100 / len(df['A1Cresult'])).round(2)}% empty")

#creating a copy of original df 
df_cleaned = df.copy()

#as these columns are mostly empty, dropping them
df_cleaned.drop(columns=["max_glu_serum","A1Cresult"],inplace=True)

max_glu_serum column is 94.75% empty
A1Cresult column is 83.28% empty


In [6]:
#assigning raw data columns per dtype 

numeric_columns = df_cleaned.select_dtypes("number")
categorical_columns = df_cleaned.select_dtypes("object")

In [7]:
def replace_question_marks_with_nan(df):
    """ Defining a function to replace '?' with nan """
    return df.replace('?', np.nan, inplace=True)

#applying the function to my df 
replace_question_marks_with_nan(df_cleaned)

In [8]:
df_cleaned.isnull().sum()
df_cleaned.dropna(inplace=True)
df_cleaned.isnull().sum()

encounter_id                0
patient_nbr                 0
race                        0
gender                      0
age                         0
weight                      0
admission_type_id           0
discharge_disposition_id    0
admission_source_id         0
time_in_hospital            0
payer_code                  0
medical_specialty           0
num_lab_procedures          0
num_procedures              0
num_medications             0
number_outpatient           0
number_emergency            0
number_inpatient            0
diag_1                      0
diag_2                      0
diag_3                      0
number_diagnoses            0
metformin                   0
repaglinide                 0
nateglinide                 0
chlorpropamide              0
glimepiride                 0
acetohexamide               0
glipizide                   0
glyburide                   0
tolbutamide                 0
pioglitazone                0
rosiglitazone               0
acarbose  

In [9]:
#checking for duplicated but result is 0 
df_cleaned.duplicated().sum()

0

In [12]:
# medical_specialty & payer_code columns are irrelevant, so will drop them as well 

#and to keep things simple, dropping discharge_disposition_id & admission_source_id 

columns_to_remove = ['medical_specialty',
                               'payer_code',
                               'discharge_disposition_id',
                               'admission_source_id',
                               'admission_type_id',
                               'encounter_id',
                               'diag_1',
                               'diag_2',
                               'diag_3']
                               
#columns 'diag_1', 'diag_2','diag_3' has a lot of unique values, 

df_cleaned = df_cleaned.drop(columns_to_remove,
                axis=1,
                )



In [13]:
#just changing characters to lowercase for more uniform categorical columns 

df_cleaned = df_cleaned.map(lambda x: x.lower() if isinstance(x, str) else x)


In [None]:
df_cleaned.nunique()

In [14]:
#saving dfs as csv 

df_cleaned.to_csv(path_or_buf="../data/clean/df_cleaned.csv")

In [15]:
df_cleaned.shape

(1043, 39)