## Cleaning and pre-processing the raw data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("diabetic_data.csv")
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [3]:
df.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [4]:
df = df.drop(columns=["weight", "payer_code"])

In [5]:
df.readmitted.unique()

array(['NO', '>30', '<30'], dtype=object)

In [6]:
df.loc[df.readmitted == ">30", "readmitted"] = "NO"
df.loc[df.readmitted == "<30", "readmitted"] = "YES"

In [7]:
df = df.sort_values(by=["patient_nbr", "encounter_id"])
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
4267,24437208,135,Caucasian,Female,[50-60),2,1,1,8,Cardiology,...,No,Steady,No,No,No,No,No,Ch,Yes,YES
4780,26264286,135,Caucasian,Female,[50-60),1,1,7,3,Surgery-Cardiovascular/Thoracic,...,No,No,No,No,No,No,No,Ch,Yes,NO
5827,29758806,378,Caucasian,Female,[50-60),3,1,1,2,Surgery-Neuro,...,No,No,No,No,No,No,No,No,No,NO
67608,189899286,729,Caucasian,Female,[80-90),1,3,7,4,InternalMedicine,...,No,No,No,No,No,No,No,No,Yes,NO
17494,64331490,774,Caucasian,Female,[80-90),1,1,7,3,InternalMedicine,...,No,No,No,No,No,No,No,Ch,Yes,NO


In [52]:
df = df.drop_duplicates(subset="patient_nbr", keep="first")
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
4267,24437208,135,Caucasian,Female,[50-60),2,1,1,8,Cardiology,...,No,Steady,No,No,No,No,No,Ch,Yes,YES
5827,29758806,378,Caucasian,Female,[50-60),3,1,1,2,Surgery-Neuro,...,No,No,No,No,No,No,No,No,No,NO
67608,189899286,729,Caucasian,Female,[80-90),1,3,7,4,InternalMedicine,...,No,No,No,No,No,No,No,No,Yes,NO
17494,64331490,774,Caucasian,Female,[80-90),1,1,7,3,InternalMedicine,...,No,No,No,No,No,No,No,Ch,Yes,NO
2270,14824206,927,AfricanAmerican,Female,[30-40),1,1,7,5,InternalMedicine,...,No,No,No,No,No,No,No,No,Yes,NO


In [53]:
df = df[~df.discharge_disposition_id.isin([11, 13, 14, 19, 20, 21])]

In [54]:
df.to_csv("cleaned.csv", index=False)