In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)

df = pd.read_csv('diabetes_cleaned.csv')

In [2]:
#Drop three patients with Gender not specified (they're also missing race and they're all not readmitted which seems weird)
df.drop(df.loc[df.gender=='Unknown/Invalid'].index,inplace=True)
#Dropping just identifier numbers since they have no information. Unnamed: 0 is just a redundant index column.
df.drop(columns=['encounter_id','patient_nbr','Unnamed: 0'],inplace=True)
#Weight, payer_code, and medical_specialty are dropped because they have so much missingness, per the authors.
df.drop(columns=['weight','payer_code','medical_specialty'],inplace=True)
#These medication columns are dropped because they are the same for all patients and have no information.
df.drop(columns=['examide','citoglipton','glimepiride-pioglitazone'],inplace=True)
#These medication columns have very few, but not zero, numbers of patients with information (less than 30).
df.drop(columns=['acetohexamide','tolbutamide','miglitol','troglitazone','tolazamide','glipizide-metformin','metformin-rosiglitazone','metformin-pioglitazone'],inplace=True)
#These mediations columns have between 30-500 patients, so Steady, Up, and Down are grouped together as 1, and No as 0.
#chlorpropamide, acarbose,nateglinide, glyburide-metformin (treated with drug_dict)

#Clean up dictionaries, indicating which strings are coded to which numbers
gender_dict = {'Female':0,'Male':1}
age_dict = {'[0-10)':0,'[10-20)':10,'[20-30)':20,'[30-40)':30,'[40-50)':40,'[50-60)':50,
           '[60-70)':60,'[70-80)':70,'[80-90)':80,'[90-100)':90}
readmitted_dict = {'NO':0,'>30':1,'<30':2}
drug_dict= {'No':0,'Steady':1,'Up':1,'Down':1}
change_dict= {'No':0,'Ch':1}
diabetesmed_dict = {'No':0,'Yes':1}

#Final dictionary indcating which columns to treat with which dictionary
cleanup_dict ={'gender':gender_dict,
               'age':age_dict,
               'chlorpropamide':drug_dict,
               'acarbose':drug_dict,
               'nateglinide':drug_dict,
               'glyburide-metformin':drug_dict,
              'readmitted':readmitted_dict,
              'change':change_dict,
              'diabetesMed':diabetesmed_dict}

#Replace the string/object values with numeric.
df.replace(to_replace=cleanup_dict,value=None,inplace=True)

#Function to specify a given column, make some dummies and drop the most abundant category
def dummify_and_drop_max(df,column):
    dummies = pd.get_dummies(data=df[column],prefix=column)
    to_drop = str(df[column].value_counts().index[0])
    dummies = dummies.drop(columns=[column+'_'+to_drop])
    return(dummies)

#Which columns to dummify
columns_to_dummify = ['race','admission_type_id','discharge_disposition_id','admission_source_id',
                      'max_glu_serum','A1Cresult','metformin','repaglinide','glimepiride',
                      'glipizide','glyburide','pioglitazone','rosiglitazone','insulin']

#Simple loop to get dummies, concat them to the end of the dataframe, and drop the original column so 
#there's no redundant information.
for i in columns_to_dummify:
    dummies = dummify_and_drop_max(df,i)
    df = pd.concat([df,dummies],axis=1)
    df.drop(columns=[i],inplace=True)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 71047 entries, 0 to 71049
Data columns (total 99 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       71047 non-null  int64 
 1   age                          71047 non-null  int64 
 2   time_in_hospital             71047 non-null  int64 
 3   num_lab_procedures           71047 non-null  int64 
 4   num_procedures               71047 non-null  int64 
 5   num_medications              71047 non-null  int64 
 6   number_outpatient            71047 non-null  int64 
 7   number_emergency             71047 non-null  int64 
 8   number_inpatient             71047 non-null  int64 
 9   diag_1                       71047 non-null  object
 10  diag_2                       71047 non-null  object
 11  diag_3                       71047 non-null  object
 12  number_diagnoses             71047 non-null  int64 
 13  nateglinide                  71

In [4]:
df.to_csv('diabetes_cleaned_numeric.csv')