In [1]:
import pandas as pd
import numpy as np

In [15]:
diabetic = pd.read_csv('diabetic_data.csv')

In [3]:
print(diabetic.shape)

(101766, 50)


In [16]:
diabetic_df = diabetic.replace('?', np.nan)
diabetic_df = diabetic_df.sort_values('encounter_id')
diabetic_df.drop_duplicates(subset = ['patient_nbr'], keep = 'first', inplace = True)
print(diabetic_df.shape)
diabetic_df = diabetic_df[~diabetic_df['discharge_disposition_id'].isin([11,13,14,19,20,21])]
print(diabetic_df.shape)
diabetic_df.drop(diabetic_df.loc[diabetic_df.gender=='Unknown/Invalid'].index,inplace=True)
print(diabetic_df.shape)

(71518, 50)
(69973, 50)
(69970, 50)


In [17]:
diabetic_df.drop(['encounter_id','patient_nbr','weight','medical_specialty','payer_code'],\
                 axis = 1, inplace = True)

In [18]:
diabetic_df['readmit_30d'] = (diabetic_df['readmitted'] == '<30')

In [8]:
diabetic_df.readmit_30d.value_counts()

False    63693
True      6277
Name: readmit_30d, dtype: int64

In [9]:
diabetic_df['admission_type_id'] = diabetic_df['admission_type_id'].astype('str')
diabetic_df['admission_source_id'] = diabetic_df['admission_source_id'].astype('str')
diabetic_df['discharge_disposition_id'] = diabetic_df['discharge_disposition_id'].astype('str')

In [7]:
diabetic_df['age_num'] = diabetic_df['age'].replace(
    ['[0-10)','[10-20)','[20-30)','[30-40)','[40-50)','[50-60)','[60-70)','[70-80)','[80-90)','[90-100)'],
    [5,15,25,35,45,55,65,75,85,95])

In [10]:
diabetic_df['diag_1'] = diabetic_df['diag_1'].str.split('.', expand = True).drop(1, axis = 1)
diabetic_df['diag_2'] = diabetic_df['diag_2'].str.split('.', expand = True).drop(1, axis = 1)
diabetic_df['diag_3'] = diabetic_df['diag_3'].str.split('.', expand = True).drop(1, axis = 1)

In [11]:
count_1 = diabetic_df.diag_1.value_counts()
index_1 = count_1[count_1>500].index.tolist()

In [12]:
count_2 = diabetic_df.diag_2.value_counts()
index_2 = count_2[count_2>500].index.tolist()

In [13]:
count_3 = diabetic_df.diag_3.value_counts()
index_3 = count_3[count_3>500].index.tolist()

In [14]:
diags = set(index_1+index_2+index_3)

In [13]:
for d in diags:
    diabetic_df[d+'_diag'] = ((diabetic_df['diag_1']==d)|
                                (diabetic_df['diag_2']==d)|
                                (diabetic_df['diag_3']==d))

In [14]:
for col in ['metformin','repaglinide','glimepiride','glipizide', 'glyburide','pioglitazone',
            'rosiglitazone','insulin']:
    diabetic_df[col+'_used'] = np.where(diabetic_df[col]=='No', 0, 1)

# 'nateglinide','glyburide-metformin' (close to 500)

In [15]:
diabetic_final = diabetic_df.drop(['age',
                                   'diag_1','diag_2','diag_3',
                                   'metformin','repaglinide','nateglinide','chlorpropamide','glimepiride',
                                   'acetohexamide','glipizide','glyburide','tolbutamide','pioglitazone',
                                   'rosiglitazone','acarbose','miglitol','troglitazone','tolazamide','examide',
                                   'citoglipton','insulin','glyburide-metformin','glipizide-metformin',
                                   'glimepiride-pioglitazone','metformin-rosiglitazone','metformin-pioglitazone',
                                   'readmitted'],
                                 axis = 1)

In [16]:
diabetic_final = pd.get_dummies(diabetic_final, drop_first = True)
len(diabetic_final.columns)

119

In [17]:
#Droping columns with less than 500 patients from admission source and discharge disposition.
cols_to_drop = ['admission_type_id_4',
                'admission_type_id_7',
                'admission_type_id_8',
                'discharge_disposition_id_10',
                'discharge_disposition_id_12',
                'discharge_disposition_id_15',
                'discharge_disposition_id_16',
                'discharge_disposition_id_17',
                'discharge_disposition_id_23',
                'discharge_disposition_id_24',
                'discharge_disposition_id_27',
                'discharge_disposition_id_28',
                'discharge_disposition_id_7',
                'discharge_disposition_id_8',
                'discharge_disposition_id_9',
                'admission_source_id_10',
                'admission_source_id_11',
                'admission_source_id_13',
                'admission_source_id_14',
                'admission_source_id_20',
                'admission_source_id_22',
                'admission_source_id_25',
                'admission_source_id_3',
                'admission_source_id_8',
                'admission_source_id_9']

diabetic_final.drop(columns=cols_to_drop,inplace=True)

In [23]:
cols_to_keep = ['41_in_diag',
'250_in_diag',
'272_in_diag',
'278_in_diag',
'401_in_diag',
'403_in_diag',
'428_in_diag',
'434_in_diag',
'440_in_diag',
'486_in_diag',
'574_in_diag',
'577_in_diag',
'682_in_diag',
'707_in_diag',
'715_in_diag',
'722_in_diag',
'780_in_diag',
'786_in_diag',
'A1Cresult_None',
'admission_source_id_5',
'admission_source_id_7',
'admission_source_id_17',
'admission_source_id_4',
'admission_type_id_3',
'admission_type_id_6',
'age_num',
'diabetesMed_Yes',
'discharge_disposition_id_2',
'discharge_disposition_id_3',
'discharge_disposition_id_4',
'discharge_disposition_id_5',
'discharge_disposition_id_6',
'discharge_disposition_id_18',
'discharge_disposition_id_22',
'discharge_disposition_id_25',
'max_glu_serum_None',
'metformin_used',
'number_diagnoses',
'number_emergency',
'number_inpatient',
'num_lab_procedures',
'pioglitazone_used',
'time_in_hospital',
'readmit_30d']
diabetic_final = diabetic_final[cols_to_keep]

In [24]:
diabetic_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 69970 entries, 8 to 101765
Data columns (total 44 columns):
 #   Column                       Non-Null Count  Dtype
---  ------                       --------------  -----
 0   41_in_diag                   69970 non-null  bool 
 1   250_in_diag                  69970 non-null  bool 
 2   272_in_diag                  69970 non-null  bool 
 3   278_in_diag                  69970 non-null  bool 
 4   401_in_diag                  69970 non-null  bool 
 5   403_in_diag                  69970 non-null  bool 
 6   428_in_diag                  69970 non-null  bool 
 7   434_in_diag                  69970 non-null  bool 
 8   440_in_diag                  69970 non-null  bool 
 9   486_in_diag                  69970 non-null  bool 
 10  574_in_diag                  69970 non-null  bool 
 11  577_in_diag                  69970 non-null  bool 
 12  682_in_diag                  69970 non-null  bool 
 13  707_in_diag                  69970 non-null  

In [25]:
diabetic_final.reset_index(drop=True,inplace=True)

In [20]:
#Previous version without droping low patient numbers from admission type and discharge
#diabetic_final.to_csv('diabetes_cleaned_12-14-20.csv',index=False)

In [21]:
# Version from 12-14-20 with low patient numbers, but before pairing down to the top 45 features
#diabetic_final.to_csv('diabetes_cleaned_12-14-20B.csv',index=False)

In [26]:
diabetic_final.to_csv('diabetes_cleaned_12-15-20.csv',index=False)
