In [1]:
import pandas as pd
import numpy as np

In [2]:
diabetic = pd.read_csv('diabetic_data.csv')

In [3]:
diabetic_df = diabetic.replace('?', np.nan)
diabetic_df = diabetic_df.sort_values('encounter_id')
diabetic_df.drop_duplicates(subset = ['patient_nbr'], keep = 'first', inplace = True)
diabetic_df = diabetic_df[~diabetic_df['discharge_disposition_id'].isin([11,13,14,19,20,21])]
diabetic_df.drop(diabetic_df.loc[diabetic_df.gender=='Unknown/Invalid'].index,inplace=True)

In [4]:
diabetic_df.drop(['encounter_id','patient_nbr','weight','medical_specialty','payer_code'],\
                 axis = 1, inplace = True)

In [5]:
diabetic_df['readmit_30d'] = (diabetic_df['readmitted'] == '<30')

In [6]:
diabetic_df['admission_type_id'] = diabetic_df['admission_type_id'].astype('str')
diabetic_df['admission_source_id'] = diabetic_df['admission_source_id'].astype('str')
diabetic_df['discharge_disposition_id'] = diabetic_df['discharge_disposition_id'].astype('str')

In [7]:
diabetic_df['age_num'] = diabetic_df['age'].replace(
    ['[0-10)','[10-20)','[20-30)','[30-40)','[40-50)','[50-60)','[60-70)','[70-80)','[80-90)','[90-100)'],
    [5,15,25,35,45,55,65,75,85,95])

In [8]:
diabetic_df['diag_1'] = diabetic_df['diag_1'].str.split('.', expand = True).drop(1, axis = 1)
diabetic_df['diag_2'] = diabetic_df['diag_2'].str.split('.', expand = True).drop(1, axis = 1)
diabetic_df['diag_3'] = diabetic_df['diag_3'].str.split('.', expand = True).drop(1, axis = 1)

In [9]:
count_1 = diabetic_df.diag_1.value_counts()
index_1 = count_1[count_1>500].index.tolist()

In [10]:
count_2 = diabetic_df.diag_2.value_counts()
index_2 = count_2[count_2>500].index.tolist()

In [11]:
count_3 = diabetic_df.diag_3.value_counts()
index_3 = count_3[count_3>500].index.tolist()

In [12]:
diags = set(index_1+index_2+index_3)

In [13]:
for d in diags:
    diabetic_df[d+'_in_diag'] = ((diabetic_df['diag_1']==d)|
                                (diabetic_df['diag_2']==d)|
                                (diabetic_df['diag_3']==d))

In [14]:
for col in ['metformin','repaglinide','glimepiride','glipizide', 'glyburide','pioglitazone',
            'rosiglitazone','insulin']:
    diabetic_df[col+'_used'] = np.where(diabetic_df[col]=='No', 0, 1)

# 'nateglinide','glyburide-metformin' (close to 500)

In [15]:
diabetic_final = diabetic_df.drop(['age',
                                   'diag_1','diag_2','diag_3',
                                   'metformin','repaglinide','nateglinide','chlorpropamide','glimepiride',
                                   'acetohexamide','glipizide','glyburide','tolbutamide','pioglitazone',
                                   'rosiglitazone','acarbose','miglitol','troglitazone','tolazamide','examide',
                                   'citoglipton','insulin','glyburide-metformin','glipizide-metformin',
                                   'glimepiride-pioglitazone','metformin-rosiglitazone','metformin-pioglitazone',
                                   'readmitted'],
                                 axis = 1)

In [16]:
diabetic_final = pd.get_dummies(diabetic_final, drop_first = True)
len(diabetic_final.columns)

119

In [17]:
#Droping columns with less than 500 patients from admission source and discharge disposition.
cols_to_drop = ['admission_type_id_4',
                'admission_type_id_7',
                'admission_type_id_8',
                'discharge_disposition_id_10',
                'discharge_disposition_id_12',
                'discharge_disposition_id_15',
                'discharge_disposition_id_16',
                'discharge_disposition_id_17',
                'discharge_disposition_id_23',
                'discharge_disposition_id_24',
                'discharge_disposition_id_27',
                'discharge_disposition_id_28',
                'discharge_disposition_id_7',
                'discharge_disposition_id_8',
                'discharge_disposition_id_9',
                'admission_source_id_10',
                'admission_source_id_11',
                'admission_source_id_13',
                'admission_source_id_14',
                'admission_source_id_20',
                'admission_source_id_22',
                'admission_source_id_25',
                'admission_source_id_3',
                'admission_source_id_8',
                'admission_source_id_9']

diabetic_final.drop(columns=cols_to_drop,inplace=True)

In [18]:
diabetic_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 69970 entries, 8 to 101765
Data columns (total 94 columns):
 #   Column                       Non-Null Count  Dtype
---  ------                       --------------  -----
 0   time_in_hospital             69970 non-null  int64
 1   num_lab_procedures           69970 non-null  int64
 2   num_procedures               69970 non-null  int64
 3   num_medications              69970 non-null  int64
 4   number_outpatient            69970 non-null  int64
 5   number_emergency             69970 non-null  int64
 6   number_inpatient             69970 non-null  int64
 7   number_diagnoses             69970 non-null  int64
 8   readmit_30d                  69970 non-null  bool 
 9   age_num                      69970 non-null  int64
 10  424_in_diag                  69970 non-null  bool 
 11  599_in_diag                  69970 non-null  bool 
 12  276_in_diag                  69970 non-null  bool 
 13  562_in_diag                  69970 non-null  

In [18]:
diabetic_final.reset_index(drop=True,inplace=True)

In [20]:
#Previous version without droping low patient numbers from admission type and discharge
#diabetic_final.to_csv('diabetes_cleaned_12-14-20.csv',index=False)

In [21]:
diabetic_final.to_csv('diabetes_cleaned_12-14-20B.csv',index=False)

In [19]:
118-25


93