In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import entropy

In [2]:
dataset = pd.read_csv("diabetic_data.csv")

The target have unbalanced classes

In [3]:
dataset.readmitted.value_counts()

readmitted
NO     54864
>30    35545
<30    11357
Name: count, dtype: int64

Rplacing "?" with np.nan to handle easier missing values

In [4]:
dataset.replace("?", np.nan, inplace=True)

In [5]:
legend = pd.read_csv("IDS_mapping.csv")
legend

Unnamed: 0,admission_type_id,description
0,1,Emergency
1,2,Urgent
2,3,Elective
3,4,Newborn
4,5,Not Available
...,...,...
62,22,Transfer from hospital inpt/same fac reslt in...
63,23,Born inside this hospital
64,24,Born outside this hospital
65,25,Transfer from Ambulatory Surgery Center


First of all, we noticed that some patient had multiple visits encounters registered in the database, and this was also confirmed by the papers related to the dataset. So we decided to remove all the visits apart from the first to all the patients, to guarantee homogeneity.

In [6]:
dataset_sorted = dataset.sort_values(by='encounter_id')
dataset_first_visits = dataset_sorted.groupby('patient_nbr', as_index=False).nth(0)

In [7]:
dataset_first_visits.shape

(71518, 50)

In [8]:
dataset_first_visits[dataset_first_visits.patient_nbr == 1660293]

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
269,2967810,1660293,Caucasian,Female,[60-70),,6,25,7,10,...,No,Steady,No,No,No,No,No,No,Yes,>30


In [9]:
dataset_first_visits.readmitted.value_counts()

readmitted
NO     42985
>30    22240
<30     6293
Name: count, dtype: int64

Using all the visits related to one patient, trying to fill missing values of variable "race" on the first visit 

In [10]:
race_map = dataset.dropna(subset=['race']).drop_duplicates(subset='patient_nbr')[['patient_nbr', 'race']]
race_map = race_map.set_index('patient_nbr')['race']

dataset_first_visits['race'] = dataset_first_visits['race'].fillna(dataset_first_visits['patient_nbr'].map(race_map))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_first_visits['race'] = dataset_first_visits['race'].fillna(dataset_first_visits['patient_nbr'].map(race_map))


Because the problem persists, later we will drop the rows associated to missing values of race, because filling them with the most frequent value seems too biasing, and each other categorical variable showed a division of the races that reflected the total distribution for the variable

In [11]:
dataset_first_visits.race.isna().sum()

np.int64(1850)

After that, we were able to quantify the missing data in the dataset, but taking a particular attention to some variables, for example AC1results, for which we knew from the literature that some missing data were associated to patients for which there were such an high evidence of diabetes that the test was unnecessary. 
Notice that here we are not considering the missing data coming form the categorical variables such as admission_type_id, this problem will be handled later separately (for them, we will classify "NULL" and "Unknown" as missing values, "NULL" because it is an invalid value, "Unknown" because it seemed reasonable, also talking with the tutors).

In [12]:
nan_percentages = dataset_first_visits.isna().mean() * 100
print(nan_percentages)


encounter_id                 0.000000
patient_nbr                  0.000000
race                         2.586761
gender                       0.000000
age                          0.000000
weight                      96.010794
admission_type_id            0.000000
discharge_disposition_id     0.000000
admission_source_id          0.000000
time_in_hospital             0.000000
payer_code                  43.405856
medical_specialty           48.207444
num_lab_procedures           0.000000
num_procedures               0.000000
num_medications              0.000000
number_outpatient            0.000000
number_emergency             0.000000
number_inpatient             0.000000
diag_1                       0.015381
diag_2                       0.411085
diag_3                       1.712856
number_diagnoses             0.000000
max_glu_serum               95.167650
A1Cresult                   81.842333
metformin                    0.000000
repaglinide                  0.000000
nateglinide 

In [13]:
print(nan_percentages > 20)

encounter_id                False
patient_nbr                 False
race                        False
gender                      False
age                         False
weight                       True
admission_type_id           False
discharge_disposition_id    False
admission_source_id         False
time_in_hospital            False
payer_code                   True
medical_specialty            True
num_lab_procedures          False
num_procedures              False
num_medications             False
number_outpatient           False
number_emergency            False
number_inpatient            False
diag_1                      False
diag_2                      False
diag_3                      False
number_diagnoses            False
max_glu_serum                True
A1Cresult                    True
metformin                   False
repaglinide                 False
nateglinide                 False
chlorpropamide              False
glimepiride                 False
acetohexamide 

From the group of variables with a consistent number of missing data (threshold of 20%) the decision was to remove the variables "weight" and "payer_id", according to the papers and the previous analysis that used this dataset. The other variables were at first kept because of their potential significancy in the analysis ( also $max\_glu\_serum$ have a percentage of missing values over the threshold, it will be removed later in the code).

In [14]:
dataset_first_visits.drop(['weight', 'payer_code'], axis=1, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_first_visits.drop(['weight', 'payer_code'], axis=1, inplace=True)


### Dealing with admission\_source\_id, admission\_type\_id, discharge\_disposition\_id

The idea is, also to differentiate our pre processing a little from the work of (cita paper), to group the categories for these three variables as the paper done, including a variable "others" that will collect the Not Mapped values and others, but before, we wanted to remove from the dataset all the not frequent categories for each variable. The idea under our procedure is to aggregate in category "others" not too much heterogeneous information. 

In [15]:
def summary_categorical_variable(df, var_categorica, target="readmitted"):
    frequenze = pd.crosstab(df[var_categorica], df[target])

    frequenze['Totale'] = frequenze.sum(axis=1)

    colonne_ordinate = ['Totale'] + [col for col in frequenze.columns if col != 'Totale']
    frequenze = frequenze[colonne_ordinate]

    return frequenze


In [16]:
summary_admission_type_id = summary_categorical_variable(dataset_first_visits, "admission_type_id")


In [17]:
summary_admission_type_id

readmitted,Totale,<30,>30,NO
admission_type_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,36490,3262,11450,21778
2,13028,1149,4073,7806
3,13917,1143,3732,9042
4,9,1,2,6
5,3174,265,1039,1870
6,4588,452,1875,2261
7,21,0,0,21
8,291,21,69,201


In [18]:
def infrequent_categories(summary, threshold = 200, N=71518):
    infrequent_list = [] 
    for subcategory,tot in zip(summary.index, summary["Totale"]): 
        if tot < threshold:
            print(f"subcategory {subcategory} is infrequent")
            infrequent_list.append(subcategory)
    return infrequent_list

In [19]:
infrequent_categories(summary_admission_type_id)

subcategory 4 is infrequent
subcategory 7 is infrequent


[4, 7]

In [20]:
categorical_variables = dataset_first_visits.select_dtypes(include=['object', 'category']).columns.tolist()

In [21]:
categorical_variables.append('admission_type_id')
categorical_variables.append('discharge_disposition_id')
categorical_variables.append('admission_source_id')

In [22]:
infrequent_subcategories_for_cat_var = {}

In [23]:
for variable in categorical_variables: 
    print(f"\n Analyzing variable {variable}")
    summary_variable = summary_categorical_variable(dataset_first_visits, variable, target="readmitted")
    inf = infrequent_categories(summary_variable)
    if len(inf) >= 1:
        infrequent_subcategories_for_cat_var[variable] = inf


 Analyzing variable race

 Analyzing variable gender
subcategory Unknown/Invalid is infrequent

 Analyzing variable age
subcategory [0-10) is infrequent

 Analyzing variable medical_specialty
subcategory AllergyandImmunology is infrequent
subcategory Anesthesiology is infrequent
subcategory Anesthesiology-Pediatric is infrequent
subcategory Cardiology-Pediatric is infrequent
subcategory DCPTEAM is infrequent
subcategory Dentistry is infrequent
subcategory Dermatology is infrequent
subcategory Endocrinology is infrequent
subcategory Endocrinology-Metabolism is infrequent
subcategory Gynecology is infrequent
subcategory Hematology is infrequent
subcategory Hematology/Oncology is infrequent
subcategory Hospitalist is infrequent
subcategory InfectiousDiseases is infrequent
subcategory Neurology is infrequent
subcategory Neurophysiology is infrequent
subcategory Obsterics&Gynecology-GynecologicOnco is infrequent
subcategory Obstetrics is infrequent
subcategory Ophthalmology is infrequent
s

For example, for admission\_type\_id the categories with absolute frequency under 200 observations are

In [24]:
infrequent_subcategories_for_cat_var['admission_type_id']

[4, 7]

Here we count the invalid values for admission\_type\_id, discharge\_disposition\_id, admission\_source\_id, and decide to remove the observations related to them.

In [25]:
dataset_first_visits[(dataset_first_visits.admission_type_id == 5) | (dataset_first_visits.admission_type_id == 6)].shape[0]

7762

In [26]:
dataset_first_visits[(dataset_first_visits.discharge_disposition_id == 18) | (dataset_first_visits.discharge_disposition_id == 26)].shape[0]

2474

In [27]:
dataset_first_visits[(dataset_first_visits.admission_source_id == 17) | (dataset_first_visits.admission_source_id == 21)].shape[0]

4949

In [28]:
dataset_first_visits[(dataset_first_visits.admission_type_id == 5) | (dataset_first_visits.admission_type_id == 6) | (dataset_first_visits.discharge_disposition_id == 18) | (dataset_first_visits.discharge_disposition_id == 26) | (dataset_first_visits.admission_source_id == 17) | (dataset_first_visits.admission_source_id == 21)].shape[0]

10791

In [29]:
dataset_without_na = dataset_first_visits[~((dataset_first_visits.admission_type_id == 5) |
                                             (dataset_first_visits.admission_type_id == 6) |
                                               (dataset_first_visits.discharge_disposition_id == 18) |
                                                 (dataset_first_visits.discharge_disposition_id == 26) |
                                                   (dataset_first_visits.admission_source_id == 17) |
                                                     (dataset_first_visits.admission_source_id == 21))]

In [30]:
dataset_without_na.shape

(60727, 48)

Remove non frequent categories from admission_type_id, discharge_disposition_id and admission_source_id because they were considered not significative

In [31]:
dataset_without_infrequent_categories = dataset_without_na[~( (dataset_without_na['admission_type_id'].isin(infrequent_subcategories_for_cat_var['admission_type_id'])) |
                                      (dataset_without_na['discharge_disposition_id'].isin(infrequent_subcategories_for_cat_var['discharge_disposition_id'])) |
                                        (dataset_without_na['admission_source_id'].isin(infrequent_subcategories_for_cat_var['admission_source_id'])))]

In [32]:
dataset_without_infrequent_categories.shape

(60142, 48)

In [33]:
dataset_without_infrequent_categories[dataset_without_infrequent_categories.admission_type_id == 4]

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted


### Gathering categories in `admission_type_id`, `admission_source_id`, `discharge_disposition_id`

After removing the infrequent categories from these three variables, the next step is to **simplify their structure**, following the approach proposed by Beata Strack et al. (2014).

The main goal is to **minimize the number of categories** for each variable as much as possible, since a large number of categories can increase model complexity—especially when using techniques like `OneHotEncoder`.

In [34]:
#creation of a copy of the dataset up to now 
dataset_without_infrequent_categories_2 = dataset_without_infrequent_categories.copy()

Let's start with `discharge_disposition_id`.

First of all, patients classified as expired in the `discharge_disposition_id` variable, as well as those discharged to a hospice, are considered medical outliers and therefore need to be excluded from the analysis.

In [35]:
hospice = [13, 14]
dataset_without_infrequent_categories_2['discharge_disposition_id'] = dataset_without_infrequent_categories['discharge_disposition_id'].replace(dict.fromkeys(hospice, 'hospice'))

In [39]:
dataset_without_infrequent_categories_2 = dataset_without_infrequent_categories_2[~(dataset_without_infrequent_categories_2.discharge_disposition_id == 11)] 

In [40]:
dataset_without_infrequent_categories_2 = dataset_without_infrequent_categories_2[~(dataset_without_infrequent_categories_2.discharge_disposition_id == 'hospice')] 

Now, let's move finally to the recategorization part.

In [43]:
another_medical_facilities = [2,3,4,5,22,23]
dataset_without_infrequent_categories_2['discharge_disposition_id'] = dataset_without_infrequent_categories_2['discharge_disposition_id'].replace(dict.fromkeys(another_medical_facilities, 'another_medical_facility'))

In [44]:
dataset_without_infrequent_categories_2.discharge_disposition_id.value_counts()

discharge_disposition_id
1                           39806
another_medical_facility    11536
6                            7069
7                             366
25                             17
Name: count, dtype: int64

In [45]:
to_home = [1,6]
dataset_without_infrequent_categories_2['discharge_disposition_id'] = dataset_without_infrequent_categories_2['discharge_disposition_id'].replace(dict.fromkeys(to_home, 'to_home'))

In [46]:
dataset_without_infrequent_categories_2.discharge_disposition_id.value_counts()

discharge_disposition_id
to_home                     46875
another_medical_facility    11536
7                             366
25                             17
Name: count, dtype: int64

In [47]:
other = [7,25]
dataset_without_infrequent_categories_2['discharge_disposition_id'] = dataset_without_infrequent_categories_2['discharge_disposition_id'].replace(dict.fromkeys(other, 'other'))

Let's see for clarification the final structure of discharge`disposition_id`

In [48]:
dataset_without_infrequent_categories_2.discharge_disposition_id.value_counts()

discharge_disposition_id
to_home                     46875
another_medical_facility    11536
other                         383
Name: count, dtype: int64

Now let's move to `admission_source_id`

In [50]:
dataset_without_infrequent_categories_2.admission_source_id.value_counts()


admission_source_id
7    34456
1    19205
4     2126
6     1763
2      808
5      436
Name: count, dtype: int64

In [51]:
another_medical_facility_other = [4,5,6]
dataset_without_infrequent_categories_2['admission_source_id'] = dataset_without_infrequent_categories_2['admission_source_id'].replace(dict.fromkeys(another_medical_facility_other, 'another_medical_facility/other'))

In [52]:
referral = [1,2]
dataset_without_infrequent_categories_2['admission_source_id'] = dataset_without_infrequent_categories_2['admission_source_id'].replace(dict.fromkeys(referral, 'referral'))

In [53]:
emergency = [7]
dataset_without_infrequent_categories_2['admission_source_id'] = dataset_without_infrequent_categories_2['admission_source_id'].replace(dict.fromkeys(emergency, 'emergency'))

The final structure for the variable is:

In [54]:
dataset_without_infrequent_categories_2.admission_source_id.value_counts()


admission_source_id
emergency                         34456
referral                          20013
another_medical_facility/other     4325
Name: count, dtype: int64

Let's also to align `admission_type_id` with the format of the other variables. 

In [56]:
emergency = [1]
urgent = [2]
elective = [3]
other = [8]
dataset_without_infrequent_categories_2['admission_type_id'] = dataset_without_infrequent_categories_2['admission_type_id'].replace(dict.fromkeys(emergency, 'emergency'))
dataset_without_infrequent_categories_2['admission_type_id'] = dataset_without_infrequent_categories_2['admission_type_id'].replace(dict.fromkeys(urgent, 'urgent'))
dataset_without_infrequent_categories_2['admission_type_id'] = dataset_without_infrequent_categories_2['admission_type_id'].replace(dict.fromkeys(elective, 'elective'))
dataset_without_infrequent_categories_2['admission_type_id'] = dataset_without_infrequent_categories_2['admission_type_id'].replace(dict.fromkeys(other, 'other'))

The final structure for `admission_type_id` is:

In [57]:
dataset_without_infrequent_categories_2.admission_type_id.value_counts()


admission_type_id
emergency    33559
elective     12920
urgent       12026
other          289
Name: count, dtype: int64

Now let' move back to the variable `race`, removing the observations associated to the remaining missing values.

In [59]:
dataset_without_infrequent_categories_2.isna().sum()

encounter_id                    0
patient_nbr                     0
race                         1641
gender                          0
age                             0
admission_type_id               0
discharge_disposition_id        0
admission_source_id             0
time_in_hospital                0
medical_specialty           27963
num_lab_procedures              0
num_procedures                  0
num_medications                 0
number_outpatient               0
number_emergency                0
number_inpatient                0
diag_1                          9
diag_2                        268
diag_3                       1008
number_diagnoses                0
max_glu_serum               58666
A1Cresult                   47767
metformin                       0
repaglinide                     0
nateglinide                     0
chlorpropamide                  0
glimepiride                     0
acetohexamide                   0
glipizide                       0
glyburide     

In [61]:
def summary_categorical_variable_perc(df, var_categorica, target="readmitted"):
    frequenze = pd.crosstab(df[var_categorica], df[target], normalize='index') * 100
    frequenze['Totale'] = 100
    colonne_ordinate = ['Totale'] + [col for col in frequenze.columns if col != 'Totale']
    frequenze = frequenze[colonne_ordinate]
    return frequenze

In [62]:
dataset_without_infrequent_categories_2 = dataset_without_infrequent_categories_2.dropna(subset=['race'])

In [64]:
dataset_without_infrequent_categories_2.isna().sum()

encounter_id                    0
patient_nbr                     0
race                            0
gender                          0
age                             0
admission_type_id               0
discharge_disposition_id        0
admission_source_id             0
time_in_hospital                0
medical_specialty           27099
num_lab_procedures              0
num_procedures                  0
num_medications                 0
number_outpatient               0
number_emergency                0
number_inpatient                0
diag_1                          9
diag_2                        250
diag_3                        952
number_diagnoses                0
max_glu_serum               57025
A1Cresult                   46464
metformin                       0
repaglinide                     0
nateglinide                     0
chlorpropamide                  0
glimepiride                     0
acetohexamide                   0
glipizide                       0
glyburide     

### Handling missing values for influential variables (`A1cResult` and `medical_specialty`)

Based on the work of *Beata Strack et al.* (2014), we decide to handle missing values for the HbA1c test and for medical specialty differently than before, as these variables are considered strong predictors.

Specifically, we do not drop any missing values; instead, we replace them with a dedicated `missing` category.

The high number of missing values of `A1cResult` may be due to the fact that in certain cases of hospitalization for critical conditions, the test was not conducted because it was not considered a priority.

Another reasonable hypothesis is that some missing data are associated to patients for which there is such an high evidence of diabetes that the test is unnecessary.

In [65]:
dataset_without_infrequent_categories_2['medical_specialty'] = dataset_without_infrequent_categories_2['medical_specialty'].fillna('missing')

In [66]:
dataset_without_infrequent_categories_2['A1Cresult'] = dataset_without_infrequent_categories_2['A1Cresult'].fillna('missing')

In [67]:
dataset_without_infrequent_categories_2.A1Cresult.value_counts()

A1Cresult
missing    46464
>8          4999
Norm        3296
>7          2394
Name: count, dtype: int64

In [68]:
dataset_without_infrequent_categories_2.medical_specialty.value_counts()

medical_specialty
missing                   27099
InternalMedicine           9050
Emergency/Trauma           4344
Family/GeneralPractice     3528
Cardiology                 3206
                          ...  
Dermatology                   1
SportsMedicine                1
Perinatology                  1
Neurophysiology               1
Resident                      1
Name: count, Length: 69, dtype: int64

For `max_glu_serum`, we decide to drop the variable as it has more than $90 \%$ of N.A. 

In [None]:
dataset_without_infrequent_categories_3 = dataset_without_infrequent_categories_2.drop('max_glu_serum', axis=1)

### Dealing with missing values of `diag_1`, `diag_2`, `diag_3`

Variable `diag_1` presents at this point a small amount of N.A., so we drop them.

Moreover, we decide to drop `diag_2` and `daig_3` and to keep only the information deriving from the principal diagnosis.

In [None]:
dataset_without_infrequent_categories_3[dataset_without_infrequent_categories_3['diag_1'].isna()]

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
1006,7599132,9946782,Caucasian,Male,[80-90),elective,another_medical_facility,another_medical_facility/other,3,Urology,...,No,No,No,No,No,No,No,No,No,>30
1267,8927178,520452,Caucasian,Male,[60-70),elective,to_home,referral,3,Urology,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
1488,10122996,3650130,AfricanAmerican,Male,[60-70),elective,to_home,referral,5,Urology,...,No,No,No,No,No,No,No,Ch,Yes,NO
3197,20095914,916947,AfricanAmerican,Male,[70-80),elective,another_medical_facility,referral,1,Urology,...,No,Steady,No,No,No,No,No,No,Yes,>30
37693,117010956,25300467,Hispanic,Male,[40-50),emergency,another_medical_facility,referral,4,missing,...,No,Steady,No,No,No,No,No,Ch,Yes,<30
57058,163172034,27758448,Caucasian,Female,[80-90),emergency,to_home,referral,2,missing,...,No,Steady,No,No,No,No,No,No,Yes,NO
57737,164326842,39688524,AfricanAmerican,Male,[40-50),elective,other,emergency,3,Emergency/Trauma,...,No,Down,No,No,No,No,No,Ch,Yes,NO
60314,169067490,59785542,AfricanAmerican,Female,[80-90),emergency,to_home,emergency,8,missing,...,No,Up,No,No,No,No,No,Ch,Yes,NO
86018,273014598,113173146,Hispanic,Male,[60-70),emergency,another_medical_facility,referral,4,InternalMedicine,...,No,No,No,No,No,No,No,No,Yes,<30


In [None]:
dataset_without_infrequent_categories_3.dropna(subset=['diag_1'], inplace=True)

In [None]:
dataset_without_infrequent_categories_4 = dataset_without_infrequent_categories_3.drop(['diag_2','diag_3'], axis=1)

At this point, handling of missing values is concluded.

In [75]:
dataset_without_infrequent_categories_4.isna().sum()

encounter_id                0
patient_nbr                 0
race                        0
gender                      0
age                         0
admission_type_id           0
discharge_disposition_id    0
admission_source_id         0
time_in_hospital            0
medical_specialty           0
num_lab_procedures          0
num_procedures              0
num_medications             0
number_outpatient           0
number_emergency            0
number_inpatient            0
diag_1                      0
number_diagnoses            0
A1Cresult                   0
metformin                   0
repaglinide                 0
nateglinide                 0
chlorpropamide              0
glimepiride                 0
acetohexamide               0
glipizide                   0
glyburide                   0
tolbutamide                 0
pioglitazone                0
rosiglitazone               0
acarbose                    0
miglitol                    0
troglitazone                0
tolazamide

In [76]:
dataset_without_infrequent_categories_4.shape

(57144, 45)

### Recategorization of `medical_specialy` and `diag_1`

This part is done following *Beata Strack et al.* (2014), where principal diagnoses (or principal diagnoses) are grouped in classes.

The main goal is to **simplify the structure** of these variables as much as possible, in a logical and coherent way.

In [77]:
dataset_without_infrequent_categories_4['medical_specialty'] = dataset_without_infrequent_categories_4['medical_specialty'].apply(
    lambda x: 'Surgery' if x.startswith('Sur') else x
)

In [78]:
dataset_without_infrequent_categories_4.medical_specialty.value_counts()

medical_specialty
missing                                 27096
InternalMedicine                         9049
Emergency/Trauma                         4343
Family/GeneralPractice                   3528
Cardiology                               3206
Surgery                                  3199
Orthopedics                               896
Radiologist                               800
Nephrology                                667
Orthopedics-Reconstructive                632
ObstetricsandGynecology                   495
Psychiatry                                475
Pulmonology                               473
Urology                                   413
Gastroenterology                          332
Oncology                                  202
Neurology                                 162
Pediatrics                                155
PhysicalMedicineandRehabilitation         146
Pediatrics-Endocrinology                  133
Endocrinology                              86
Otolaryngology  

In [79]:
categorie_mantenute = ['missing', 'Surgery', 'Cardiology', 'Emergency/Trauma','Family/GeneralPractice','InternalMedicine' ]

# Ricategorizzazione: tutto ciò che non è in categorie_mantenute diventa 'other'
dataset_without_infrequent_categories_4['medical_specialty'] = dataset_without_infrequent_categories_4['medical_specialty'].apply(
    lambda x: x if x in categorie_mantenute else 'other'
)


In [80]:
dataset_without_infrequent_categories_4.medical_specialty.value_counts()

medical_specialty
missing                   27096
InternalMedicine           9049
other                      6723
Emergency/Trauma           4343
Family/GeneralPractice     3528
Cardiology                 3206
Surgery                    3199
Name: count, dtype: int64

Now for `diag_1`

In [82]:
circulatory = [i for i in range(390,460)]
circulatory.append(785)
circulatory_str = [str(num) for num in circulatory]

In [83]:
respiratory = [i for i in range(460,520)]
respiratory.append(786)
respiratory_str = [str(num) for num in respiratory]

In [84]:
digestive = [i for i in range(520,580)]
digestive.append(787)
digestive_str = [str(num) for num in digestive]

In [85]:
injury = [i for i in range(800,1000)]
injury_str = [str(num) for num in injury]

In [86]:
muscoloskeletal = [i for i in range(710,740)]
muscoloskeletal_str = [str(num) for num in muscoloskeletal]

In [87]:
genitourinary = [i for i in range(580,630)]
genitourinary.append(788)
genitourinary_str = [str(num) for num in genitourinary]

In [88]:
neoplasms = [i for i in range(140,240)]

neoplasms_str = [str(num) for num in neoplasms]

In [89]:
dataset_without_infrequent_categories_4['diag_1'] = dataset_without_infrequent_categories_4['diag_1'].apply(
    lambda x: 'diabetes' if x.startswith('250') else x
)

In [90]:
dataset_without_infrequent_categories_4['diag_1'] = dataset_without_infrequent_categories_4['diag_1'].replace(dict.fromkeys(circulatory_str, 'circulatory'))
dataset_without_infrequent_categories_4['diag_1'] = dataset_without_infrequent_categories_4['diag_1'].replace(dict.fromkeys(respiratory_str, 'respiratory'))
dataset_without_infrequent_categories_4['diag_1'] = dataset_without_infrequent_categories_4['diag_1'].replace(dict.fromkeys(digestive_str, 'digestive'))
dataset_without_infrequent_categories_4['diag_1'] = dataset_without_infrequent_categories_4['diag_1'].replace(dict.fromkeys(injury_str, 'injury'))
dataset_without_infrequent_categories_4['diag_1'] = dataset_without_infrequent_categories_4['diag_1'].replace(dict.fromkeys(muscoloskeletal_str, 'muscoloskeletal'))
dataset_without_infrequent_categories_4['diag_1'] = dataset_without_infrequent_categories_4['diag_1'].replace(dict.fromkeys(genitourinary_str, 'genitourinary'))
dataset_without_infrequent_categories_4['diag_1'] = dataset_without_infrequent_categories_4['diag_1'].replace(dict.fromkeys(neoplasms_str, 'neoplasms'))

In [91]:
categorie_mantenute_diag_1 = ['circulatory', 'respiratory', 'digestive', 'injury','muscoloskeletal','genitourinary', 'neoplasms', 'diabetes' ]

dataset_without_infrequent_categories_4['diag_1'] = dataset_without_infrequent_categories_4['diag_1'].apply(
    lambda x: x if x in categorie_mantenute_diag_1 else 'other'
)


In [92]:
dataset_without_infrequent_categories_4.diag_1.value_counts()

diag_1
circulatory        17448
other               9853
respiratory         7640
digestive           5308
diabetes            4761
injury              3881
muscoloskeletal     3233
genitourinary       2898
neoplasms           2122
Name: count, dtype: int64

In [93]:
dataset_without_infrequent_categories_4.shape

(57144, 45)

## Handling of diabetes treatments features

The second half of the dataset includes a collection of potential **diabetes treatments**, indicating whether each treatment is prescribed to the patient and, if so, whether the dosage is kept steady, increased, or reduced. Moreover, an additional binary variable indicates if at least one of the treatments is prescribed to the patient.

Our idea is to condense all of these information in 3 numerical variables: 
- `n_treatments` indicating the total number of treatments for each patient
- `rate_up_treatments` indicating the number of time a dosage has been increased
- `rate_down_treatments` indicating the number of time a dosage has been decreased

In [94]:
dataset_without_infrequent_categories_4.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'medical_specialty', 'num_lab_procedures',
       'num_procedures', 'num_medications', 'number_outpatient',
       'number_emergency', 'number_inpatient', 'diag_1', 'number_diagnoses',
       'A1Cresult', 'metformin', 'repaglinide', 'nateglinide',
       'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide',
       'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose',
       'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton',
       'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

Let's collect all the variables related to the treatments.

In [97]:
drugs_col = dataset_without_infrequent_categories_4.columns[19:19+23]

In [98]:
drugs_col

Index(['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone'],
      dtype='object')

Before creating the synthetic variables, we analyze each treatment by computing the **entropy** of them.

The idea is to see if there is any treatment worth keeping in the dataset.

In [99]:
H = []
drugs_col = dataset_without_infrequent_categories_4.columns[19:19+23]
for col in drugs_col:
    freq = dataset_without_infrequent_categories_4[col].value_counts(normalize=True)
    h = entropy(freq)
    print(f"Entropy of {col}: {h}")
    H.append(h)


Entropy of metformin: 0.5927167477128397
Entropy of repaglinide: 0.08283696738545238
Entropy of nateglinide: 0.04914283084201372
Entropy of chlorpropamide: 0.005922468270788561
Entropy of glimepiride: 0.23837750616613848
Entropy of acetohexamide: 0.00020917893264309475
Entropy of glipizide: 0.43633117338981814
Entropy of glyburide: 0.3827303209959923
Entropy of tolbutamide: 0.0018393658065174786
Entropy of pioglitazone: 0.2897150304136706
Entropy of rosiglitazone: 0.25849081712838823
Entropy of acarbose: 0.01882049934026931
Entropy of miglitol: 0.0029888646119657827
Entropy of troglitazone: 0.0003940978929169154
Entropy of tolazamide: 0.002426802870871497
Entropy of examide: 0.0
Entropy of citoglipton: 0.0
Entropy of insulin: 1.1870988930968884
Entropy of glyburide-metformin: 0.048638430621396514
Entropy of glipizide-metformin: 0.0012258768713342436
Entropy of glimepiride-pioglitazone: 0.0
Entropy of metformin-rosiglitazone: 0.0
Entropy of metformin-pioglitazone: 0.00020917893264309475

The entropy in this case of 4 categories for each treatment goes from 0 to $\log_2 (4) = 2$. **Insulin** is the only with a consistent value of entropy ($>1$), so we decide to keep it.

Now we create the **3 synthetic variables** that will replace all the others apart from `insulin`

In [101]:
dataset_without_infrequent_categories_4['n_up'] = (dataset_without_infrequent_categories_4[drugs_col] == 'Up').sum(axis=1)
dataset_without_infrequent_categories_4['n_steady'] = (dataset_without_infrequent_categories_4[drugs_col] == 'Steady').sum(axis=1)
dataset_without_infrequent_categories_4['n_down'] = (dataset_without_infrequent_categories_4[drugs_col] == 'Down').sum(axis=1)
dataset_without_infrequent_categories_4['n_no'] = (dataset_without_infrequent_categories_4[drugs_col] == 'No').sum(axis=1)


In [None]:
dataset_without_infrequent_categories_4['n_treatments'] = dataset_without_infrequent_categories_4['n_up'] + dataset_without_infrequent_categories_4['n_steady'] + dataset_without_infrequent_categories_4['n_down']

In [104]:
dataset_without_infrequent_categories_4['rate_down_treatments'] = dataset_without_infrequent_categories_4['n_down'] / dataset_without_infrequent_categories_4['n_treatments']

In [105]:
dataset_without_infrequent_categories_4['rate_up_treatments'] = dataset_without_infrequent_categories_4['n_up'] / dataset_without_infrequent_categories_4['n_treatments']

In [106]:
dataset_without_infrequent_categories_4

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,...,change,diabetesMed,readmitted,n_up,n_steady,n_down,n_no,n_treatments,rate_down_treatments,rate_up_treatments
8,12522,48330783,Caucasian,Female,[80-90),urgent,to_home,another_medical_facility/other,13,missing,...,Ch,Yes,NO,0,2,0,21,2,0.0,0.0
9,15738,63555939,Caucasian,Female,[90-100),elective,another_medical_facility,another_medical_facility/other,12,InternalMedicine,...,Ch,Yes,NO,0,2,0,21,2,0.0,0.0
4,16680,42519267,Caucasian,Male,[40-50),emergency,to_home,emergency,1,missing,...,Ch,Yes,NO,0,2,0,21,2,0.0,0.0
10,28236,89869032,AfricanAmerican,Female,[40-50),emergency,to_home,emergency,9,missing,...,No,Yes,>30,0,1,0,22,1,0.0,0.0
5,35754,82637451,Caucasian,Male,[50-60),urgent,to_home,referral,3,missing,...,No,Yes,>30,0,1,0,22,1,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101754,443842016,183087545,Caucasian,Female,[70-80),emergency,to_home,emergency,9,missing,...,Ch,Yes,>30,1,1,0,21,2,0.0,0.5
101755,443842022,188574944,Other,Female,[40-50),emergency,to_home,emergency,14,missing,...,Ch,Yes,>30,1,1,0,21,2,0.0,0.5
101756,443842070,140199494,Other,Female,[60-70),emergency,to_home,emergency,2,missing,...,No,Yes,>30,0,1,0,22,1,0.0,0.0
101758,443842340,120975314,Caucasian,Female,[80-90),emergency,to_home,emergency,5,missing,...,Ch,Yes,NO,1,0,0,22,1,0.0,1.0


For the patient without any active treatment, we set the two rates to 0.

In [107]:
dataset_without_infrequent_categories_4[['rate_down_treatments','rate_up_treatments']] = dataset_without_infrequent_categories_4[['rate_down_treatments','rate_up_treatments']].fillna(0)

In [108]:
dataset_without_infrequent_categories_4.rate_up_treatments.value_counts()

rate_up_treatments
0.000000    49842
1.000000     3730
0.500000     2071
0.333333     1082
0.250000      230
0.666667      158
0.200000       17
0.750000        7
0.400000        6
0.166667        1
Name: count, dtype: int64

In [109]:
dataset_without_infrequent_categories_4.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'medical_specialty', 'num_lab_procedures',
       'num_procedures', 'num_medications', 'number_outpatient',
       'number_emergency', 'number_inpatient', 'diag_1', 'number_diagnoses',
       'A1Cresult', 'metformin', 'repaglinide', 'nateglinide',
       'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide',
       'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose',
       'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton',
       'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted', 'n_up',
       'n_steady', 'n_down', 'n_no', 'n_treatments', 'rate_down_treatments',
       'rate_up_treatments'],
      dtype='object')

Now let's remove all the variables that are no longer needed at this stage.

In [110]:
drugs_col_tot = drugs_col
drugs_col = list(drugs_col)
drugs_col.remove('insulin')

In [112]:
dataset_without_infrequent_categories_5 = dataset_without_infrequent_categories_4.drop(drugs_col, axis = 1)

In [113]:
dataset_without_infrequent_categories_6 = dataset_without_infrequent_categories_5.drop(['patient_nbr','encounter_id','change','diabetesMed'], axis = 1)

In [114]:
dataset_without_infrequent_categories_5.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'medical_specialty', 'num_lab_procedures',
       'num_procedures', 'num_medications', 'number_outpatient',
       'number_emergency', 'number_inpatient', 'diag_1', 'number_diagnoses',
       'A1Cresult', 'insulin', 'change', 'diabetesMed', 'readmitted', 'n_up',
       'n_steady', 'n_down', 'n_no', 'n_treatments', 'rate_down_treatments',
       'rate_up_treatments'],
      dtype='object')

Let's remove the auxiliary variables used.

In [115]:
dataset_without_infrequent_categories_7 = dataset_without_infrequent_categories_6.drop(['n_no','n_down','n_steady','n_up'], axis = 1)

In [116]:
dataset_without_infrequent_categories_7.columns

Index(['race', 'gender', 'age', 'admission_type_id',
       'discharge_disposition_id', 'admission_source_id', 'time_in_hospital',
       'medical_specialty', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'diag_1', 'number_diagnoses', 'A1Cresult',
       'insulin', 'readmitted', 'n_treatments', 'rate_down_treatments',
       'rate_up_treatments'],
      dtype='object')

In [117]:
dataset_without_infrequent_categories_7.gender.value_counts()

gender
Female             30585
Male               26558
Unknown/Invalid        1
Name: count, dtype: int64

In [118]:
dataset_without_infrequent_categories_8 = dataset_without_infrequent_categories_7[~(dataset_without_infrequent_categories_7.gender == 'Unknown/Invalid')]

In [119]:
dataset_without_infrequent_categories_7.shape

(57144, 22)

In [120]:
dataset_without_infrequent_categories_8.shape

(57143, 22)

In [121]:
dataset_without_infrequent_categories_8.to_csv("final_dataset_1",index=False )

In [122]:
df = pd.read_csv("final_dataset_1")

In [123]:
df

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,...,number_emergency,number_inpatient,diag_1,number_diagnoses,A1Cresult,insulin,readmitted,n_treatments,rate_down_treatments,rate_up_treatments
0,Caucasian,Female,[80-90),urgent,to_home,another_medical_facility/other,13,missing,68,2,...,0,0,circulatory,8,missing,Steady,NO,2,0.0,0.0
1,Caucasian,Female,[90-100),elective,another_medical_facility,another_medical_facility/other,12,InternalMedicine,33,3,...,0,0,circulatory,8,missing,Steady,NO,2,0.0,0.0
2,Caucasian,Male,[40-50),emergency,to_home,emergency,1,missing,51,0,...,0,0,neoplasms,5,missing,Steady,NO,2,0.0,0.0
3,AfricanAmerican,Female,[40-50),emergency,to_home,emergency,9,missing,47,2,...,0,0,diabetes,9,missing,Steady,>30,1,0.0,0.0
4,Caucasian,Male,[50-60),urgent,to_home,referral,3,missing,31,6,...,0,0,circulatory,9,missing,Steady,>30,1,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57138,Caucasian,Female,[70-80),emergency,to_home,emergency,9,missing,50,2,...,0,0,digestive,9,>7,Steady,>30,2,0.0,0.5
57139,Other,Female,[40-50),emergency,to_home,emergency,14,missing,73,6,...,1,0,genitourinary,9,>8,Up,>30,2,0.0,0.5
57140,Other,Female,[60-70),emergency,to_home,emergency,2,missing,46,6,...,1,1,injury,9,missing,Steady,>30,1,0.0,0.0
57141,Caucasian,Female,[80-90),emergency,to_home,emergency,5,missing,76,1,...,1,0,other,9,missing,Up,NO,1,0.0,1.0
