# Data Preprocessing

In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [51]:
df = pd.read_csv('hospital_readmissions.csv')
df.head()

Unnamed: 0,age,time_in_hospital,n_lab_procedures,n_procedures,n_medications,n_outpatient,n_inpatient,n_emergency,medical_specialty,diag_1,diag_2,diag_3,glucose_test,A1Ctest,change,diabetes_med,readmitted
0,[70-80),8,72,1,18,2,0,0,Missing,Circulatory,Respiratory,Other,no,no,no,yes,no
1,[70-80),3,34,2,13,0,0,0,Other,Other,Other,Other,no,no,no,yes,no
2,[50-60),5,45,0,18,0,0,0,Missing,Circulatory,Circulatory,Circulatory,no,no,yes,yes,yes
3,[70-80),2,36,0,12,1,0,0,Missing,Circulatory,Other,Diabetes,no,no,yes,yes,yes
4,[60-70),1,42,0,7,0,0,0,InternalMedicine,Other,Circulatory,Respiratory,no,no,no,yes,no


In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   age                25000 non-null  object
 1   time_in_hospital   25000 non-null  int64 
 2   n_lab_procedures   25000 non-null  int64 
 3   n_procedures       25000 non-null  int64 
 4   n_medications      25000 non-null  int64 
 5   n_outpatient       25000 non-null  int64 
 6   n_inpatient        25000 non-null  int64 
 7   n_emergency        25000 non-null  int64 
 8   medical_specialty  25000 non-null  object
 9   diag_1             25000 non-null  object
 10  diag_2             25000 non-null  object
 11  diag_3             25000 non-null  object
 12  glucose_test       25000 non-null  object
 13  A1Ctest            25000 non-null  object
 14  change             25000 non-null  object
 15  diabetes_med       25000 non-null  object
 16  readmitted         25000 non-null  objec

In [53]:
df.describe()
df.value_counts()

age       time_in_hospital  n_lab_procedures  n_procedures  n_medications  n_outpatient  n_inpatient  n_emergency  medical_specialty       diag_1           diag_2       diag_3       glucose_test  A1Ctest  change  diabetes_med  readmitted
[40-50)   1                 1                 0             1              0             1            0            Missing                 Other            Other        Digestive    no            no       no      no            no            1
[70-80)   4                 54                3             15             0             0            0            Missing                 Circulatory      Circulatory  Circulatory  no            no       yes     yes           no            1
                            55                3             15             0             3            0            Family/GeneralPractice  Musculoskeletal  Diabetes     Other        no            no       no      yes           yes           1
                                 

Encode Categorical Vales:

In [54]:
df = pd.get_dummies(df, columns=['medical_specialty', 'diag_1', 'diag_2', 'diag_3', 'glucose_test', 'A1Ctest', 'change', 'diabetes_med'])

In [55]:
df.dtypes

age                                         object
time_in_hospital                             int64
n_lab_procedures                             int64
n_procedures                                 int64
n_medications                                int64
n_outpatient                                 int64
n_inpatient                                  int64
n_emergency                                  int64
readmitted                                  object
medical_specialty_Cardiology                  bool
medical_specialty_Emergency/Trauma            bool
medical_specialty_Family/GeneralPractice      bool
medical_specialty_InternalMedicine            bool
medical_specialty_Missing                     bool
medical_specialty_Other                       bool
medical_specialty_Surgery                     bool
diag_1_Circulatory                            bool
diag_1_Diabetes                               bool
diag_1_Digestive                              bool
diag_1_Injury                  

In [56]:
df.head()

Unnamed: 0,age,time_in_hospital,n_lab_procedures,n_procedures,n_medications,n_outpatient,n_inpatient,n_emergency,readmitted,medical_specialty_Cardiology,...,glucose_test_high,glucose_test_no,glucose_test_normal,A1Ctest_high,A1Ctest_no,A1Ctest_normal,change_no,change_yes,diabetes_med_no,diabetes_med_yes
0,[70-80),8,72,1,18,2,0,0,no,False,...,False,True,False,False,True,False,True,False,False,True
1,[70-80),3,34,2,13,0,0,0,no,False,...,False,True,False,False,True,False,True,False,False,True
2,[50-60),5,45,0,18,0,0,0,yes,False,...,False,True,False,False,True,False,False,True,False,True
3,[70-80),2,36,0,12,1,0,0,yes,False,...,False,True,False,False,True,False,False,True,False,True
4,[60-70),1,42,0,7,0,0,0,no,False,...,False,True,False,False,True,False,True,False,False,True


In [57]:
df.head()

Unnamed: 0,age,time_in_hospital,n_lab_procedures,n_procedures,n_medications,n_outpatient,n_inpatient,n_emergency,readmitted,medical_specialty_Cardiology,...,glucose_test_high,glucose_test_no,glucose_test_normal,A1Ctest_high,A1Ctest_no,A1Ctest_normal,change_no,change_yes,diabetes_med_no,diabetes_med_yes
0,[70-80),8,72,1,18,2,0,0,no,False,...,False,True,False,False,True,False,True,False,False,True
1,[70-80),3,34,2,13,0,0,0,no,False,...,False,True,False,False,True,False,True,False,False,True
2,[50-60),5,45,0,18,0,0,0,yes,False,...,False,True,False,False,True,False,False,True,False,True
3,[70-80),2,36,0,12,1,0,0,yes,False,...,False,True,False,False,True,False,False,True,False,True
4,[60-70),1,42,0,7,0,0,0,no,False,...,False,True,False,False,True,False,True,False,False,True


In [63]:
df.head()

Unnamed: 0,age,time_in_hospital,n_lab_procedures,n_procedures,n_medications,n_outpatient,n_inpatient,n_emergency,readmitted,medical_specialty_Cardiology,...,glucose_test_high,glucose_test_no,glucose_test_normal,A1Ctest_high,A1Ctest_no,A1Ctest_normal,change_no,change_yes,diabetes_med_no,diabetes_med_yes
0,[70-80),0.538462,0.633929,0.166667,0.217949,0.060606,0.0,0.0,no,False,...,False,True,False,False,True,False,True,False,False,True
1,[70-80),0.153846,0.294643,0.333333,0.153846,0.0,0.0,0.0,no,False,...,False,True,False,False,True,False,True,False,False,True
2,[50-60),0.307692,0.392857,0.0,0.217949,0.0,0.0,0.0,yes,False,...,False,True,False,False,True,False,False,True,False,True
3,[70-80),0.076923,0.3125,0.0,0.141026,0.030303,0.0,0.0,yes,False,...,False,True,False,False,True,False,False,True,False,True
4,[60-70),0.0,0.366071,0.0,0.076923,0.0,0.0,0.0,no,False,...,False,True,False,False,True,False,True,False,False,True


In [64]:
quantitative_columns = ['time_in_hospital', 'n_lab_procedures', 'n_procedures', 'n_medications', 'n_outpatient', 'n_inpatient', 'n_emergency']

scaler = MinMaxScaler()
df[quantitative_columns] = scaler.fit_transform(df[quantitative_columns])

In [62]:
df.head()

Unnamed: 0,age,time_in_hospital,n_lab_procedures,n_procedures,n_medications,n_outpatient,n_inpatient,n_emergency,readmitted,medical_specialty_Cardiology,...,glucose_test_high,glucose_test_no,glucose_test_normal,A1Ctest_high,A1Ctest_no,A1Ctest_normal,change_no,change_yes,diabetes_med_no,diabetes_med_yes
0,[70-80),0.538462,0.633929,0.166667,0.217949,0.060606,0.0,0.0,no,False,...,False,True,False,False,True,False,True,False,False,True
1,[70-80),0.153846,0.294643,0.333333,0.153846,0.0,0.0,0.0,no,False,...,False,True,False,False,True,False,True,False,False,True
2,[50-60),0.307692,0.392857,0.0,0.217949,0.0,0.0,0.0,yes,False,...,False,True,False,False,True,False,False,True,False,True
3,[70-80),0.076923,0.3125,0.0,0.141026,0.030303,0.0,0.0,yes,False,...,False,True,False,False,True,False,False,True,False,True
4,[60-70),0.0,0.366071,0.0,0.076923,0.0,0.0,0.0,no,False,...,False,True,False,False,True,False,True,False,False,True


In [None]:
quantitative_columns = ['time_in_hospital', 'n_lab_procedures', 'n_procedures', 'n_medications', 'n_outpatient', 'n_inpatient', 'n_emergency']

scaler = MinMaxScaler()
df[quantitative_columns] = scaler.fit_transform(df[quantitative_columns])