In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

# Doing Pre-processing for 3-Class Diabetes Data

In [2]:
data = pd.read_csv('Dataset/diabetes_012_health_indicators_BRFSS2015.csv')
data.head()

Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [3]:
X_data = data.drop(['Diabetes_012'], axis=1)
y_data = data['Diabetes_012']

In [4]:
y_data.value_counts()

Diabetes_012
0.0    213703
2.0     35346
1.0      4631
Name: count, dtype: int64

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, stratify=y_data, random_state=42)

In [6]:
y_train.value_counts()

Diabetes_012
0.0    170962
2.0     28277
1.0      3705
Name: count, dtype: int64

In [7]:
y_test.value_counts()

Diabetes_012
0.0    42741
2.0     7069
1.0      926
Name: count, dtype: int64

In [8]:
continous_features = ['BMI']
ordinal_features = ['GenHlth', 'MentHlth', 'PhysHlth', 'Age', 'Education', 'Income']

In [9]:
standard_scaler = StandardScaler()
standard_scaler.fit(X_train[continous_features])
X_train[continous_features] = standard_scaler.transform(X_train[continous_features])
X_test[continous_features] = standard_scaler.transform(X_test[continous_features])

In [10]:
minmax_scaler = MinMaxScaler()
minmax_scaler.fit(X_train[ordinal_features])
X_train[ordinal_features] = minmax_scaler.transform(X_train[ordinal_features])
X_test[ordinal_features] = minmax_scaler.transform(X_test[ordinal_features])

In [11]:
train_data = pd.concat([X_train, y_train], axis=1).reset_index(drop=True)
train_data.to_csv('Dataset/train_012.csv', index=False)
train_data.describe()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,Diabetes_012
count,202944.0,202944.0,202944.0,202944.0,202944.0,202944.0,202944.0,202944.0,202944.0,202944.0,...,202944.0,202944.0,202944.0,202944.0,202944.0,202944.0,202944.0,202944.0,202944.0,202944.0
mean,0.429187,0.42426,0.962463,-4.3974780000000005e-17,0.444029,0.040578,0.094282,0.756672,0.634436,0.811505,...,0.083728,0.377485,0.105913,0.141079,0.168436,0.439427,0.586284,0.810019,0.72178,0.296924
std,0.494961,0.494231,0.190075,1.000002,0.496859,0.19731,0.292222,0.429093,0.481589,0.391108,...,0.276979,0.266897,0.246717,0.29031,0.374254,0.496319,0.254668,0.197219,0.295976,0.698162
min,0.0,0.0,0.0,-2.477359,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,1.0,-0.6636915,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.25,0.0,0.0,0.0,0.0,0.416667,0.6,0.571429,0.0
50%,0.0,0.0,1.0,-0.2102745,0.0,0.0,0.0,1.0,1.0,1.0,...,0.0,0.25,0.0,0.0,0.0,0.0,0.583333,0.8,0.857143,0.0
75%,1.0,1.0,1.0,0.3942815,1.0,0.0,0.0,1.0,1.0,1.0,...,0.0,0.5,0.066667,0.1,0.0,1.0,0.75,1.0,1.0,0.0
max,1.0,1.0,1.0,10.52059,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0


In [12]:
test_data = pd.concat([X_test, y_test], axis=1).reset_index(drop=True)
test_data.to_csv('Dataset/test_012.csv', index=False)
test_data.describe()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,Diabetes_012
count,50736.0,50736.0,50736.0,50736.0,50736.0,50736.0,50736.0,50736.0,50736.0,50736.0,...,50736.0,50736.0,50736.0,50736.0,50736.0,50736.0,50736.0,50736.0,50736.0,50736.0
mean,0.428256,0.423565,0.963497,-0.006727,0.439727,0.040543,0.093799,0.756031,0.633534,0.811081,...,0.085974,0.379302,0.107144,0.142696,0.167376,0.444004,0.584913,0.81036,0.722792,0.296909
std,0.494831,0.494128,0.187539,0.994125,0.496359,0.197231,0.291552,0.429478,0.481843,0.391448,...,0.280329,0.268003,0.2486,0.291746,0.373315,0.496859,0.253918,0.196899,0.295489,0.698156
min,0.0,0.0,0.0,-2.477359,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,1.0,-0.663691,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.25,0.0,0.0,0.0,0.0,0.416667,0.6,0.571429,0.0
50%,0.0,0.0,1.0,-0.210274,0.0,0.0,0.0,1.0,1.0,1.0,...,0.0,0.25,0.0,0.0,0.0,0.0,0.583333,0.8,0.857143,0.0
75%,1.0,1.0,1.0,0.394282,1.0,0.0,0.0,1.0,1.0,1.0,...,0.0,0.5,0.066667,0.1,0.0,1.0,0.75,1.0,1.0,0.0
max,1.0,1.0,1.0,10.520594,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0


# Doing Pre-processing for 2-Class Diabetes Data with 50-50 split

In [13]:
data = pd.read_csv('Dataset/diabetes_binary_5050split_health_indicators_BRFSS2015.csv')
data.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,0.0,1.0,26.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,3.0,5.0,30.0,0.0,1.0,4.0,6.0,8.0
1,0.0,1.0,1.0,1.0,26.0,1.0,1.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,0.0,0.0,1.0,12.0,6.0,8.0
2,0.0,0.0,0.0,1.0,26.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,10.0,0.0,1.0,13.0,6.0,8.0
3,0.0,1.0,1.0,1.0,28.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,3.0,0.0,3.0,0.0,1.0,11.0,6.0,8.0
4,0.0,0.0,0.0,1.0,29.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,8.0,5.0,8.0


In [14]:
X_data = data.drop(['Diabetes_binary'], axis=1)
y_data = data['Diabetes_binary']

In [15]:
y_data.value_counts()

Diabetes_binary
0.0    35346
1.0    35346
Name: count, dtype: int64

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, stratify=y_data, random_state=42)

In [17]:
y_train.value_counts()

Diabetes_binary
1.0    28277
0.0    28276
Name: count, dtype: int64

In [18]:
y_test.value_counts()

Diabetes_binary
0.0    7070
1.0    7069
Name: count, dtype: int64

In [19]:
continous_features = ['BMI']
ordinal_features = ['GenHlth', 'MentHlth', 'PhysHlth', 'Age', 'Education', 'Income']

In [20]:
standard_scaler = StandardScaler()
standard_scaler.fit(X_train[continous_features])
X_train[continous_features] = standard_scaler.transform(X_train[continous_features])
X_test[continous_features] = standard_scaler.transform(X_test[continous_features])

In [21]:
minmax_scaler = MinMaxScaler()
minmax_scaler.fit(X_train[ordinal_features])
X_train[ordinal_features] = minmax_scaler.transform(X_train[ordinal_features])
X_test[ordinal_features] = minmax_scaler.transform(X_test[ordinal_features])

In [22]:
train_data = pd.concat([X_train, y_train], axis=1).reset_index(drop=True)
train_data.to_csv('Dataset/train_binary_split.csv', index=False)
train_data.describe()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,Diabetes_binary
count,56553.0,56553.0,56553.0,56553.0,56553.0,56553.0,56553.0,56553.0,56553.0,56553.0,...,56553.0,56553.0,56553.0,56553.0,56553.0,56553.0,56553.0,56553.0,56553.0,56553.0
mean,0.562923,0.525525,0.975527,1.030264e-16,0.47543,0.061553,0.147331,0.702651,0.612222,0.788252,...,0.093664,0.458521,0.124491,0.192296,0.251905,0.456952,0.632037,0.783431,0.6716,0.500009
std,0.496029,0.499352,0.154513,1.000009,0.4994,0.240344,0.354439,0.457096,0.487248,0.408551,...,0.291364,0.278201,0.270794,0.334063,0.434111,0.498148,0.237639,0.205991,0.310585,0.500004
min,0.0,0.0,0.0,-2.367982,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,1.0,-0.681918,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.25,0.0,0.0,0.0,0.0,0.5,0.6,0.428571,0.0
50%,1.0,1.0,1.0,-0.1198965,0.0,0.0,0.0,1.0,1.0,1.0,...,0.0,0.5,0.0,0.0,0.0,0.0,0.666667,0.8,0.714286,1.0
75%,1.0,1.0,1.0,0.442125,1.0,0.0,0.0,1.0,1.0,1.0,...,0.0,0.75,0.066667,0.166667,1.0,1.0,0.833333,1.0,1.0,1.0
max,1.0,1.0,1.0,9.574974,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [23]:
test_data = pd.concat([X_test, y_test], axis=1).reset_index(drop=True)
test_data.to_csv('Dataset/test_binary_split.csv', index=False)
test_data.describe()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,Diabetes_binary
count,14139.0,14139.0,14139.0,14139.0,14139.0,14139.0,14139.0,14139.0,14139.0,14139.0,...,14139.0,14139.0,14139.0,14139.0,14139.0,14139.0,14139.0,14139.0,14139.0,14139.0
mean,0.565599,0.526416,0.974185,0.002572,0.474645,0.064644,0.149728,0.704576,0.610086,0.790862,...,0.094915,0.462267,0.127373,0.19922,0.256029,0.457175,0.631875,0.787227,0.669536,0.499965
std,0.495696,0.499319,0.158589,0.997739,0.499374,0.245905,0.356817,0.456249,0.487748,0.406707,...,0.293108,0.27914,0.276051,0.340693,0.436454,0.49818,0.23785,0.205094,0.311378,0.500018
min,0.0,0.0,0.0,-2.508488,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,1.0,-0.681918,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.25,0.0,0.0,0.0,0.0,0.5,0.6,0.428571,0.0
50%,1.0,1.0,1.0,-0.119897,0.0,0.0,0.0,1.0,1.0,1.0,...,0.0,0.5,0.0,0.0,0.0,0.0,0.666667,0.8,0.714286,0.0
75%,1.0,1.0,1.0,0.442125,1.0,0.0,0.0,1.0,1.0,1.0,...,0.0,0.75,0.066667,0.233333,1.0,1.0,0.833333,1.0,1.0,1.0
max,1.0,1.0,1.0,9.574974,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
