In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

# Doing Pre-processing for 3-Class Diabetes Data

In [2]:
data = pd.read_csv('Dataset/diabetes_012_health_indicators_BRFSS2015.csv')
data.head()

Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [3]:
X_data = data.drop(['Diabetes_012'], axis=1)
y_data = data['Diabetes_012']

In [4]:
y_data.value_counts()

Diabetes_012
0.0    213703
2.0     35346
1.0      4631
Name: count, dtype: int64

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, stratify=y_data, random_state=42)

In [6]:
y_train.value_counts()

Diabetes_012
0.0    170962
2.0     28277
1.0      3705
Name: count, dtype: int64

In [7]:
y_test.value_counts()

Diabetes_012
0.0    42741
2.0     7069
1.0      926
Name: count, dtype: int64

In [8]:
continous_features = ['BMI']
ordinal_features = ['GenHlth', 'MentHlth', 'PhysHlth', 'Age', 'Education', 'Income']
binary_categorical_features = ['HighBP', 'HighChol', 'CholCheck', 'Smoker', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity',
                               'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'Sex']
binary_categorical_features_onehot = ['HighBP_0', 'HighBP_1', 'HighChol_0', 'HighChol_1', 'CholCheck_0', 'CholCheck_1',
                                      'Smoker_0', 'Smoker_1', 'Stroke_0', 'Stroke_1', 'HeartDiseaseorAttack_0', 'HeartDiseaseorAttack_1',
                                      'PhysActivity_0', 'PhysActivity_1', 'Fruits_0', 'Fruits_1', 'Veggies_0', 'Veggies_1',
                                      'HvyAlcoholConsump_0', 'HvyAlcoholConsump_1', 'AnyHealthcare_0', 'AnyHealthcare_1', 
                                      'NoDocbcCost_0', 'NoDocbcCost_1', 'Sex_0', 'Sex_1']

In [9]:
standard_scaler = StandardScaler()
standard_scaler.fit(X_train[continous_features])
X_train[continous_features] = standard_scaler.transform(X_train[continous_features])
X_test[continous_features] = standard_scaler.transform(X_test[continous_features])

In [10]:
minmax_scaler = MinMaxScaler()
minmax_scaler.fit(X_train[ordinal_features])
X_train[ordinal_features] = minmax_scaler.transform(X_train[ordinal_features])
X_test[ordinal_features] = minmax_scaler.transform(X_test[ordinal_features])

In [11]:
one_hot_encoder = OneHotEncoder(handle_unknown='ignore')
one_hot_encoder.fit(X_train[binary_categorical_features])

In [12]:
output_train = one_hot_encoder.transform(X_train[binary_categorical_features])
output_train = output_train.toarray().transpose().tolist()

for i, feature_name in enumerate(binary_categorical_features_onehot):
    X_train[feature_name] = output_train[i]
    
X_train = X_train.drop(binary_categorical_features, axis=1)

In [13]:
train_data = pd.concat([X_train, y_train], axis=1).reset_index(drop=True)
train_data.to_csv('Dataset/train_012.csv', index=False)
train_data.describe()

Unnamed: 0,BMI,GenHlth,MentHlth,PhysHlth,DiffWalk,Age,Education,Income,HighBP_0,HighBP_1,...,Veggies_1,HvyAlcoholConsump_0,HvyAlcoholConsump_1,AnyHealthcare_0,AnyHealthcare_1,NoDocbcCost_0,NoDocbcCost_1,Sex_0,Sex_1,Diabetes_012
count,202944.0,202944.0,202944.0,202944.0,202944.0,202944.0,202944.0,202944.0,202944.0,202944.0,...,202944.0,202944.0,202944.0,202944.0,202944.0,202944.0,202944.0,202944.0,202944.0,202944.0
mean,-4.3974780000000005e-17,0.377485,0.105913,0.141079,0.168436,0.586284,0.810019,0.72178,0.570813,0.429187,...,0.811505,0.944167,0.055833,0.049166,0.950834,0.916272,0.083728,0.560573,0.439427,0.296924
std,1.000002,0.266897,0.246717,0.29031,0.374254,0.254668,0.197219,0.295976,0.494961,0.494961,...,0.391108,0.2296,0.2296,0.216216,0.216216,0.276979,0.276979,0.496319,0.496319,0.698162
min,-2.477359,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.6636915,0.25,0.0,0.0,0.0,0.416667,0.6,0.571429,0.0,0.0,...,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
50%,-0.2102745,0.25,0.0,0.0,0.0,0.583333,0.8,0.857143,1.0,0.0,...,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
75%,0.3942815,0.5,0.066667,0.1,0.0,0.75,1.0,1.0,1.0,1.0,...,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0
max,10.52059,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0


In [14]:
output_test = one_hot_encoder.transform(X_test[binary_categorical_features])
output_test = output_test.toarray().transpose().tolist()

for i, feature_name in enumerate(binary_categorical_features_onehot):
    X_test[feature_name] = output_test[i]
    
X_test = X_test.drop(binary_categorical_features, axis=1)

In [15]:
test_data = pd.concat([X_test, y_test], axis=1).reset_index(drop=True)
test_data.to_csv('Dataset/test_012.csv', index=False)
test_data.describe()

Unnamed: 0,BMI,GenHlth,MentHlth,PhysHlth,DiffWalk,Age,Education,Income,HighBP_0,HighBP_1,...,Veggies_1,HvyAlcoholConsump_0,HvyAlcoholConsump_1,AnyHealthcare_0,AnyHealthcare_1,NoDocbcCost_0,NoDocbcCost_1,Sex_0,Sex_1,Diabetes_012
count,50736.0,50736.0,50736.0,50736.0,50736.0,50736.0,50736.0,50736.0,50736.0,50736.0,...,50736.0,50736.0,50736.0,50736.0,50736.0,50736.0,50736.0,50736.0,50736.0,50736.0
mean,-0.006727,0.379302,0.107144,0.142696,0.167376,0.584913,0.81036,0.722792,0.571744,0.428256,...,0.811081,0.942349,0.057651,0.048072,0.951928,0.914026,0.085974,0.555996,0.444004,0.296909
std,0.994125,0.268003,0.2486,0.291746,0.373315,0.253918,0.196899,0.295489,0.494831,0.494831,...,0.391448,0.233085,0.233085,0.213921,0.213921,0.280329,0.280329,0.496859,0.496859,0.698156
min,-2.477359,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.663691,0.25,0.0,0.0,0.0,0.416667,0.6,0.571429,0.0,0.0,...,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
50%,-0.210274,0.25,0.0,0.0,0.0,0.583333,0.8,0.857143,1.0,0.0,...,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
75%,0.394282,0.5,0.066667,0.1,0.0,0.75,1.0,1.0,1.0,1.0,...,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0
max,10.520594,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0


# Doing Pre-processing for 2-Class Diabetes Data with 50-50 split

In [16]:
data = pd.read_csv('Dataset/diabetes_binary_5050split_health_indicators_BRFSS2015.csv')
data.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,0.0,1.0,26.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,3.0,5.0,30.0,0.0,1.0,4.0,6.0,8.0
1,0.0,1.0,1.0,1.0,26.0,1.0,1.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,0.0,0.0,1.0,12.0,6.0,8.0
2,0.0,0.0,0.0,1.0,26.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,10.0,0.0,1.0,13.0,6.0,8.0
3,0.0,1.0,1.0,1.0,28.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,3.0,0.0,3.0,0.0,1.0,11.0,6.0,8.0
4,0.0,0.0,0.0,1.0,29.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,8.0,5.0,8.0


In [17]:
X_data = data.drop(['Diabetes_binary'], axis=1)
y_data = data['Diabetes_binary']

In [18]:
y_data.value_counts()

Diabetes_binary
0.0    35346
1.0    35346
Name: count, dtype: int64

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, stratify=y_data, random_state=42)

In [20]:
y_train.value_counts()

Diabetes_binary
1.0    28277
0.0    28276
Name: count, dtype: int64

In [21]:
y_test.value_counts()

Diabetes_binary
0.0    7070
1.0    7069
Name: count, dtype: int64

In [22]:
continous_features = ['BMI']
ordinal_features = ['GenHlth', 'MentHlth', 'PhysHlth', 'Age', 'Education', 'Income']
binary_categorical_features = ['HighBP', 'HighChol', 'CholCheck', 'Smoker', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity',
                               'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'Sex']
binary_categorical_features_onehot = ['HighBP_0', 'HighBP_1', 'HighChol_0', 'HighChol_1', 'CholCheck_0', 'CholCheck_1',
                                      'Smoker_0', 'Smoker_1', 'Stroke_0', 'Stroke_1', 'HeartDiseaseorAttack_0', 'HeartDiseaseorAttack_1',
                                      'PhysActivity_0', 'PhysActivity_1', 'Fruits_0', 'Fruits_1', 'Veggies_0', 'Veggies_1',
                                      'HvyAlcoholConsump_0', 'HvyAlcoholConsump_1', 'AnyHealthcare_0', 'AnyHealthcare_1', 
                                      'NoDocbcCost_0', 'NoDocbcCost_1', 'Sex_0', 'Sex_1']

In [23]:
standard_scaler = StandardScaler()
standard_scaler.fit(X_train[continous_features])
X_train[continous_features] = standard_scaler.transform(X_train[continous_features])
X_test[continous_features] = standard_scaler.transform(X_test[continous_features])

In [24]:
minmax_scaler = MinMaxScaler()
minmax_scaler.fit(X_train[ordinal_features])
X_train[ordinal_features] = minmax_scaler.transform(X_train[ordinal_features])
X_test[ordinal_features] = minmax_scaler.transform(X_test[ordinal_features])

In [25]:
one_hot_encoder = OneHotEncoder(handle_unknown='ignore')
one_hot_encoder.fit(X_train[binary_categorical_features])

In [26]:
output_train = one_hot_encoder.transform(X_train[binary_categorical_features])
output_train = output_train.toarray().transpose().tolist()

for i, feature_name in enumerate(binary_categorical_features_onehot):
    X_train[feature_name] = output_train[i]
    
X_train = X_train.drop(binary_categorical_features, axis=1)

In [27]:
train_data = pd.concat([X_train, y_train], axis=1).reset_index(drop=True)
train_data.to_csv('Dataset/train_binary_split.csv', index=False)
train_data.describe()

Unnamed: 0,BMI,GenHlth,MentHlth,PhysHlth,DiffWalk,Age,Education,Income,HighBP_0,HighBP_1,...,Veggies_1,HvyAlcoholConsump_0,HvyAlcoholConsump_1,AnyHealthcare_0,AnyHealthcare_1,NoDocbcCost_0,NoDocbcCost_1,Sex_0,Sex_1,Diabetes_binary
count,56553.0,56553.0,56553.0,56553.0,56553.0,56553.0,56553.0,56553.0,56553.0,56553.0,...,56553.0,56553.0,56553.0,56553.0,56553.0,56553.0,56553.0,56553.0,56553.0,56553.0
mean,1.030264e-16,0.458521,0.124491,0.192296,0.251905,0.632037,0.783431,0.6716,0.437077,0.562923,...,0.788252,0.957933,0.042067,0.045426,0.954574,0.906336,0.093664,0.543048,0.456952,0.500009
std,1.000009,0.278201,0.270794,0.334063,0.434111,0.237639,0.205991,0.310585,0.496029,0.496029,...,0.408551,0.200743,0.200743,0.208239,0.208239,0.291364,0.291364,0.498148,0.498148,0.500004
min,-2.367982,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.681918,0.25,0.0,0.0,0.0,0.5,0.6,0.428571,0.0,0.0,...,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
50%,-0.1198965,0.5,0.0,0.0,0.0,0.666667,0.8,0.714286,0.0,1.0,...,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0
75%,0.442125,0.75,0.066667,0.166667,1.0,0.833333,1.0,1.0,1.0,1.0,...,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0
max,9.574974,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [28]:
output_test = one_hot_encoder.transform(X_test[binary_categorical_features])
output_test = output_test.toarray().transpose().tolist()

for i, feature_name in enumerate(binary_categorical_features_onehot):
    X_test[feature_name] = output_test[i]
    
X_test = X_test.drop(binary_categorical_features, axis=1)

In [29]:
test_data = pd.concat([X_test, y_test], axis=1).reset_index(drop=True)
test_data.to_csv('Dataset/test_binary_split.csv', index=False)
test_data.describe()

Unnamed: 0,BMI,GenHlth,MentHlth,PhysHlth,DiffWalk,Age,Education,Income,HighBP_0,HighBP_1,...,Veggies_1,HvyAlcoholConsump_0,HvyAlcoholConsump_1,AnyHealthcare_0,AnyHealthcare_1,NoDocbcCost_0,NoDocbcCost_1,Sex_0,Sex_1,Diabetes_binary
count,14139.0,14139.0,14139.0,14139.0,14139.0,14139.0,14139.0,14139.0,14139.0,14139.0,...,14139.0,14139.0,14139.0,14139.0,14139.0,14139.0,14139.0,14139.0,14139.0,14139.0
mean,0.002572,0.462267,0.127373,0.19922,0.256029,0.631875,0.787227,0.669536,0.434401,0.565599,...,0.790862,0.954664,0.045336,0.043497,0.956503,0.905085,0.094915,0.542825,0.457175,0.499965
std,0.997739,0.27914,0.276051,0.340693,0.436454,0.23785,0.205094,0.311378,0.495696,0.495696,...,0.406707,0.208046,0.208046,0.20398,0.20398,0.293108,0.293108,0.49818,0.49818,0.500018
min,-2.508488,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.681918,0.25,0.0,0.0,0.0,0.5,0.6,0.428571,0.0,0.0,...,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
50%,-0.119897,0.5,0.0,0.0,0.0,0.666667,0.8,0.714286,0.0,1.0,...,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
75%,0.442125,0.75,0.066667,0.233333,1.0,0.833333,1.0,1.0,1.0,1.0,...,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0
max,9.574974,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Doing Pre-processing for 2-Class Diabetes Data with Imbalanced split

In [30]:
data = pd.read_csv('Dataset/diabetes_binary_health_indicators_BRFSS2015.csv')
data.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [31]:
X_data = data.drop(['Diabetes_binary'], axis=1)
y_data = data['Diabetes_binary']

In [32]:
y_data.value_counts()

Diabetes_binary
0.0    218334
1.0     35346
Name: count, dtype: int64

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, stratify=y_data, random_state=42)

In [34]:
y_train.value_counts()

Diabetes_binary
0.0    174667
1.0     28277
Name: count, dtype: int64

In [35]:
y_test.value_counts()

Diabetes_binary
0.0    43667
1.0     7069
Name: count, dtype: int64

In [36]:
continous_features = ['BMI']
ordinal_features = ['GenHlth', 'MentHlth', 'PhysHlth', 'Age', 'Education', 'Income']
binary_categorical_features = ['HighBP', 'HighChol', 'CholCheck', 'Smoker', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity',
                               'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'Sex']
binary_categorical_features_onehot = ['HighBP_0', 'HighBP_1', 'HighChol_0', 'HighChol_1', 'CholCheck_0', 'CholCheck_1',
                                      'Smoker_0', 'Smoker_1', 'Stroke_0', 'Stroke_1', 'HeartDiseaseorAttack_0', 'HeartDiseaseorAttack_1',
                                      'PhysActivity_0', 'PhysActivity_1', 'Fruits_0', 'Fruits_1', 'Veggies_0', 'Veggies_1',
                                      'HvyAlcoholConsump_0', 'HvyAlcoholConsump_1', 'AnyHealthcare_0', 'AnyHealthcare_1', 
                                      'NoDocbcCost_0', 'NoDocbcCost_1', 'Sex_0', 'Sex_1']

In [37]:
standard_scaler = StandardScaler()
standard_scaler.fit(X_train[continous_features])
X_train[continous_features] = standard_scaler.transform(X_train[continous_features])
X_test[continous_features] = standard_scaler.transform(X_test[continous_features])

In [38]:
minmax_scaler = MinMaxScaler()
minmax_scaler.fit(X_train[ordinal_features])
X_train[ordinal_features] = minmax_scaler.transform(X_train[ordinal_features])
X_test[ordinal_features] = minmax_scaler.transform(X_test[ordinal_features])

In [39]:
one_hot_encoder = OneHotEncoder(handle_unknown='ignore')
one_hot_encoder.fit(X_train[binary_categorical_features])

In [40]:
output_train = one_hot_encoder.transform(X_train[binary_categorical_features])
output_train = output_train.toarray().transpose().tolist()

for i, feature_name in enumerate(binary_categorical_features_onehot):
    X_train[feature_name] = output_train[i]
    
X_train = X_train.drop(binary_categorical_features, axis=1)

In [41]:
train_data = pd.concat([X_train, y_train], axis=1).reset_index(drop=True)
train_data.to_csv('Dataset/train_binary_imbalanced_split.csv', index=False)
train_data.describe()

Unnamed: 0,BMI,GenHlth,MentHlth,PhysHlth,DiffWalk,Age,Education,Income,HighBP_0,HighBP_1,...,Veggies_1,HvyAlcoholConsump_0,HvyAlcoholConsump_1,AnyHealthcare_0,AnyHealthcare_1,NoDocbcCost_0,NoDocbcCost_1,Sex_0,Sex_1,Diabetes_binary
count,202944.0,202944.0,202944.0,202944.0,202944.0,202944.0,202944.0,202944.0,202944.0,202944.0,...,202944.0,202944.0,202944.0,202944.0,202944.0,202944.0,202944.0,202944.0,202944.0,202944.0
mean,1.627347e-16,0.377959,0.106327,0.141694,0.16816,0.586069,0.809985,0.721547,0.571005,0.428995,...,0.811426,0.94427,0.05573,0.048851,0.951149,0.915376,0.084624,0.558987,0.441013,0.139334
std,1.000002,0.267095,0.247254,0.290855,0.374009,0.254282,0.197277,0.296094,0.494934,0.494934,...,0.391171,0.229399,0.229399,0.215557,0.215557,0.278323,0.278323,0.49651,0.49651,0.346296
min,-2.482157,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.6635007,0.25,0.0,0.0,0.0,0.416667,0.6,0.571429,0.0,0.0,...,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
50%,-0.2088366,0.25,0.0,0.0,0.0,0.583333,0.8,0.857143,1.0,0.0,...,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
75%,0.3973823,0.5,0.066667,0.1,0.0,0.75,1.0,1.0,1.0,1.0,...,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0
max,10.55155,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [42]:
output_test = one_hot_encoder.transform(X_test[binary_categorical_features])
output_test = output_test.toarray().transpose().tolist()

for i, feature_name in enumerate(binary_categorical_features_onehot):
    X_test[feature_name] = output_test[i]
    
X_test = X_test.drop(binary_categorical_features, axis=1)

In [43]:
test_data = pd.concat([X_test, y_test], axis=1).reset_index(drop=True)
test_data.to_csv('Dataset/test_binary_imbalanced_split.csv', index=False)
test_data.describe()

Unnamed: 0,BMI,GenHlth,MentHlth,PhysHlth,DiffWalk,Age,Education,Income,HighBP_0,HighBP_1,...,Veggies_1,HvyAlcoholConsump_0,HvyAlcoholConsump_1,AnyHealthcare_0,AnyHealthcare_1,NoDocbcCost_0,NoDocbcCost_1,Sex_0,Sex_1,Diabetes_binary
count,50736.0,50736.0,50736.0,50736.0,50736.0,50736.0,50736.0,50736.0,50736.0,50736.0,...,50736.0,50736.0,50736.0,50736.0,50736.0,50736.0,50736.0,50736.0,50736.0,50736.0
mean,0.003336,0.377405,0.105489,0.140238,0.16848,0.585774,0.810494,0.723721,0.570975,0.429025,...,0.811396,0.941935,0.058065,0.049334,0.950666,0.917613,0.082387,0.562342,0.437658,0.139329
std,1.007865,0.267221,0.24646,0.28957,0.374296,0.255463,0.196667,0.295008,0.494942,0.494942,...,0.391197,0.233869,0.233869,0.216566,0.216566,0.274957,0.274957,0.496103,0.496103,0.346293
min,-2.482157,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.663501,0.25,0.0,0.0,0.0,0.416667,0.6,0.571429,0.0,0.0,...,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
50%,-0.208837,0.25,0.0,0.0,0.0,0.583333,0.8,0.857143,1.0,0.0,...,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
75%,0.397382,0.5,0.066667,0.1,0.0,0.75,1.0,1.0,1.0,1.0,...,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0
max,10.551548,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
