## **Feature engineering**

### **Import necessary libraries**

In [7]:

import pandas as pd
import numpy as np
import os
import sys
sys.path.append(os.path.abspath(os.path.join('..')))

from utils.function import *
from utils.constants import *

import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [8]:
df = pd.read_csv('../data/preprocessed_diabetes.csv')

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99968 entries, 0 to 99967
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   gender                99968 non-null  object 
 1   age                   99968 non-null  float64
 2   location              99968 non-null  object 
 3   race:AfricanAmerican  99968 non-null  int64  
 4   race:Asian            99968 non-null  int64  
 5   race:Caucasian        99968 non-null  int64  
 6   race:Hispanic         99968 non-null  int64  
 7   race:Other            99968 non-null  int64  
 8   hypertension          99968 non-null  int64  
 9   heart_disease         99968 non-null  int64  
 10  bmi                   99968 non-null  float64
 11  hbA1c_level           99968 non-null  float64
 12  blood_glucose_level   99968 non-null  int64  
 13  diabetes              99968 non-null  int64  
 14  bmi_class             99968 non-null  object 
dtypes: float64(3), int6

In [10]:
df.head()

Unnamed: 0,gender,age,location,race:AfricanAmerican,race:Asian,race:Caucasian,race:Hispanic,race:Other,hypertension,heart_disease,bmi,hbA1c_level,blood_glucose_level,diabetes,bmi_class
0,Female,32.0,Alabama,0,0,0,0,1,0,0,27.32,5.0,100,0,Overweight
1,Female,29.0,Alabama,0,1,0,0,0,0,0,19.95,5.0,90,0,Normal weight
2,Male,18.0,Alabama,0,0,0,0,1,0,0,23.76,4.8,160,0,Normal weight
3,Male,41.0,Alabama,0,0,1,0,0,0,0,27.32,4.0,159,0,Overweight
4,Female,52.0,Alabama,1,0,0,0,0,0,0,23.75,6.5,90,0,Normal weight


### **Apply `StandardScaler`**

We will apply `StandardScaler` to some numerical columns.

In [11]:
scaler = StandardScaler()

In [12]:
df[["age", "bmi", "hbA1c_level", "blood_glucose_level"]] = scaler.fit_transform(df[["age", "bmi", "hbA1c_level", "blood_glucose_level"]])

In [14]:
df.head()

Unnamed: 0,gender,age,location,race:AfricanAmerican,race:Asian,race:Caucasian,race:Hispanic,race:Other,hypertension,heart_disease,bmi,hbA1c_level,blood_glucose_level,diabetes,bmi_class
0,Female,-0.439148,Alabama,0,0,0,0,1,0,0,-0.000114,-0.492723,-0.934892,0,Overweight
1,Female,-0.572383,Alabama,0,1,0,0,0,0,0,-1.110508,-0.492723,-1.180533,0,Normal weight
2,Male,-1.06091,Alabama,0,0,0,0,1,0,0,-0.536478,-0.679523,0.538955,0,Normal weight
3,Male,-0.039444,Alabama,0,0,1,0,0,0,0,-0.000114,-1.426725,0.51439,0,Overweight
4,Female,0.449083,Alabama,1,0,0,0,0,0,0,-0.537985,0.908281,-1.180533,0,Normal weight


### **Apply `OneHotEncoder`**

We will apply `OneHotEncoder` for some columns.

In [23]:
categorical_cols = ["gender", "location", "bmi_class"]

In [24]:
encoder = OneHotEncoder(sparse_output=False, drop=None)

In [25]:
encoded = encoder.fit_transform(df[categorical_cols])
encoded_df = pd.DataFrame(
    encoded,
    columns=encoder.get_feature_names_out(categorical_cols),
    index=df.index
)
encoded_df = encoded_df.astype(int)

In [26]:
df_encoded = pd.concat([df.drop(columns=categorical_cols), encoded_df], axis=1)

In [29]:
df_encoded.shape

(99968, 71)

### **Store final data**

In [30]:
df_encoded.to_csv('../data/feature_engineered_diabetes.csv', index=False)