# Feature Engineering

In [4]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA

In [5]:
# Load the dataset
df = pd.read_csv("dataset_new.csv")

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   age                            1000 non-null   int64  
 1   income                         1000 non-null   int64  
 2   daily_hours_physical_activity  1000 non-null   int64  
 3   servings_fruits_veggies        1000 non-null   int64  
 4   BMI                            1000 non-null   float64
 5   gender_male                    1000 non-null   float64
 6   encoded_health_status          1000 non-null   int64  
 7   bmi_category                   1000 non-null   object 
dtypes: float64(2), int64(5), object(1)
memory usage: 62.6+ KB


## Feature Engineering Create BMI category feature ---------

In [8]:
def categorize_bmi(bmi):
    if bmi < 18.5:
        return 'Underweight'
    elif 18.5 <= bmi < 25:
        return 'Normal Weight'
    elif 25 <= bmi < 30:
        return 'Overweight'
    else:
        return 'Obese'

df['bmi_category'] = df['BMI'].apply(categorize_bmi)

In [9]:
# Label encode BMI category for modeling
label_encoder = LabelEncoder()
df['encoded_bmi_category'] = label_encoder.fit_transform(df['bmi_category'])

## Feature Selection Select top k features using ANOVA F-value ------------

In [16]:
selector = SelectKBest(score_func=f_classif, k=5)
X = df.drop(columns=['encoded_health_status', 'bmi_category'])
y = df['encoded_health_status']
selected_features = selector.fit_transform(X, y)

## Principal Component Analysis (PCA) for dimensionality reduction -----------

In [17]:
pca = PCA(n_components=5)
X_pca = pca.fit_transform(X)

In [18]:
# Print selected features
print("Selected features after feature selection:")
print(selected_features)

Selected features after feature selection:
[[5.6000e+01 4.9241e+04 2.0000e+00 1.0000e+00 1.0000e+00]
 [4.6000e+01 6.4569e+04 2.0000e+00 0.0000e+00 1.0000e+00]
 [3.2000e+01 3.1745e+04 1.0000e+00 0.0000e+00 1.0000e+00]
 ...
 [2.7000e+01 4.0644e+04 0.0000e+00 0.0000e+00 1.0000e+00]
 [6.1000e+01 3.8745e+04 2.0000e+00 1.0000e+00 0.0000e+00]
 [1.9000e+01 4.8228e+04 2.0000e+00 1.0000e+00 2.0000e+00]]


In [21]:
# Print principal components after PCA
print("Principal components after PCA:")
print(X_pca)

Principal components after PCA:
[[-1.06313920e+04  1.50585840e+01  1.06594309e+01  4.18845473e+00
   1.01852694e+00]
 [ 4.69660795e+03  4.88441585e+00  7.20113393e+00 -2.71026998e+00
   1.04385945e+00]
 [-2.81273922e+04 -8.79147146e+00  7.36986576e+00 -4.51119547e+00
   5.64016818e-02]
 ...
 [-1.92283923e+04 -1.38625400e+01  9.49600290e+00  1.43156678e+00
  -9.58662353e-01]
 [-2.11273917e+04  2.02858924e+01 -6.82145262e+00 -1.37423008e+00
   1.05905621e+00]
 [-1.16443922e+04 -2.18617491e+01 -5.41003470e-01  7.16599207e-01
   9.39119806e-01]]
