In [10]:
import pandas as pd

# Load the dataset
data = pd.read_csv("../data/raw/obesity_dataset.csv")

# Basic summary of all features
print("Basic Info:")
data.info()

print("\nStatistical Summary (Numerical Features):")
print(data.describe())


print("\nMissing Values:")
missing_values = data.isnull().sum()
print(missing_values[missing_values > 0])

Basic Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2086 entries, 0 to 2085
Data columns (total 15 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Height                          2086 non-null   float64
 1   Weight                          2086 non-null   float64
 2   family_history_with_overweight  2086 non-null   int64  
 3   SCC                             2086 non-null   int64  
 4   MTRANS_Walking                  2086 non-null   int64  
 5   FAVC_z                          2086 non-null   float64
 6   FCVC_minmax                     2086 non-null   float64
 7   NCP_z                           2086 non-null   float64
 8   CAEC_minmax                     2086 non-null   float64
 9   CH2O_minmax                     2086 non-null   float64
 10  FAF_minmax                      2086 non-null   float64
 11  TUE_z                           2086 non-null   float64
 12  CALC_z                

In [24]:
from sklearn.preprocessing import StandardScaler

numerical_columns = data.select_dtypes(include=['float64']).columns
categorical_columns = data.select_dtypes(include=['int64']).columns
# print("Numerical Columns:", numerical_columns)
# print("Categorical Columns:", categorical_columns)



# Standardize only numerical features
scaler = StandardScaler()
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

# Print results
print("Standardized Numerical Features:")
print(data[numerical_columns].head())

print("\nCategorical Features (Unchanged):")
print(data[categorical_columns].head())

data.describe()

# numerical: height, weight, FAV_c, FCVC, NCP_z, CAEC, CH20
# categorical: family_history, SCC, MTRANS_Walking


Standardized Numerical Features:
     Height    Weight    FAVC_z  FCVC_minmax     NCP_z  CAEC_minmax  \
0 -0.878459 -0.861830  2.766876    -0.784574 -0.556013    -0.298236   
1 -1.949167 -1.166592  2.766876     1.085081 -0.556013    -0.298236   
2  1.048815 -0.366591  2.766876    -0.784574 -0.556013    -0.298236   
3  1.048815  0.014363  2.766876     1.085081 -0.556013    -0.298236   
4  0.834673  0.121029  2.766876    -0.784574  2.169193    -0.298236   

   CH2O_minmax  FAF_minmax     TUE_z    CALC_z  Age_bin_minmax  
0    -0.014227   -1.191292 -0.543561  1.126802       -0.706260  
1     1.617069    2.334567  0.464143 -0.654257       -0.706260  
2    -0.014227    1.159281 -0.543561  3.121444        0.000678  
3    -0.014227    1.159281  0.464143  3.121444        0.707615  
4    -0.014227   -1.191292  0.464143 -0.654257        0.000678  

Categorical Features (Unchanged):
   family_history_with_overweight  SCC  MTRANS_Walking  NObeyesdad
0                               1    0          

Unnamed: 0,Height,Weight,family_history_with_overweight,SCC,MTRANS_Walking,FAVC_z,FCVC_minmax,NCP_z,CAEC_minmax,CH2O_minmax,FAF_minmax,TUE_z,CALC_z,Age_bin_minmax,NObeyesdad
count,2086.0,2086.0,2086.0,2086.0,2086.0,2086.0,2086.0,2086.0,2086.0,2086.0,2086.0,2086.0,2086.0,2086.0,2086.0
mean,0.0,0.0,0.817354,0.045062,0.026366,-2.7249960000000003e-17,-1.3624980000000002e-17,-5.4499920000000006e-17,0.0,0.0,0.0,0.0,-2.7249960000000003e-17,0.0,3.110259
std,1.00024,1.00024,0.386469,0.207491,0.16026,1.00024,1.00024,1.00024,1.00024,1.00024,1.00024,1.00024,1.00024,1.00024,1.993832
min,-2.698662,-1.814212,0.0,0.0,0.0,-0.3614185,-2.654228,-1.179193,-2.436109,-1.645522,-1.191292,-1.568194,-0.654257,-1.413197,0.0
25%,-0.771388,-0.818759,1.0,0.0,0.0,-0.3614185,-0.7845735,-0.5560135,-0.298236,-0.696092,-1.04003,-0.700488,-0.654257,-0.70626,1.0
50%,-0.00709,-0.138019,1.0,0.0,0.0,-0.3614185,-0.05508389,-0.5560135,-0.298236,-0.014227,-0.016005,-0.05437,-0.654257,0.000678,3.0
75%,0.714433,0.814724,1.0,0.0,0.0,-0.3614185,1.085081,0.4151867,-0.298236,0.769587,0.773868,0.464143,1.126802,0.707615,5.0
max,2.976089,2.987978,1.0,1.0,1.0,2.766876,1.085081,2.169193,3.97751,1.617069,2.334567,2.513951,6.897145,1.414553,6.0


In [28]:
standardized_data = pd.read_csv("../data/processed/obesity_standardized.csv")
standardized_data.describe()

Unnamed: 0,Height,Weight,family_history_with_overweight,SCC,MTRANS_Walking,FAVC_z,NCP_z,CAEC_minmax,CH2O_minmax,FAF_minmax,TUE_z,CALC_z,Age_bin_minmax,NObeyesdad
count,2086.0,2086.0,2086.0,2086.0,2086.0,2086.0,2086.0,2086.0,2086.0,2086.0,2086.0,2086.0,2086.0,2086.0
mean,-1.008249e-15,-1.089998e-16,0.817354,0.045062,0.026366,-2.452496e-16,-1.089998e-16,1.089998e-16,1.294373e-16,-1.226248e-16,-2.179997e-16,5.4499920000000006e-17,0.0,3.110259
std,1.00024,1.00024,0.386469,0.207491,0.16026,1.00024,1.00024,1.00024,1.00024,1.00024,1.00024,1.00024,1.00024,1.993832
min,-2.698662,-1.814212,0.0,0.0,0.0,-0.3614185,-1.179193,-2.436109,-1.645522,-1.191292,-1.568194,-0.654257,-1.413197,0.0
25%,-0.7713883,-0.8187593,1.0,0.0,0.0,-0.3614185,-0.5560135,-0.2982364,-0.6960922,-1.04003,-0.700488,-0.654257,-0.70626,1.0
50%,-0.0070904,-0.1380187,1.0,0.0,0.0,-0.3614185,-0.5560135,-0.2982364,-0.01422653,-0.01600543,-0.05436993,-0.654257,0.000678,3.0
75%,0.7144328,0.8147241,1.0,0.0,0.0,-0.3614185,0.4151867,-0.2982364,0.7695874,0.7738683,0.4641429,1.126802,0.707615,5.0
max,2.976089,2.987978,1.0,1.0,1.0,2.766876,2.169193,3.97751,1.617069,2.334567,2.513951,6.897145,1.414553,6.0
