In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, RobustScaler
from sklearn.model_selection import train_test_split
import joblib

In [2]:
data=pd.read_csv("obesity_data_cleaned.csv")

In [3]:
data.head()

Unnamed: 0,gender,age,height_m,weight_kg,family_overweight_history,high_calorie_food,vegetable_intake_freq,main_meals_per_day,snack_frequency,smokes,water_intake_liters,calorie_tracking,physical_activity_hours,screentime_hours,alcohol_consumption,travel_mode,obesity_level,bmi
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight,24.386526
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight,24.238227
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight,23.765432
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I,26.851852
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II,28.342381


In [4]:
cat_cols=data.select_dtypes(include='object').columns

In [5]:
cat_cols

Index(['gender', 'family_overweight_history', 'high_calorie_food',
       'snack_frequency', 'smokes', 'calorie_tracking', 'alcohol_consumption',
       'travel_mode', 'obesity_level'],
      dtype='object')

In [6]:
for col in cat_cols:
    print(f"{col}:{data[col].unique()}")

gender:['Female' 'Male']
family_overweight_history:['yes' 'no']
high_calorie_food:['no' 'yes']
snack_frequency:['Sometimes' 'Frequently' 'Always' 'no']
smokes:['no' 'yes']
calorie_tracking:['no' 'yes']
alcohol_consumption:['no' 'Sometimes' 'Frequently' 'Always']
travel_mode:['Public_Transportation' 'Walking' 'Automobile' 'Motorbike' 'Bike']
obesity_level:['Normal_Weight' 'Overweight_Level_I' 'Overweight_Level_II'
 'Obesity_Type_I' 'Insufficient_Weight' 'Obesity_Type_II'
 'Obesity_Type_III']


In [7]:
binary_cols=[]
multi_cols=[]
for col in cat_cols:
    if data[col].nunique()==2:
        binary_cols.append(col)
    else:
        multi_cols.append(col)

In [8]:
encoded_data=data.copy()

In [9]:
le=LabelEncoder()

In [10]:
label_encoders={}

In [11]:
for col in binary_cols:
    le = LabelEncoder()   # create new encoder each loop
    encoded_data[col] = le.fit_transform(encoded_data[col])
    label_encoders[col] = le

joblib.dump(label_encoders, "models/label_encoders.pkl")


['models/label_encoders.pkl']

In [12]:
encoded_data.head()

Unnamed: 0,gender,age,height_m,weight_kg,family_overweight_history,high_calorie_food,vegetable_intake_freq,main_meals_per_day,snack_frequency,smokes,water_intake_liters,calorie_tracking,physical_activity_hours,screentime_hours,alcohol_consumption,travel_mode,obesity_level,bmi
0,0,21.0,1.62,64.0,1,0,2.0,3.0,Sometimes,0,2.0,0,0.0,1.0,no,Public_Transportation,Normal_Weight,24.386526
1,0,21.0,1.52,56.0,1,0,3.0,3.0,Sometimes,1,3.0,1,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight,24.238227
2,1,23.0,1.8,77.0,1,0,2.0,3.0,Sometimes,0,2.0,0,2.0,1.0,Frequently,Public_Transportation,Normal_Weight,23.765432
3,1,27.0,1.8,87.0,0,0,3.0,3.0,Sometimes,0,2.0,0,2.0,0.0,Frequently,Walking,Overweight_Level_I,26.851852
4,1,22.0,1.78,89.8,0,0,2.0,1.0,Sometimes,0,2.0,0,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II,28.342381


In [13]:
for col in binary_cols:
    print(f"{col}: {encoded_data[col].unique()}")

gender: [0 1]
family_overweight_history: [1 0]
high_calorie_food: [0 1]
smokes: [0 1]
calorie_tracking: [0 1]


In [14]:
multi_cols

['snack_frequency', 'alcohol_consumption', 'travel_mode', 'obesity_level']

In [15]:
multi_cat_col=['snack_frequency', 'alcohol_consumption', 'travel_mode']

In [16]:
encoded_data=pd.get_dummies(encoded_data, columns=multi_cat_col, drop_first=True)

In [17]:
# Confirm the new encoded columns
encoded_cols = [col for col in encoded_data.columns if 
                any(prefix in col for prefix in ['snack_frequency_', 'alcohol_consumption_', 'travel_mode_'])]
print("New one-hot encoded columns:\n", encoded_cols)


New one-hot encoded columns:
 ['snack_frequency_Frequently', 'snack_frequency_Sometimes', 'snack_frequency_no', 'alcohol_consumption_Frequently', 'alcohol_consumption_Sometimes', 'alcohol_consumption_no', 'travel_mode_Bike', 'travel_mode_Motorbike', 'travel_mode_Public_Transportation', 'travel_mode_Walking']


In [18]:
le_target=LabelEncoder()

In [19]:
encoded_data['obesity_level']=le_target.fit_transform(encoded_data['obesity_level'])

In [20]:
joblib.dump(le_target, 'models/target_label_encoder.pkl')

['models/target_label_encoder.pkl']

In [21]:
num_cols = [
    'age', 'height_m', 'weight_kg', 'bmi',
    'vegetable_intake_freq', 'main_meals_per_day',
    'water_intake_liters', 'physical_activity_hours',
    'screentime_hours'
]

In [22]:
scaler=RobustScaler()

In [23]:
encoded_data[num_cols] = scaler.fit_transform(encoded_data[num_cols])

In [24]:
encoded_data.head()

Unnamed: 0,gender,age,height_m,weight_kg,family_overweight_history,high_calorie_food,vegetable_intake_freq,main_meals_per_day,smokes,water_intake_liters,...,snack_frequency_Frequently,snack_frequency_Sometimes,snack_frequency_no,alcohol_consumption_Frequently,alcohol_consumption_Sometimes,alcohol_consumption_no,travel_mode_Bike,travel_mode_Motorbike,travel_mode_Public_Transportation,travel_mode_Walking
0,0,-0.303682,-0.585617,-0.454616,1,0,-0.396265,0.0,0,0.0,...,False,True,False,False,False,True,False,False,True,False
1,0,-0.303682,-1.303425,-0.64502,1,0,0.603735,0.0,1,1.142503,...,False,True,False,False,True,False,False,False,True,False
2,1,0.025046,0.706438,-0.145209,1,0,-0.396265,0.0,0,0.0,...,False,True,False,True,False,False,False,False,True,False
3,1,0.682502,0.706438,0.092796,0,0,0.603735,0.0,0,0.0,...,False,True,False,True,False,False,False,False,False,True
4,1,-0.139318,0.562876,0.159437,0,0,-0.396265,-6.610849,0,0.0,...,False,True,False,False,True,False,False,False,True,False


In [25]:
joblib.dump(scaler, 'models/robust_scaler.pkl')

['models/robust_scaler.pkl']

In [26]:
encoded_data[num_cols].describe().T


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,2087.0,0.247445,1.046801,-1.454229,-0.481862,0.0,0.518138,6.270873
height_m,2087.0,0.007825,0.668896,-1.80589,-0.512554,0.0,0.487446,1.998493
weight_kg,2087.0,0.089433,0.623356,-1.049629,-0.407015,0.0,0.592985,2.13964
bmi,2087.0,0.07415,0.684334,-1.355677,-0.386072,0.0,0.613928,1.868867
vegetable_intake_freq,2087.0,0.025201,0.534737,-1.396265,-0.396265,0.0,0.603735,0.603735
main_meals_per_day,2087.0,-0.98773,2.527375,-6.610849,-1.0,0.0,0.0,3.305425
water_intake_liters,2087.0,0.005426,0.694967,-1.142503,-0.467374,0.0,0.532626,1.142503
physical_activity_hours,2087.0,0.008247,0.549354,-0.643668,-0.563528,0.0,0.436472,1.287335
screentime_hours,2087.0,0.032169,0.608153,-0.630866,-0.630866,0.0,0.369134,1.369134


In [27]:
encoded_data.head()

Unnamed: 0,gender,age,height_m,weight_kg,family_overweight_history,high_calorie_food,vegetable_intake_freq,main_meals_per_day,smokes,water_intake_liters,...,snack_frequency_Frequently,snack_frequency_Sometimes,snack_frequency_no,alcohol_consumption_Frequently,alcohol_consumption_Sometimes,alcohol_consumption_no,travel_mode_Bike,travel_mode_Motorbike,travel_mode_Public_Transportation,travel_mode_Walking
0,0,-0.303682,-0.585617,-0.454616,1,0,-0.396265,0.0,0,0.0,...,False,True,False,False,False,True,False,False,True,False
1,0,-0.303682,-1.303425,-0.64502,1,0,0.603735,0.0,1,1.142503,...,False,True,False,False,True,False,False,False,True,False
2,1,0.025046,0.706438,-0.145209,1,0,-0.396265,0.0,0,0.0,...,False,True,False,True,False,False,False,False,True,False
3,1,0.682502,0.706438,0.092796,0,0,0.603735,0.0,0,0.0,...,False,True,False,True,False,False,False,False,False,True
4,1,-0.139318,0.562876,0.159437,0,0,-0.396265,-6.610849,0,0.0,...,False,True,False,False,True,False,False,False,True,False


In [28]:
encoded_data.columns.tolist()

['gender',
 'age',
 'height_m',
 'weight_kg',
 'family_overweight_history',
 'high_calorie_food',
 'vegetable_intake_freq',
 'main_meals_per_day',
 'smokes',
 'water_intake_liters',
 'calorie_tracking',
 'physical_activity_hours',
 'screentime_hours',
 'obesity_level',
 'bmi',
 'snack_frequency_Frequently',
 'snack_frequency_Sometimes',
 'snack_frequency_no',
 'alcohol_consumption_Frequently',
 'alcohol_consumption_Sometimes',
 'alcohol_consumption_no',
 'travel_mode_Bike',
 'travel_mode_Motorbike',
 'travel_mode_Public_Transportation',
 'travel_mode_Walking']

In [29]:
y=encoded_data['obesity_level']
X=encoded_data.drop('obesity_level', axis=1)

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [31]:
y_test.value_counts(normalize=True).round(3)

obesity_level
2    0.167
4    0.156
3    0.144
6    0.139
1    0.136
5    0.132
0    0.127
Name: proportion, dtype: float64

In [32]:
y_train.value_counts(normalize=True).round(3)

obesity_level
2    0.168
4    0.155
3    0.142
6    0.139
1    0.135
5    0.132
0    0.128
Name: proportion, dtype: float64

In [33]:
joblib.dump(X_train, 'models/X_train.pkl')
joblib.dump(X_test, 'models/X_test.pkl')
joblib.dump(y_train, 'models/y_train.pkl')
joblib.dump(y_test, 'models/y_test.pkl')

['models/y_test.pkl']

In [34]:
encoded_data.to_csv("obesity_data_preprocessed.csv", index=False)