In [4]:
import pandas as pd
import numpy as np
import seaborn as sns

In [5]:
#load the data
data = pd.read_csv('../data/ObesityDataSet_raw.csv')

In [6]:
#split test and train data first before EDA to avoid data leakage
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(data, test_size=0.2, random_state=42, stratify=data['NObeyesdad'])

In [7]:
# Separate features and target
X_train = train_data.drop('NObeyesdad', axis=1)
y_train = train_data['NObeyesdad']

X_test = test_data.drop('NObeyesdad', axis=1)
y_test = test_data['NObeyesdad']

In [8]:
#creating the column transformer for preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, OneHotEncoder, OrdinalEncoder

# 1. Define feature groups based on your EDA
num_features = ['Age', 'Height', 'Weight', 'BMI', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
nom_features = ['MTRANS']
ord_features = ['CAEC', 'CALC']

# 2. Define Ordinal Order (The Model now 'knows' the rank)
caec_order = ['no', 'Sometimes', 'Frequently', 'Always']
calc_order = ['no', 'Sometimes', 'Frequently', 'Always']

#binary columns usses nominal encoding
binary_cols = ['Gender', 'FAVC', 'SMOKE', 'SCC', 'family_history_with_overweight']

# 3. Create the ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', RobustScaler(), num_features),
        ('nom', OneHotEncoder(handle_unknown='ignore'), nom_features),
        ('ord', OrdinalEncoder(categories=[caec_order, calc_order]), ord_features),
        ('bin', OneHotEncoder(drop='if_binary'), binary_cols)
    ])

In [9]:
#creating a custom transformer for BMI

import sys
import os

# Go up one level to the project root
sys.path.append(os.path.abspath('..'))

# Now you can import from src
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from src.preprocessing import bmi_transformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

# 1. Create the pipeline
full_pipeline = Pipeline(steps=[
    ('feature_engineering', bmi_transformer),  # Adds BMI column first
    ('preprocessor', preprocessor),           # Scales and encodes (including new BMI)
    ('classifier', LogisticRegression(max_iter=1000))   # Trains the model
])

# 2. Fit the pipeline
# It will now automatically create BMI, scale it, and train the model!
full_pipeline.fit(X_train, y_train)
weights = full_pipeline.named_steps['classifier'].coef_
print(weights)

[[-8.37171497e-01  1.74332093e+00 -7.49921736e+00 -1.00097650e+01
   2.57830929e-01 -1.15924455e-02 -4.98465208e-03  3.06001541e-01
   4.98579778e-02  4.75502098e-01 -4.13271880e-01 -8.53246597e-02
   1.29580698e-01  8.82275545e-02  7.18441732e-01 -2.25251992e-01
  -4.63645743e-01 -7.02997628e-01 -5.44894154e-01 -2.37438896e-01
  -9.03796305e-01]
 [-3.47423059e-01  5.11083018e-01 -4.15634853e+00 -5.33938464e+00
  -3.45008655e-01 -6.43334560e-02 -2.49254931e-01  5.84498232e-01
  -1.57407366e-01 -3.88996799e-01  3.54584784e-01  3.38631229e-01
  -8.70902003e-01  6.02896970e-01  1.03926780e+00 -1.29157626e-01
   3.87996084e-01 -4.74059886e-01  5.89899180e-01  1.86878363e-02
  -8.86463214e-01]
 [-1.17595120e-01 -9.78435476e-01  2.64611340e+00  3.85873119e+00
  -8.33751328e-01 -1.37884131e-01  2.91409042e-01  3.94508985e-01
   1.37207446e-01  2.62714156e-01 -1.82948260e-02  6.83598058e-01
  -4.58943448e-01 -5.32008941e-01  2.05831689e-02 -3.90992623e-01
  -4.00744237e-01  1.19871302e+00  7.7

In [10]:
#save the model
import joblib
joblib.dump(full_pipeline, '../models/obesity_classifier_pipeline.pkl')

['../models/obesity_classifier_pipeline.pkl']

In [None]:
# Hyperparameter tuning 
param_grid = {
    'classifier__C': [0.1, 1.0, 10.0, 100.0],
    'classifier__max_iter': [1000, 2000],
    'classifier__solver': ['lbfgs', 'saga'] # 'saga' is often better for large datasets
}

# cv=5 means it will run 5-fold cross-validation for EVERY combination
#each of these parameters will be tried out and the best one will be selected
#classifier_c is the regularization parameter for logistic regression
#solver is the algorithm used to optimize the logistic regression model

In [12]:
from sklearn.model_selection import GridSearchCV

# cv=5 means it will run 5-fold cross-validation for EVERY combination
grid_search = GridSearchCV(full_pipeline, param_grid, cv=5, scoring='accuracy', verbose=1)

grid_search.fit(X_train, y_train)

print(f"Best Score: {grid_search.best_score_:.4f}")
print(f"Best Parameters: {grid_search.best_params_}")

Fitting 5 folds for each of 16 candidates, totalling 80 fits




Best Score: 0.9550
Best Parameters: {'classifier__C': 100.0, 'classifier__max_iter': 1000, 'classifier__solver': 'lbfgs'}


In [13]:
# Get the best version of the pipeline automatically
best_model = grid_search.best_estimator_

# Check the score on the test set you set aside earlier
test_accuracy = best_model.score(X_test, y_test)
print(f"Final Test Accuracy: {test_accuracy:.4f}")

Final Test Accuracy: 0.9622


In [14]:
import joblib
joblib.dump(best_model, '../models/obesity_classifier_v2_optimized.pkl')

['../models/obesity_classifier_v2_optimized.pkl']