# 1. Import Libraries and Load Environment

In [7]:
import os
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from pipeline_utils import CustomRowTransformer
from dotenv import load_dotenv
from benchmarks import ml_benchmarks, custom_ml_benchmarks, optimize_xgboost
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

import os
load_dotenv()
ML_PREPROC_FILENAME = os.getenv("ML_PREPROC_FILENAME")
CSV_FOLDER = os.getenv("CSV_FOLDER")
CSV_TRAIN_FILENAME = os.getenv("CSV_TRAIN_FILENAME")

# 2. Load Data and Preprocess PKL

In [8]:
# Paths
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
csv_path = os.path.join(parent_dir, ".kaggle", "train.csv")
print(f'CSV path: {csv_path}')
pre_pkl_path = os.path.join(parent_dir, "modeling", "pkl", ML_PREPROC_FILENAME)
print(f'Preprocessing pipeline path: {pre_pkl_path}')

# Load raw data and preprocessing pipeline
df = pd.read_csv(csv_path)
preprocessing_pipeline = joblib.load(pre_pkl_path)


CSV path: c:\Users\Fernando\VSC\python\dev\p7-1\multi-class_prediction_obesity_risk\data\.kaggle\train.csv
Preprocessing pipeline path: c:\Users\Fernando\VSC\python\dev\p7-1\multi-class_prediction_obesity_risk\data\modeling\pkl\preprocessing_pipeline.pkl


# 3. APPLY PIPELINE AND SPLIT FEATURES AND TARGET

In [9]:
# Apply preprocessing
df_prepared = preprocessing_pipeline.transform(df)   

# Separate features and target
X = df_prepared.drop(columns=["NObeyesdad", "SMOKE","MTRANS","id"])
y = df_prepared["NObeyesdad"]



# 4. OBTAIN MODELS BENCHMARK

In [10]:
ml_benchmarks(X, y)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004026 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1120
[LightGBM] [Info] Number of data points in the train set: 16606, number of used features: 15
[LightGBM] [Info] Start training from score -2.107657
[LightGBM] [Info] Start training from score -1.907572
[LightGBM] [Info] Start training from score -1.964755
[LightGBM] [Info] Start training from score -1.855022
[LightGBM] [Info] Start training from score -1.635117
[LightGBM] [Info] Start training from score -2.146046
[LightGBM] [Info] Start training from score -2.107657




+---------------------+--------------------+--------------------+--------------------+--------------------+----------------+-------------+
|        Model        |      Accuracy      |     Precision      |       Recall       |      F1-Score      | Train Time (s) | Overfitting |
+---------------------+--------------------+--------------------+--------------------+--------------------+----------------+-------------+
|       XGBoost       | 0.9072736030828517 | 0.9070431550288031 | 0.9072736030828517 | 0.9070520800655331 |     2.4346     |   0.0769    |
|      CatBoost       | 0.9060693641618497 | 0.9055571752237769 | 0.9060693641618497 | 0.9057671655185464 |    35.4323     |   0.0483    |
|      LightGBM       | 0.9051059730250481 | 0.904767060264294  | 0.9051059730250481 | 0.904858615859683  |     4.7564     |   0.0711    |
|    Random Forest    | 0.9036608863198459 | 0.9029543954604532 | 0.9036608863198459 | 0.9031310719897029 |     3.8831     |   0.0962    |
|      SVM (RBF)      | 0.8

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score,Train Time (s),Overfitting
7,XGBoost,0.907274,0.907043,0.907274,0.907052,2.4346,0.0769
9,CatBoost,0.906069,0.905557,0.906069,0.905767,35.4323,0.0483
8,LightGBM,0.905106,0.904767,0.905106,0.904859,4.7564,0.0711
2,Random Forest,0.903661,0.902954,0.903661,0.903131,3.8831,0.0962
4,SVM (RBF),0.883189,0.88223,0.883189,0.882603,5.1034,0.0137
3,SVM (Linear),0.869701,0.868421,0.869701,0.868777,5.1625,-0.0029
0,Logistic Regression,0.863921,0.862367,0.863921,0.862864,0.7557,-0.0013
1,Decision Tree,0.8408,0.840408,0.8408,0.84038,0.1659,0.1591
6,KNN,0.793593,0.790631,0.793593,0.790766,0.2451,0.0574
5,Naive Bayes,0.77553,0.770976,0.77553,0.76914,0.026,-0.0024


# 5. TRY CUSTOM MANUAL PIPELINES
As seen in benchmarking, XGBoost and Logistic Regression are the best performing models.
Now we will try to create a custom pipeline for each of them and see if we can get better results.

In [11]:
custom_pipelines = {
    'XGBoost (Ajustado)': Pipeline([
        ('scaler', StandardScaler()),
        ('clf', XGBClassifier(
            n_estimators=150,
            max_depth=4,
            learning_rate=0.07,
            subsample=0.9,
            colsample_bytree=0.9,
            reg_alpha=0.5,
            reg_lambda=0.7,
            use_label_encoder=False,
            eval_metric='mlogloss',
            random_state=42
        ))
    ]),

    'Logistic Regression (Ajustado)': Pipeline([
        ('scaler', StandardScaler()),
        ('clf', LogisticRegression(
            C= 20.0,  # <--- Aumentado desde 1.0 para reducir la regularización
            penalty='l2',
            class_weight='balanced',
            max_iter=1000,
            solver='lbfgs',
            random_state=42
        ))
    ])
}

In [12]:
results_df = custom_ml_benchmarks(X, y, models_dict=custom_pipelines)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


+--------------------------------+--------------------+--------------------+--------------------+--------------------+----------------+-------------+
|             Model              |      Accuracy      |     Precision      |       Recall       |      F1-Score      | Train Time (s) | Overfitting |
+--------------------------------+--------------------+--------------------+--------------------+--------------------+----------------+-------------+
|       XGBoost (Ajustado)       | 0.9053468208092486 | 0.9048472242502588 | 0.9053468208092486 | 0.9049623394462658 |     5.4682     |   0.0097    |
| Logistic Regression (Ajustado) | 0.8639210019267822 | 0.8639142688637425 | 0.8639210019267822 | 0.8633476011203041 |     0.7708     |   0.0002    |
+--------------------------------+--------------------+--------------------+--------------------+--------------------+----------------+-------------+


XGBoost seems to be the best performing model, so we will try to create a custom pipeline for it.

# 6. XGBOOST CUSTOM PIPELINE

In [16]:
optuna_pipeline, optuna_study = optimize_xgboost(X, y, n_trials=30)

[I 2025-05-21 13:59:19,853] A new study created in memory with name: no-name-82b8634a-f0c0-460e-a241-fe3cbd1d352a
[I 2025-05-21 13:59:29,511] Trial 0 finished with value: 0.9007049882912807 and parameters: {'n_estimators': 211, 'max_depth': 9, 'learning_rate': 0.07399121925637306, 'subsample': 0.9058122997069449, 'colsample_bytree': 0.8100488489552895, 'gamma': 1.8003221785109669, 'min_child_weight': 9}. Best is trial 0 with value: 0.9007049882912807.
[I 2025-05-21 13:59:39,094] Trial 1 finished with value: 0.9053019281303364 and parameters: {'n_estimators': 295, 'max_depth': 3, 'learning_rate': 0.18983693315245503, 'subsample': 0.7935140258818527, 'colsample_bytree': 0.7360484533439278, 'gamma': 0.6132051992890641, 'min_child_weight': 9}. Best is trial 1 with value: 0.9053019281303364.
[I 2025-05-21 13:59:41,337] Trial 2 finished with value: 0.9008526024331038 and parameters: {'n_estimators': 67, 'max_depth': 11, 'learning_rate': 0.2244016875576874, 'subsample': 0.6282429965597576, 'c

+------------------+-------------------+-------------------+-------------------+--------------------+----------------+-------------+
|      Model       |     Accuracy      |     Precision     |      Recall       |      F1-Score      | Train Time (s) | Overfitting |
+------------------+-------------------+-------------------+-------------------+--------------------+----------------+-------------+
| XGBoost + Optuna | 0.911849710982659 | 0.911707052980141 | 0.911849710982659 | 0.9115821436962183 |     4.1328     |   0.0185    |
+------------------+-------------------+-------------------+-------------------+--------------------+----------------+-------------+


# 7. TRAIN MODEL

In [18]:
y_pred = optuna_pipeline.predict(X)
y_proba = optuna_pipeline.predict_proba(X)


# 8. EXPORT MODEL

In [19]:

current_dir = os.getcwd()
model_pkl_filename = "xgboost_optuna_pipeline.pkl"
model_pkl_path = os.path.join(current_dir, "pkl",model_pkl_filename)

joblib.dump(optuna_pipeline, model_pkl_path)

print(f"✔️ Modelo guardado en: {model_pkl_path}")

✔️ Modelo guardado en: c:\Users\Fernando\VSC\python\dev\p7-1\multi-class_prediction_obesity_risk\data\modeling\pkl\xgboost_optuna_pipeline.pkl
