# 1. Importaciones iniciales

In [1]:
import os
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from pipeline_utils import CustomRowTransformer
from dotenv import load_dotenv
from benchmarks import ml_benchmarks

import os
load_dotenv()
ML_PREPROC_FILENAME = os.getenv("ML_PREPROC_FILENAME")
CSV_FOLDER = os.getenv("CSV_FOLDER")
CSV_TRAIN_FILENAME = os.getenv("CSV_TRAIN_FILENAME")

✅ Preprocessing pipeline saved to: c:\Users\Fernando\VSC\python\dev\p7-1\multi-class_prediction_obesity_risk\data\modeling\pkl\preprocessing_pipeline.pkl


# 2. Cargar datos y pipeline

In [2]:
# Paths
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
csv_path = os.path.join(parent_dir, ".kaggle", "train.csv")
print(f'CSV path: {csv_path}')
pre_pkl_path = os.path.join(parent_dir, "modeling", "pkl", ML_PREPROC_FILENAME)
print(f'Preprocessing pipeline path: {pre_pkl_path}')

# Load raw data and preprocessing pipeline
df = pd.read_csv(csv_path)
preprocessing_pipeline = joblib.load(pre_pkl_path)


CSV path: c:\Users\Fernando\VSC\python\dev\p7-1\multi-class_prediction_obesity_risk\data\.kaggle\train.csv
Preprocessing pipeline path: c:\Users\Fernando\VSC\python\dev\p7-1\multi-class_prediction_obesity_risk\data\modeling\pkl\preprocessing_pipeline.pkl


# 3. Aplicar el pipeline y separar variables

In [3]:
# Apply preprocessing
df_prepared = preprocessing_pipeline.transform(df)   

# Separate features and target
X = df_prepared.drop(columns=["NObeyesdad", "SMOKE","MTRANS","id"])
y = df_prepared["NObeyesdad"]

ml_benchmarks(X, y)



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001999 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1120
[LightGBM] [Info] Number of data points in the train set: 16606, number of used features: 15
[LightGBM] [Info] Start training from score -2.107657
[LightGBM] [Info] Start training from score -1.907572
[LightGBM] [Info] Start training from score -1.964755
[LightGBM] [Info] Start training from score -1.855022
[LightGBM] [Info] Start training from score -1.635117
[LightGBM] [Info] Start training from score -2.146046
[LightGBM] [Info] Start training from score -2.107657




+---------------------+--------------------+--------------------+--------------------+--------------------+----------------+-------------+
|        Model        |      Accuracy      |     Precision      |       Recall       |      F1-Score      | Train Time (s) | Overfitting |
+---------------------+--------------------+--------------------+--------------------+--------------------+----------------+-------------+
|       XGBoost       | 0.9072736030828517 | 0.9070431550288031 | 0.9072736030828517 | 0.9070520800655331 |     2.8927     |   0.0769    |
|      CatBoost       | 0.9060693641618497 | 0.9055571752237769 | 0.9060693641618497 | 0.9057671655185464 |     46.65      |   0.0483    |
|      LightGBM       | 0.9051059730250481 | 0.904767060264294  | 0.9051059730250481 | 0.904858615859683  |     7.4812     |   0.0711    |
|    Random Forest    | 0.9034200385356455 | 0.9027256941629342 | 0.9034200385356455 | 0.9028778844140601 |     4.7996     |   0.0965    |
|      SVM (RBF)      | 0.8

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score,Train Time (s),Overfitting
7,XGBoost,0.907274,0.907043,0.907274,0.907052,2.8927,0.0769
9,CatBoost,0.906069,0.905557,0.906069,0.905767,46.65,0.0483
8,LightGBM,0.905106,0.904767,0.905106,0.904859,7.4812,0.0711
2,Random Forest,0.90342,0.902726,0.90342,0.902878,4.7996,0.0965
4,SVM (RBF),0.883189,0.88223,0.883189,0.882603,14.7101,0.0137
3,SVM (Linear),0.869701,0.868421,0.869701,0.868777,9.1787,-0.0029
0,Logistic Regression,0.863921,0.862367,0.863921,0.862864,0.5756,-0.0013
1,Decision Tree,0.840318,0.840522,0.840318,0.840271,0.212,0.1596
6,KNN,0.793593,0.790631,0.793593,0.790766,0.2354,0.0574
5,Naive Bayes,0.77553,0.770976,0.77553,0.76914,0.0309,-0.0024


# 6. Evaluar el modelo

In [None]:
y_pred = model.predict(X_test)

print("🔍 Classification Report:")
print(classification_report(y_test, y_pred))

print("📊 Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


# 7. Guardar el modelo

In [None]:
model_path = os.path.join(parent_dir, "modeling", "pkl", "trained_model.pkl")
joblib.dump(model, model_path)
print(f"✅ Model saved to: {model_path}")


# PREPROCESAMIENTO MÁS MODELO

In [None]:
full_pipeline = Pipeline(steps=[
    ("preprocessing", preprocessing_pipeline),
    ("classifier", RandomForestClassifier(n_estimators=100, random_state=42))
])

# Entrenar directamente
full_pipeline.fit(df, df["NObeyesdad"])

# Guardar todo el pipeline (ideal para producción)
joblib.dump(full_pipeline, "full_pipeline_model.pkl")
