## Entrenamiento de modelos base con parametros por defecto

In [2]:
import pandas as pd
import mlflow
from pathlib import Path

# funciones personalizadas
from package_ml.modeling.baseline import train_baselines_with_mlflow

In [3]:
df = pd.read_parquet('../data/interim/train_fe.parquet')

In [4]:
df.head()

Unnamed: 0,User_ID,Gender,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories,Age_x_Height,...,Body_Temp_Deviation,Duration_div_Body_Temp,Duration_log,Heart_Rate_log,Body_Temp_log,Weight_log,Duration_x_Heart_Rate_log,is_temp_high,is_overweight,feno_var
0,301957,False,21.0,188.0,84.0,16.0,100.0,40.4,80.0,3948.0,...,3.4,0.39604,2.833213,4.615121,3.723281,4.442651,7.378384,True,False,583.696
1,407676,False,48.0,165.0,68.0,11.0,91.0,40.2,54.0,7920.0,...,3.2,0.273632,2.484907,4.521789,3.718438,4.234107,6.909753,True,False,356.7674
2,244464,True,28.0,178.0,77.0,23.0,103.0,40.5,125.0,4984.0,...,3.5,0.567901,3.178054,4.644391,3.725693,4.356709,7.770645,True,False,709.343
3,38748,False,32.0,156.0,54.0,29.0,105.0,41.1,179.0,4992.0,...,4.1,0.705596,3.401197,4.663439,3.740048,4.007333,8.021585,True,False,1036.518
4,297351,True,24.0,172.0,75.0,6.0,77.0,39.4,9.0,4128.0,...,2.4,0.152284,1.94591,4.356709,3.69883,4.330733,6.137727,True,False,79.3992


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 65 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   User_ID                            200000 non-null  int64  
 1   Gender                             200000 non-null  bool   
 2   Age                                200000 non-null  float64
 3   Height                             200000 non-null  float64
 4   Weight                             200000 non-null  float64
 5   Duration                           200000 non-null  float64
 6   Heart_Rate                         200000 non-null  float64
 7   Body_Temp                          200000 non-null  float64
 8   Calories                           200000 non-null  float64
 9   Age_x_Height                       200000 non-null  float64
 10  Age_x_Weight                       200000 non-null  float64
 11  Age_x_Duration                     2000

In [5]:
integer_columns = df.select_dtypes(include=['int8', 'int32', 'int64']).columns

In [6]:
integer_columns

Index(['User_ID'], dtype='object')

In [7]:
TARGET = "Calories"
ID_COLUMN = "User_ID"
FEATURES = [col for col in df.columns if col not in [TARGET, ID_COLUMN]]
y = df[TARGET]
X = df[FEATURES]

In [8]:
X.shape

(200000, 63)

## Pasos para los modelos base

El objetivo es entrenar rapidamente varios modelos con sus configuraciones por defecto para tener un rendimiento base a traves de lo siguiente:

- **Modelos a probar**: Ridge, Lasso, HistGradientBoostingRegressor, XGBoost y LightGBM.
- **Validación:** Usaremos una **Validación Cruzada de 10-Folds** para asegurar que nuestras métricas sean estables y no dependan de una única partición de datos.
- **Métricas:** Evaluaremos con RMSE, MAE, etc.
- **Seguimiento de Experimentos:** Utilizaremos **MLflow** para registrar los parámetros, métricas y artefactos de cada modelo, lo que nos permitirá comparar los resultados de manera organizada.

In [9]:
metrics_df = train_baselines_with_mlflow(X, y, transform_target=True, cv_splits=5)

--- Entrenando modelo: ridge ---
Registrando métricas para ridge: {'val_rmse': np.float64(-5.2712), 'val_mae': np.float64(-3.0989), 'val_medae': np.float64(-1.7727)}
--- Entrenando modelo: lasso ---
Registrando métricas para lasso: {'val_rmse': np.float64(-67.8641), 'val_mae': np.float64(-53.4478), 'val_medae': np.float64(-43.683)}
--- Entrenando modelo: random_forest ---
Registrando métricas para random_forest: {'val_rmse': np.float64(-3.7917), 'val_mae': np.float64(-2.2305), 'val_medae': np.float64(-1.3844)}
--- Entrenando modelo: hgb ---
Registrando métricas para hgb: {'val_rmse': np.float64(-3.7527), 'val_mae': np.float64(-2.2229), 'val_medae': np.float64(-1.3892)}
--- Entrenando modelo: xgb ---
Registrando métricas para xgb: {'val_rmse': np.float64(-3.8231), 'val_mae': np.float64(-2.3119), 'val_medae': np.float64(-1.4728)}
--- Entrenando modelo: lgbm ---
Registrando métricas para lgbm: {'val_rmse': np.float64(-3.7514), 'val_mae': np.float64(-2.2212), 'val_medae': np.float64(-1.393

In [10]:
metrics_df

Unnamed: 0,model,rmse,mae,medae
0,ridge,-5.2712,-3.0989,-1.7727
1,lasso,-67.8641,-53.4478,-43.683
2,random_forest,-3.7917,-2.2305,-1.3844
3,hgb,-3.7527,-2.2229,-1.3892
4,xgb,-3.8231,-2.3119,-1.4728
5,lgbm,-3.7514,-2.2212,-1.3934
6,knn,-4.5787,-2.9435,-2.0055


In [11]:
metrics_df.to_csv("../reports/baseline_metrics.csv", index=False)