In [21]:
import pandas as pd
import numpy as np
import mlflow

# uv add xgboost
from xgboost import XGBClassifier
# uv add lightgbm
from lightgbm import LGBMClassifier
# uv add catboost
from catboost import CatBoostClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier, BaggingClassifier, RandomForestClassifier, VotingClassifier
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.metrics import accuracy_score, f1_score

2025/07/22 13:39:56 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.
2025/07/22 13:39:57 INFO mlflow.tracking.fluent: Autologging successfully enabled for lightgbm.


In [None]:
# Importa la librería dagshub para integrar el seguimiento de experimentos con DagsHub
import dagshub

# Inicializa la integración con DagsHub, especificando el propietario y nombre del repositorio,
# y habilita la integración con MLflow para registrar experimentos en DagsHub
dagshub.init(
  repo_owner='edynsoncoronado',
  repo_name='ml_supervisado_avanzado',
  mlflow=True
)

# Importa la librería mlflow para el seguimiento de experimentos de machine learning
import mlflow

# Inicia una nueva ejecución de MLflow
with mlflow.start_run():
  # Registra un parámetro personalizado en el experimento de MLflow
  mlflow.log_param('parameter name', 'value')
  # Registra una métrica personalizada en el experimento de MLflow
  mlflow.log_metric('metric name', 1)

🏃 View run wise-calf-996 at: https://dagshub.com/edynsoncoronado/ml_supervisado_avanzado.mlflow/#/experiments/0/runs/1abf4ae6f9ca4e3787b2785892bbeaec
🧪 View experiment at: https://dagshub.com/edynsoncoronado/ml_supervisado_avanzado.mlflow/#/experiments/0


In [5]:
df = pd.read_csv("../data/raw/hotel_bookings.csv")
df.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [7]:
FEATURES = ["lead_time", "stays_in_week_nights", "children", "adr", "booking_changes" ]
X, y = df[FEATURES], df["is_canceled"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=100)

In [None]:
# Establece la URI de seguimiento de MLflow para que apunte al servidor remoto de DagsHub,
# permitiendo así registrar y visualizar experimentos de MLflow en esa plataforma.
mlflow.set_tracking_uri("https://dagshub.com/edynsoncoronado/ml_supervisado_avanzado.mlflow")

In [None]:
# Crea un nuevo experimento en MLflow llamado "hotel_bookings_experiment"
mlflow.create_experiment("hotel_bookings_experiment")

'1'

In [None]:
# Establece el experimento actual de MLflow en "hotel_bookings_experiment"
mlflow.set_experiment("hotel_bookings_experiment")

<Experiment: artifact_location='mlflow-artifacts:/b4797be1712a4b8a913e111c78516646', creation_time=1753205160284, experiment_id='1', last_update_time=1753205160284, lifecycle_stage='active', name='hotel_bookings_experiment', tags={}>

# Baseline

In [None]:
# Activa el registro automático de parámetros, métricas y artefactos de modelos compatibles con MLflow
mlflow.autolog()

# Inicia una nueva ejecución de MLflow con el nombre especificado
with mlflow.start_run(run_name="Baseline - Dummy Classifier - Con métricas") as run:

    # Crea una instancia del clasificador Dummy usando la estrategia "most_frequent"
    algorithm = DummyClassifier(strategy="most_frequent")
    # Ajusta el clasificador Dummy con los datos de entrenamiento
    algorithm.fit(X_train, y_train)

    # Realiza predicciones sobre el conjunto de prueba
    predictions = algorithm.predict(X_test)

    # Calcula la métrica de exactitud (accuracy) usando las etiquetas verdaderas y las predichas
    _accuracy_score = accuracy_score(y_test, predictions)
    # Calcula la métrica F1 usando las etiquetas verdaderas y las predichas
    _f1_score = f1_score(y_test, predictions)
    
    # Registra las métricas calculadas en MLflow
    mlflow.log_metrics(
        {
            "accuracy": _accuracy_score,
            "f1": _f1_score
        }
    )

2025/07/22 12:29:40 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


🏃 View run Baseline - Dummy Classifier - Con métricas at: https://dagshub.com/edynsoncoronado/ml_supervisado_avanzado.mlflow/#/experiments/1/runs/258b84a471be4b6ea4ca6a278c2bf835
🧪 View experiment at: https://dagshub.com/edynsoncoronado/ml_supervisado_avanzado.mlflow/#/experiments/1


# Regresión Logística

In [None]:
# Inicia una nueva ejecución de MLflow con el nombre "Regresión logistica"
with mlflow.start_run(run_name="Regresión logistica") as run:

    # Crea una instancia del modelo de Regresión Logística
    algorithm = LogisticRegression()
    # Define un pipeline que primero imputa valores faltantes y luego aplica la regresión logística
    pipeline = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="mean")),  # Imputa valores faltantes usando la media
            ("reg_logistica", algorithm)                  # Aplica el modelo de regresión logística
        ]
    )
    # Ajusta el pipeline con los datos de entrenamiento
    pipeline.fit(X_train, y_train)

    # Realiza predicciones sobre el conjunto de prueba
    predictions = pipeline.predict(X_test)

    # Calcula la métrica de exactitud (accuracy) usando las etiquetas verdaderas y las predichas
    _accuracy_score = accuracy_score(y_test, predictions)
    # Calcula la métrica F1 usando las etiquetas verdaderas y las predichas
    _f1_score = f1_score(y_test, predictions)
    
    # Registra las métricas calculadas en MLflow
    mlflow.log_metrics(
        {
            "accuracy": _accuracy_score,
            "f1": _f1_score
        }
    )





🏃 View run Regresión logistica at: https://dagshub.com/edynsoncoronado/ml_supervisado_avanzado.mlflow/#/experiments/1/runs/d1e91a41508c4c0aa8f9fc8c01d2f120
🧪 View experiment at: https://dagshub.com/edynsoncoronado/ml_supervisado_avanzado.mlflow/#/experiments/1


# Ensamble 1: Bagging

In [None]:
# Inicia una nueva ejecución de MLflow con el nombre "Bagging"
with mlflow.start_run(run_name="Bagging") as run:

    # Crea una instancia del clasificador Bagging
    algorithm = BaggingClassifier()
    # Define un pipeline que primero imputa valores faltantes y luego aplica el clasificador Bagging
    pipeline = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="mean")),  # Imputa valores faltantes usando la media
            ("bagging", algorithm)                        # Aplica el modelo Bagging
        ]
    )
    # Ajusta el pipeline con los datos de entrenamiento
    pipeline.fit(X_train, y_train)

    # Realiza predicciones sobre el conjunto de prueba
    predictions = pipeline.predict(X_test)

    # Calcula la métrica de exactitud (accuracy) usando las etiquetas verdaderas y las predichas
    _accuracy_score = accuracy_score(y_test, predictions)
    # Calcula la métrica F1 usando las etiquetas verdaderas y las predichas
    _f1_score = f1_score(y_test, predictions)
    
    # Registra las métricas calculadas en MLflow
    mlflow.log_metrics(
        {
            "accuracy": _accuracy_score,
            "f1": _f1_score
        }
    )



🏃 View run Bagging at: https://dagshub.com/edynsoncoronado/ml_supervisado_avanzado.mlflow/#/experiments/1/runs/7b2a52e56b4a498a89919a3d5ced0f2e
🧪 View experiment at: https://dagshub.com/edynsoncoronado/ml_supervisado_avanzado.mlflow/#/experiments/1


# Ensamble 2: Random Forest

In [None]:
# Inicia una nueva ejecución de MLflow con el nombre "Random Forest"
with mlflow.start_run(run_name="Random Forest") as run:

    # Crea una instancia del clasificador Random Forest
    algorithm = RandomForestClassifier()
    # Define un pipeline que primero imputa valores faltantes y luego aplica el clasificador Random Forest
    pipeline = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="mean")),  # Imputa valores faltantes usando la media
            ("rf", algorithm)                             # Aplica el modelo Random Forest
        ]
    )
    # Ajusta el pipeline con los datos de entrenamiento
    pipeline.fit(X_train, y_train)

    # Realiza predicciones sobre el conjunto de prueba
    predictions = pipeline.predict(X_test)

    # Calcula la métrica de exactitud (accuracy) usando las etiquetas verdaderas y las predichas
    _accuracy_score = accuracy_score(y_test, predictions)
    # Calcula la métrica F1 usando las etiquetas verdaderas y las predichas
    _f1_score = f1_score(y_test, predictions)
    
    # Registra las métricas calculadas en MLflow
    mlflow.log_metrics(
        {
            "accuracy": _accuracy_score,
            "f1": _f1_score
        }
    )




🏃 View run Random Forest at: https://dagshub.com/edynsoncoronado/ml_supervisado_avanzado.mlflow/#/experiments/1/runs/ec5ca5e9a2f24d9dad64ff6cb86acec8
🧪 View experiment at: https://dagshub.com/edynsoncoronado/ml_supervisado_avanzado.mlflow/#/experiments/1


# Ensamble 3: XGboost

In [None]:
# Inicia una nueva ejecución de MLflow con el nombre "XGboost"
with mlflow.start_run(run_name="XGboost") as run:

    # Crea una instancia del clasificador XGBoost con profundidad máxima de 5 y 101 árboles
    algorithm = XGBClassifier(
        max_depth=5,         # Establece la profundidad máxima de cada árbol en 5
        n_estimators=101     # Establece el número de árboles en el modelo en 101
    )
    # Define un pipeline que primero imputa valores faltantes y luego aplica el clasificador XGBoost
    pipeline = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="mean")),  # Imputa valores faltantes usando la media
            ("xgb", algorithm)                            # Aplica el modelo XGBoost
        ]
    )
    # Ajusta el pipeline con los datos de entrenamiento
    pipeline.fit(X_train, y_train)

    # Realiza predicciones sobre el conjunto de prueba
    predictions = pipeline.predict(X_test)

    # Calcula la métrica de exactitud (accuracy) usando las etiquetas verdaderas y las predichas
    _accuracy_score = accuracy_score(y_test, predictions)
    # Calcula la métrica F1 usando las etiquetas verdaderas y las predichas
    _f1_score = f1_score(y_test, predictions)
    
    # Registra las métricas calculadas en MLflow
    mlflow.log_metrics(
        {
            "accuracy": _accuracy_score,
            "f1": _f1_score
        }
    )



🏃 View run XGboost at: https://dagshub.com/edynsoncoronado/ml_supervisado_avanzado.mlflow/#/experiments/1/runs/1a971708734d4528aad107a92a44725f
🧪 View experiment at: https://dagshub.com/edynsoncoronado/ml_supervisado_avanzado.mlflow/#/experiments/1


# Ensamble 4: LGBM

In [None]:
# Inicia una nueva ejecución de MLflow con el nombre "LGBM"
with mlflow.start_run(run_name="LGBM") as run:
    
    # Crea una instancia del clasificador LightGBM
    algorithm = LGBMClassifier()
    
    # Define un pipeline que primero imputa valores faltantes y luego aplica el clasificador LightGBM
    pipeline = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="mean")),  # Imputa valores faltantes usando la media
            ("lgbm", algorithm)                           # Aplica el modelo LightGBM
        ]
    )
    
    # Ajusta el pipeline con los datos de entrenamiento
    pipeline.fit(X_train, y_train)
    
    # Realiza predicciones sobre el conjunto de prueba
    predictions = pipeline.predict(X_test)
    
    # Calcula la métrica de exactitud (accuracy) usando las etiquetas verdaderas y las predichas
    _accuracy_score = accuracy_score(y_test, predictions)
    # Calcula la métrica F1 usando las etiquetas verdaderas y las predichas
    _f1_score = f1_score(y_test, predictions)
    
    # Registra las métricas calculadas en MLflow
    mlflow.log_metrics(
        {
            "accuracy": _accuracy_score,
            "f1": _f1_score
        }   
    )



[LightGBM] [Info] Number of positive: 33167, number of negative: 56375
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001351 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 555
[LightGBM] [Info] Number of data points in the train set: 89542, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.370407 -> initscore=-0.530470
[LightGBM] [Info] Start training from score -0.530470




🏃 View run LGBM at: https://dagshub.com/edynsoncoronado/ml_supervisado_avanzado.mlflow/#/experiments/1/runs/96d82616a85e4fb798c0e885e8ad5695
🧪 View experiment at: https://dagshub.com/edynsoncoronado/ml_supervisado_avanzado.mlflow/#/experiments/1


# Ensamble 5: Catboost

In [None]:
# Inicia una nueva ejecución de MLflow con el nombre "CatBoost"
with mlflow.start_run(run_name="CatBoost") as run:

    # Crea una instancia del clasificador CatBoost
    algorithm = CatBoostClassifier()
    
    # Define un pipeline que primero imputa valores faltantes y luego aplica el clasificador CatBoost
    pipeline = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="mean")),  # Imputa valores faltantes usando la media
            ("catboost", algorithm)                       # Aplica el modelo CatBoost
        ]
    )
    
    # Ajusta el pipeline con los datos de entrenamiento
    pipeline.fit(X_train, y_train)

    # Realiza predicciones sobre el conjunto de prueba
    predictions = pipeline.predict(X_test)

    # Calcula la métrica de exactitud (accuracy) usando las etiquetas verdaderas y las predichas
    _accuracy_score = accuracy_score(y_test, predictions)
    # Calcula la métrica F1 usando las etiquetas verdaderas y las predichas
    _f1_score = f1_score(y_test, predictions)
    
    # Registra las métricas calculadas en MLflow
    mlflow.log_metrics(
        {
            "accuracy": _accuracy_score,
            "f1": _f1_score
        }        
    )



Learning rate set to 0.070218
0:	learn: 0.6738091	total: 52.5ms	remaining: 52.5s
1:	learn: 0.6577196	total: 57.3ms	remaining: 28.6s
2:	learn: 0.6435688	total: 63.7ms	remaining: 21.2s
3:	learn: 0.6321461	total: 71.1ms	remaining: 17.7s
4:	learn: 0.6222567	total: 76.5ms	remaining: 15.2s
5:	learn: 0.6132893	total: 83ms	remaining: 13.7s
6:	learn: 0.6055982	total: 89.5ms	remaining: 12.7s
7:	learn: 0.5995170	total: 96.2ms	remaining: 11.9s
8:	learn: 0.5937286	total: 105ms	remaining: 11.5s
9:	learn: 0.5890019	total: 116ms	remaining: 11.4s
10:	learn: 0.5846239	total: 121ms	remaining: 10.9s
11:	learn: 0.5811667	total: 128ms	remaining: 10.5s
12:	learn: 0.5778450	total: 135ms	remaining: 10.3s
13:	learn: 0.5752667	total: 142ms	remaining: 9.98s
14:	learn: 0.5728444	total: 149ms	remaining: 9.81s
15:	learn: 0.5706692	total: 155ms	remaining: 9.56s
16:	learn: 0.5687932	total: 163ms	remaining: 9.4s
17:	learn: 0.5673718	total: 168ms	remaining: 9.18s
18:	learn: 0.5658373	total: 175ms	remaining: 9.03s
19:	le



🏃 View run CatBoost at: https://dagshub.com/edynsoncoronado/ml_supervisado_avanzado.mlflow/#/experiments/1/runs/8b546e07ce8b4964acefe81bfb27e228
🧪 View experiment at: https://dagshub.com/edynsoncoronado/ml_supervisado_avanzado.mlflow/#/experiments/1


# Ensamble 7: Stacking

In [None]:
# Inicia una nueva ejecución de MLflow con el nombre "Ensamble de Pilas"
with mlflow.start_run(run_name="Ensamble de Pilas") as run:
    
    # Crea una instancia del clasificador CatBoost
    algorithm1 = CatBoostClassifier()
    # Crea una instancia del clasificador XGBoost
    algorithm2 = XGBClassifier()
    # Crea una instancia del clasificador LightGBM
    algorithm3 = LGBMClassifier()

    # Define el clasificador de ensamble Stacking, usando los tres modelos anteriores como base
    # y RandomForestClassifier como estimador final
    stacking_clf = StackingClassifier(
        estimators=[
            ("catboost", algorithm1),  # Primer modelo base: CatBoost
            ("xgb", algorithm2),       # Segundo modelo base: XGBoost
            ("lgbm", algorithm3),      # Tercer modelo base: LightGBM
        ],
        final_estimator=RandomForestClassifier()  # Modelo final: Random Forest
    )
    # Define un pipeline que primero imputa valores faltantes y luego aplica el clasificador Stacking
    pipeline = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="mean")),  # Imputa valores faltantes usando la media
            ("staking", stacking_clf)                     # Aplica el modelo Stacking
        ]
    )
    # Ajusta el pipeline con los datos de entrenamiento
    pipeline.fit(X_train, y_train)

    # Realiza predicciones sobre el conjunto de prueba
    predictions = pipeline.predict(X_test)

    # Calcula la métrica de exactitud (accuracy) usando las etiquetas verdaderas y las predichas
    _accuracy_score = accuracy_score(y_test, predictions)
    # Calcula la métrica F1 usando las etiquetas verdaderas y las predichas
    _f1_score = f1_score(y_test, predictions)
    
    # Registra las métricas calculadas en MLflow
    mlflow.log_metrics(
        {
            "accuracy": _accuracy_score,
            "f1": _f1_score
        }   
    )



Learning rate set to 0.070218
0:	learn: 0.6738091	total: 7.09ms	remaining: 7.08s
1:	learn: 0.6577196	total: 11.8ms	remaining: 5.88s
2:	learn: 0.6435688	total: 18.6ms	remaining: 6.17s
3:	learn: 0.6321461	total: 24.2ms	remaining: 6.02s
4:	learn: 0.6222567	total: 30.4ms	remaining: 6.04s
5:	learn: 0.6132893	total: 38ms	remaining: 6.29s
6:	learn: 0.6055982	total: 43.3ms	remaining: 6.14s
7:	learn: 0.5995170	total: 50.1ms	remaining: 6.22s
8:	learn: 0.5937286	total: 55.7ms	remaining: 6.13s
9:	learn: 0.5890019	total: 61.9ms	remaining: 6.13s
10:	learn: 0.5846239	total: 66.9ms	remaining: 6.02s
11:	learn: 0.5811667	total: 72.4ms	remaining: 5.96s
12:	learn: 0.5778450	total: 78.6ms	remaining: 5.97s
13:	learn: 0.5752667	total: 86.6ms	remaining: 6.1s
14:	learn: 0.5728444	total: 94ms	remaining: 6.17s
15:	learn: 0.5706692	total: 102ms	remaining: 6.26s
16:	learn: 0.5687932	total: 108ms	remaining: 6.26s
17:	learn: 0.5673718	total: 115ms	remaining: 6.27s
18:	learn: 0.5658373	total: 121ms	remaining: 6.25s
1



[LightGBM] [Info] Number of positive: 26534, number of negative: 45100
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000842 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 554
[LightGBM] [Info] Number of data points in the train set: 71634, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.370411 -> initscore=-0.530455
[LightGBM] [Info] Start training from score -0.530455
[LightGBM] [Info] Number of positive: 26534, number of negative: 45100
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000823 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 555
[LightGBM] [Info] Number of data points in the train set: 71634, number of used features: 5
[LightGBM] [Info] [bin



[LightGBM] [Info] Number of positive: 26534, number of negative: 45100
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001115 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 554
[LightGBM] [Info] Number of data points in the train set: 71634, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.370411 -> initscore=-0.530455
[LightGBM] [Info] Start training from score -0.530455




🏃 View run Ensamble de Pilas at: https://dagshub.com/edynsoncoronado/ml_supervisado_avanzado.mlflow/#/experiments/1/runs/5bb522c498db429ca1c517a3780b7733
🧪 View experiment at: https://dagshub.com/edynsoncoronado/ml_supervisado_avanzado.mlflow/#/experiments/1
