<style>
#title {
    text-align: center;
    font-family: Helvetica, Sans-serif;
}
#assign_name {
    text-align: center;
    font-size: 30px;
    font-weight: 200
}
#info {
    text-aling: left;
    font-size: 20px;
    font-weight: 200
}

</style>

<h1 id='title'>Trabajo en Clase 4</h1> 
<p id='assign_name'>DataBricks</p>
<p id='info'>Por Sofía Maldonado García</p>
<p id='info'>Curso: Proyecto de Ciencia de Datos</p>
<p id='info'>Profesor: Cristian Zapata</p>

***

In [13]:
# Imports

# Generales
import os, pickle
import pandas as pd
from dotenv import load_dotenv
import math
import pathlib

# Modelado
import xgboost as xgb
from sklearn.ensemble import GradientBoostingClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import root_mean_squared_error
import optuna
from optuna.samplers import TPESampler
from mlflow.models.signature import infer_signature

# Registro
import mlflow
from mlflow.tracking import MlflowClient

In [2]:
# Experiment Setup
load_dotenv(override=True)
EXPERIMENT_NAME = "/Users/alfonso.maldonado@iteso.mx/tarea4"

mlflow.set_tracking_uri('databricks')
experiment = mlflow.set_experiment(experiment_name=EXPERIMENT_NAME)

In [3]:
# Helper functions

def read_dataframe(filename):

    df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    return df

def preprocess(df, dv):
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PU_DO']
    numerical = ['trip_distance']
    train_dicts = df[categorical + numerical].to_dict(orient='records')
    return dv.transform(train_dicts)

In [None]:
# Reading datasets
df_train = read_dataframe('../data/green_tripdata_2025-01.parquet')
df_val = read_dataframe('../data/green_tripdata_2025-02.parquet')

In [None]:
# Feature Engineering
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
categorical = ['PU_DO']
numerical = ['trip_distance']
dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

X_val = preprocess(df_val, dv)

In [6]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

y_train = y_train.astype(int)
y_val = y_val.astype(int)

In [None]:
# Train Test Split for MLFlow register
training_dataset = mlflow.data.from_numpy(X_train.data, targets=y_train, name="green_tripdata_2025-01")
validation_dataset = mlflow.data.from_numpy(X_val.data, targets=y_val, name="green_tripdata_2025-02")

In [None]:
# DMatrix, specific for XGBoost
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

# XGBoost

In [33]:
# ------------------------------------------------------------
# Definir la función objetivo para Optuna
#    - Recibe un `trial`, que se usa para proponer hiperparámetros.
#    - Entrena un modelo con esos hiperparámetros.
#    - Calcula la métrica de validación (RMSE) y la retorna (Optuna la minimizará).
#    - Abrimos un run anidado de MLflow para registrar cada trial.
# ------------------------------------------------------------
def objective(trial: optuna.trial.Trial):
    # Hiperparámetros MUESTREADOS por Optuna en CADA trial.
    # Nota: usamos log=True para emular rangos log-uniformes (similar a loguniform).
    params = {
        "max_depth": trial.suggest_int("max_depth", 4, 100),
        "learning_rate": trial.suggest_float("learning_rate", math.exp(-3), 1.0, log=True),
        "reg_alpha": trial.suggest_float("reg_alpha",   math.exp(-5), math.exp(-1), log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", math.exp(-6), math.exp(-1), log=True),
        "min_child_weight": trial.suggest_float("min_child_weight", math.exp(-1), math.exp(3), log=True),
        "objective": "reg:squarederror",  
        "seed": 42,                      
    }

    # Run anidado para dejar rastro de cada trial en MLflow
    with mlflow.start_run(nested=True):
        mlflow.set_tag("model_family", "xgboost")  # etiqueta informativa
        mlflow.log_params(params)                  # registra hiperparámetros del trial

        # Entrenamiento con early stopping en el conjunto de validación
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=100,
            evals=[(valid, "validation")],
            early_stopping_rounds=10,
        )

        # Predicción y métrica en validación
        y_pred = booster.predict(valid)
        rmse = root_mean_squared_error(y_val, y_pred)

        # Registrar la métrica principal
        mlflow.log_metric("rmse", rmse)

        # La "signature" describe la estructura esperada de entrada y salida del modelo:
        # incluye los nombres, tipos y forma (shape) de las variables de entrada y el tipo de salida.
        # MLflow la usa para validar datos en inferencia y documentar el modelo en el Model Registry.
        signature = infer_signature(X_val, y_pred)

        # Guardar el modelo del trial como artefacto en MLflow.
        mlflow.xgboost.log_model(
            booster,
            name="model",
            input_example=X_val[:5],
            signature=signature
        )

    # Optuna minimiza el valor retornado
    return rmse

In [34]:
mlflow.xgboost.autolog(log_models=False)

# ------------------------------------------------------------
# Crear el estudio de Optuna
#    - Usamos TPE (Tree-structured Parzen Estimator) como sampler.
#    - direction="minimize" porque queremos minimizar el RMSE.
# ------------------------------------------------------------
sampler = TPESampler(seed=42)
study = optuna.create_study(direction="minimize", sampler=sampler)

# ------------------------------------------------------------
# Ejecutar la optimización (n_trials = número de intentos)
#    - Cada trial ejecuta la función objetivo con un set distinto de hiperparámetros.
#    - Abrimos un run "padre" para agrupar toda la búsqueda.
# ------------------------------------------------------------
with mlflow.start_run(run_name="XGBoost Hyperparameter Optimization (Optuna)", nested=True):
    study.optimize(objective, n_trials=10)

    # --------------------------------------------------------
    # Recuperar y registrar los mejores hiperparámetros
    # --------------------------------------------------------
    best_params = study.best_params
    # Asegurar tipos/campos fijos (por claridad y consistencia)
    best_params["max_depth"] = int(best_params["max_depth"])
    best_params["seed"] = 42
    best_params["objective"] = "reg:squarederror"

    mlflow.log_params(best_params)

    # Etiquetas del run "padre" (metadatos del experimento)
    mlflow.set_tags({
        "project": "NYC Taxi Time Prediction Project",
        "optimizer_engine": "optuna",
        "model_family": "xgboost",
        "feature_set_version": 1,
    })

    # --------------------------------------------------------
    # 7) Entrenar un modelo FINAL con los mejores hiperparámetros
    #    (normalmente se haría sobre train+val o con CV; aquí mantenemos el patrón original)
    # --------------------------------------------------------
    booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=100,
        evals=[(valid, "validation")],
        early_stopping_rounds=10,
    )

    # Evaluar y registrar la métrica final en validación
    y_pred = booster.predict(valid)
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)

    # --------------------------------------------------------
    # 8) Guardar artefactos adicionales (p. ej. el preprocesador)
    # --------------------------------------------------------
    pathlib.Path("preprocessor").mkdir(exist_ok=True)
    with open("preprocessor/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)

    mlflow.log_artifact("preprocessor/preprocessor.b", artifact_path="preprocessor")

    # La "signature" describe la estructura esperada de entrada y salida del modelo:
    # incluye los nombres, tipos y forma (shape) de las variables de entrada y el tipo de salida.
    # MLflow la usa para validar datos en inferencia y documentar el modelo en el Model Registry.
    # Si X_val es la matriz dispersa (scipy.sparse) salida de DictVectorizer:
    feature_names = dv.get_feature_names_out()
    input_example = pd.DataFrame(X_val[:5].toarray(), columns=feature_names)

    # Para que las longitudes coincidan, usa el mismo slice en y_pred
    signature = infer_signature(input_example, y_val[:5])

    # Guardar el modelo del trial como artefacto en MLflow.
    mlflow.xgboost.log_model(
        booster,
        name="model",
        input_example=input_example,
        signature=signature
    )

[I 2025-10-23 21:24:46,819] A new study created in memory with name: no-name-3ca1134f-c4e5-442e-8fed-3a52a1beb573


[0]	validation-rmse:5.72656
[1]	validation-rmse:5.58439
[2]	validation-rmse:5.57299
[3]	validation-rmse:5.56482
[4]	validation-rmse:5.56776
[5]	validation-rmse:5.55068
[6]	validation-rmse:5.55101
[7]	validation-rmse:5.54669
[8]	validation-rmse:5.54169
[9]	validation-rmse:5.54048
[10]	validation-rmse:5.53883
[11]	validation-rmse:5.53449
[12]	validation-rmse:5.52788
[13]	validation-rmse:5.52992
[14]	validation-rmse:5.52920
[15]	validation-rmse:5.52853
[16]	validation-rmse:5.53104
[17]	validation-rmse:5.52973
[18]	validation-rmse:5.53397
[19]	validation-rmse:5.53149
[20]	validation-rmse:5.53267
[21]	validation-rmse:5.53209
[22]	validation-rmse:5.53033


  xgb_model.save_model(model_data_path)
  model.load_model(xgb_model_path)
2025/10/23 21:25:06 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-23 21:25:08,880] Trial 0 finished with value: 5.530333518981934 and parameters: {'max_depth': 40, 'learning_rate': 0.8625543817410922, 'reg_alpha': 0.12593061066249622, 'reg_lambda': 0.049454235173237264, 'min_child_weight': 0.6866535292359801}. Best is trial 0 with value: 5.530333518981934.


🏃 View run thoughtful-gnu-807 at: https://dbc-03802296-e2f6.cloud.databricks.com/ml/experiments/3962753005528092/runs/a7fe6117457746a594c31cba2d39ae13
🧪 View experiment at: https://dbc-03802296-e2f6.cloud.databricks.com/ml/experiments/3962753005528092
[0]	validation-rmse:8.78545
[1]	validation-rmse:8.48231
[2]	validation-rmse:8.20381
[3]	validation-rmse:7.94875
[4]	validation-rmse:7.71546
[5]	validation-rmse:7.50285
[6]	validation-rmse:7.30884
[7]	validation-rmse:7.13210
[8]	validation-rmse:6.97156
[9]	validation-rmse:6.82518
[10]	validation-rmse:6.69288
[11]	validation-rmse:6.57310
[12]	validation-rmse:6.46433
[13]	validation-rmse:6.36676
[14]	validation-rmse:6.27804
[15]	validation-rmse:6.19811
[16]	validation-rmse:6.12594
[17]	validation-rmse:6.06081
[18]	validation-rmse:6.00146
[19]	validation-rmse:5.94837
[20]	validation-rmse:5.90055
[21]	validation-rmse:5.85766
[22]	validation-rmse:5.81854
[23]	validation-rmse:5.78373
[24]	validation-rmse:5.75230
[25]	validation-rmse:5.72422
[26]

  xgb_model.save_model(model_data_path)
  model.load_model(xgb_model_path)
[I 2025-10-23 21:25:32,292] Trial 1 finished with value: 5.411676406860352 and parameters: {'max_depth': 19, 'learning_rate': 0.059264241587996896, 'reg_alpha': 0.21539205131792016, 'reg_lambda': 0.05006540936006931, 'min_child_weight': 6.248180561354165}. Best is trial 1 with value: 5.411676406860352.


🏃 View run clumsy-shad-911 at: https://dbc-03802296-e2f6.cloud.databricks.com/ml/experiments/3962753005528092/runs/aaf202cc21664684b2e8a4edef2b2ca7
🧪 View experiment at: https://dbc-03802296-e2f6.cloud.databricks.com/ml/experiments/3962753005528092
[0]	validation-rmse:5.85284
[1]	validation-rmse:5.74153
[2]	validation-rmse:5.72191
[3]	validation-rmse:5.71815
[4]	validation-rmse:5.70875
[5]	validation-rmse:5.69156
[6]	validation-rmse:5.69268
[7]	validation-rmse:5.68229
[8]	validation-rmse:5.68117
[9]	validation-rmse:5.67906
[10]	validation-rmse:5.68228
[11]	validation-rmse:5.67907
[12]	validation-rmse:5.67993
[13]	validation-rmse:5.66435
[14]	validation-rmse:5.65793
[15]	validation-rmse:5.65795
[16]	validation-rmse:5.65432
[17]	validation-rmse:5.65379
[18]	validation-rmse:5.65003
[19]	validation-rmse:5.65146
[20]	validation-rmse:5.64805
[21]	validation-rmse:5.64348
[22]	validation-rmse:5.64541
[23]	validation-rmse:5.64543
[24]	validation-rmse:5.63990
[25]	validation-rmse:5.64013
[26]	va

  xgb_model.save_model(model_data_path)
  model.load_model(xgb_model_path)
[I 2025-10-23 21:25:45,083] Trial 2 finished with value: 5.627442359924316 and parameters: {'max_depth': 5, 'learning_rate': 0.9136840519292247, 'reg_alpha': 0.18820387978911576, 'reg_lambda': 0.007166739666045858, 'min_child_weight': 0.7613210498541186}. Best is trial 1 with value: 5.411676406860352.


🏃 View run orderly-fawn-976 at: https://dbc-03802296-e2f6.cloud.databricks.com/ml/experiments/3962753005528092/runs/9aab21c533f94b59a48e9b24dbe4641c
🧪 View experiment at: https://dbc-03802296-e2f6.cloud.databricks.com/ml/experiments/3962753005528092
[0]	validation-rmse:8.43062
[1]	validation-rmse:7.86480
[2]	validation-rmse:7.40080
[3]	validation-rmse:7.01964
[4]	validation-rmse:6.71097
[5]	validation-rmse:6.45949
[6]	validation-rmse:6.25872
[7]	validation-rmse:6.09855
[8]	validation-rmse:5.96924
[9]	validation-rmse:5.86577
[10]	validation-rmse:5.78101
[11]	validation-rmse:5.71375
[12]	validation-rmse:5.66051
[13]	validation-rmse:5.61409
[14]	validation-rmse:5.57951
[15]	validation-rmse:5.55150
[16]	validation-rmse:5.52699
[17]	validation-rmse:5.50770
[18]	validation-rmse:5.48937
[19]	validation-rmse:5.47562
[20]	validation-rmse:5.46385
[21]	validation-rmse:5.45381
[22]	validation-rmse:5.44558
[23]	validation-rmse:5.43793
[24]	validation-rmse:5.43060
[25]	validation-rmse:5.42614
[26]	v

  xgb_model.save_model(model_data_path)
  model.load_model(xgb_model_path)
[I 2025-10-23 21:26:08,058] Trial 3 finished with value: 5.368428707122803 and parameters: {'max_depth': 21, 'learning_rate': 0.12402485733085497, 'reg_alpha': 0.054969638498598095, 'reg_lambda': 0.02148769342025257, 'min_child_weight': 1.1792947151892554}. Best is trial 3 with value: 5.368428707122803.


🏃 View run ambitious-rook-131 at: https://dbc-03802296-e2f6.cloud.databricks.com/ml/experiments/3962753005528092/runs/07e44b127dc44e5e92a0a45365453232
🧪 View experiment at: https://dbc-03802296-e2f6.cloud.databricks.com/ml/experiments/3962753005528092
[0]	validation-rmse:8.67773
[1]	validation-rmse:8.28559
[2]	validation-rmse:7.93525
[3]	validation-rmse:7.62306
[4]	validation-rmse:7.34544
[5]	validation-rmse:7.09909
[6]	validation-rmse:6.88057
[7]	validation-rmse:6.68879
[8]	validation-rmse:6.52039
[9]	validation-rmse:6.37274
[10]	validation-rmse:6.24296
[11]	validation-rmse:6.12967
[12]	validation-rmse:6.03055
[13]	validation-rmse:5.94407
[14]	validation-rmse:5.86815
[15]	validation-rmse:5.80252
[16]	validation-rmse:5.74580
[17]	validation-rmse:5.69646
[18]	validation-rmse:5.65399
[19]	validation-rmse:5.61702
[20]	validation-rmse:5.58592
[21]	validation-rmse:5.55833
[22]	validation-rmse:5.53410
[23]	validation-rmse:5.51314
[24]	validation-rmse:5.49465
[25]	validation-rmse:5.47910
[26]

  xgb_model.save_model(model_data_path)
  model.load_model(xgb_model_path)
[I 2025-10-23 21:27:27,180] Trial 4 finished with value: 5.354063510894775 and parameters: {'max_depth': 63, 'learning_rate': 0.07565903471570516, 'reg_alpha': 0.021678779375600917, 'reg_lambda': 0.015480241912324163, 'min_child_weight': 2.2802382585441565}. Best is trial 4 with value: 5.354063510894775.


🏃 View run carefree-auk-702 at: https://dbc-03802296-e2f6.cloud.databricks.com/ml/experiments/3962753005528092/runs/7b53d5f6b81f4bb1841b4ff04bf3757c
🧪 View experiment at: https://dbc-03802296-e2f6.cloud.databricks.com/ml/experiments/3962753005528092
[0]	validation-rmse:8.59758
[1]	validation-rmse:8.14813
[2]	validation-rmse:7.76409
[3]	validation-rmse:7.42131
[4]	validation-rmse:7.13737
[5]	validation-rmse:6.87898
[6]	validation-rmse:6.67518
[7]	validation-rmse:6.49377
[8]	validation-rmse:6.34135
[9]	validation-rmse:6.21602
[10]	validation-rmse:6.09792
[11]	validation-rmse:6.00785
[12]	validation-rmse:5.93015
[13]	validation-rmse:5.85688
[14]	validation-rmse:5.80358
[15]	validation-rmse:5.75459
[16]	validation-rmse:5.71567
[17]	validation-rmse:5.67958
[18]	validation-rmse:5.65087
[19]	validation-rmse:5.63134
[20]	validation-rmse:5.60976
[21]	validation-rmse:5.59042
[22]	validation-rmse:5.57561
[23]	validation-rmse:5.56327
[24]	validation-rmse:5.55452
[25]	validation-rmse:5.54581
[26]	v

  xgb_model.save_model(model_data_path)
  model.load_model(xgb_model_path)
[I 2025-10-23 21:29:21,698] Trial 5 finished with value: 5.478622913360596 and parameters: {'max_depth': 80, 'learning_rate': 0.0906292152736207, 'reg_alpha': 0.05270408847118816, 'reg_lambda': 0.04793414660944966, 'min_child_weight': 0.4429943118354462}. Best is trial 4 with value: 5.354063510894775.


🏃 View run charming-elk-264 at: https://dbc-03802296-e2f6.cloud.databricks.com/ml/experiments/3962753005528092/runs/40396b4612e549858361b8a63dd44427
🧪 View experiment at: https://dbc-03802296-e2f6.cloud.databricks.com/ml/experiments/3962753005528092
[0]	validation-rmse:8.66710
[1]	validation-rmse:8.27068
[2]	validation-rmse:7.92063
[3]	validation-rmse:7.61250
[4]	validation-rmse:7.34223
[5]	validation-rmse:7.10618
[6]	validation-rmse:6.90005
[7]	validation-rmse:6.72107
[8]	validation-rmse:6.56568
[9]	validation-rmse:6.43133
[10]	validation-rmse:6.31471
[11]	validation-rmse:6.21426
[12]	validation-rmse:6.12804
[13]	validation-rmse:6.05348
[14]	validation-rmse:5.98960
[15]	validation-rmse:5.93435
[16]	validation-rmse:5.88727
[17]	validation-rmse:5.84625
[18]	validation-rmse:5.81111
[19]	validation-rmse:5.78005
[20]	validation-rmse:5.75243
[21]	validation-rmse:5.72872
[22]	validation-rmse:5.70817
[23]	validation-rmse:5.69071
[24]	validation-rmse:5.67579
[25]	validation-rmse:5.66220
[26]	v

  xgb_model.save_model(model_data_path)
  model.load_model(xgb_model_path)


🏃 View run lyrical-ram-27 at: https://dbc-03802296-e2f6.cloud.databricks.com/ml/experiments/3962753005528092/runs/6592dde55d454abab1d5e86716162fcf
🧪 View experiment at: https://dbc-03802296-e2f6.cloud.databricks.com/ml/experiments/3962753005528092


[I 2025-10-23 21:30:01,166] Trial 6 finished with value: 5.492410659790039 and parameters: {'max_depth': 62, 'learning_rate': 0.08304043435235499, 'reg_alpha': 0.008740449782948887, 'reg_lambda': 0.28491274207986833, 'min_child_weight': 17.505727836123448}. Best is trial 4 with value: 5.354063510894775.


[0]	validation-rmse:8.40400
[1]	validation-rmse:7.81407
[2]	validation-rmse:7.32968
[3]	validation-rmse:6.93360
[4]	validation-rmse:6.61360
[5]	validation-rmse:6.35591
[6]	validation-rmse:6.14981
[7]	validation-rmse:5.98496
[8]	validation-rmse:5.85496
[9]	validation-rmse:5.75267
[10]	validation-rmse:5.67211
[11]	validation-rmse:5.61064
[12]	validation-rmse:5.56253
[13]	validation-rmse:5.52496
[14]	validation-rmse:5.49490
[15]	validation-rmse:5.47065
[16]	validation-rmse:5.45372
[17]	validation-rmse:5.43980
[18]	validation-rmse:5.42974
[19]	validation-rmse:5.42061
[20]	validation-rmse:5.41280
[21]	validation-rmse:5.40667
[22]	validation-rmse:5.40317
[23]	validation-rmse:5.39994
[24]	validation-rmse:5.39552
[25]	validation-rmse:5.39185
[26]	validation-rmse:5.38799
[27]	validation-rmse:5.38586
[28]	validation-rmse:5.38381
[29]	validation-rmse:5.38123
[30]	validation-rmse:5.38016
[31]	validation-rmse:5.37826
[32]	validation-rmse:5.37796
[33]	validation-rmse:5.37719
[34]	validation-rmse:5.3

  xgb_model.save_model(model_data_path)
  model.load_model(xgb_model_path)
[I 2025-10-23 21:30:56,025] Trial 7 finished with value: 5.363897800445557 and parameters: {'max_depth': 82, 'learning_rate': 0.12416316985362412, 'reg_alpha': 0.009958672056108932, 'reg_lambda': 0.0758623422350637, 'min_child_weight': 2.1395809133199974}. Best is trial 4 with value: 5.354063510894775.


🏃 View run shivering-hawk-251 at: https://dbc-03802296-e2f6.cloud.databricks.com/ml/experiments/3962753005528092/runs/7d3b0bba11f94614a30aeb4ddfd91567
🧪 View experiment at: https://dbc-03802296-e2f6.cloud.databricks.com/ml/experiments/3962753005528092
[0]	validation-rmse:7.94171
[1]	validation-rmse:7.13023
[2]	validation-rmse:6.57708
[3]	validation-rmse:6.21578
[4]	validation-rmse:5.96666
[5]	validation-rmse:5.81041
[6]	validation-rmse:5.70225
[7]	validation-rmse:5.63239
[8]	validation-rmse:5.58031
[9]	validation-rmse:5.54201
[10]	validation-rmse:5.51907
[11]	validation-rmse:5.49963
[12]	validation-rmse:5.48439
[13]	validation-rmse:5.47310
[14]	validation-rmse:5.46843
[15]	validation-rmse:5.46460
[16]	validation-rmse:5.46138
[17]	validation-rmse:5.45751
[18]	validation-rmse:5.45421
[19]	validation-rmse:5.45231
[20]	validation-rmse:5.45025
[21]	validation-rmse:5.44851
[22]	validation-rmse:5.44642
[23]	validation-rmse:5.44442
[24]	validation-rmse:5.44292
[25]	validation-rmse:5.44204
[26]

  xgb_model.save_model(model_data_path)
  model.load_model(xgb_model_path)
[I 2025-10-23 21:31:13,035] Trial 8 finished with value: 5.373172760009766 and parameters: {'max_depth': 15, 'learning_rate': 0.21992487468175848, 'reg_alpha': 0.007731550026907306, 'reg_lambda': 0.23377457337376373, 'min_child_weight': 1.0357439143907545}. Best is trial 4 with value: 5.354063510894775.


🏃 View run bemused-cod-928 at: https://dbc-03802296-e2f6.cloud.databricks.com/ml/experiments/3962753005528092/runs/484928a826214f3da282445c7d62198f
🧪 View experiment at: https://dbc-03802296-e2f6.cloud.databricks.com/ml/experiments/3962753005528092
[0]	validation-rmse:8.40239
[1]	validation-rmse:7.82514
[2]	validation-rmse:7.34145
[3]	validation-rmse:6.96367
[4]	validation-rmse:6.64958
[5]	validation-rmse:6.40909
[6]	validation-rmse:6.21376
[7]	validation-rmse:6.06256
[8]	validation-rmse:5.93165
[9]	validation-rmse:5.83895
[10]	validation-rmse:5.76462
[11]	validation-rmse:5.70713
[12]	validation-rmse:5.66507
[13]	validation-rmse:5.61864
[14]	validation-rmse:5.59231
[15]	validation-rmse:5.56479
[16]	validation-rmse:5.54825
[17]	validation-rmse:5.53445
[18]	validation-rmse:5.52012
[19]	validation-rmse:5.50930
[20]	validation-rmse:5.50311
[21]	validation-rmse:5.49321
[22]	validation-rmse:5.49187
[23]	validation-rmse:5.48592
[24]	validation-rmse:5.48312
[25]	validation-rmse:5.47968
[26]	va

  xgb_model.save_model(model_data_path)
  model.load_model(xgb_model_path)
[I 2025-10-23 21:31:54,677] Trial 9 finished with value: 5.453731536865234 and parameters: {'max_depth': 68, 'learning_rate': 0.1268351874747755, 'reg_alpha': 0.05394836382863035, 'reg_lambda': 0.03814164293595655, 'min_child_weight': 0.7706028272535065}. Best is trial 4 with value: 5.354063510894775.


🏃 View run overjoyed-cod-946 at: https://dbc-03802296-e2f6.cloud.databricks.com/ml/experiments/3962753005528092/runs/6288ab5e6d484f99993bad9b1b4e1693
🧪 View experiment at: https://dbc-03802296-e2f6.cloud.databricks.com/ml/experiments/3962753005528092
[0]	validation-rmse:8.67773
[1]	validation-rmse:8.28559
[2]	validation-rmse:7.93525
[3]	validation-rmse:7.62306
[4]	validation-rmse:7.34544
[5]	validation-rmse:7.09909
[6]	validation-rmse:6.88057
[7]	validation-rmse:6.68879
[8]	validation-rmse:6.52039
[9]	validation-rmse:6.37274
[10]	validation-rmse:6.24296
[11]	validation-rmse:6.12967
[12]	validation-rmse:6.03055
[13]	validation-rmse:5.94407
[14]	validation-rmse:5.86815
[15]	validation-rmse:5.80252
[16]	validation-rmse:5.74580
[17]	validation-rmse:5.69646
[18]	validation-rmse:5.65399
[19]	validation-rmse:5.61702
[20]	validation-rmse:5.58592
[21]	validation-rmse:5.55833
[22]	validation-rmse:5.53410
[23]	validation-rmse:5.51314
[24]	validation-rmse:5.49465
[25]	validation-rmse:5.47910
[26]	

  xgb_model.save_model(model_data_path)
  model.load_model(xgb_model_path)


🏃 View run XGBoost Hyperparameter Optimization (Optuna) at: https://dbc-03802296-e2f6.cloud.databricks.com/ml/experiments/3962753005528092/runs/f14976ba44d143aba188baa05b50fec9
🧪 View experiment at: https://dbc-03802296-e2f6.cloud.databricks.com/ml/experiments/3962753005528092


# Random Forest Classifier

In [10]:
# Función Objective
def objective(trial: optuna.trial.Trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 300),
        "max_depth": trial.suggest_int("max_depth", 4, 50),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 5),
        "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None]),
        "bootstrap": trial.suggest_categorical("bootstrap", [True, False]),
        "random_state": 42,
    }

    with mlflow.start_run(nested=True):
        mlflow.set_tag("model_family", "random_forest")
        mlflow.log_params(params)

        model = RandomForestRegressor(**params)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_val)
        rmse = root_mean_squared_error(y_val, y_pred)

        mlflow.log_metric("rmse", rmse)

        signature = infer_signature(X_val, y_pred)
        mlflow.sklearn.log_model(model, "model", input_example=X_val[:5], signature=signature)

    return rmse


# Optuna
sampler = TPESampler(seed=42)
study = optuna.create_study(direction="minimize", sampler=sampler)

# ------------------------------------------------------------
# Ejecutar optimización
# ------------------------------------------------------------
with mlflow.start_run(run_name="Random Forest Hyperparameter Optimization (Optuna)", nested=True):
    study.optimize(objective, n_trials=10)

    best_params = study.best_params
    best_params["random_state"] = 42

    mlflow.log_params(best_params)
    mlflow.set_tags({
        "project": "NYC Taxi Time Prediction Project",
        "optimizer_engine": "optuna",
        "model_family": "random_forest",
        "feature_set_version": 1,
    })

    model = RandomForestRegressor(**best_params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)

    pathlib.Path("preprocessor").mkdir(exist_ok=True)
    with open("preprocessor/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)
    mlflow.log_artifact("preprocessor/preprocessor.b", artifact_path="preprocessor")

    feature_names = dv.get_feature_names_out()
    input_example = pd.DataFrame(X_val[:5].toarray(), columns=feature_names)
    signature = infer_signature(input_example, y_val[:5])

    mlflow.sklearn.log_model(model, "model", input_example=input_example, signature=signature)


[I 2025-10-28 09:16:17,802] A new study created in memory with name: no-name-fcf3729c-7cb5-4e44-9a52-d58135e1664e


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/28 09:16:41 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-28 09:17:07,624] Trial 0 finished with value: 6.831953886442933 and parameters: {'n_estimators': 144, 'max_depth': 48, 'min_samples_split': 8, 'min_samples_leaf': 3, 'max_features': 'sqrt', 'bootstrap': True}. Best is trial 0 with value: 6.831953886442933.


🏃 View run sassy-steed-883 at: https://dbc-03802296-e2f6.cloud.databricks.com/ml/experiments/3962753005528092/runs/565178b1fa8b4bc2ba9f02ab38fa398a
🧪 View experiment at: https://dbc-03802296-e2f6.cloud.databricks.com/ml/experiments/3962753005528092




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

[I 2025-10-28 09:17:15,754] Trial 1 finished with value: 8.715106688964338 and parameters: {'n_estimators': 227, 'max_depth': 4, 'min_samples_split': 10, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'bootstrap': False}. Best is trial 0 with value: 6.831953886442933.


🏃 View run smiling-perch-634 at: https://dbc-03802296-e2f6.cloud.databricks.com/ml/experiments/3962753005528092/runs/b4f52c75f97f4083808ba59a62d21efe
🧪 View experiment at: https://dbc-03802296-e2f6.cloud.databricks.com/ml/experiments/3962753005528092




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

[I 2025-10-28 09:18:11,219] Trial 2 finished with value: 5.550846624369101 and parameters: {'n_estimators': 158, 'max_depth': 17, 'min_samples_split': 7, 'min_samples_leaf': 1, 'max_features': None, 'bootstrap': True}. Best is trial 2 with value: 5.550846624369101.


🏃 View run bustling-wolf-126 at: https://dbc-03802296-e2f6.cloud.databricks.com/ml/experiments/3962753005528092/runs/ebd4083bb4d243a8bb8ad3b7da90b40c
🧪 View experiment at: https://dbc-03802296-e2f6.cloud.databricks.com/ml/experiments/3962753005528092




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

[I 2025-10-28 09:20:41,098] Trial 3 finished with value: 5.481507532979925 and parameters: {'n_estimators': 179, 'max_depth': 31, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': None, 'bootstrap': True}. Best is trial 3 with value: 5.481507532979925.


🏃 View run hilarious-finch-231 at: https://dbc-03802296-e2f6.cloud.databricks.com/ml/experiments/3962753005528092/runs/fd4f243227e745d4b591d06f0f6de8e5
🧪 View experiment at: https://dbc-03802296-e2f6.cloud.databricks.com/ml/experiments/3962753005528092




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

[I 2025-10-28 09:20:49,434] Trial 4 finished with value: 8.959464902409657 and parameters: {'n_estimators': 126, 'max_depth': 8, 'min_samples_split': 8, 'min_samples_leaf': 3, 'max_features': 'log2', 'bootstrap': True}. Best is trial 3 with value: 5.481507532979925.


🏃 View run trusting-snail-103 at: https://dbc-03802296-e2f6.cloud.databricks.com/ml/experiments/3962753005528092/runs/d1c493bfbf714a49872714bb4bfe50f3
🧪 View experiment at: https://dbc-03802296-e2f6.cloud.databricks.com/ml/experiments/3962753005528092




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

[I 2025-10-28 09:21:04,670] Trial 5 finished with value: 8.87897957794121 and parameters: {'n_estimators': 216, 'max_depth': 18, 'min_samples_split': 6, 'min_samples_leaf': 3, 'max_features': 'log2', 'bootstrap': True}. Best is trial 3 with value: 5.481507532979925.


🏃 View run upset-moose-132 at: https://dbc-03802296-e2f6.cloud.databricks.com/ml/experiments/3962753005528092/runs/9a1d7d5474b0494f937ea62a1a333c25
🧪 View experiment at: https://dbc-03802296-e2f6.cloud.databricks.com/ml/experiments/3962753005528092




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

[I 2025-10-28 09:27:50,200] Trial 6 finished with value: 5.7830727676521345 and parameters: {'n_estimators': 200, 'max_depth': 47, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': None, 'bootstrap': False}. Best is trial 3 with value: 5.481507532979925.


🏃 View run omniscient-wasp-105 at: https://dbc-03802296-e2f6.cloud.databricks.com/ml/experiments/3962753005528092/runs/97aee3e17443460fabba40f26c4e25fc
🧪 View experiment at: https://dbc-03802296-e2f6.cloud.databricks.com/ml/experiments/3962753005528092




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

[I 2025-10-28 09:28:41,953] Trial 7 finished with value: 5.552422845045161 and parameters: {'n_estimators': 139, 'max_depth': 17, 'min_samples_split': 6, 'min_samples_leaf': 1, 'max_features': None, 'bootstrap': True}. Best is trial 3 with value: 5.481507532979925.


🏃 View run peaceful-horse-74 at: https://dbc-03802296-e2f6.cloud.databricks.com/ml/experiments/3962753005528092/runs/49535242340b4be8aac9177f66669b8f
🧪 View experiment at: https://dbc-03802296-e2f6.cloud.databricks.com/ml/experiments/3962753005528092




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

[I 2025-10-28 09:29:04,114] Trial 8 finished with value: 7.188647654709836 and parameters: {'n_estimators': 51, 'max_depth': 42, 'min_samples_split': 8, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'bootstrap': False}. Best is trial 3 with value: 5.481507532979925.


🏃 View run industrious-cow-365 at: https://dbc-03802296-e2f6.cloud.databricks.com/ml/experiments/3962753005528092/runs/128fdf61e8e44259973b3bbd119da7ec
🧪 View experiment at: https://dbc-03802296-e2f6.cloud.databricks.com/ml/experiments/3962753005528092




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

[I 2025-10-28 09:29:20,789] Trial 9 finished with value: 8.809825871700163 and parameters: {'n_estimators': 206, 'max_depth': 19, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'log2', 'bootstrap': True}. Best is trial 3 with value: 5.481507532979925.


🏃 View run orderly-bee-376 at: https://dbc-03802296-e2f6.cloud.databricks.com/ml/experiments/3962753005528092/runs/56074b71569f44d0a97abb5a52d3bbbb
🧪 View experiment at: https://dbc-03802296-e2f6.cloud.databricks.com/ml/experiments/3962753005528092




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



🏃 View run Random Forest Hyperparameter Optimization (Optuna) at: https://dbc-03802296-e2f6.cloud.databricks.com/ml/experiments/3962753005528092/runs/b19bda000f31485eaad828da61d0cabe
🧪 View experiment at: https://dbc-03802296-e2f6.cloud.databricks.com/ml/experiments/3962753005528092


In [None]:
# Encontrando el Champion y Challenger
runs = mlflow.search_runs(
    experiment_names=[EXPERIMENT_NAME],
    order_by=["metrics.rmse ASC"],
    output_format="list"
)

# Obtener el mejor run
if len(runs) > 0:
    best_run = runs[0]
    challenger = runs[1]
    print("🏆 Champion Run encontrado:")
    print(f"Run ID: {best_run.info.run_id}")
    print(f"RMSE: {best_run.data.metrics['rmse']}")
    print(f"Params: {best_run.data.params}")
    print()
    print("🥈 Challenger Encontrado")
    print(f"Run ID: {challenger.info.run_id}")
    print(f"RMSE: {challenger.data.metrics['rmse']}")
    print(f"Params: {challenger.data.params}")
else:
    print("⚠️ No se encontraron runs con métrica RMSE.")


🏆 Champion Run encontrado:
Run ID: 7b53d5f6b81f4bb1841b4ff04bf3757c
RMSE: 5.354063510894775
Params: {'custom_metric': 'None', 'early_stopping_rounds': '10', 'learning_rate': '0.07565903471570516', 'max_depth': '63', 'maximize': 'None', 'min_child_weight': '2.2802382585441565', 'num_boost_round': '100', 'objective': 'reg:squarederror', 'reg_alpha': '0.021678779375600917', 'reg_lambda': '0.015480241912324163', 'seed': '42', 'verbose_eval': 'True'}

🥈 Challenger Encontrado
Run ID: f14976ba44d143aba188baa05b50fec9
RMSE: 5.354063510894775
Params: {'custom_metric': 'None', 'early_stopping_rounds': '10', 'learning_rate': '0.07565903471570516', 'max_depth': '63', 'maximize': 'None', 'min_child_weight': '2.2802382585441565', 'num_boost_round': '100', 'objective': 'reg:squarederror', 'reg_alpha': '0.021678779375600917', 'reg_lambda': '0.015480241912324163', 'seed': '42', 'verbose_eval': 'True'}


# Probando con nueva data

In [None]:
CHANPION_RUN = '7b53d5f6b81f4bb1841b4ff04bf3757c'
CHALLENGER_RUN = 'fd4f243227e745d4b591d06f0f6de8e5' # Diferente al de arriba porque el challenger "verdadero" es otro XGBoost, pero queremos probar con Random Forest tambiéns

In [16]:
champ_uri = f"runs:/{CHANPION_RUN}/model"
chall_uri = f'runs:/{CHALLENGER_RUN}/model'

champ_model = mlflow.pyfunc.load_model(champ_uri)
chall_model = mlflow.pyfunc.load_model(chall_uri)

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

  model.load_model(xgb_model_path)


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

In [17]:
df_reval = read_dataframe('../data/green_tripdata_2025-03.parquet')

In [29]:
df_reval['PU_DO'] = df_reval['PULocationID'] + '_' + df_reval['DOLocationID']
categorical = ['PU_DO']
numerical = ['trip_distance']

with open("preprocessor/preprocessor.b", "rb") as f:
    dv = pickle.load(f)

reval_dicts = df_reval[categorical + numerical].to_dict(orient='records')
X_reval = preprocess(df_reval, dv)

In [19]:
target = 'duration'
y_reval = df_reval[target].values

y_train = y_train.astype(int)

In [20]:
reval_dataset = mlflow.data.from_numpy(X_reval.data, targets=y_reval, name="green_tripdata_2025-03")

In [26]:
reval = xgb.DMatrix(X_reval, label=y_reval)

In [30]:
y_champ_preds = champ_model.predict(X_reval)
y_chall_preds = chall_model.predict(X_reval)

In [None]:
rmse_champ = root_mean_squared_error(y_reval, y_champ_preds)
rmse_chall = root_mean_squared_error(y_reval, y_chall_preds)

print(rmse_champ, rmse_chall)

5.970067173875601 6.111736225419588


El modelo challenger sigue siendo el mejor, incluso con la nueva data