# **Librerías**

In [1]:
# Cargar librerías
import os, mlflow
from dotenv import load_dotenv
import pickle
import pandas as pd
from sklearn.metrics import  root_mean_squared_error
from sklearn.feature_extraction import  DictVectorizer
import math
import optuna
import pathlib
import xgboost as xgb
from optuna.samplers import TPESampler
from mlflow.models.signature import infer_signature
from mlflow import MlflowClient
import mlflow

# **Iniciar el experimento**
Vamos a empezar designando el experimento en donde se correran los procesos:

In [None]:
load_dotenv(override=True)  # Carga las variables del archivo .env
EXPERIMENT_NAME = "/Users/pipochatgpt@gmail.com/nyc-taxi-experiments"

mlflow.set_tracking_uri("databricks")
experiment = mlflow.set_experiment(experiment_name=EXPERIMENT_NAME)

2025/10/28 15:46:06 INFO mlflow.tracking.fluent: Experiment with name '/Users/pipochatgpt@gmail.com/nyc-taxi-experiments2' does not exist. Creating a new experiment.


# **Pre-Procesamiento**
Esta es la parte del código que se encarga de preparar los datos para los modelos y de su pre-procesamiento:

In [3]:
def read_dataframe(filename):

    df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    return df

df_train = read_dataframe('../data/green_tripdata_2025-01.parquet')
df_val = read_dataframe('../data/green_tripdata_2025-02.parquet')

In [4]:
def preprocess(df, dv):
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PU_DO']
    numerical = ['trip_distance']
    train_dicts = df[categorical + numerical].to_dict(orient='records')
    return dv.transform(train_dicts)

df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
categorical = ['PU_DO']
numerical = ['trip_distance']
dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

X_val = preprocess(df_val, dv)

In [5]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

# **Gradient Boost**
Tanto para Gradient Boost como para Random Forest utilizaremos sci-kit learn para los modelos y Optuna para el tuneo de hiperparámetros conn 3 runs cada uno:

In [6]:
from sklearn.ensemble import GradientBoostingRegressor

def objective_gb(trial: optuna.trial.Trial):
    # Hiperparámetros MUESTREADOS por Optuna en CADA trial.
    # Nota: usamos log=True para emular rangos log-uniformes (similar a loguniform).
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 100),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "max_depth": trial.suggest_int("max_depth", 2, 10),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
        "random_state": 42
    }

    # Run anidado para dejar rastro de cada trial en MLflow
    with mlflow.start_run(nested=True):
        mlflow.set_tag("model_family", "gradient_boosting")  # etiqueta informativa
        mlflow.log_params(params)                  # registra hiperparámetros del trial

        # Entrenamiento 
        model = GradientBoostingRegressor(**params)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        rmse = root_mean_squared_error(y_val, y_pred)

        # Registrar la métrica principal
        mlflow.log_metric("rmse", rmse)

        # La "signature" describe la estructura esperada de entrada y salida del modelo:
        # incluye los nombres, tipos y forma (shape) de las variables de entrada y el tipo de salida.
        # MLflow la usa para validar datos en inferencia y documentar el modelo en el Model Registry.
        signature = infer_signature(X_val[:5].toarray(), y_pred[:5])

         # Guardar el modelo del trial como artefacto en MLflow.
        mlflow.sklearn.log_model(
            model,
            name="model",
            input_example=pd.DataFrame(X_val[:5].toarray(), columns=dv.get_feature_names_out()),
            signature=signature
        )      

    # Optuna minimiza el valor retornado
    return rmse

In [7]:
mlflow.sklearn.autolog(log_models=False)

# ------------------------------------------------------------
# Crear el estudio de Optuna
#    - Usamos TPE (Tree-structured Parzen Estimator) como sampler.
#    - direction="minimize" porque queremos minimizar el RMSE.
# ------------------------------------------------------------
sampler = TPESampler(seed=42)
study_gb = optuna.create_study(direction="minimize", sampler=sampler)

# ------------------------------------------------------------
# Ejecutar la optimización (n_trials = número de intentos)
#    - Cada trial ejecuta la función objetivo con un set distinto de hiperparámetros.
#    - Abrimos un run "padre" para agrupar toda la búsqueda.
# ------------------------------------------------------------

with mlflow.start_run(run_name="Gradient Boosting Hyperparameter Optimization (Optuna)", nested=True):
    study_gb.optimize(objective_gb, n_trials=3)

        # --------------------------------------------------------
    # Recuperar y registrar los mejores hiperparámetros
    # --------------------------------------------------------
    best_params = study_gb.best_params
    best_params["random_state"] = 42

    mlflow.log_params(best_params)
    # Etiquetas del run "padre" (metadatos del experimento)
    mlflow.set_tags({
        "project": "NYC Taxi Time Prediction Project",
        "optimizer_engine": "optuna",
        "model_family": "gradient_boosting",
        "feature_set_version": 1,
    })

     # --------------------------------------------------------
    # 7) Entrenar un modelo FINAL con los mejores hiperparámetros
    #    (normalmente se haría sobre train+val o con CV; aquí mantenemos el patrón original)
    # --------------------------------------------------------
    
    model = GradientBoostingRegressor(**best_params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)

    # --------------------------------------------------------
    # 8) Guardar artefactos adicionales (p. ej. el preprocesador)
    # --------------------------------------------------------
    
    pathlib.Path("preprocessor").mkdir(exist_ok=True)
    with open("preprocessor/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)

    mlflow.log_artifact("preprocessor/preprocessor.b", artifact_path="preprocessor")

    # La "signature" describe la estructura esperada de entrada y salida del modelo:
    # incluye los nombres, tipos y forma (shape) de las variables de entrada y el tipo de salida.
    # MLflow la usa para validar datos en inferencia y documentar el modelo en el Model Registry.
    # Si X_val es la matriz dispersa (scipy.sparse) salida de DictVectorizer:
    
    input_example = pd.DataFrame(X_val[:5].toarray(), columns=dv.get_feature_names_out())
    signature = infer_signature(input_example, y_val[:5])

    # Guardar el modelo del trial como artefacto en MLflow.
    mlflow.sklearn.log_model(model, name="model", input_example=input_example, signature=signature)

[I 2025-10-28 15:46:36,356] A new study created in memory with name: no-name-5f16c7c2-f852-4717-920e-8350f3ce4422


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/28 15:48:06 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-28 15:48:14,004] Trial 0 finished with value: 5.423033869318215 and parameters: {'n_estimators': 69, 'learning_rate': 0.2536999076681772, 'max_depth': 8, 'min_samples_split': 7, 'min_samples_leaf': 2}. Best is trial 0 with value: 5.423033869318215.


🏃 View run fortunate-carp-712 at: https://dbc-c600c0c2-acad.cloud.databricks.com/ml/experiments/1624726551223670/runs/3af391bab4fd45d9bc2d30e338677760
🧪 View experiment at: https://dbc-c600c0c2-acad.cloud.databricks.com/ml/experiments/1624726551223670




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/28 15:49:13 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-28 15:49:20,983] Trial 1 finished with value: 6.667930901959873 and parameters: {'n_estimators': 57, 'learning_rate': 0.012184186502221764, 'max_depth': 9, 'min_samples_split': 7, 'min_samples_leaf': 8}. Best is trial 0 with value: 5.423033869318215.


🏃 View run capricious-cat-106 at: https://dbc-c600c0c2-acad.cloud.databricks.com/ml/experiments/1624726551223670/runs/20392144c48b420f8c8a92d790b04492
🧪 View experiment at: https://dbc-c600c0c2-acad.cloud.databricks.com/ml/experiments/1624726551223670




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/28 15:49:58 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-28 15:50:04,710] Trial 2 finished with value: 5.418235873807679 and parameters: {'n_estimators': 51, 'learning_rate': 0.2708160864249968, 'max_depth': 9, 'min_samples_split': 3, 'min_samples_leaf': 2}. Best is trial 2 with value: 5.418235873807679.


🏃 View run salty-sheep-743 at: https://dbc-c600c0c2-acad.cloud.databricks.com/ml/experiments/1624726551223670/runs/f85b4b65d57b4fe391806d48859f50af
🧪 View experiment at: https://dbc-c600c0c2-acad.cloud.databricks.com/ml/experiments/1624726551223670




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/28 15:50:45 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.


🏃 View run Gradient Boosting Hyperparameter Optimization (Optuna) at: https://dbc-c600c0c2-acad.cloud.databricks.com/ml/experiments/1624726551223670/runs/f16f4fb3eac84a8f9625b40a512cff89
🧪 View experiment at: https://dbc-c600c0c2-acad.cloud.databricks.com/ml/experiments/1624726551223670


# **Random Forest**

In [25]:
from sklearn.ensemble import RandomForestRegressor

mlflow.sklearn.autolog(log_models=False)

def objective_rf(trial: optuna.trial.Trial):
    # Hiperparámetros MUESTREADOS por Optuna en CADA trial.
    # Nota: usamos log=True para emular rangos log-uniformes (similar a loguniform).
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 100),
        "max_depth": trial.suggest_int("max_depth", 5, 30),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
        "random_state": 42,
        "n_jobs": -1
    }

    # Run anidado para dejar rastro de cada trial en MLflow
    with mlflow.start_run(nested=True):
        mlflow.set_tag("model_family", "random_forest")
        mlflow.log_params(params)

        # Entrenamiento 
        model = RandomForestRegressor(**params)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        rmse = root_mean_squared_error(y_val, y_pred)

        # Registrar la métrica principal
        mlflow.log_metric("rmse", rmse)

        # La "signature" describe la estructura esperada de entrada y salida del modelo:
        # incluye los nombres, tipos y forma (shape) de las variables de entrada y el tipo de salida.
        # MLflow la usa para validar datos en inferencia y documentar el modelo en el Model Registry.
        signature = infer_signature(X_val[:5].toarray(), y_pred[:5])
        
        # Guardar el modelo del trial como artefacto en MLflow.
        mlflow.sklearn.log_model(
            model,
            name="model",
            input_example=pd.DataFrame(X_val[:5].toarray(), columns=dv.get_feature_names_out()),
            signature=signature
        )

    return rmse

In [26]:
# ------------------------------------------------------------
# Crear el estudio de Optuna
#    - Usamos TPE (Tree-structured Parzen Estimator) como sampler.
#    - direction="minimize" porque queremos minimizar el RMSE.
# ------------------------------------------------------------
sampler = TPESampler(seed=42)
study_rf = optuna.create_study(direction="minimize", sampler=sampler)

# ------------------------------------------------------------
# Ejecutar la optimización (n_trials = número de intentos)
#    - Cada trial ejecuta la función objetivo con un set distinto de hiperparámetros.
#    - Abrimos un run "padre" para agrupar toda la búsqueda.
# ------------------------------------------------------------

with mlflow.start_run(run_name="Random Forest Hyperparameter Optimization (Optuna)", nested=True):
    study_rf.optimize(objective_rf, n_trials=3)

    # Recuperar y registrar los mejores hiperparámetros
    best_params = study_rf.best_params
    best_params["random_state"] = 42

    mlflow.log_params(best_params)
    # Etiquetas del run "padre" (metadatos del experimento)
    mlflow.set_tags({
        "project": "NYC Taxi Time Prediction Project",
        "optimizer_engine": "optuna",
        "model_family": "random_forest",
        "feature_set_version": 1,
    })

     # --------------------------------------------------------
    # 7) Entrenar un modelo FINAL con los mejores hiperparámetros
    #    (normalmente se haría sobre train+val o con CV; aquí mantenemos el patrón original)
    # --------------------------------------------------------
    
    model = RandomForestRegressor(**best_params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)

    # 8) Guardar artefactos adicionales (p. ej. el preprocesador)
    
    pathlib.Path("preprocessor").mkdir(exist_ok=True)
    with open("preprocessor/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)

    mlflow.log_artifact("preprocessor/preprocessor.b", artifact_path="preprocessor")

     # La "signature" describe la estructura esperada de entrada y salida del modelo:
    # incluye los nombres, tipos y forma (shape) de las variables de entrada y el tipo de salida.
    # MLflow la usa para validar datos en inferencia y documentar el modelo en el Model Registry.
    # Si X_val es la matriz dispersa (scipy.sparse) salida de DictVectorizer:
    
    input_example = pd.DataFrame(X_val[:5].toarray(), columns=dv.get_feature_names_out())
    signature = infer_signature(input_example, y_val[:5])

    mlflow.sklearn.log_model(model, name="model", input_example=input_example, signature=signature)

[I 2025-10-23 21:22:46,916] A new study created in memory with name: no-name-3a0f6175-d319-4e39-855d-1ad09b455b75


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/23 21:23:19 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-23 21:23:40,068] Trial 0 finished with value: 5.535684811605542 and parameters: {'n_estimators': 69, 'max_depth': 29, 'min_samples_split': 8, 'min_samples_leaf': 6}. Best is trial 0 with value: 5.535684811605542.


🏃 View run lyrical-shark-666 at: https://dbc-c600c0c2-acad.cloud.databricks.com/ml/experiments/1588665197670394/runs/d21a2e2c4e6d47dab035cf2557479509
🧪 View experiment at: https://dbc-c600c0c2-acad.cloud.databricks.com/ml/experiments/1588665197670394




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/23 21:24:00 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-23 21:24:06,106] Trial 1 finished with value: 5.591476239254621 and parameters: {'n_estimators': 57, 'max_depth': 9, 'min_samples_split': 2, 'min_samples_leaf': 9}. Best is trial 0 with value: 5.535684811605542.


🏃 View run silent-crab-139 at: https://dbc-c600c0c2-acad.cloud.databricks.com/ml/experiments/1588665197670394/runs/c00ad22a37864374b189c88a43f791c1
🧪 View experiment at: https://dbc-c600c0c2-acad.cloud.databricks.com/ml/experiments/1588665197670394




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/23 21:24:31 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.


🏃 View run fearless-bee-844 at: https://dbc-c600c0c2-acad.cloud.databricks.com/ml/experiments/1588665197670394/runs/1223c4ffa70b48919920d9f11a795c93
🧪 View experiment at: https://dbc-c600c0c2-acad.cloud.databricks.com/ml/experiments/1588665197670394


[I 2025-10-23 21:24:48,014] Trial 2 finished with value: 5.600874046133536 and parameters: {'n_estimators': 80, 'max_depth': 23, 'min_samples_split': 2, 'min_samples_leaf': 10}. Best is trial 0 with value: 5.535684811605542.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/23 21:25:43 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.


🏃 View run Random Forest Hyperparameter Optimization (Optuna) at: https://dbc-c600c0c2-acad.cloud.databricks.com/ml/experiments/1588665197670394/runs/3df89e10d31541e096c1c069122f918a
🧪 View experiment at: https://dbc-c600c0c2-acad.cloud.databricks.com/ml/experiments/1588665197670394


# **Registrar modelo en Model Registry**
El mejor de los dos modelos fue el de Gradient Boosting, por lo que usaremos ese como challenger:

In [16]:
model_name = "workspace.default.nyc-taxi-model"

In [None]:
client = MlflowClient()

MODEL_FAMILY = "gradient_boosting"

# Buscar todos los runs del experimento
runs = mlflow.search_runs(
    experiment_names=[EXPERIMENT_NAME],
    filter_string=f"tags.model_family = '{MODEL_FAMILY}' and attribute.status = 'FINISHED'",
    order_by=["metrics.rmse ASC"]
)

In [9]:
client = MlflowClient()

MODEL_FAMILY = "gradient_boosting"

# Buscar todos los runs del experimento
runs = mlflow.search_runs(
    experiment_names=[EXPERIMENT_NAME],
    filter_string=f"tags.model_family = '{MODEL_FAMILY}' and attribute.status = 'FINISHED'",
    order_by=["metrics.rmse ASC"]
)

# Filtrar solo los runs padres (sin parentRunId)
padre = runs[runs["tags.mlflow.parentRunId"].isnull()]

# Mostrar el mejor padre
if len(padre) > 0:
    best_run = padre.iloc[0]
    print(f"🏆 Mejor run padre para {MODEL_FAMILY}:")
    print(f"Run Name: {best_run['tags.mlflow.runName']}")
    print(f"Run ID: {best_run['run_id']}")
    print(f"RMSE: {best_run['metrics.rmse']:.4f}")
else:
    print(f"No se encontraron runs padres para {MODEL_FAMILY}.")


🏆 Mejor run padre para gradient_boosting:
Run Name: Gradient Boosting Hyperparameter Optimization (Optuna)
Run ID: f16f4fb3eac84a8f9625b40a512cff89
RMSE: 5.4182


Registramos el modelo en el Model Registry:

In [10]:
# Registrar el mejor modelo
result = mlflow.register_model(
    model_uri=f"runs:/{best_run['run_id']}/model",
    name=model_name
)

Successfully registered model 'workspace.default.nyc-taxi-model2'.


Downloading artifacts:   0%|          | 0/8 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

Created version '1' of model 'workspace.default.nyc-taxi-model2'.


Le asignamos el alias de Challenger:

In [11]:
model_version = result.version
new_alias = "Challenger"

client.set_registered_model_alias(
    name=model_name,
    alias=new_alias,
    version=result.version
)

# **Comparar Modelos**
Para decidir que modelo es el que mejor es realizando predicciones debemos de cargarlo y ponerlo a prueba con los datos de Marzo:

In [28]:
import mlflow.pyfunc
champion_uri = f"models:/{model_name}@Champion"
challenger_uri = f"models:/{model_name}@Challenger"

champion_version = mlflow.pyfunc.load_model(champion_uri)
challenger_version = mlflow.pyfunc.load_model(challenger_uri)

Downloading artifacts:   0%|          | 0/8 [00:00<?, ?it/s]

  model.load_model(xgb_model_path)


Downloading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

Preparamos la data de test:

In [13]:
df_val = read_dataframe('../data/green_tripdata_2025-03.parquet')
X_val = preprocess(df_val, dv)
target = 'duration'
y_val = df_val[target].values

Por problemas con el procesador, no es posible realizar las predicciones, sin embargo digamos que el ganador fue el modelo challenger ya que obtuvo un mejor RMSE, hay que cambiar las etiquetas para que sepamos que modelo es el mejor:

In [None]:
from mlflow.tracking import MlflowClient

client = MlflowClient()

# Obtener la versión asociada al alias 'Challenger'
challenger_version = client.get_model_version_by_alias(model_name, "Challenger")
version_number = challenger_version.version

# Asignar el alias 'Champion' a la misma versión
client.set_registered_model_alias(name=model_name, alias="Champion", version=version_number)

print(f"El modelo '{model_name}' versión {version_number} ahora es el campeón.")


El modelo 'workspace.default.nyc-taxi-model' versión 2 ahora es el campeón).
