## Experimentos con Gradient Boosting y Random Forest

In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
import mlflow
from mlflow import MlflowClient
from scipy.stats import randint, uniform

In [2]:
# Configurar el tracking URI de MLflow
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-taxi-experiment")

<Experiment: artifact_location=('file:///c:/Users/diego/OneDrive - ITESO/ITESO/5 semestre/Proyecto de Ciencia '
 'de datos/nyc-taxi-time-prediction/experiments/mlruns/1'), creation_time=1726709290057, experiment_id='1', last_update_time=1726709290057, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [3]:
# Cargar los datos de enero y febrero de 2024
def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df['duration'] = df['duration'].dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    return df

df_train = read_dataframe('../data/green_tripdata_2024-01.parquet')
df_val = read_dataframe('../data/green_tripdata_2024-02.parquet')

In [4]:
# Feature Engineering
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

categorical = ['PU_DO']
numerical = ['trip_distance']
dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts).toarray()
y_train = df_train['duration'].values

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts).toarray()
y_val = df_val['duration'].values

In [5]:
# Guardar el preprocesador
with open("preprocessor.b", "wb") as f_out:
    pickle.dump(dv, f_out)

# Definir las búsquedas de hiperparámetros
gb_params = {
    'n_estimators': randint(50, 200),
    'learning_rate': uniform(0.01, 0.3),
    'max_depth': randint(3, 10)
}

# rf_params = {
#     'n_estimators': randint(50, 200),
#     'max_depth': randint(3, 20),
#     'max_features': ['auto', 'sqrt', 'log2']
# }

rf_params = {
    'n_estimators': randint(50, 200),
    'max_depth': randint(3, 20),
    'max_features': ['sqrt', 'log2', None]  # Elimina 'auto'
}

mlflow.sklearn.autolog()



In [6]:
# Experimento con Gradient Boosting Regressor
with mlflow.start_run(run_name="GradientBoostingRegressor") as parent_run:
    mlflow.set_tag("model", "GradientBoostingRegressor")
    gbr = GradientBoostingRegressor()
    random_search_gbr = RandomizedSearchCV(
        gbr, gb_params, n_iter=10, scoring='neg_root_mean_squared_error', cv=3, random_state=42
    )

    random_search_gbr.fit(X_train, y_train)
    best_model_gbr = random_search_gbr.best_estimator_
    rmse_gbr = mean_squared_error(y_val, best_model_gbr.predict(X_val), squared=False)
    mlflow.log_metric("rmse", rmse_gbr)
    # Tomar una muestra de los datos de validación
    input_example = X_val[0:5]

    mlflow.sklearn.log_model(
    best_model_gbr,
    artifact_path="model",
    input_example=input_example
    )

    # Loguear el preprocesador
    mlflow.log_artifact("preprocessor.b", artifact_path="preprocessor")

2024/09/18 21:34:03 INFO mlflow.sklearn.utils: Logging the 5 best runs, 5 runs will be omitted.


In [7]:
# Experimento con Random Forest Regressor
with mlflow.start_run(run_name="RandomForestRegressor") as parent_run:
    mlflow.set_tag("model", "RandomForestRegressor")
    rfr = RandomForestRegressor()
    random_search_rfr = RandomizedSearchCV(
        rfr, rf_params, n_iter=10, scoring='neg_root_mean_squared_error', cv=3, random_state=42
    )

    random_search_rfr.fit(X_train, y_train)
    best_model_rfr = random_search_rfr.best_estimator_
    rmse_rfr = mean_squared_error(y_val, best_model_rfr.predict(X_val), squared=False)
    mlflow.log_metric("rmse", rmse_rfr)
    mlflow.sklearn.log_model(best_model_rfr, artifact_path="model")
    # Tomar una muestra de los datos de validación
    input_example = X_val[0:5]

    mlflow.sklearn.log_model(
    best_model_rfr,
    artifact_path="model",
    input_example=input_example
    )

    # Loguear el preprocesador
    mlflow.log_artifact("preprocessor.b", artifact_path="preprocessor")

2024/09/18 21:39:32 INFO mlflow.sklearn.utils: Logging the 5 best runs, 5 runs will be omitted.


In [8]:
print(f"RMSE GradientBoostingRegressor: {rmse_gbr}")
print(f"RMSE RandomForestRegressor: {rmse_rfr}")

RMSE GradientBoostingRegressor: 5.335807890583282
RMSE RandomForestRegressor: 5.426790695571489


In [9]:
# Determinar el mejor modelo
if rmse_gbr < rmse_rfr:
    best_model = best_model_gbr
    best_rmse = rmse_gbr
    best_model_name = "GradientBoostingRegressor"
    best_run_id = mlflow.active_run().info.run_id
else:
    best_model = best_model_rfr
    best_rmse = rmse_rfr
    best_model_name = "RandomForestRegressor"
    best_run_id = mlflow.active_run().info.run_id

print(f"Mejor modelo: {best_model_name} con RMSE: {best_rmse}")

AttributeError: 'NoneType' object has no attribute 'info'

In [None]:
# (2) Determinar el mejor modelo

In [8]:
# Determinar el mejor modelo
if rmse_gbr < rmse_rfr:
    best_model = best_model_gbr
    best_rmse = rmse_gbr
    best_model_name = "GradientBoostingRegressor"
    best_run_id = random_search_gbr.best_estimator_.random_state
else:
    best_model = best_model_rfr
    best_rmse = rmse_rfr
    best_model_name = "RandomForestRegressor"
    best_run_id = random_search_rfr.best_estimator_.random_state

print(f"Mejor modelo: {best_model_name} con RMSE: {best_rmse}")

Mejor modelo: GradientBoostingRegressor con RMSE: 5.335802943433224


### Registrar el modelo con mejor metrica en el model registry elde `nyc-taxi-model`

In [9]:
# Obtener el run_id del mejor modelo
client = MlflowClient()
experiment = client.get_experiment_by_name("nyc-taxi-experiment")
runs = client.search_runs(experiment_ids=experiment.experiment_id, order_by=["metrics.rmse ASC"])

best_run = runs[0]
run_id = best_run.info.run_id

# Registrar el modelo
result = mlflow.register_model(
    model_uri=f"runs:/{run_id}/model",
    name="nyc-taxi-model"
)

Successfully registered model 'nyc-taxi-model'.
Created version '1' of model 'nyc-taxi-model'.


### Asignarle el alias de CHALLENGER

In [10]:
# Asignar alias 'challenger' a la nueva versión del modelo
new_model_version = result.version

client.set_registered_model_alias(
    name="nyc-taxi-model",
    alias="challenger",
    version=new_model_version
)

#### Descargar en la carpeta data los `datos de marzo del 2024`

In [11]:
# Desde tu terminal o en una celda de código
!curl -o ../data/green_tripdata_2024-03.parquet https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2024-03.parquet

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 1340k  100 1340k    0     0  2074k      0 --:--:-- --:--:-- --:--:-- 2094k


Guardarla en el storage disponible de MLflow el dataset como un artifact

In [12]:
with mlflow.start_run(run_name="Data Storage") as run:
    mlflow.log_artifact('../data/green_tripdata_2024-03.parquet', artifact_path="data")

Usar esos datos para probarlo sobre los modelos con el alias champion y challenger

In [13]:
df_test = read_dataframe('../data/green_tripdata_2024-03.parquet')

df_test['PU_DO'] = df_test['PULocationID'] + '_' + df_test['DOLocationID']

test_dicts = df_test[categorical + numerical].to_dict(orient='records')
X_test = dv.transform(test_dicts)
y_test = df_test['duration'].values

cargar los modelos

In [21]:
# Cargar preprocesador
with open("preprocessor.b", "rb") as f_in:
    dv = pickle.load(f_in)

# Cargar modelos
champion_model = mlflow.pyfunc.load_model(model_uri="models:/nyc-taxi-model@champion")
challenger_model = mlflow.pyfunc.load_model(model_uri="models:/nyc-taxi-model@challenger")

In [18]:
client = MlflowClient()

# Nombre del modelo
model_name = "nyc-taxi-model"

# Obtener todas las versiones del modelo
versions = client.get_latest_versions(name=model_name, stages=["None", "Staging", "Production", "Archived"])

# Imprimir detalles de cada versión
for version in versions:
    print(f"Version: {version.version}, Aliases: {version.aliases}")

Version: 1, Aliases: []


  versions = client.get_latest_versions(name=model_name, stages=["None", "Staging", "Production", "Archived"])


In [19]:
# # Asignar el alias 'champion' a la versión 1
# client.set_registered_model_alias(
#     name="nyc-taxi-model",
#     alias="champion",
#     version=1  # Cambiar esto por la versión correcta CUANDO SE NECESITE
# )

In [20]:
# # Verificar las versiones y sus alias nuevamente
# versions = client.get_latest_versions(name=model_name, stages=["None", "Staging", "Production", "Archived"])

# for version in versions:
#     print(f"Version: {version.version}, Aliases: {version.aliases}")

Version: 1, Aliases: []


  versions = client.get_latest_versions(name=model_name, stages=["None", "Staging", "Production", "Archived"])


Obtener la métrica de cada modelo

In [22]:
# Predecir y calcular RMSE para el modelo champion
y_pred_champion = champion_model.predict(X_test)
rmse_champion = mean_squared_error(y_test, y_pred_champion, squared=False)

# Predecir y calcular RMSE para el modelo challenger
y_pred_challenger = challenger_model.predict(X_test)
rmse_challenger = mean_squared_error(y_test, y_pred_challenger, squared=False)

print(f"RMSE Champion: {rmse_champion}")
print(f"RMSE Challenger: {rmse_challenger}")



RMSE Champion: 5.369724023809842
RMSE Challenger: 5.369724023809842




Decidir si el nuevo modelo challenger debe ser promovido a champion

Paso 11: Decidir si el nuevo modelo **challenger** debe ser promovido a **champion**

Análisis:
- **Rendimiento**: Si `rmse_challenger` es menor que `rmse_champion`, el modelo challenger tiene un mejor rendimiento en el conjunto de prueba.
- **Consistencia**: Considera si el rendimiento es consistente a través de diferentes métricas o segmentos de datos.
- **Complejidad**: Un modelo más complejo puede requerir más recursos computacionales. ¿Es aceptable?
- **Tiempo de inferencia**: ¿El tiempo de predicción es adecuado para las necesidades del negocio?


In [23]:
if rmse_challenger < rmse_champion:
    # Promover el modelo challenger a champion
    client.set_registered_model_alias(
        name="nyc-taxi-model",
        alias="champion",
        version=new_model_version
    )
    print("El modelo challenger ha sido promovido a champion.")
else:
    print("El modelo challenger no supera al champion actual.")

El modelo challenger no supera al champion actual.
