In [5]:
import pandas as pd
import pickle
from sklearn.metrics import  root_mean_squared_error
from sklearn.feature_extraction import  DictVectorizer

Cargar el dataset

In [6]:
# Create the directory if it doesn't exist
!mkdir -p ../data

# Download files using curl
!curl -o ../data/green_tripdata_2024-01.parquet https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2024-01.parquet
!curl -o ../data/green_tripdata_2024-02.parquet https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2024-02.parquet

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1330k  100 1330k    0     0  1381k      0 --:--:-- --:--:-- --:--:-- 1381k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1253k  100 1253k    0     0  1400k      0 --:--:-- --:--:-- --:--:-- 1400k


In [7]:
def read_dataframe(filename):

    df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    return df

In [8]:
df_train = read_dataframe('../data/green_tripdata_2024-01.parquet')
df_val = read_dataframe('../data/green_tripdata_2024-02.parquet')

In [9]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [10]:
categorical = ['PU_DO']  #'PULocationID', 'DOLocationID']
numerical = ['trip_distance']
dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [11]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

Conectarse a dagshub

In [12]:
import dagshub
import mlflow


dagshub.init(url="https://dagshub.com/daduke1/nyc-taxi-time-prediction", mlflow=True)

MLFLOW_TRACKING_URI = mlflow.get_tracking_uri()

print(MLFLOW_TRACKING_URI)

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(experiment_name="nyc-taxi-experiment")

https://dagshub.com/daduke1/nyc-taxi-time-prediction.mlflow


<Experiment: artifact_location='mlflow-artifacts:/655077ef21ea4d658c388a4f45673d3f', creation_time=1726875513781, experiment_id='0', last_update_time=1726875513781, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [13]:
mlflow.set_experiment(experiment_name="nyc-taxi-experiment")
##%%
training_dataset = mlflow.data.from_numpy(X_train.data, targets=y_train, name="green_tripdata_2024-01")
validation_dataset = mlflow.data.from_numpy(X_val.data, targets=y_val, name="green_tripdata_2024-02")

Definir los dataset como objetos de mlflow para poderlos trackear

In [15]:
training_dataset = mlflow.data.from_numpy(X_train.data, targets=y_train, name="green_tripdata_2024-01")
validation_dataset = mlflow.data.from_numpy(X_val.data, targets=y_val, name="green_tripdata_2024-02")

In [20]:
from sklearn.metrics import root_mean_squared_error
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import Lasso, Ridge, LinearRegression
from sklearn.ensemble import RandomForestRegressor
import pathlib
from sklearn.metrics import mean_squared_error
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from hyperopt.pyll.base import scope

Entrenamiento de modelo Gradient Boost

In [23]:
# Function to calculate RMSE
def root_mean_squared_error(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Define the objective function for Gradient Boosting
def objective(params):
    with mlflow.start_run(nested=True):
        mlflow.set_tag("model_family", "GradientBoosting")
        mlflow.log_params(params)
        
        # Train Gradient Boosting model
        model = GradientBoostingRegressor(**params)
        model.fit(X_train, y_train)
        
        # Log the trained model
        mlflow.sklearn.log_model(model, artifact_path="model")
        
        # Predictions and RMSE on validation set
        y_pred = model.predict(X_val)
        rmse = root_mean_squared_error(y_val, y_pred)
        
        mlflow.log_metric("rmse", rmse)
        
    return {'loss': rmse, 'status': STATUS_OK}

# MLflow autologging for Scikit-learn models
mlflow.sklearn.autolog()

# Define search space for Gradient Boosting Regressor
gb_search_space = {
    'n_estimators': scope.int(hp.quniform('n_estimators', 50, 300, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'max_depth': scope.int(hp.quniform('max_depth', 3, 10, 1)),
    'subsample': hp.uniform('subsample', 0.6, 1.0),
    'random_state': 42
}

# Hyperparameter Optimization for Gradient Boosting
with mlflow.start_run(run_name="GradientBoosting Hyperparameter Optimization", nested=True):
    gb_trials = Trials()
    best_gb_params = fmin(
        fn=objective,
        space=gb_search_space,
        algo=tpe.suggest,
        max_evals=10,
        trials=gb_trials
    )
    
    # Convert the best params back to int where necessary
    best_gb_params["n_estimators"] = int(best_gb_params["n_estimators"])
    best_gb_params["max_depth"] = int(best_gb_params["max_depth"])
    
    mlflow.log_params(best_gb_params)

  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]



2024/09/20 18:55:26 INFO mlflow.tracking._tracking_service.client: 🏃 View run big-ant-494 at: https://dagshub.com/daduke1/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/49121f9897e444f6887ee11549b945c4.

2024/09/20 18:55:26 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/daduke1/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 10%|█         | 1/10 [00:18<02:50, 18.95s/trial, best loss: 5.368089137590123]



2024/09/20 18:55:40 INFO mlflow.tracking._tracking_service.client: 🏃 View run gregarious-fox-500 at: https://dagshub.com/daduke1/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/68be3601771a4e3a800865a87ef964b4.

2024/09/20 18:55:40 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/daduke1/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 20%|██        | 2/10 [00:33<02:10, 16.35s/trial, best loss: 5.368089137590123]



2024/09/20 18:55:56 INFO mlflow.tracking._tracking_service.client: 🏃 View run gaudy-koi-433 at: https://dagshub.com/daduke1/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/fbd744c9c10f42e5b6a2d01243ddb407.

2024/09/20 18:55:56 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/daduke1/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 30%|███       | 3/10 [00:49<01:53, 16.18s/trial, best loss: 5.3679468091329765]



2024/09/20 18:56:11 INFO mlflow.tracking._tracking_service.client: 🏃 View run monumental-fawn-69 at: https://dagshub.com/daduke1/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/f19a2de4d83b458285171b8214aaa120.

2024/09/20 18:56:11 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/daduke1/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 40%|████      | 4/10 [01:04<01:34, 15.81s/trial, best loss: 5.3679468091329765]



2024/09/20 18:56:26 INFO mlflow.tracking._tracking_service.client: 🏃 View run debonair-lamb-437 at: https://dagshub.com/daduke1/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/5aad34c0bfcf4bf19389533455dd0a75.

2024/09/20 18:56:26 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/daduke1/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 50%|█████     | 5/10 [01:19<01:16, 15.39s/trial, best loss: 5.3679468091329765]



2024/09/20 18:56:45 INFO mlflow.tracking._tracking_service.client: 🏃 View run casual-ram-743 at: https://dagshub.com/daduke1/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/7edfee2aabd943d09d2b09a2b634d776.

2024/09/20 18:56:45 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/daduke1/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 60%|██████    | 6/10 [01:38<01:06, 16.75s/trial, best loss: 5.265174169509532] 



2024/09/20 18:56:59 INFO mlflow.tracking._tracking_service.client: 🏃 View run capricious-wren-597 at: https://dagshub.com/daduke1/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/e555d55ef05f4beaa6c4818175da0dd3.

2024/09/20 18:56:59 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/daduke1/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 70%|███████   | 7/10 [01:52<00:47, 15.84s/trial, best loss: 5.265174169509532]



2024/09/20 18:57:17 INFO mlflow.tracking._tracking_service.client: 🏃 View run gregarious-robin-524 at: https://dagshub.com/daduke1/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/e91634fc690342719a2571565726c03b.

2024/09/20 18:57:17 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/daduke1/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 80%|████████  | 8/10 [02:09<00:32, 16.31s/trial, best loss: 5.265174169509532]



2024/09/20 18:57:36 INFO mlflow.tracking._tracking_service.client: 🏃 View run dashing-mouse-210 at: https://dagshub.com/daduke1/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/89e82ad8566243a3a522a58425d12b22.

2024/09/20 18:57:36 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/daduke1/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 90%|█████████ | 9/10 [02:29<00:17, 17.19s/trial, best loss: 5.265174169509532]



2024/09/20 18:57:50 INFO mlflow.tracking._tracking_service.client: 🏃 View run skillful-crane-944 at: https://dagshub.com/daduke1/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/2378159609454cf18bac068b8fbeb308.

2024/09/20 18:57:50 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/daduke1/nyc-taxi-time-prediction.mlflow/#/experiments/0.



100%|██████████| 10/10 [02:43<00:00, 16.35s/trial, best loss: 5.265174169509532]


2024/09/20 18:57:51 INFO mlflow.tracking._tracking_service.client: 🏃 View run GradientBoosting Hyperparameter Optimization at: https://dagshub.com/daduke1/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/b8cf710cafde47b8b656764333e2eb7b.
2024/09/20 18:57:51 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/daduke1/nyc-taxi-time-prediction.mlflow/#/experiments/0.


Random Forest

In [24]:
# Define the objective function for Random Forest
def objective(params):
    with mlflow.start_run(nested=True):
        mlflow.set_tag("model_family", "RandomForest")
        mlflow.log_params(params)
        
        # Train Random Forest model
        model = RandomForestRegressor(**params)
        model.fit(X_train, y_train)
        
        # Log the trained model
        mlflow.sklearn.log_model(model, artifact_path="model")
        
        # Predictions and RMSE on validation set
        y_pred = model.predict(X_val)
        rmse = root_mean_squared_error(y_val, y_pred)
        
        mlflow.log_metric("rmse", rmse)
        
    return {'loss': rmse, 'status': STATUS_OK}

# MLflow autologging for Scikit-learn models
mlflow.sklearn.autolog()

# Define search space for Random Forest
rf_search_space = {
    'n_estimators': scope.int(hp.quniform('n_estimators', 50, 300, 1)),
    'max_depth': scope.int(hp.quniform('max_depth', 4, 20, 1)),
    'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 10, 1)),
    'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 5, 1)),
    'random_state': 42
}

# Hyperparameter Optimization for Random Forest
with mlflow.start_run(run_name="RandomForest Hyperparameter Optimization", nested=True):
    rf_trials = Trials()
    best_rf_params = fmin(
        fn=objective,
        space=rf_search_space,
        algo=tpe.suggest,
        max_evals=10,
        trials=rf_trials
    )
    
    # Convert the best params back to int where necessary
    best_rf_params["n_estimators"] = int(best_rf_params["n_estimators"])
    best_rf_params["max_depth"] = int(best_rf_params["max_depth"])
    best_rf_params["min_samples_split"] = int(best_rf_params["min_samples_split"])
    best_rf_params["min_samples_leaf"] = int(best_rf_params["min_samples_leaf"])
    
    mlflow.log_params(best_rf_params)

  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]



2024/09/20 18:58:18 INFO mlflow.tracking._tracking_service.client: 🏃 View run brawny-roo-912 at: https://dagshub.com/daduke1/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/817b768a1fe24cb198cf222d644ec6bd.

2024/09/20 18:58:18 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/daduke1/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 10%|█         | 1/10 [00:15<02:19, 15.50s/trial, best loss: 5.510058937507135]



2024/09/20 18:58:32 INFO mlflow.tracking._tracking_service.client: 🏃 View run trusting-owl-502 at: https://dagshub.com/daduke1/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/b9fe5312f4c940c3b6a919677146af0d.

2024/09/20 18:58:32 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/daduke1/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 20%|██        | 2/10 [00:29<01:57, 14.68s/trial, best loss: 5.492740980429902]



2024/09/20 18:58:56 INFO mlflow.tracking._tracking_service.client: 🏃 View run capable-shrike-265 at: https://dagshub.com/daduke1/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/38c9dce12cd34885b8cd6182e32e43ab.

2024/09/20 18:58:56 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/daduke1/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 30%|███       | 3/10 [00:53<02:12, 18.87s/trial, best loss: 5.385856874093268]



2024/09/20 18:59:29 INFO mlflow.tracking._tracking_service.client: 🏃 View run adorable-mouse-419 at: https://dagshub.com/daduke1/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/1c9eebd6b2e6458c8223ebb67a3e0b83.

2024/09/20 18:59:29 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/daduke1/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 40%|████      | 4/10 [01:26<02:27, 24.63s/trial, best loss: 5.373145479605618]



2024/09/20 19:00:00 INFO mlflow.tracking._tracking_service.client: 🏃 View run powerful-wren-751 at: https://dagshub.com/daduke1/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/78d82fb311aa4d869b7dd83ffca3cf72.

2024/09/20 19:00:00 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/daduke1/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 50%|█████     | 5/10 [01:58<02:15, 27.05s/trial, best loss: 5.373145479605618]



2024/09/20 19:01:08 INFO mlflow.tracking._tracking_service.client: 🏃 View run incongruous-loon-728 at: https://dagshub.com/daduke1/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/a75a8bcd5d6245dbba58d845e3da77e9.

2024/09/20 19:01:08 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/daduke1/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 60%|██████    | 6/10 [03:06<02:43, 40.92s/trial, best loss: 5.333793209457883]



2024/09/20 19:01:25 INFO mlflow.tracking._tracking_service.client: 🏃 View run popular-wren-927 at: https://dagshub.com/daduke1/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/a2b3edd4fc0f46b58b693f302152f55b.

2024/09/20 19:01:25 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/daduke1/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 70%|███████   | 7/10 [03:22<01:38, 32.99s/trial, best loss: 5.333793209457883]



2024/09/20 19:02:15 INFO mlflow.tracking._tracking_service.client: 🏃 View run gifted-croc-654 at: https://dagshub.com/daduke1/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/8b07246c3f6f4258aad99ae024c9256e.

2024/09/20 19:02:15 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/daduke1/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 80%|████████  | 8/10 [04:13<01:17, 38.53s/trial, best loss: 5.333793209457883]



2024/09/20 19:03:31 INFO mlflow.tracking._tracking_service.client: 🏃 View run clean-flea-537 at: https://dagshub.com/daduke1/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/08c0c7db2b344234abbe231795891e79.

2024/09/20 19:03:31 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/daduke1/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 90%|█████████ | 9/10 [05:28<00:50, 50.01s/trial, best loss: 5.333793209457883]



2024/09/20 19:04:12 INFO mlflow.tracking._tracking_service.client: 🏃 View run bold-fowl-217 at: https://dagshub.com/daduke1/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/2b68cf40ee624b43992fc38ea422c735.

2024/09/20 19:04:12 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/daduke1/nyc-taxi-time-prediction.mlflow/#/experiments/0.



100%|██████████| 10/10 [06:09<00:00, 36.98s/trial, best loss: 5.333793209457883]


2024/09/20 19:04:12 INFO mlflow.tracking._tracking_service.client: 🏃 View run RandomForest Hyperparameter Optimization at: https://dagshub.com/daduke1/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/8d50e2af26764056a3ca715fac2887d8.
2024/09/20 19:04:12 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/daduke1/nyc-taxi-time-prediction.mlflow/#/experiments/0.


Ahora vamos a registrar el mejor modelo en el model registry y usarlo para hacer predicciones

In [25]:
run_id = input("Ingrese el run_id")
run_uri = f"runs:/{run_id}/model"

result = mlflow.register_model(
    model_uri=run_uri,
    name="nyc-taxi-model"
)

Ingrese el run_id 1dff56afbeca42d9920acff77490716e


Registered model 'nyc-taxi-model' already exists. Creating a new version of this model...
2024/09/20 19:05:53 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: nyc-taxi-model, version 2
Created version '2' of model 'nyc-taxi-model'.


Ahora a asignarle la etiqueta de challenger

In [27]:
from datetime import datetime
from mlflow import MlflowClient

client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)
client.update_registered_model(
    name="nyc-taxi-model",
    description="Model registry for the NYC Taxi Time Prediction Project",
)

new_alias = "challenger"
date = datetime.today()
model_version = "2"

# create "champion" alias for version 1 of model "nyc-taxi-model"
client.set_registered_model_alias(
    name="nyc-taxi-model",
    alias=new_alias,
    version=model_version
)

client.update_model_version(
    name="nyc-taxi-model",
    version=model_version,
    description=f"The model version {model_version} was transitioned to {new_alias} on {date}",
)

<ModelVersion: aliases=['challenger'], creation_timestamp=1726880754021, current_stage='None', description=('The model version 2 was transitioned to challenger on 2024-09-20 '
 '19:07:45.540020'), last_updated_timestamp=1726880865919, name='nyc-taxi-model', run_id='1dff56afbeca42d9920acff77490716e', run_link='', source='mlflow-artifacts:/655077ef21ea4d658c388a4f45673d3f/1dff56afbeca42d9920acff77490716e/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='2'>

Descargar los datos de marzo

In [34]:
!curl -o ../data/green_tripdata_2024-03.parquet https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2024-03.parquet

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1340k  100 1340k    0     0  4224k      0 --:--:-- --:--:-- --:--:-- 4227k


In [35]:
# Cargarlos al storage de mlflow
from dagshub import get_repo_bucket_client
# Get a boto3.client object
s3 = get_repo_bucket_client("daduke1/nyc-taxi-time-prediction")

# Upload val file
s3.upload_file(
    Bucket="nyc-taxi-time-prediction",  # name of the repo
    Filename="../data/green_tripdata_2024-03.parquet",  # local path of file to upload
    Key="val_data.parquet",  # remote path where to upload the file
)

Usar el champion y el challenger en los datos de marzo

In [36]:
df_marzo = read_dataframe('../data/green_tripdata_2024-03.parquet')

In [37]:
df_marzo['PU_DO'] = df_marzo['PULocationID'] + '_' + df_marzo['DOLocationID']

In [41]:
val_dicts = df_marzo[categorical + numerical].to_dict(orient='records')
X_marzo = dv.transform(val_dicts)

target = 'duration'
y_marzo_val = df_marzo[target].values
y_marzo_val = df_marzo[target].values

In [43]:
import mlflow.pyfunc

model_name = "nyc-taxi-model"
alias = "champion"

model_uri = f"models:/{model_name}@{alias}"

champion_version = mlflow.pyfunc.load_model(
    model_uri=model_uri
)

y_pred_marzo = champion_version.predict(X_marzo)

rmse = np.sqrt(mean_squared_error(y_marzo_val, y_pred_marzo))

print(f"RMSE: {rmse}")

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

RMSE: 5.2124355219162455


In [45]:
import mlflow.pyfunc

model_name = "nyc-taxi-model"
alias = "challenger"

model_uri = f"models:/{model_name}@{alias}"

challenger_model = mlflow.pyfunc.load_model(model_uri=model_uri)

y_pred_marzo = challenger_model.predict(X_marzo)

rmse = np.sqrt(mean_squared_error(y_marzo_val, y_pred_marzo))

print(f"RMSE: {rmse}")


Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

RMSE: 5.288766959881196


Como podemos observar, el challenger obtuvo una métrica de 5.288 RMSE, mientras que el champion fue de 5.212 RMSE. Esto indica que el champion sigue siendo mejor, entonces lo dejaremos como champion. Hay que tomar en cuenta que estos datos de marzo, los modelos no los habían visto, lo cual indica que hubo entrenamiento correcto para los dos modelos. También significa que el champion es un modelo bueno, ya que sigue dando buenos resultados inclusive con nuevos datos nunca antes vistos.