In [1]:
import os
import requests

# Crear el directorio si no existe
os.makedirs('../data', exist_ok=True)

# Función para descargar archivos
def download_file(url, save_path):
    response = requests.get(url)
    with open(save_path, 'wb') as f:
        f.write(response.content)

# Descargar los archivos
download_file('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2024-01.parquet', '../data/green_tripdata_2024-01.parquet')
download_file('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2024-02.parquet', '../data/green_tripdata_2024-02.parquet')


In [2]:
import pickle
import pandas as pd
from sklearn.metrics import  root_mean_squared_error
from sklearn.feature_extraction import  DictVectorizer
from sklearn.linear_model import Lasso, Ridge, LinearRegression

In [3]:
def read_dataframe(filename):

    df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    return df

In [6]:
df_train = read_dataframe('../data/green_tripdata_2024-01.parquet')
df_val = read_dataframe('../data/green_tripdata_2024-02.parquet')

In [7]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [8]:
categorical = ['PU_DO']  #'PULocationID', 'DOLocationID']
numerical = ['trip_distance']
dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [9]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [11]:
import dagshub
import mlflow


dagshub.init(url="https://dagshub.com/diego-mercadoc/nyc-taxi-time-prediction", mlflow=True)

MLFLOW_TRACKING_URI = mlflow.get_tracking_uri()

print(MLFLOW_TRACKING_URI)

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(experiment_name="nyc-taxi-experiment")

Output()



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=ec14fab5-d0fc-46cc-a0d9-20457aa8f997&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=f992f330d64bf4efc99945cdf35d795031e03ce7e3ccccc9032b517bc09780a8




https://dagshub.com/diego-mercadoc/nyc-taxi-time-prediction.mlflow


2024/09/17 21:28:49 INFO mlflow.tracking.fluent: Experiment with name 'nyc-taxi-experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/4a2f89adc0ff477ead9bc8f38ff1a8ba', creation_time=1726630128698, experiment_id='0', last_update_time=1726630128698, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [12]:
training_dataset = mlflow.data.from_numpy(X_train.data, targets=y_train, name="green_tripdata_2024-01")
validation_dataset = mlflow.data.from_numpy(X_val.data, targets=y_val, name="green_tripdata_2024-02")

In [13]:
from dagshub import get_repo_bucket_client
# Get a boto3.client object
s3 = get_repo_bucket_client("diego-mercadoc/nyc-taxi-time-prediction")

# Upload file
s3.upload_file(
    Bucket="nyc-taxi-time-prediction",  # name of the repo
    Filename="../data/green_tripdata_2024-01.parquet",  # local path of file to upload
    Key="train_data.parquet",  # remote path where to upload the file
)

# Upload file
s3.upload_file(
    Bucket="nyc-taxi-time-prediction",  # name of the repo
    Filename="../data/green_tripdata_2024-02.parquet",  # local path of file to upload
    Key="eval_data.parquet",  # remote path where to upload the file
)

In [14]:
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope
import pathlib

In [15]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [16]:
def objective(params):
    with mlflow.start_run(nested=True):
         
        # Tag model
        mlflow.set_tag("model_family", "xgboost")
        
        # Log parameters
        mlflow.log_params(params)
        
        # Train model
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=100,
            evals=[(valid, 'validation')],
            early_stopping_rounds=10
        )
        
        # Log xgboost model with artifact_path
        mlflow.xgboost.log_model(booster, artifact_path="model")
         
        # Predict in the val dataset
        y_pred = booster.predict(valid)
        
        # Calculate metric
        rmse = root_mean_squared_error(y_val, y_pred)
        
        # Log performance metric
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

In [17]:
mlflow.xgboost.autolog()

with mlflow.start_run(run_name="Xgboost Hyper-parameter Optimization", nested=True):
    search_space = {
        'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
        'learning_rate': hp.loguniform('learning_rate', -3, 0),
        'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
        'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
        'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
        'objective': 'reg:squarederror',
        'seed': 42
    }
    
    best_params = fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=10,
        trials=Trials()
    )
    best_params["max_depth"] = int(best_params["max_depth"])
    best_params["seed"] = 42
    best_params["objective"] = "reg:squarederror"
    
    mlflow.log_params(best_params)

    # Log tags
    mlflow.set_tags(
        tags={
            "project": "NYC Taxi Time Prediction Project",
            "optimizer_engine": "hyper-opt",
            "model_family": "xgboost",
            "feature_set_version": 1,
        }
    )

    # Log a fit model instance
    booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=100,
        evals=[(valid, 'validation')],
        early_stopping_rounds=10
    )
        
    y_pred = booster.predict(valid)
    
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)
    
    pathlib.Path("models").mkdir(exist_ok=True)
    with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)
        
    mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

[0]	validation-rmse:7.88285                                                                                                                               
[1]	validation-rmse:7.02368                                                                                                                               
[2]	validation-rmse:6.43411                                                                                                                               
[3]	validation-rmse:6.06322                                                                                                                               
[4]	validation-rmse:5.77666                                                                                                                               
[5]	validation-rmse:5.62864                                                                                                                               
[6]	validation-rmse:5.52521                                           






2024/09/17 21:40:11 INFO mlflow.tracking._tracking_service.client: 🏃 View run classy-dog-881 at: https://dagshub.com/diego-mercadoc/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/5a65510fcd4e4bfcb27248c14bff46ed.

2024/09/17 21:40:11 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/diego-mercadoc/nyc-taxi-time-prediction.mlflow/#/experiments/0.



[0]	validation-rmse:6.17120                                                                                                                               
[1]	validation-rmse:5.60789                                                                                                                               
[2]	validation-rmse:5.50790                                                                                                                               
[3]	validation-rmse:5.48088                                                                                                                               
[4]	validation-rmse:5.45581                                                                                                                               
[5]	validation-rmse:5.45020                                                                                                                               
[6]	validation-rmse:5.44613                                           






2024/09/17 21:40:28 INFO mlflow.tracking._tracking_service.client: 🏃 View run gaudy-dove-381 at: https://dagshub.com/diego-mercadoc/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/22dabeb0ad5b47bcbe8a86c76c8e4996.

2024/09/17 21:40:28 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/diego-mercadoc/nyc-taxi-time-prediction.mlflow/#/experiments/0.



[0]	validation-rmse:6.70352                                                                                                                               
[1]	validation-rmse:5.84604                                                                                                                               
[2]	validation-rmse:5.58006                                                                                                                               
[3]	validation-rmse:5.48988                                                                                                                               
[4]	validation-rmse:5.45316                                                                                                                               
[5]	validation-rmse:5.43521                                                                                                                               
[6]	validation-rmse:5.42484                                           






2024/09/17 21:40:43 INFO mlflow.tracking._tracking_service.client: 🏃 View run victorious-eel-180 at: https://dagshub.com/diego-mercadoc/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/babe8125df5747659ea34830ee361b82.

2024/09/17 21:40:43 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/diego-mercadoc/nyc-taxi-time-prediction.mlflow/#/experiments/0.



[0]	validation-rmse:6.90361                                                                                                                               
[1]	validation-rmse:6.03902                                                                                                                               
[2]	validation-rmse:5.72013                                                                                                                               
[3]	validation-rmse:5.59562                                                                                                                               
[4]	validation-rmse:5.53288                                                                                                                               
[5]	validation-rmse:5.51099                                                                                                                               
[6]	validation-rmse:5.49920                                           






2024/09/17 21:40:59 INFO mlflow.tracking._tracking_service.client: 🏃 View run crawling-bear-262 at: https://dagshub.com/diego-mercadoc/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/5f477a76902e4969af2fbda783116e11.

2024/09/17 21:40:59 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/diego-mercadoc/nyc-taxi-time-prediction.mlflow/#/experiments/0.



[0]	validation-rmse:8.62842                                                                                                                               
[1]	validation-rmse:8.20231                                                                                                                               
[2]	validation-rmse:7.82648                                                                                                                               
[3]	validation-rmse:7.49597                                                                                                                               
[4]	validation-rmse:7.20882                                                                                                                               
[5]	validation-rmse:6.95938                                                                                                                               
[6]	validation-rmse:6.74034                                           






2024/09/17 21:41:20 INFO mlflow.tracking._tracking_service.client: 🏃 View run righteous-finch-98 at: https://dagshub.com/diego-mercadoc/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/930fdc89874a4a91b761611811f1d2d1.

2024/09/17 21:41:20 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/diego-mercadoc/nyc-taxi-time-prediction.mlflow/#/experiments/0.



[0]	validation-rmse:5.59142                                                                                                                               
[1]	validation-rmse:5.32343                                                                                                                               
[2]	validation-rmse:5.28194                                                                                                                               
[3]	validation-rmse:5.26850                                                                                                                               
[4]	validation-rmse:5.26246                                                                                                                               
[5]	validation-rmse:5.25909                                                                                                                               
[6]	validation-rmse:5.25586                                           






2024/09/17 21:41:40 INFO mlflow.tracking._tracking_service.client: 🏃 View run gaudy-zebra-821 at: https://dagshub.com/diego-mercadoc/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/6cd47a02436546a480eb328acf7b42b5.

2024/09/17 21:41:40 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/diego-mercadoc/nyc-taxi-time-prediction.mlflow/#/experiments/0.



[0]	validation-rmse:7.49792                                                                                                                               
[1]	validation-rmse:6.53178                                                                                                                               
[2]	validation-rmse:5.98162                                                                                                                               
[3]	validation-rmse:5.67757                                                                                                                               
[4]	validation-rmse:5.51306                                                                                                                               
[5]	validation-rmse:5.42307                                                                                                                               
[6]	validation-rmse:5.37313                                           






2024/09/17 21:42:06 INFO mlflow.tracking._tracking_service.client: 🏃 View run casual-rook-229 at: https://dagshub.com/diego-mercadoc/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/71b09c57479943c7b244f7603f50ed31.

2024/09/17 21:42:06 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/diego-mercadoc/nyc-taxi-time-prediction.mlflow/#/experiments/0.



[0]	validation-rmse:7.99622                                                                                                                               
[1]	validation-rmse:7.17716                                                                                                                               
[2]	validation-rmse:6.58565                                                                                                                               
[3]	validation-rmse:6.16765                                                                                                                               
[4]	validation-rmse:5.87458                                                                                                                               
[5]	validation-rmse:5.67329                                                                                                                               
[6]	validation-rmse:5.53747                                           






2024/09/17 21:42:38 INFO mlflow.tracking._tracking_service.client: 🏃 View run rogue-moose-838 at: https://dagshub.com/diego-mercadoc/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/debee74804e849efa17146dba3483920.

2024/09/17 21:42:38 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/diego-mercadoc/nyc-taxi-time-prediction.mlflow/#/experiments/0.



[0]	validation-rmse:7.69052                                                                                                                               
[1]	validation-rmse:6.78380                                                                                                                               
[2]	validation-rmse:6.21754                                                                                                                               
[3]	validation-rmse:5.87064                                                                                                                               
[4]	validation-rmse:5.66557                                                                                                                               
[5]	validation-rmse:5.54280                                                                                                                               
[6]	validation-rmse:5.46996                                           






2024/09/17 21:42:57 INFO mlflow.tracking._tracking_service.client: 🏃 View run nervous-sheep-664 at: https://dagshub.com/diego-mercadoc/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/3f481b08cd5c4e66924705dd0dd95600.

2024/09/17 21:42:57 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/diego-mercadoc/nyc-taxi-time-prediction.mlflow/#/experiments/0.



[0]	validation-rmse:6.80139                                                                                                                               
[1]	validation-rmse:5.91747                                                                                                                               
[2]	validation-rmse:5.61133                                                                                                                               
[3]	validation-rmse:5.50631                                                                                                                               
[4]	validation-rmse:5.45299                                                                                                                               
[5]	validation-rmse:5.42968                                                                                                                               
[6]	validation-rmse:5.41734                                           






2024/09/17 21:43:21 INFO mlflow.tracking._tracking_service.client: 🏃 View run lyrical-slug-919 at: https://dagshub.com/diego-mercadoc/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/015fa78c13244c61b1679f0febb94ac8.

2024/09/17 21:43:21 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/diego-mercadoc/nyc-taxi-time-prediction.mlflow/#/experiments/0.



100%|████████████████████████████████████████████████████████████████████████████████████| 10/10 [04:04<00:00, 24.41s/trial, best loss: 5.160976224940984]
[0]	validation-rmse:7.99622
[1]	validation-rmse:7.17716
[2]	validation-rmse:6.58565
[3]	validation-rmse:6.16765
[4]	validation-rmse:5.87458
[5]	validation-rmse:5.67329
[6]	validation-rmse:5.53747
[7]	validation-rmse:5.43895
[8]	validation-rmse:5.37163
[9]	validation-rmse:5.32768
[10]	validation-rmse:5.28922
[11]	validation-rmse:5.26246
[12]	validation-rmse:5.24231
[13]	validation-rmse:5.22682
[14]	validation-rmse:5.21526
[15]	validation-rmse:5.20712
[16]	validation-rmse:5.19982
[17]	validation-rmse:5.19415
[18]	validation-rmse:5.19060
[19]	validation-rmse:5.18758
[20]	validation-rmse:5.18474
[21]	validation-rmse:5.18422
[22]	validation-rmse:5.18433
[23]	validation-rmse:5.18209
[24]	validation-rmse:5.18217
[25]	validation-rmse:5.18087
[26]	validation-rmse:5.18117
[27]	validation-rmse:5.18023
[28]	validation-rmse:5.18044
[29]	validatio

2024/09/17 21:43:56 INFO mlflow.tracking._tracking_service.client: 🏃 View run Xgboost Hyper-parameter Optimization at: https://dagshub.com/diego-mercadoc/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/e2e66d7be851466da7f92b56e1add8f1.
2024/09/17 21:43:56 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/diego-mercadoc/nyc-taxi-time-prediction.mlflow/#/experiments/0.


In [18]:
run_id = input("Ingrese el run_id")
run_uri = f"runs:/{run_id}/model"

result = mlflow.register_model(
    model_uri=run_uri,
    name="nyc-taxi-model"
)

Ingrese el run_id debee74804e849efa17146dba3483920


Successfully registered model 'nyc-taxi-model'.
2024/09/17 21:50:48 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: nyc-taxi-model, version 1
Created version '1' of model 'nyc-taxi-model'.


In [19]:
from datetime import datetime
from mlflow import MlflowClient

client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)
client.update_registered_model(
    name="nyc-taxi-model",
    description="Model registry for the NYC Taxi Time Prediction Project",
)

new_alias = "champion"
date = datetime.today()
model_version = "1"

# create "champion" alias for version 1 of model "nyc-taxi-model"
client.set_registered_model_alias(
    name="nyc-taxi-model",
    alias=new_alias,
    version=model_version
)

client.update_model_version(
    name="nyc-taxi-model",
    version=model_version,
    description=f"The model version {model_version} was transitioned to {new_alias} on {date}",
)

<ModelVersion: aliases=['champion'], creation_timestamp=1726631447429, current_stage='None', description='The model version 1 was transitioned to champion on 2024-09-17 21:51:16.881680', last_updated_timestamp=1726631476589, name='nyc-taxi-model', run_id='debee74804e849efa17146dba3483920', run_link='', source='mlflow-artifacts:/4a2f89adc0ff477ead9bc8f38ff1a8ba/debee74804e849efa17146dba3483920/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='1'>

In [20]:
import mlflow.pyfunc

model_name = "nyc-taxi-model"
alias = "champion"

model_uri = f"models:/{model_name}@{alias}"

champion_version = mlflow.pyfunc.load_model(
    model_uri=model_uri
)

champion_version.predict(X_val)

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

array([19.053276, 28.44469 ,  9.331161, ..., 45.867966, 13.883814,
       19.686247], dtype=float32)