In [1]:
# Create the directory if it doesn't exist
!mkdir -p ../data

# Download files using curl
!curl -o ../data/green_tripdata_2024-01.parquet https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2024-01.parquet
!curl -o ../data/green_tripdata_2024-02.parquet https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2024-02.parquet

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1330k  100 1330k    0     0  2092k      0 --:--:-- --:--:-- --:--:-- 2095k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1253k  100 1253k    0     0  2290k      0 --:--:-- --:--:-- --:--:-- 2291k


In [2]:
import pickle
import pandas as pd
from sklearn.metrics import  root_mean_squared_error
from sklearn.feature_extraction import  DictVectorizer
from sklearn.linear_model import Lasso, Ridge, LinearRegression

In [3]:
def read_dataframe(filename):

    df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    return df

In [4]:
df_train = read_dataframe('../data/green_tripdata_2024-01.parquet')
df_val = read_dataframe('../data/green_tripdata_2024-02.parquet')

In [5]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [6]:
categorical = ['PU_DO']  #'PULocationID', 'DOLocationID']
numerical = ['trip_distance']
dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [7]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [33]:
import dagshub
import mlflow

dagshub.init(repo_owner='JuanPab2009', repo_name='nyc-taxi-time-prediction', mlflow=True)

MLFLOW_TRACKING_URI = mlflow.get_tracking_uri()
print("MLFLOW_TRACKING_URI:", MLFLOW_TRACKING_URI)

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(experiment_name="nyc-taxi-experiment")

MLFLOW_TRACKING_URI: https://dagshub.com/JuanPab2009/nyc-taxi-time-prediction.mlflow


2024/10/23 23:36:35 INFO mlflow.tracking.fluent: Experiment with name 'nyc-taxi-experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/7d7e7f9ebf5a4fa09de8217839da801c', creation_time=1729748195362, experiment_id='2', last_update_time=1729748195362, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [34]:
training_dataset = mlflow.data.from_numpy(X_train.data, targets=y_train, name="green_tripdata_2024-01")
validation_dataset = mlflow.data.from_numpy(X_val.data, targets=y_val, name="green_tripdata_2024-02")

In [35]:
from dagshub import get_repo_bucket_client
# Get a boto3.client object
s3 = get_repo_bucket_client("JuanPab2009/nyc-taxi-time-prediction")

# Upload file
s3.upload_file(
    Bucket="nyc-taxi-time-prediction",  # name of the repo
    Filename="../data/green_tripdata_2024-01.parquet",  # local path of file to upload
    Key="train_data.parquet",  # remote path where to upload the file
)

s3.upload_file(
    Bucket="nyc-taxi-time-prediction",  # name of the repo
    Filename="../data/green_tripdata_2024-02.parquet",  # local path of file to upload
    Key="eval_data.parquet",  # remote path where to upload the file
)

In [36]:
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope
import pathlib

In [37]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [38]:
def objective(params):
    with mlflow.start_run(nested=True):
         
        # Tag model
        mlflow.set_tag("model_family", "xgboost")
        
        # Log parameters
        mlflow.log_params(params)
        
        # Train model
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=100,
            evals=[(valid, 'validation')],
            early_stopping_rounds=10
        )
        
        # Log xgboost model with artifact_path
        mlflow.xgboost.log_model(booster, artifact_path="model")
         
        # Predict in the val dataset
        y_pred = booster.predict(valid)
        
        # Calculate metric
        rmse = root_mean_squared_error(y_val, y_pred)
        
        # Log performance metric
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

In [39]:
mlflow.xgboost.autolog()

with mlflow.start_run(run_name="Xgboost Hyper-parameter Optimization", nested=True):
    search_space = {
        'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
        'learning_rate': hp.loguniform('learning_rate', -3, 0),
        'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
        'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
        'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
        'objective': 'reg:squarederror',
        'seed': 42
    }
    
    best_params = fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=10,
        trials=Trials()
    )
    best_params["max_depth"] = int(best_params["max_depth"])
    best_params["seed"] = 42
    best_params["objective"] = "reg:squarederror"
    
    mlflow.log_params(best_params)

    # Log tags
    mlflow.set_tags(
        tags={
            "project": "NYC Taxi Time Prediction Project",
            "optimizer_engine": "hyper-opt",
            "model_family": "xgboost",
            "feature_set_version": 1,
        }
    )

    # Log a fit model instance
    booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=100,
        evals=[(valid, 'validation')],
        early_stopping_rounds=10
    )
        
    y_pred = booster.predict(valid)
    
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)
    
    pathlib.Path("models").mkdir(exist_ok=True)
    with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)
        
    mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

[0]	validation-rmse:6.81742                           
[1]	validation-rmse:5.90665                           
[2]	validation-rmse:5.58350                           
[3]	validation-rmse:5.46275                           
[4]	validation-rmse:5.41627                           
[5]	validation-rmse:5.39170                           
[6]	validation-rmse:5.37496                           
[7]	validation-rmse:5.36733                           
[8]	validation-rmse:5.35713                           
[9]	validation-rmse:5.34845                           
[10]	validation-rmse:5.34473                          
[11]	validation-rmse:5.34145                          
[12]	validation-rmse:5.33968                          
[13]	validation-rmse:5.33772                          
[14]	validation-rmse:5.33524                          
[15]	validation-rmse:5.33325                          
[16]	validation-rmse:5.33100                          
[17]	validation-rmse:5.33025                          
[18]	valid






2024/10/23 23:44:29 INFO mlflow.tracking._tracking_service.client: 🏃 View run unruly-fish-750 at: https://dagshub.com/JuanPab2009/nyc-taxi-time-prediction.mlflow/#/experiments/2/runs/e9943dc0fdda4300bdf440595fbe49d2.

2024/10/23 23:44:29 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/JuanPab2009/nyc-taxi-time-prediction.mlflow/#/experiments/2.



[0]	validation-rmse:8.73635                                                    
[1]	validation-rmse:8.39394                                                    
[2]	validation-rmse:8.08100                                                    
[3]	validation-rmse:7.79681                                                    
[4]	validation-rmse:7.53813                                                    
[5]	validation-rmse:7.30365                                                    
[6]	validation-rmse:7.09094                                                    
[7]	validation-rmse:6.89799                                                    
[8]	validation-rmse:6.72505                                                    
[9]	validation-rmse:6.56871                                                    
[10]	validation-rmse:6.42753                                                   
[11]	validation-rmse:6.30048                                                   
[12]	validation-rmse:6.18735            






2024/10/23 23:44:54 INFO mlflow.tracking._tracking_service.client: 🏃 View run rambunctious-yak-459 at: https://dagshub.com/JuanPab2009/nyc-taxi-time-prediction.mlflow/#/experiments/2/runs/b7c04cacf7704f9e8a3085668f3bdbed.

2024/10/23 23:44:54 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/JuanPab2009/nyc-taxi-time-prediction.mlflow/#/experiments/2.



[0]	validation-rmse:7.36171                                                    
[1]	validation-rmse:6.40033                                                   
[2]	validation-rmse:5.89042                                                   
[3]	validation-rmse:5.61683                                                   
[4]	validation-rmse:5.47372                                                   
[5]	validation-rmse:5.39636                                                   
[6]	validation-rmse:5.34523                                                   
[7]	validation-rmse:5.32764                                                   
[8]	validation-rmse:5.31875                                                   
[9]	validation-rmse:5.31177                                                   
[10]	validation-rmse:5.29990                                                  
[11]	validation-rmse:5.29416                                                  
[12]	validation-rmse:5.29208                       






2024/10/23 23:45:15 INFO mlflow.tracking._tracking_service.client: 🏃 View run vaunted-fox-314 at: https://dagshub.com/JuanPab2009/nyc-taxi-time-prediction.mlflow/#/experiments/2/runs/67dedba9200d4170a0a28175c4020902.

2024/10/23 23:45:15 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/JuanPab2009/nyc-taxi-time-prediction.mlflow/#/experiments/2.



[0]	validation-rmse:5.57486                                                   
[1]	validation-rmse:5.49221                                                   
[2]	validation-rmse:5.47926                                                   
[3]	validation-rmse:5.46571                                                   
[4]	validation-rmse:5.46030                                                   
[5]	validation-rmse:5.44932                                                   
[6]	validation-rmse:5.44399                                                   
[7]	validation-rmse:5.44103                                                   
[8]	validation-rmse:5.43623                                                   
[9]	validation-rmse:5.43366                                                   
[10]	validation-rmse:5.42894                                                  
[11]	validation-rmse:5.42439                                                  
[12]	validation-rmse:5.41893                        






2024/10/23 23:45:32 INFO mlflow.tracking._tracking_service.client: 🏃 View run classy-fawn-200 at: https://dagshub.com/JuanPab2009/nyc-taxi-time-prediction.mlflow/#/experiments/2/runs/0a570507b19b4490bbe16ce1d0ce6101.

2024/10/23 23:45:32 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/JuanPab2009/nyc-taxi-time-prediction.mlflow/#/experiments/2.



[0]	validation-rmse:8.49512                                                   
[1]	validation-rmse:7.97822                                                   
[2]	validation-rmse:7.54568                                                   
[3]	validation-rmse:7.18503                                                   
[4]	validation-rmse:6.88651                                                   
[5]	validation-rmse:6.63994                                                   
[6]	validation-rmse:6.43902                                                   
[7]	validation-rmse:6.27404                                                   
[8]	validation-rmse:6.14212                                                   
[9]	validation-rmse:6.03453                                                   
[10]	validation-rmse:5.94874                                                  
[11]	validation-rmse:5.88041                                                  
[12]	validation-rmse:5.82331                        






2024/10/23 23:45:47 INFO mlflow.tracking._tracking_service.client: 🏃 View run unequaled-slug-9 at: https://dagshub.com/JuanPab2009/nyc-taxi-time-prediction.mlflow/#/experiments/2/runs/639d8a1f94804b7bb1625d61cbfcd61a.

2024/10/23 23:45:47 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/JuanPab2009/nyc-taxi-time-prediction.mlflow/#/experiments/2.



[0]	validation-rmse:6.57103                                                   
[1]	validation-rmse:5.77047                                                   
[2]	validation-rmse:5.54491                                                   
[3]	validation-rmse:5.47172                                                   
[4]	validation-rmse:5.43346                                                   
[5]	validation-rmse:5.41938                                                   
[6]	validation-rmse:5.41155                                                   
[7]	validation-rmse:5.39361                                                   
[8]	validation-rmse:5.38271                                                   
[9]	validation-rmse:5.37893                                                   
[10]	validation-rmse:5.36149                                                  
[11]	validation-rmse:5.35850                                                  
[12]	validation-rmse:5.35198                        






2024/10/23 23:46:05 INFO mlflow.tracking._tracking_service.client: 🏃 View run lyrical-cub-107 at: https://dagshub.com/JuanPab2009/nyc-taxi-time-prediction.mlflow/#/experiments/2/runs/edb293b2daf048fd953a763a992d2b24.

2024/10/23 23:46:05 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/JuanPab2009/nyc-taxi-time-prediction.mlflow/#/experiments/2.



[0]	validation-rmse:8.36726                                                   
[1]	validation-rmse:7.75669                                                   
[2]	validation-rmse:7.25949                                                   
[3]	validation-rmse:6.85856                                                   
[4]	validation-rmse:6.53828                                                   
[5]	validation-rmse:6.28396                                                   
[6]	validation-rmse:6.08310                                                   
[7]	validation-rmse:5.92500                                                   
[8]	validation-rmse:5.80150                                                   
[9]	validation-rmse:5.70496                                                   
[10]	validation-rmse:5.62880                                                  
[11]	validation-rmse:5.56945                                                  
[12]	validation-rmse:5.52298                        






2024/10/23 23:59:29 INFO mlflow.tracking._tracking_service.client: 🏃 View run abrasive-roo-681 at: https://dagshub.com/JuanPab2009/nyc-taxi-time-prediction.mlflow/#/experiments/2/runs/42a3791316d94a8685f6a094a5fdd4f7.

2024/10/23 23:59:29 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/JuanPab2009/nyc-taxi-time-prediction.mlflow/#/experiments/2.



[0]	validation-rmse:8.26735                                                    
[1]	validation-rmse:7.58881                                                    
[2]	validation-rmse:7.05001                                                    
[3]	validation-rmse:6.62393                                                    
[4]	validation-rmse:6.29418                                                    
[5]	validation-rmse:6.03839                                                    
[6]	validation-rmse:5.84084                                                    
[7]	validation-rmse:5.69014                                                    
[8]	validation-rmse:5.57545                                                    
[9]	validation-rmse:5.48868                                                    
[10]	validation-rmse:5.42303                                                   
[11]	validation-rmse:5.37357                                                   
[12]	validation-rmse:5.33676            






2024/10/23 23:59:53 INFO mlflow.tracking._tracking_service.client: 🏃 View run stylish-sheep-1 at: https://dagshub.com/JuanPab2009/nyc-taxi-time-prediction.mlflow/#/experiments/2/runs/185f6aaece484810ac65ea253f1b0f5c.

2024/10/23 23:59:53 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/JuanPab2009/nyc-taxi-time-prediction.mlflow/#/experiments/2.



[0]	validation-rmse:5.62715                                                     
[1]	validation-rmse:5.47472                                                     
[2]	validation-rmse:5.43901                                                     
[3]	validation-rmse:5.40158                                                     
[4]	validation-rmse:5.39108                                                     
[5]	validation-rmse:5.38280                                                     
[6]	validation-rmse:5.37741                                                     
[7]	validation-rmse:5.37438                                                     
[8]	validation-rmse:5.35392                                                     
[9]	validation-rmse:5.34762                                                     
[10]	validation-rmse:5.34574                                                    
[11]	validation-rmse:5.34132                                                    
[12]	validation-rmse:5.33651






2024/10/24 00:00:11 INFO mlflow.tracking._tracking_service.client: 🏃 View run wise-sponge-407 at: https://dagshub.com/JuanPab2009/nyc-taxi-time-prediction.mlflow/#/experiments/2/runs/5556d81ba5db4b77a4602ce469280d39.

2024/10/24 00:00:11 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/JuanPab2009/nyc-taxi-time-prediction.mlflow/#/experiments/2.



[0]	validation-rmse:8.51103                                                     
[1]	validation-rmse:7.99363                                                     
[2]	validation-rmse:7.55908                                                     
[3]	validation-rmse:7.18234                                                     
[4]	validation-rmse:6.87065                                                     
[5]	validation-rmse:6.60327                                                     
[6]	validation-rmse:6.37402                                                     
[7]	validation-rmse:6.19101                                                     
[8]	validation-rmse:6.03957                                                     
[9]	validation-rmse:5.90507                                                     
[10]	validation-rmse:5.80709                                                    
[11]	validation-rmse:5.72635                                                    
[12]	validation-rmse:5.65646






2024/10/24 00:00:41 INFO mlflow.tracking._tracking_service.client: 🏃 View run trusting-panda-814 at: https://dagshub.com/JuanPab2009/nyc-taxi-time-prediction.mlflow/#/experiments/2/runs/cbcc97b1de7a4ae0ac38db6740a1c1df.

2024/10/24 00:00:41 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/JuanPab2009/nyc-taxi-time-prediction.mlflow/#/experiments/2.



100%|██████████| 10/10 [17:36<00:00, 105.67s/trial, best loss: 5.171803119607092]
[0]	validation-rmse:8.26735
[1]	validation-rmse:7.58881
[2]	validation-rmse:7.05001
[3]	validation-rmse:6.62393
[4]	validation-rmse:6.29418
[5]	validation-rmse:6.03839
[6]	validation-rmse:5.84084
[7]	validation-rmse:5.69014
[8]	validation-rmse:5.57545
[9]	validation-rmse:5.48868
[10]	validation-rmse:5.42303
[11]	validation-rmse:5.37357
[12]	validation-rmse:5.33676
[13]	validation-rmse:5.30857
[14]	validation-rmse:5.28675
[15]	validation-rmse:5.27136
[16]	validation-rmse:5.25791
[17]	validation-rmse:5.24786
[18]	validation-rmse:5.23792
[19]	validation-rmse:5.23051
[20]	validation-rmse:5.22404
[21]	validation-rmse:5.21867
[22]	validation-rmse:5.21405
[23]	validation-rmse:5.20999
[24]	validation-rmse:5.20651
[25]	validation-rmse:5.20467
[26]	validation-rmse:5.20228
[27]	validation-rmse:5.20081
[28]	validation-rmse:5.19986
[29]	validation-rmse:5.19865
[30]	validation-rmse:5.19678
[31]	validation-rmse:5.19632


2024/10/24 00:00:58 INFO mlflow.tracking._tracking_service.client: 🏃 View run Xgboost Hyper-parameter Optimization at: https://dagshub.com/JuanPab2009/nyc-taxi-time-prediction.mlflow/#/experiments/2/runs/bfc86a11a706470d981cb8a74a98e6a1.
2024/10/24 00:00:58 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/JuanPab2009/nyc-taxi-time-prediction.mlflow/#/experiments/2.


In [44]:
run_id = input("Ingrese el run_id")
run_uri = f"runs:/{run_id}/model"

result = mlflow.register_model(
    model_uri=run_uri,
    name="nyc-taxi-model"
)

Registered model 'nyc-taxi-model' already exists. Creating a new version of this model...
2024/10/24 00:12:30 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: nyc-taxi-model, version 1
Created version '1' of model 'nyc-taxi-model'.


In [45]:
from datetime import datetime
from mlflow import MlflowClient

client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)
client.update_registered_model(
    name="nyc-taxi-model",
    description="Model registry for the NYC Taxi Time Prediction Project",
)

new_alias = "champion"
date = datetime.today()
model_version = "1"

# create "champion" alias for version 1 of model "nyc-taxi-model"
client.set_registered_model_alias(
    name="nyc-taxi-model",
    alias=new_alias,
    version=model_version
)

client.update_model_version(
    name="nyc-taxi-model",
    version=model_version,
    description=f"The model version {model_version} was transitioned to {new_alias} on {date}",
)

<ModelVersion: aliases=['champion'], creation_timestamp=1729750350786, current_stage='None', description='The model version 1 was transitioned to champion on 2024-10-24 00:12:36.021737', last_updated_timestamp=1729750356498, name='nyc-taxi-model', run_id='185f6aaece484810ac65ea253f1b0f5c', run_link='', source='mlflow-artifacts:/7d7e7f9ebf5a4fa09de8217839da801c/185f6aaece484810ac65ea253f1b0f5c/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='1'>

In [46]:
import mlflow.pyfunc

model_name = "nyc-taxi-model"
alias = "champion"

model_uri = f"models:/{model_name}@{alias}"

champion_version = mlflow.pyfunc.load_model(
    model_uri=model_uri
)

champion_version.predict(X_val)

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

array([19.11525 , 28.429977,  9.304751, ..., 40.515846, 13.561398,
       19.851883], dtype=float32)