In [1]:
import pickle
import pandas as pd
from sklearn.metrics import  root_mean_squared_error
from sklearn.feature_extraction import  DictVectorizer
from sklearn.linear_model import Lasso, Ridge, LinearRegression

In [2]:
def read_dataframe(filename):

    df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    return df

In [3]:
df_train = read_dataframe('../data/green_tripdata_2024-01.parquet')
df_val = read_dataframe('../data/green_tripdata_2024-02.parquet')

In [4]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [5]:
categorical = ['PU_DO']  #'PULocationID', 'DOLocationID']
numerical = ['trip_distance']
dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [6]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [7]:
import dagshub
import mlflow


dagshub.init(url="https://dagshub.com/dafnetamayo/nyc-taxi-time-prediction", mlflow=True)

MLFLOW_TRACKING_URI = mlflow.get_tracking_uri()

print(MLFLOW_TRACKING_URI)

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(experiment_name="nyc-taxi-experiment")

https://dagshub.com/dafnetamayo/nyc-taxi-time-prediction.mlflow


<Experiment: artifact_location='mlflow-artifacts:/5b5314a67f894d4d9aa944625869aa1c', creation_time=1726630094011, experiment_id='0', last_update_time=1726630094011, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [8]:
training_dataset = mlflow.data.from_numpy(X_train.data, targets=y_train, name="green_tripdata_2024-01")
validation_dataset = mlflow.data.from_numpy(X_val.data, targets=y_val, name="green_tripdata_2024-02")


In [9]:
from sklearn.metrics import  root_mean_squared_error
from sklearn.feature_extraction import  DictVectorizer
from sklearn.linear_model import Lasso, Ridge, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import numpy as np

## Random forest

In [10]:
mlflow.sklearn.autolog()
 
def objective_rf(params):
    with mlflow.start_run(nested=True):
        # Set model tag
        mlflow.set_tag("model_family", "random_forest")
        
        # Log parameters
        mlflow.log_params(params)
        
        # Train RandomForest model
        rf_model = RandomForestRegressor(
            n_estimators=int(params['n_estimators']),
            max_depth=int(params['max_depth']),
            min_samples_split=int(params['min_samples_split']),
            min_samples_leaf=int(params['min_samples_leaf']),
            random_state=42
        )
        rf_model.fit(X_train, y_train)
        
        # Predict on validation dataset
        y_pred = rf_model.predict(X_val)
        
        # Calculate RMSE
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        
        # Log RMSE metric
        mlflow.log_metric("rmse", rmse)
 
    return {'loss': rmse, 'status': STATUS_OK}
 
# Define search space for RandomForest
search_space_rf = {
    'n_estimators': hp.quniform('n_estimators', 50, 200, 1),
    'max_depth': hp.quniform('max_depth', 5, 10, 1),
    'min_samples_split': hp.quniform('min_samples_split', 2, 8, 1),
    'min_samples_leaf': hp.quniform('min_samples_leaf', 1, 3, 1),
}
 
# Run hyperparameter optimization
with mlflow.start_run(run_name="Random Forest", nested=True):
    best_params_rf = fmin(
        fn=objective_rf,
        space=search_space_rf,
        algo=tpe.suggest,
        max_evals=10,
        trials=Trials()
    )
    
    # Log best parameters
    mlflow.log_params(best_params_rf)



  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]



2024/09/20 15:45:04 INFO mlflow.tracking._tracking_service.client: 🏃 View run welcoming-stork-952 at: https://dagshub.com/dafnetamayo/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/d6c0ce2c1f414c7980e59b932355b1c8.

2024/09/20 15:45:04 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/dafnetamayo/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 10%|█         | 1/10 [00:18<02:46, 18.48s/trial, best loss: 5.421237858812696]



2024/09/20 15:45:12 INFO mlflow.tracking._tracking_service.client: 🏃 View run adorable-fox-777 at: https://dagshub.com/dafnetamayo/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/a9c71dc53e2841749ece654deb960d5d.

2024/09/20 15:45:12 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/dafnetamayo/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 20%|██        | 2/10 [00:26<01:40, 12.54s/trial, best loss: 5.421237858812696]



2024/09/20 15:45:32 INFO mlflow.tracking._tracking_service.client: 🏃 View run indecisive-crab-140 at: https://dagshub.com/dafnetamayo/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/f17a0a97254c492b93fe6f84e1d67c33.

2024/09/20 15:45:32 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/dafnetamayo/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 30%|███       | 3/10 [00:46<01:50, 15.81s/trial, best loss: 5.421237858812696]



2024/09/20 15:45:45 INFO mlflow.tracking._tracking_service.client: 🏃 View run wistful-midge-240 at: https://dagshub.com/dafnetamayo/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/599acd3b370d4905bb814244699eb544.

2024/09/20 15:45:45 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/dafnetamayo/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 40%|████      | 4/10 [00:59<01:27, 14.64s/trial, best loss: 5.421237858812696]



2024/09/20 15:45:56 INFO mlflow.tracking._tracking_service.client: 🏃 View run clean-quail-489 at: https://dagshub.com/dafnetamayo/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/e8ebb2ef3f2c43ecb202224f1bf565be.

2024/09/20 15:45:56 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/dafnetamayo/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 50%|█████     | 5/10 [01:10<01:06, 13.25s/trial, best loss: 5.421237858812696]



2024/09/20 15:46:08 INFO mlflow.tracking._tracking_service.client: 🏃 View run brawny-sponge-349 at: https://dagshub.com/dafnetamayo/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/bbda32dd8422478ab103185822fc2b96.

2024/09/20 15:46:08 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/dafnetamayo/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 60%|██████    | 6/10 [01:22<00:52, 13.07s/trial, best loss: 5.421237858812696]



2024/09/20 15:46:19 INFO mlflow.tracking._tracking_service.client: 🏃 View run useful-sow-562 at: https://dagshub.com/dafnetamayo/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/56128bfefced44f394b15b792ff89d01.

2024/09/20 15:46:19 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/dafnetamayo/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 70%|███████   | 7/10 [01:33<00:37, 12.35s/trial, best loss: 5.421237858812696]



2024/09/20 15:46:33 INFO mlflow.tracking._tracking_service.client: 🏃 View run able-squirrel-426 at: https://dagshub.com/dafnetamayo/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/6143a55380aa41c28ab57cd7e7dc3927.

2024/09/20 15:46:33 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/dafnetamayo/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 80%|████████  | 8/10 [01:47<00:25, 12.87s/trial, best loss: 5.421237858812696]



2024/09/20 15:46:49 INFO mlflow.tracking._tracking_service.client: 🏃 View run gifted-pig-173 at: https://dagshub.com/dafnetamayo/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/9d9e5b00d70c4628a0407520e6d377e5.

2024/09/20 15:46:49 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/dafnetamayo/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 90%|█████████ | 9/10 [02:03<00:13, 13.81s/trial, best loss: 5.421237858812696]



2024/09/20 15:46:59 INFO mlflow.tracking._tracking_service.client: 🏃 View run chill-ox-44 at: https://dagshub.com/dafnetamayo/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/8f2a5f100b3e4876ac82f4258dcfb44d.

2024/09/20 15:46:59 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/dafnetamayo/nyc-taxi-time-prediction.mlflow/#/experiments/0.



100%|██████████| 10/10 [02:14<00:00, 13.41s/trial, best loss: 5.421237858812696]


2024/09/20 15:47:00 INFO mlflow.tracking._tracking_service.client: 🏃 View run Random Forest at: https://dagshub.com/dafnetamayo/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/95850359032a4159812854ee3465e5c9.
2024/09/20 15:47:00 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/dafnetamayo/nyc-taxi-time-prediction.mlflow/#/experiments/0.


In [12]:
run_id = "a9c71dc53e2841749ece654deb960d5d"
run_uri = f"runs:/{run_id}/model"

result = mlflow.register_model(
    model_uri=run_uri,
    name="nyc-taxi-model"
)

Registered model 'nyc-taxi-model' already exists. Creating a new version of this model...
2024/09/20 15:49:44 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: nyc-taxi-model, version 2
Created version '2' of model 'nyc-taxi-model'.


## Gradient Boost

In [13]:
mlflow.sklearn.autolog()
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
 
def objective_gb(params):
    with mlflow.start_run(nested=True):
        # Set model tag
        mlflow.set_tag("model_family", "gradient_boosting")
        
        # Log parameters
        mlflow.log_params(params)
        
        # Train GradientBoosting model
        gb_model = GradientBoostingRegressor(
            n_estimators=int(params['n_estimators']),
            max_depth=int(params['max_depth']),
            min_samples_split=int(params['min_samples_split']),
            min_samples_leaf=int(params['min_samples_leaf']),
            learning_rate=float(params['learning_rate']),
            random_state=42
        )
        gb_model.fit(X_train, y_train)
        
        # Predict on validation dataset
        y_pred = gb_model.predict(X_val)
        
        # Calculate RMSE
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        
        # Log RMSE metric
        mlflow.log_metric("rmse", rmse)
 
    return {'loss': rmse, 'status': STATUS_OK}
 
# Define search space for GradientBoosting
search_space_gb = {
    'n_estimators': hp.quniform('n_estimators', 50, 200, 1),
    'max_depth': hp.quniform('max_depth', 3, 8, 1),
    'min_samples_split': hp.quniform('min_samples_split', 2, 6, 1),
    'min_samples_leaf': hp.quniform('min_samples_leaf', 1, 3, 1),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.1)
}
 
# Run hyperparameter optimization for GradientBoosting
with mlflow.start_run(run_name="Gradient Boosting", nested=True):
    best_params_gb = fmin(
        fn=objective_gb,
        space=search_space_gb,
        algo=tpe.suggest,
        max_evals=10,
        trials=Trials()
    )
    
    # Log best parameters
    mlflow.log_params(best_params_gb)



  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]



2024/09/20 15:59:22 INFO mlflow.tracking._tracking_service.client: 🏃 View run fun-dove-795 at: https://dagshub.com/dafnetamayo/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/8cfc39b505394ec893b3b0792f0bafde.

2024/09/20 15:59:22 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/dafnetamayo/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 10%|█         | 1/10 [00:12<01:54, 12.75s/trial, best loss: 5.375466523736079]



2024/09/20 15:59:31 INFO mlflow.tracking._tracking_service.client: 🏃 View run skittish-duck-865 at: https://dagshub.com/dafnetamayo/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/0454d93bbbdb4ca999b942904dfa8d3e.

2024/09/20 15:59:31 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/dafnetamayo/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 20%|██        | 2/10 [00:20<01:20, 10.07s/trial, best loss: 5.375466523736079]



2024/09/20 15:59:39 INFO mlflow.tracking._tracking_service.client: 🏃 View run enthused-turtle-812 at: https://dagshub.com/dafnetamayo/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/42f4959032b54798a10245c8aab06a74.

2024/09/20 15:59:39 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/dafnetamayo/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 30%|███       | 3/10 [00:29<01:06,  9.46s/trial, best loss: 5.3178234853073185]



2024/09/20 15:59:51 INFO mlflow.tracking._tracking_service.client: 🏃 View run delicate-lark-685 at: https://dagshub.com/dafnetamayo/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/eefcc7ee9d4f4166829f7b5d06a9c1a5.

2024/09/20 15:59:51 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/dafnetamayo/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 40%|████      | 4/10 [00:41<01:02, 10.34s/trial, best loss: 5.271957658663778] 



2024/09/20 15:59:58 INFO mlflow.tracking._tracking_service.client: 🏃 View run flawless-toad-737 at: https://dagshub.com/dafnetamayo/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/5f933dbcdb504a5db258776badec7ca7.

2024/09/20 15:59:58 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/dafnetamayo/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 50%|█████     | 5/10 [00:48<00:46,  9.24s/trial, best loss: 5.271957658663778]



2024/09/20 16:00:07 INFO mlflow.tracking._tracking_service.client: 🏃 View run enthused-auk-408 at: https://dagshub.com/dafnetamayo/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/f893787eece04c5598e53eaa1646b6bf.

2024/09/20 16:00:07 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/dafnetamayo/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 60%|██████    | 6/10 [00:57<00:36,  9.07s/trial, best loss: 5.271957658663778]



2024/09/20 16:00:17 INFO mlflow.tracking._tracking_service.client: 🏃 View run funny-asp-922 at: https://dagshub.com/dafnetamayo/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/3bc69f36688b446cb1ad191ee9d237ba.

2024/09/20 16:00:17 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/dafnetamayo/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 70%|███████   | 7/10 [01:06<00:27,  9.24s/trial, best loss: 5.271957658663778]



2024/09/20 16:00:28 INFO mlflow.tracking._tracking_service.client: 🏃 View run useful-crab-633 at: https://dagshub.com/dafnetamayo/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/2b661d5f37684ce39d3ed95ebb271324.

2024/09/20 16:00:28 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/dafnetamayo/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 80%|████████  | 8/10 [01:18<00:20, 10.06s/trial, best loss: 5.271957658663778]



2024/09/20 16:00:35 INFO mlflow.tracking._tracking_service.client: 🏃 View run charming-snipe-935 at: https://dagshub.com/dafnetamayo/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/7cd8b6530cc241398278de2c0ba0a0d5.

2024/09/20 16:00:35 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/dafnetamayo/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 90%|█████████ | 9/10 [01:25<00:09,  9.04s/trial, best loss: 5.271957658663778]



2024/09/20 16:00:42 INFO mlflow.tracking._tracking_service.client: 🏃 View run salty-ape-266 at: https://dagshub.com/dafnetamayo/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/200b7076ebba4ecdb08fc140e507f807.

2024/09/20 16:00:42 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/dafnetamayo/nyc-taxi-time-prediction.mlflow/#/experiments/0.



100%|██████████| 10/10 [01:32<00:00,  9.23s/trial, best loss: 5.271957658663778]


2024/09/20 16:00:42 INFO mlflow.tracking._tracking_service.client: 🏃 View run Gradient Boosting at: https://dagshub.com/dafnetamayo/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/3eb039be095d424da79f770e90362b16.
2024/09/20 16:00:42 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/dafnetamayo/nyc-taxi-time-prediction.mlflow/#/experiments/0.


In [14]:

run_id = "0454d93bbbdb4ca999b942904dfa8d3e"
run_uri = f"runs:/{run_id}/model"

result = mlflow.register_model(
    model_uri=run_uri,
    name="nyc-taxi-model"
)

Registered model 'nyc-taxi-model' already exists. Creating a new version of this model...
2024/09/20 16:02:51 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: nyc-taxi-model, version 3
Created version '3' of model 'nyc-taxi-model'.
