In [1]:
!python -V

Python 3.10.14


In [9]:
import pandas as pd

In [34]:
import mlflow


mlflow.set_tracking_uri("sqlite:////Users/user/notebooks/mlflow.db")

In [53]:
import os
import pickle
import mlflow
import numpy as np
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from hyperopt.pyll import scope
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error

mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("random-forest-hyperopt")


def load_pickle(filename: str):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)
    
def run_optimization(data_path: str, num_trials: int):

    X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
    X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))

    def objective(params):

        rf = RandomForestRegressor(**params)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)
        rmse = root_mean_squared_error(y_val, y_pred)
        mlflow.log_metric("rmse", rmse)

        return {'loss': rmse, 'status': STATUS_OK}

    search_space = {
        'max_depth': scope.int(hp.quniform('max_depth', 1, 20, 1)),
        'n_estimators': scope.int(hp.quniform('n_estimators', 10, 50, 1)),
        'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 10, 1)),
        'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 4, 1)),
        'random_state': 42
    }


    rstate = np.random.default_rng(42)  # for reproducible results
    fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=num_trials,
        trials=Trials(),
        rstate=rstate
    )
    mlflow.end_run()

run_optimization("./output", 15)

  0%|                                                                                                                                                              | 0/15 [00:00<?, ?trial/s, best loss=?]

2024/09/13 19:55:19 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '006ad0786c7a43099da932c869d498ec', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


2024/09/13 19:55:27 INFO mlflow.tracking._tracking_service.client: 🏃 View run nebulous-bear-991 at: http://127.0.0.1:5000/#/experiments/2/runs/006ad0786c7a43099da932c869d498ec.

2024/09/13 19:55:27 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2.



  7%|████████▊                                                                                                                            | 1/15 [00:08<02:02,  8.76s/trial, best loss: 5.370086069268862]




 13%|█████████████████▋                                                                                                                   | 2/15 [00:13<01:20,  6.21s/trial, best loss: 5.370086069268862]





 20%|██████████████████████████▌                                                                                                          | 3/15 [00:16<00:58,  4.86s/trial, best loss: 5.370086069268862]





 27%|███████████████████████████████████▍                                                                                                 | 4/15 [00:21<00:55,  5.07s/trial, best loss: 5.357490752366866]





 33%|████████████████████████████████████████████▎                                                                                        | 5/15 [00:26<00:47,  4.80s/trial, best loss: 5.357490752366866]





 40%|█████████████████████████████████████████████████████▏                                                                               | 6/15 [00:33<00:51,  5.67s/trial, best loss: 5.354700855292386]





 47%|██████████████████████████████████████████████████████████████                                                                       | 7/15 [00:40<00:48,  6.12s/trial, best loss: 5.354700855292386]





 53%|██████████████████████████████████████████████████████████████████████▉                                                              | 8/15 [00:43<00:36,  5.26s/trial, best loss: 5.354700855292386]





 60%|███████████████████████████████████████████████████████████████████████████████▊                                                     | 9/15 [00:49<00:32,  5.41s/trial, best loss: 5.354700855292386]





 67%|████████████████████████████████████████████████████████████████████████████████████████                                            | 10/15 [00:54<00:26,  5.31s/trial, best loss: 5.354700855292386]





 73%|████████████████████████████████████████████████████████████████████████████████████████████████▊                                   | 11/15 [00:59<00:20,  5.06s/trial, best loss: 5.335419588556921]





 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▌                          | 12/15 [01:03<00:14,  4.90s/trial, best loss: 5.335419588556921]





 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                 | 13/15 [01:07<00:08,  4.47s/trial, best loss: 5.335419588556921]





 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏        | 14/15 [01:11<00:04,  4.48s/trial, best loss: 5.335419588556921]





100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [01:17<00:00,  5.14s/trial, best loss: 5.335419588556921]

2024/09/13 19:56:37 INFO mlflow.tracking._tracking_service.client: 🏃 View run serious-hawk-682 at: http://127.0.0.1:5000/#/experiments/2/runs/8bd31999f91741bb993d93aa561f2af2.
2024/09/13 19:56:37 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2.





In [6]:
from mlflow.tracking import MlflowClient
MLFLOW_TRACKING_URI = "sqlite:////Users/user/notebooks/mlflow.db"
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

In [63]:
from mlflow.entities import ViewType

runs = client.search_runs(
    experiment_ids='2',
    filter_string="metrics.rmse < 7",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.rmse ASC"]
)

for run in runs:
    print(f"run id: {run.info.run_id}, rmse: {run.data.metrics['rmse']:.4f}")

run id: 8bd31999f91741bb993d93aa561f2af2, rmse: 5.3550
run id: 6ff90da576d942ae8b21b644bf1aaae9, rmse: 5.3550
run id: b16b2fe23c5244b8b8a93b5a075c138a, rmse: 5.3550


In [88]:
import os
import pickle
import mlflow

from mlflow.entities import ViewType
from mlflow.tracking import MlflowClient
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

HPO_EXPERIMENT_NAME = "random-forest-hyperopt"
EXPERIMENT_NAME = "random-forest-best-models"
RF_PARAMS = ['max_depth', 'n_estimators', 'min_samples_split', 'min_samples_leaf', 'random_state']

mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment(EXPERIMENT_NAME)
mlflow.sklearn.autolog()


def load_pickle(filename):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)


def train_and_log_model(data_path, params):
    X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
    X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))
    X_test, y_test = load_pickle(os.path.join(data_path, "test.pkl"))

    with mlflow.start_run():
        new_params = {}
        for param in RF_PARAMS:
            new_params[param] = int(params[param])

        rf = RandomForestRegressor(**new_params)
        rf.fit(X_train, y_train)

        # Evaluate model on the validation and test sets
        val_rmse = mean_squared_error(y_val, rf.predict(X_val), squared=False)
        mlflow.log_metric("val_rmse", val_rmse)
        test_rmse = mean_squared_error(y_test, rf.predict(X_test), squared=False)
        mlflow.log_metric("test_rmse", test_rmse)



In [91]:
def run_register_model(data_path: str, top_n: int):

    client = MlflowClient()

    # Retrieve the top_n model runs and log the models
    experiment = client.get_experiment_by_name(HPO_EXPERIMENT_NAME)
    runs = client.search_runs(
        experiment_ids=experiment.experiment_id,
        run_view_type=ViewType.ACTIVE_ONLY,
        max_results=top_n,
        order_by=["metrics.rmse ASC"]
    )
    for run in runs:
        train_and_log_model(data_path=data_path, params=run.data.params)

    # Select the model with the lowest test RMSE
    experiment = client.get_experiment_by_name(EXPERIMENT_NAME)
    best_run = client.search_runs(
        experiment_ids=experiment.experiment_id,
        run_view_type=ViewType.ACTIVE_ONLY,
        max_results=top_n,
        order_by=["metrics.test_rmse ASC"]
    )[0]
    # Register the best model
    run_id = best_run.info.run_id
    print(f"{run_id}")
    model_uri = f"runs:/{run_id}/model"
    mlflow.register_model(model_uri, name="rf-best-model")

In [92]:
run_register_model("./output", 5)

2024/09/13 21:07:51 INFO mlflow.tracking._tracking_service.client: 🏃 View run inquisitive-bass-362 at: http://127.0.0.1:5000/#/experiments/3/runs/664319d3ec2b47a689b4c1c2425669fd.
2024/09/13 21:07:51 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/3.
2024/09/13 21:08:00 INFO mlflow.tracking._tracking_service.client: 🏃 View run shivering-cat-390 at: http://127.0.0.1:5000/#/experiments/3/runs/70adf3b5b9f04846a28457db24e7a4c8.
2024/09/13 21:08:00 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/3.
2024/09/13 21:08:08 INFO mlflow.tracking._tracking_service.client: 🏃 View run delicate-stork-50 at: http://127.0.0.1:5000/#/experiments/3/runs/a0c250206f294e6fa1eb0dc271ff5729.
2024/09/13 21:08:08 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/3.
2024/09/13 21:08:15 INFO mlflow.tracking._tracking_service.client: 🏃 View run uniq

a0c250206f294e6fa1eb0dc271ff5729


Created version '5' of model 'rf-best-model'.
