In [11]:
import os
import pickle
import click
import mlflow

from mlflow.entities import ViewType
from mlflow.tracking import MlflowClient
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error

In [12]:
HPO_EXPERIMENT_NAME = "random-forest-hyperopt"
EXPERIMENT_NAME = "random-forest-best-models"
RF_PARAMS = ['max_depth', 'n_estimators', 'min_samples_split', 'min_samples_leaf', 'random_state']
LOCAL_TRACKING_SERVER = "http://127.0.0.1:5000"

mlflow.set_tracking_uri(LOCAL_TRACKING_SERVER)
mlflow.set_experiment(EXPERIMENT_NAME)
mlflow.sklearn.autolog()


In [14]:
def load_pickle(filename):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)


def train_and_log_model(data_path, params):
    X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
    X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))
    X_test, y_test = load_pickle(os.path.join(data_path, "test.pkl"))

    with mlflow.start_run():
        for param in RF_PARAMS:
            params[param] = int(params[param])

        rf = RandomForestRegressor(**params)
        rf.fit(X_train, y_train)

        # Evaluate model on the validation and test sets
        # val_rmse = mean_squared_error(y_val, rf.predict(X_val), squared=False)
        val_rmse = root_mean_squared_error(y_val, rf.predict(X_val))
        mlflow.log_metric("val_rmse", val_rmse)
        # test_rmse = mean_squared_error(y_test, rf.predict(X_test), squared=False)
        test_rmse = root_mean_squared_error(y_test, rf.predict(X_test))
        mlflow.log_metric("test_rmse", test_rmse)


In [17]:
def run_register_model(data_path: str, top_n: int):

    client = MlflowClient()

    # Retrieve the top_n model runs and log the models
    experiment = client.get_experiment_by_name(HPO_EXPERIMENT_NAME)
    runs = client.search_runs(
        experiment_ids=experiment.experiment_id,
        run_view_type=ViewType.ACTIVE_ONLY,
        max_results=top_n,
        order_by=["metrics.rmse ASC"]
    )
    for run in runs:
        train_and_log_model(data_path=data_path, params=run.data.params)

    # Select the model with the lowest test RMSE
    experiment = client.get_experiment_by_name(EXPERIMENT_NAME)
    best_run = client.search_runs(
        experiment_ids=experiment.experiment_id,
        run_view_type=ViewType.ACTIVE_ONLY,
        max_results=top_n,
        order_by=["metrics.rmse ASC"]
    )[0]

    mlflow.register_model(model_uri=f"runs:/{best_run.info.run_id}/models", 
                          name="best_taxi_experiment_model"
                         )

    return 



In [18]:
e, r = run_register_model("./output", 5)



In [20]:
r[0]

<Run: data=<RunData: metrics={'rmse': 5.311234357303558}, params={'max_depth': 20,
 'min_samples_leaf': 2,
 'min_samples_split': 10,
 'n_estimators': 24,
 'random_state': 42}, tags={'mlflow.runName': 'placid-fawn-521',
 'mlflow.source.name': '/opt/miniconda3/envs/mlops-dtc/lib/python3.12/site-packages/ipykernel_launcher.py',
 'mlflow.source.type': 'LOCAL',
 'mlflow.user': 'cj',
 'model': 'RandomForestRegressor'}>, info=<RunInfo: artifact_uri='/Users/cj/Documents/Projects/zoomcamp-homework/mlops/mlflow_hw_2/homework/artifacts_local/2/3509c057ffe34942924f3315e15326ef/artifacts', end_time=1716755106743, experiment_id='2', lifecycle_stage='active', run_id='3509c057ffe34942924f3315e15326ef', run_name='placid-fawn-521', run_uuid='3509c057ffe34942924f3315e15326ef', start_time=1716755099905, status='FINISHED', user_id='cj'>, inputs=<RunInputs: dataset_inputs=[]>>

In [25]:
run_id = r[0].info.run_id
mlflow.register_model(model_uri=f"runs:/{run_id}/models", name="best_taxi_experiment_model")


Successfully registered model 'best_taxi_experiment_model'.
2024/05/26 19:01:19 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: best_taxi_experiment_model, version 1
Created version '1' of model 'best_taxi_experiment_model'.


<ModelVersion: aliases=[], creation_timestamp=1716775279774, current_stage='None', description='', last_updated_timestamp=1716775279774, name='best_taxi_experiment_model', run_id='3509c057ffe34942924f3315e15326ef', run_link='', source='/Users/cj/Documents/Projects/zoomcamp-homework/mlops/mlflow_hw_2/homework/artifacts_local/2/3509c057ffe34942924f3315e15326ef/artifacts/models', status='READY', status_message='', tags={}, user_id='', version='1'>

In [21]:
e

<Experiment: artifact_location='/Users/cj/Documents/Projects/zoomcamp-homework/mlops/mlflow_hw_2/homework/artifacts_local/3', creation_time=1716772971613, experiment_id='3', last_update_time=1716772971613, lifecycle_stage='active', name='random-forest-best-models', tags={}>

In [None]:
runs = client.search_runs(
        experiment_ids=experiment.experiment_id,
        run_view_type=ViewType.ACTIVE_ONLY,
        max_results=top_n,
        order_by=["metrics.rmse ASC"]
    )