In [1]:
import mlflow

from sklearn.model_selection import train_test_split
from sklearn.datasets import load_wine
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, explained_variance_score


import warnings

In [2]:
%load_ext kedro.ipython

In [3]:
X_train = df = catalog.load("X_train_data")
X_test = df = catalog.load("X_test_data")
y_train = df = catalog.load("y_train_data")
y_test = df = catalog.load("y_test_data")

In [6]:
# descrition that will be used as metadata
description = "starting_example"

# Mlflow tracking server
mlflow.set_tracking_uri("http://127.0.0.1:5000/")

In [11]:
with mlflow.start_run(run_name="tracking experiment_1", description=description) as run:
    rf = RandomForestRegressor(n_estimators=100, max_depth=6, max_features=3)
    
    rf.fit(X_train, y_train)
    print("1")
mlflow.end_run()

1


In [12]:
mlflow.set_experiment("mlflow_first_example")

2024/06/22 16:14:56 INFO mlflow.tracking.fluent: Experiment with name 'mlflow_first_example' does not exist. Creating a new experiment.


[1m<[0m[1;95mExperiment:[0m[39m [0m[33martifact_location[0m[39m=[0m[32m'mlflow-artifacts:/669815948000371949'[0m[39m, [0m[33mcreation_time[0m[39m=[0m[1;36m1719069296361[0m[39m, [0m[33mexperiment_id[0m[39m=[0m[32m'669815948000371949'[0m[39m, [0m[33mlast_update_time[0m[39m=[0m[1;36m1719069296361[0m[39m, [0m[33mlifecycle_stage[0m[39m=[0m[32m'active'[0m[39m, [0m[33mname[0m[39m=[0m[32m'mlflow_first_example'[0m[39m, [0m[33mtags[0m[39m=[0m[1;39m{[0m[1;39m}[0m[1m>[0m

In [13]:
with mlflow.start_run(run_name="tracking experiment_2", description=description) as run:
    rf = RandomForestRegressor(n_estimators=100, max_depth=6, max_features=3)
    rf.fit(X_train, y_train)
mlflow.end_run()

In [14]:
mlflow.set_experiment("mlflow_first_example")

with mlflow.start_run(run_name="params_artifacts_logged") as run:

    params = {"n_estimators":100, "max_depth":6, "max_features":3}

    rf = RandomForestRegressor(**params)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)

    
    #metrics
    explained_variance = explained_variance_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    
    #log the relevant objects
    mlflow.log_params(params)
    mlflow.log_param("test", "test")
    mlflow.log_metric("explained_variance", explained_variance)
    mlflow.log_metric("mse", mse)
    mlflow.set_tag("tag", "logged run")
    
    # save the model as an artifact
    mlflow.sklearn.log_model(
    sk_model=rf,
    artifact_path="random_forest_regressor")
    
mlflow.end_run()

In [15]:
# use autolog
mlflow.autolog(log_model_signatures=False, log_input_examples=False)

with mlflow.start_run(run_name="run_2") as run:
    rf = RandomForestRegressor(n_estimators=100, max_depth=6, max_features=3)
    rf.fit(X_train, y_train)
mlflow.end_run()  

2024/06/22 16:17:20 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.


In [16]:
mlflow.set_experiment("mlflow_parameters_search")

2024/06/22 16:17:47 INFO mlflow.tracking.fluent: Experiment with name 'mlflow_parameters_search' does not exist. Creating a new experiment.


[1m<[0m[1;95mExperiment:[0m[39m [0m[33martifact_location[0m[39m=[0m[32m'mlflow-artifacts:/861406686854474306'[0m[39m, [0m[33mcreation_time[0m[39m=[0m[1;36m1719069467355[0m[39m, [0m[33mexperiment_id[0m[39m=[0m[32m'861406686854474306'[0m[39m, [0m[33mlast_update_time[0m[39m=[0m[1;36m1719069467355[0m[39m, [0m[33mlifecycle_stage[0m[39m=[0m[32m'active'[0m[39m, [0m[33mname[0m[39m=[0m[32m'mlflow_parameters_search'[0m[39m, [0m[33mtags[0m[39m=[0m[1;39m{[0m[1;39m}[0m[1m>[0m

In [17]:
from sklearn.model_selection import GridSearchCV
mlflow.autolog(log_model_signatures=True, log_input_examples=True)
params = {
  "n_estimators": [33, 66, 200],
  "max_depth": [2, 4, 6],
  "max_features": [3, 4, 5]
}

rf = RandomForestRegressor()
searcher = GridSearchCV(estimator=rf, param_grid=params)

with mlflow.start_run(run_name="autolog_with_grid_search") as run:
  searcher.fit(X_train, y_train)
mlflow.end_run()

# get model path from run id
run_id = run.info.run_id
model_path = f"runs:/{run_id}/model"
print(f"Loading model from: {model_path}")

2024/06/22 16:18:01 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.


Loading model from: runs:/d3b13de3b9024980b12f9f162e2c2666/model


In [18]:
result = mlflow.register_model(
    model_path, "sk-learn-random-forest-reg"
)

Successfully registered model 'sk-learn-random-forest-reg'.
2024/06/22 16:20:55 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: sk-learn-random-forest-reg, version 1
Created version '1' of model 'sk-learn-random-forest-reg'.


In [19]:
from mlflow import MlflowClient

client = MlflowClient()

In [20]:
# Set registered model tag
client.set_registered_model_tag("sk-learn-random-forest-reg", "task", "classification")

In [21]:
# Set model version tag
client.set_model_version_tag("sk-learn-random-forest-reg", "1", "validation_status", "approved")

In [22]:
#create "champion" alias for version 1 of model 
client.set_registered_model_alias("sk-learn-random-forest-reg", "Champion", "1")

In [23]:
# get a model version by alias
model_champion = client.get_model_version_by_alias("sk-learn-random-forest-reg", "Champion")

In [24]:
# delete the alias
client.delete_registered_model_alias("sk-learn-random-forest-reg", "Champion")

In [25]:
# get model path from run id
run_id = run.info.run_id
model_path = f"runs:/{run_id}/model"
print(f"Loading model from: {model_path}")

Loading model from: runs:/d3b13de3b9024980b12f9f162e2c2666/model


In [26]:
# load using sklearn flavor
loaded_model = mlflow.sklearn.load_model(model_path)

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
# load using sklearn flavor
loaded_model = mlflow.sklearn.load_model(model_path)

print("Showing predictions")
print(loaded_model.predict(X_test))