# Model tracking with MLflow

MLflow is an open source platform for managing the end-to-end machine learning lifecycle. It tackles four primary functions:

* Tracking experiments to compare parameters and results (MLflow Tracking).
* Model versioning
* Support for serving models
* Packaging of ML code

## Documentation
* Documentation: https://mlflow.org/docs/latest/index.html
* Tracking API: https://mlflow.org/docs/latest/python_api/index.html


In [1]:
import os
import mlflow
import numpy as np
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [1]:
mlflow.start_run()

NameError: name 'mlflow' is not defined

In [12]:
n_estimators = 100
max_depth = 5

mlflow.log_param("n_estimators", n_estimators)
mlflow.log_param("max_depth", max_depth)

In [13]:
db = load_diabetes()
X_train, X_test, y_train, y_test = train_test_split(db.data, db.target)
print(X_train.shape, y_train.shape)
print(X_train[0,:])
print(y_train[0])

(331, 10) (331,)
[ 0.03081083  0.05068012 -0.03422907  0.0436772   0.05759701  0.06883138
 -0.03235593  0.05755657  0.03546194  0.08590655]
120.0


In [14]:
model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
model.fit(X_train, y_train)

RandomForestRegressor(max_depth=5, random_state=42)

In [15]:
pred = model.predict(X_test)
print(pred.shape)

(111,)


In [16]:
rmse = np.sqrt(mean_squared_error(y_test, pred))
mae = mean_absolute_error(y_test, pred)
print(rmse)
print(mae)
mlflow.log_metric("rmse", rmse)
mlflow.log_metric("mae", mae)

58.63887175520547
47.65957438545899


In [17]:
# https://mlflow.org/docs/latest/models.html#built-in-model-flavors
mlflow.sklearn.log_model(model, "model")

ModelInfo(artifact_path='model', flavors={'python_function': {'model_path': 'model.pkl', 'loader_module': 'mlflow.sklearn', 'python_version': '3.8.13', 'env': 'conda.yaml'}, 'sklearn': {'pickled_model': 'model.pkl', 'sklearn_version': '1.0.2', 'serialization_format': 'cloudpickle', 'code': None}}, model_uri='runs:/daf30341dd984025a09ceb75da879111/model', model_uuid='52e0359d057d485dba19ac59ea050920', run_id='daf30341dd984025a09ceb75da879111', saved_input_example_info=None, signature_dict=None, utc_time_created='2022-04-30 11:16:40.293223', mlflow_version='1.25.1')

In [18]:
mlflow.end_run()

In [19]:
#!mlflow ui

[2022-04-30 13:16:48 +0200] [999] [INFO] Starting gunicorn 20.1.0
[2022-04-30 13:16:48 +0200] [999] [INFO] Listening at: http://127.0.0.1:5000 (999)
[2022-04-30 13:16:48 +0200] [999] [INFO] Using worker: sync
[2022-04-30 13:16:48 +0200] [1001] [INFO] Booting worker with pid: 1001
^C
[2022-04-30 13:17:52 +0200] [999] [INFO] Handling signal: int
[2022-04-30 13:17:52 +0200] [1001] [INFO] Worker exiting (pid: 1001)


In [20]:
logged_model = 'runs:/daf30341dd984025a09ceb75da879111/model'

In [21]:
model2 = mlflow.sklearn.load_model(logged_model)
pred2 = model2.predict(X_test)
rmse2 = np.sqrt(mean_squared_error(y_test, pred2))
mae2 = mean_absolute_error(y_test, pred2)
print(rmse2)
print(mae2)

58.63887175520547
47.65957438545899
