# Mlflow model registry

## Interacting with the MLflow tracking server

In [1]:
from mlflow.tracking import MlflowClient

In [2]:
# get access to the models stored
MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"

client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

In [3]:
# Now using 'client' we can interact with models and experiments
client.list_experiments()

[<Experiment: artifact_location='./mlruns/0', experiment_id='0', lifecycle_stage='active', name='Default', tags={}>,
 <Experiment: artifact_location='./mlruns/1', experiment_id='1', lifecycle_stage='active', name='my-cool-experiment', tags={}>]

In [4]:
# We can create a new experiment
client.create_experiment(name="my-cool-experiment")

MlflowException: Experiment(name=my-cool-experiment) already exists. Error: (raised as a result of Query-invoked autoflush; consider using a session.no_autoflush block if this flush is occurring prematurely)
(sqlite3.IntegrityError) UNIQUE constraint failed: experiments.name
[SQL: INSERT INTO experiments (name, artifact_location, lifecycle_stage) VALUES (?, ?, ?)]
[parameters: ('my-cool-experiment', None, 'active')]
(Background on this error at: https://sqlalche.me/e/14/gkpj)

In [None]:
# find best runs for a experiment
from mlflow.entities import ViewType

runs = client.search_runs(experiment_ids='0',
                          filter_string="metrics.rmse < 6.8",
                          run_view_type=ViewType.ACTIVE_ONLY,
                          max_results=5,
                          order_by=["metrics.rmse ASC"]
                         )

In [None]:
for run in runs:
    print(f"run id: {run.info.run_id}, rmse: {run.data.metrics['rmse']:.4f}")

## Interacting with the Model Registry

In [None]:
import mlflow

In [None]:
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

In [None]:
# register a new version of the model
model_uri=f"runs:/{run_id}/model"
mlflow.register_model(model_uri=model_uri,)

In [None]:
model_name = "nyc-taxi-regressor"
latest_versions = client.get_latest_versions(name=model_name) 

for version in latest_versions:
    print(f"version: {version.version}, stage: {version.current_stage}")

In [None]:
# change the stage of a model
model_version = 4
new_stage = "Staging"

client.transition_model_version_stage(
    name=model_name, 
    version=model_version, 
    stage=new_stage, 
    archive_existing_versions=False)

In [None]:
from datetime import datetime
date = datetime.today().date()
client.update_model_version(
    name=model_name,
    version=new_stage,
    description=f"The model version {version} was transitioned to stage {new_stage}")

## Comparing versions and selecting the new "Production" model


In [10]:
!pip install pyarrow



In [11]:
# prepared code adaptd from from git: https://github.com/froukje/mlops-zoomcamp/blob/main/02-experiment-tracking/model-registry.ipynb

from sklearn.metrics import mean_squared_error
import pandas as pd


def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df


def preprocess(df, dv):
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PU_DO']
    numerical = ['trip_distance']
    train_dicts = df[categorical + numerical].to_dict(orient='records')
    return dv.transform(train_dicts)


def test_model(name, stage, X_test, y_test):
    model = mlflow.pyfunc.load_model(f"models:/{name}/{stage}")
    y_pred = model.predict(X_test)
    return {"rmse": mean_squared_error(y_test, y_pred, squared=False)}

The last function is totest the model, we will use the data from March for the ncy-taxi data for that

In [18]:
!wget https://s3.amazonaws.com/nyc-tlc/trip+data/green_tripdata_2021-03.parquet

--2022-05-25 09:43:00--  https://s3.amazonaws.com/nyc-tlc/trip+data/green_tripdata_2021-03.parquet
Resolving s3.amazonaws.com (s3.amazonaws.com)... 54.231.133.128
Connecting to s3.amazonaws.com (s3.amazonaws.com)|54.231.133.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1474538 (1,4M) [binary/octet-stream]
Saving to: ‘green_tripdata_2021-03.parquet.4’


2022-05-25 09:43:02 (1,55 MB/s) - ‘green_tripdata_2021-03.parquet.4’ saved [1474538/1474538]



In [19]:
!mv green_tripdata_2021-03.parquet ../data/

In [20]:
df = read_dataframe("../data/green_tripdata_2021-03.parquet")

In [None]:
# download the preprocessor from mlflow
client.download_artifacts(run_id=run_id, path="preprocessor", dst_path=".")

In [21]:
# load the preprocessor
import pickle
with open("models/preprocessor.b", "rb") as f_in:
    dv = pickle.load(f_in)

In [22]:
X_test = preprocess(df, dv)

In [23]:
target = "duration"
y_test = df[target].values

In [None]:
%time test_model(name=model_name, stage="Production", X_test=X_test, y_test=y_test)

In [None]:
%time test_model(name=model_name, stage="Staging", X_test=X_test, y_test=y_test)

To transition the model to production (only if it is better than the current model). The previous model will then be archived.

In [None]:
client.transition_model_version_stage(
    name=model_name,
    version=4,
    stage="Production",
    archive_existing_versions=True
)