In [1]:
#here we are trying to choose the best model from staging
from mlflow.tracking import MlflowClient
from datetime import datetime
from mlflow.entities import ViewType
import mlflow
import pickle
import pandas as pd
from sklearn.metrics import mean_squared_error

In [2]:
MLFLOW_TRACKING_URI = "sqlite:///backend.db"

In [3]:
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

#you can track runs and other stuffs like models
#this lists the url under the client name
client.list_experiments()

[<Experiment: artifact_location='artifacts_local/0', experiment_id='0', lifecycle_stage='active', name='Default', tags={}>,
 <Experiment: artifact_location='artifacts_local/1', experiment_id='1', lifecycle_stage='active', name='my-experiment-1', tags={}>]

In [4]:

runs = client.search_runs(
    experiment_ids='1',
    filter_string="metrics.rmse < 4137269",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.rmse ASC"]
)

In [5]:
#to show the result we stored, should give you the same result as sorting the mlflowui
#when you sort in ascending order
for run in runs:
    print(f"run id: {run.info.run_id}, rmse: {run.data.metrics['rmse']:.4f}")

run id: dc822e6d5d95417a913fa8199489ff05, rmse: 4137268.6811
run id: b326b2fcd303436090279dc538f91aa4, rmse: 4137268.6811
run id: aee2f51452724b39b92e60fcc0bfd5d0, rmse: 4137268.6811
run id: 13147b85bffa483ab090ca007ca85003, rmse: 4137268.6811
run id: 20d8ec28affa47b4b646e19f22d729b8, rmse: 4137268.6811


In [6]:
#to promote some of these models to moderation
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

In [7]:
#here we just log all the top 5 models to the registry, with version 1 being the best
#this should only be run once 
#it ws just used to make the loggings faster 

#for run in runs:
 #   run_id = f"{run.info.run_id}"
  #  model_uri = f"runs:/{run_id}/model"
   # mlflow.register_model(model_uri=model_uri, name="paris-model-registry")


Successfully registered model 'paris-model-registry'.
2022/08/09 22:21:34 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: paris-model-registry, version 1
Created version '1' of model 'paris-model-registry'.
Registered model 'paris-model-registry' already exists. Creating a new version of this model...
2022/08/09 22:21:34 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: paris-model-registry, version 2
Created version '2' of model 'paris-model-registry'.
Registered model 'paris-model-registry' already exists. Creating a new version of this model...
2022/08/09 22:21:34 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: paris-model-registry, version 3
Created version '3' of model 'paris-model-registry'.
Registered model

In [None]:
#the best run id chosen
run_id = "dc822e6d5d95417a913fa8199489ff05"
model_uri = f"runs:/{run_id}/model"
mlflow.register_model(model_uri=model_uri, name="paris-model-registry")

In [8]:
#to check client list of registered models 
client.list_registered_models()

[<RegisteredModel: creation_timestamp=1660076494059, description=None, last_updated_timestamp=1660076494142, latest_versions=[<ModelVersion: creation_timestamp=1660076494142, current_stage='None', description=None, last_updated_timestamp=1660076494142, name='paris-model-registry', run_id='20d8ec28affa47b4b646e19f22d729b8', run_link=None, source='artifacts_local/1/20d8ec28affa47b4b646e19f22d729b8/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=5>], name='paris-model-registry', tags={}>]

In [9]:
model_name = "paris-model-registry"
latest_versions = client.get_latest_versions(name=model_name)

for version in latest_versions:
    print(f"version: {version.version}, stage: {version.current_stage}")

version: 5, stage: None


In [20]:
date = datetime.today().date()
model_version = 2
new_stage = "Staging"
client.transition_model_version_stage(
    name=model_name,
    version=model_version,
    stage=new_stage,
    archive_existing_versions=False,
)
print(f"The model version {model_version} was transitioned to {new_stage} on {date}")


The model version 2 was transitioned to Staging on 2022-08-09


In [11]:
model_version = 1
new_stage = "Production"
client.transition_model_version_stage(
    name=model_name,
    version=model_version,
    stage=new_stage,
    archive_existing_versions=False
)
print(f"The model version {model_version} was transitioned to {new_stage} on {date}")


The model version 1 was transitioned to Production on 2022-08-09


In [12]:
model_version = 3
new_stage = "Archived"
client.transition_model_version_stage(
    name=model_name,
    version=model_version,
    stage=new_stage,
    archive_existing_versions=False
)
print(f"The model version {model_version} was transitioned to {new_stage} on {date}")


The model version 3 was transitioned to Archived on 2022-08-09


In [13]:
#we chack again to ensure changes are made
model_name = "paris-model-registry"
latest_versions = client.get_latest_versions(name=model_name)

for version in latest_versions:
    print(f"version: {version.version}, stage: {version.current_stage}")

version: 1, stage: Production
version: 5, stage: None
version: 3, stage: Archived


In [14]:
#we can creat a new model and transition it

In [15]:
#testing the model in production
#we load the input
def read_dataframe(filename):
    if filename.endswith('.parquet'):
        df = pd.read_parquet(filename)
    return  df    

In [16]:
#creating step for a new file 
#we transform the model to a dictionary
def preparing_features(input_file, dv):

    df_train = read_dataframe(input_file)
    categorical = ['squareMeters']
    df_train_data = df_train[categorical]
    train_dicts = df_train_data[categorical].to_dict(orient='records')
    X_test = dv.transform(train_dicts)
    target = 'price'
    y_test = df_train[target].values
    return X_test , y_test

In [17]:
def test_model(name, stage, X_test, y_test):
    model = mlflow.pyfunc.load_model(f"models:/{name}/{stage}")
    y_pred = model.predict(X_test)
    return {"rmse": mean_squared_error(y_test, y_pred, squared=False)}

In [18]:

#we import the model and the dict vectorizer
dv, model = pickle.load(open("models/linear-reg",'rb'))

# note that 'i' is the period therefore the test period is period 3 
i = 3
input_file = f"s3://mlops-project-dataset-deen/paris-housing-dataset/ParisHousing_period_{i:02d}.parquet"

X_test , y_test = preparing_features(input_file, dv)



In [19]:

#to check model in production
model_name = "paris-model-registry"
%time test_model(name=model_name, stage="Production", X_test=X_test, y_test=y_test)

CPU times: user 228 ms, sys: 16.2 ms, total: 245 ms
Wall time: 247 ms


{'rmse': 3273.190601446966}