In [56]:
import mlflow

In [57]:
#here we are trying to choose the best model from staging
from mlflow.tracking import MlflowClient
from datetime import datetime
from mlflow.entities import ViewType
import mlflow
import pickle
import pandas as pd
from sklearn.metrics import mean_squared_error

In [58]:
MLFLOW_TRACKING_URI = "sqlite:///backend.db"

In [59]:
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

#you can track runs and other stuffs like models
#this lists the url under the client name
client.list_experiments()

[<Experiment: artifact_location='artifacts_local/0', experiment_id='0', lifecycle_stage='active', name='Default', tags={}>,
 <Experiment: artifact_location='artifacts_local/1', experiment_id='1', lifecycle_stage='active', name='my-experiment-1', tags={}>]

In [60]:

runs = client.search_runs(
    experiment_ids='1',
    filter_string="metrics.rmse < 4137269",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.rmse ASC"]
)

In [61]:
#to show the result we stored, should give you the same result as sorting the mlflowui
#when you sort in ascending order
for run in runs:
    print(f"run id: {run.info.run_id}, rmse: {run.data.metrics['rmse']:.4f}")

run id: 88245f27c10d4e0e8e87db1c64f9a060, rmse: 3205.5839
run id: 565aa5e1e52240c8af3076792392c0eb, rmse: 3205.5839
run id: 9572f13a1e66443ba732808ecd3f8c53, rmse: 3205.5839
run id: e8a9d2a541164be8b27d47b9e94d49ff, rmse: 3205.5839
run id: a4faa2b38bb34086a775c042b70b3333, rmse: 3205.5839


In [62]:
#to promote some of these models to moderation
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

In [63]:
#you dont need this, its for an experiment :)
#here we just log all the top 5 models to the registry, with version 1 being the best
#this should only be run once 
#it ws just used to make the loggings faster 

#for run in runs:
 #   run_id = f"{run.info.run_id}"
  #  model_uri = f"runs:/{run_id}/model"
   # mlflow.register_model(model_uri=model_uri, name="paris-model-registry")


In [64]:
#the best run id chosen
run_id = "565aa5e1e52240c8af3076792392c0eb"
model_uri = f"runs:/{run_id}/model"
mlflow.register_model(model_uri=model_uri, name="paris-model-registry")

Registered model 'paris-model-registry' already exists. Creating a new version of this model...
2022/08/15 12:33:21 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: paris-model-registry, version 4
Created version '4' of model 'paris-model-registry'.


<ModelVersion: creation_timestamp=1660559601051, current_stage='None', description=None, last_updated_timestamp=1660559601051, name='paris-model-registry', run_id='565aa5e1e52240c8af3076792392c0eb', run_link=None, source='artifacts_local/1/565aa5e1e52240c8af3076792392c0eb/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=4>

In [65]:
#to check client list of registered models 
client.list_registered_models()

[<RegisteredModel: creation_timestamp=1660554846242, description=None, last_updated_timestamp=1660559601051, latest_versions=[<ModelVersion: creation_timestamp=1660554846280, current_stage='Staging', description=None, last_updated_timestamp=1660555167988, name='paris-model-registry', run_id='9572f13a1e66443ba732808ecd3f8c53', run_link=None, source='artifacts_local/1/9572f13a1e66443ba732808ecd3f8c53/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=1>,
  <ModelVersion: creation_timestamp=1660559601051, current_stage='None', description=None, last_updated_timestamp=1660559601051, name='paris-model-registry', run_id='565aa5e1e52240c8af3076792392c0eb', run_link=None, source='artifacts_local/1/565aa5e1e52240c8af3076792392c0eb/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=4>,
  <ModelVersion: creation_timestamp=1660555113571, current_stage='Production', description=None, last_updated_timestamp=1660555166626, name='pa

In [66]:
model_name = "paris-model-registry"
latest_versions = client.get_latest_versions(name=model_name)

for version in latest_versions:
    print(f"version: {version.version}, stage: {version.current_stage}")

version: 1, stage: Staging
version: 4, stage: None
version: 3, stage: Production


In [67]:
date = datetime.today().date()
model_version = 3
new_stage = "Production"
client.transition_model_version_stage(
    name=model_name,
    version=model_version,
    stage=new_stage,
    archive_existing_versions=False,
)
print(f"The model version {model_version} was transitioned to {new_stage} on {date}")


The model version 3 was transitioned to Production on 2022-08-15


In [68]:
model_version = 1
new_stage = "staging"
client.transition_model_version_stage(
    name=model_name,
    version=model_version,
    stage=new_stage,
    archive_existing_versions=False
)
print(f"The model version {model_version} was transitioned to {new_stage} on {date}")


The model version 1 was transitioned to staging on 2022-08-15


In [69]:
#we chack again to ensure changes are made
model_name = "paris-model-registry"
latest_versions = client.get_latest_versions(name=model_name)

for version in latest_versions:
    print(f"version: {version.version}, stage: {version.current_stage}")

version: 1, stage: Staging
version: 4, stage: None
version: 3, stage: Production


In [70]:
#we can creat a new model and transition it

In [71]:
#testing the model in production
#we load the input
def read_dataframe(filename):
    if filename.endswith('.parquet'):
        df = pd.read_parquet(filename)
    return  df    

In [72]:
#creating step for a new file 
#we transform the model to a dictionary
def preparing_features(input_file, dv):

    df_train = read_dataframe(input_file)
    categorical = ['squareMeters']
    df_train_data = df_train[categorical]
    train_dicts = df_train_data[categorical].to_dict(orient='records')
    X_test = dv.transform(train_dicts)
    target = 'price'
    y_test = df_train[target].values
    return X_test , y_test

In [79]:
def test_model(name, stage, X_test, y_test):
    model = mlflow.pyfunc.load_model(f"models:/{name}/{stage}")
    y_pred = model.predict(X_test)
    print(y_pred)
    return {"rmse": mean_squared_error(y_test, y_pred, squared=False)}

In [89]:

#we import the model and the dict vectorizer
dv, model = pickle.load(open("models/linear-reg",'rb'))

# note that 'i' is the period therefore the test period is period 3 
i = 2
input_file = f"s3://mlops-project-dataset-deen/paris-housing-dataset/ParisHousing_period_{i:02d}.parquet"

X_test , y_test = preparing_features(input_file, dv)



In [90]:

#to check model in production
model_name = "paris-model-registry"
%time test_model(name=model_name, stage="Production", X_test=X_test, y_test=y_test)

[8427821.9946263  7324106.60799486 6672197.51988665 ... 5231577.43654765
  765115.17066865 1706028.2877127 ]
CPU times: user 25.1 ms, sys: 3.65 ms, total: 28.7 ms
Wall time: 27.8 ms


{'rmse': 3205.5838721409696}

In [91]:
y_test

array([8429482.7, 7320329.7, 6668307.7, ..., 5231126.5,  762915.5,
       1708754.5])