# MLFLOW CLIENT USING PYTHON API

Check the [docs](https://www.mlflow.org/docs/latest/model-registry.html#adding-an-mlflow-model-to-the-model-registry) for more information.

In [1]:
# Standard imports
import numpy as np
import pandas as pd
from pydantic import BaseModel, ValidationError
import yaml

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

# Built-in library
import itertools
import re
import json
import typing as tp
import logging

import warnings

warnings.filterwarnings("error")

# for saving the pipeline
import joblib

# MLFlow
import mlflow

# from Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Binarizer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics, set_config

# Pipeline Display
set_config(display="text")

# from feature-engine
from feature_engine.imputation import (
    AddMissingIndicator,
    MeanMedianImputer,
)

from feature_engine.transformation import (
    LogTransformer,
    YeoJohnsonTransformer,
)

from feature_engine.selection import DropFeatures

# Custom Imports
from data_manager import load_data, validate_input
import feat_engineering as fe
from schema import (
    InputSchema,
    ValidateTrainingData,
    ModelConfig,
    MLFlowConfig,
    ConfigVars,
)
import utilities as util

# pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Black code formatter (Optional)
%load_ext lab_black
# auto reload imports
%load_ext autoreload
%autoreload 2

In [2]:
# Load Data
train_data = load_data("data/yellow_tripdata_2022-01.parquet")
test_data = load_data("data/yellow_tripdata_2022-02.parquet")

print(f"Shape of: \ntrain_data: {train_data.shape}\ntest_data: {test_data.shape}\n")

train_data.head()

Shape of: 
train_data: (2406155, 20)
test_data: (2901257, 20)



Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,trip_duration
0,1,2022-01-01 00:35:40,2022-01-01 00:53:29,2.0,3.8,1.0,N,142,236,1,14.5,3.0,0.5,3.65,0.0,0.3,21.95,2.5,0.0,2.93492
1,1,2022-01-01 00:33:43,2022-01-01 00:42:07,1.0,2.1,1.0,N,236,42,1,8.0,0.5,0.5,4.0,0.0,0.3,13.3,0.0,0.0,2.24071
2,2,2022-01-01 00:53:21,2022-01-01 01:02:19,1.0,0.97,1.0,N,166,166,1,7.5,0.5,0.5,1.76,0.0,0.3,10.56,0.0,0.0,2.299581
3,2,2022-01-01 00:25:21,2022-01-01 00:35:23,1.0,1.09,1.0,N,114,68,2,8.0,0.5,0.5,0.0,0.0,0.3,11.8,2.5,0.0,2.400619
4,2,2022-01-01 00:36:48,2022-01-01 01:14:20,1.0,4.3,1.0,N,68,163,1,23.5,0.5,0.5,3.0,0.0,0.3,30.3,2.5,0.0,3.651437


### Load Config

In [3]:
fp = "config.yml"

with open(fp, "r") as file:
    config_file = yaml.load(stream=file, Loader=yaml.loader.SafeLoader)

config = ConfigVars(
    model_config=ModelConfig(**config_file),
    mlflow_config=MLFlowConfig(**config_file),
)

## Fetching an MLflow Model from the Model Registry

After you have registered an MLflow model, you can fetch that model using mlflow.<model_flavor>.load_model(), or more generally, load_model().

Fetch a specific model version

To fetch a specific model version, just supply that version number as part of the model URI.

In [4]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("sample_experiment")

<Experiment: artifact_location='./mlruns/1', creation_time=1671287433449, experiment_id='1', last_update_time=1671287433449, lifecycle_stage='active', name='sample_experiment', tags={}>

In [5]:
new_data = {
    "DOLocationID": [82, 72],
    "payment_type": [1, 2],
    "PULocationID": [5, 99],
    "RatecodeID": [np.nan, 2],
    "tpep_pickup_datetime": ["2022-12-16 14:33:43", "2022-12-18 09:18:03"],
    "trip_distance": [5.5, 3.1],
    "VendorID": [2, 1],
    "total_amount": [12, 9.5],
}

df = pd.DataFrame(new_data)
# Conevrt to datetime
df["tpep_pickup_datetime"] = pd.to_datetime(df["tpep_pickup_datetime"], errors="coerce")
df

Unnamed: 0,DOLocationID,payment_type,PULocationID,RatecodeID,tpep_pickup_datetime,trip_distance,VendorID,total_amount
0,82,1,5,,2022-12-16 14:33:43,5.5,2,12.0
1,72,2,99,2.0,2022-12-18 09:18:03,3.1,1,9.5


In [6]:
from mlflow import MlflowClient

# Fetch a specific model version
client = MlflowClient()
model_name = "random_forest_pipeline"
model_version = 5

model = mlflow.pyfunc.load_model(model_uri=f"models:/{model_name}/{model_version}")

model.predict(df)

array([2.25417234, 2.15199791])

### Adding or Updating an MLflow Model Descriptions
At any point in a model’s lifecycle development, you can update a model version’s description using: 

```python
update_model_version().
```

In [7]:
client.update_model_version(
    name=model_name,
    version=model_version,
    description="This model version is a scikit-learn model",
)

<ModelVersion: creation_timestamp=1671311239721, current_stage='None', description='This model version is a scikit-learn model', last_updated_timestamp=1671399844279, name='random_forest_pipeline', run_id='2f4f11a37ec541eab8976fac4ce3537b', run_link=None, source='./mlruns/1/2f4f11a37ec541eab8976fac4ce3537b/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=5>

### Renaming an MLflow Model

As well as adding or updating a description of a specific version of the model, you can rename an existing registered model using:

```python
rename_registered_model()
```

In [8]:
client = MlflowClient()
# client.rename_registered_model(name=model_name, new_name=f"{model_name}_updated")

### Transitioning an MLflow Model’s Stage

Over the course of the model’s lifecycle, a model evolves—from development to staging to production. You can transition a registered model to one of the stages: **`Staging`**, **`Production`** or **`Archived`**:

**Note:** The accepted values for <stage> are: Staging|Archived|Production|None.



```python
rename_registered_model()
```

### Listing and Searching MLflow Models
You can fetch a list of registered models in the registry with a simple method.

In [9]:
from pprint import pprint

client = MlflowClient()
for rm in client.search_registered_models():
    pprint(dict(rm), indent=4)

{   'creation_timestamp': 1671287864310,
    'description': None,
    'last_updated_timestamp': 1671311154312,
    'latest_versions': [   <ModelVersion: creation_timestamp=1671287864324, current_stage='Archived', description='This is the baseline model.', last_updated_timestamp=1671288530900, name='linear_reg_pipeline', run_id='0dbe5f223eaf44c3bb3229fdd65167a6', run_link=None, source='./mlruns/1/0dbe5f223eaf44c3bb3229fdd65167a6/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=1>,
                           <ModelVersion: creation_timestamp=1671311154312, current_stage='None', description=None, last_updated_timestamp=1671311154312, name='linear_reg_pipeline', run_id='470a38ffc2794e969618858d745b6099', run_link=None, source='./mlruns/1/470a38ffc2794e969618858d745b6099/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=5>],
    'name': 'linear_reg_pipeline',
    'tags': {}}
{   'creation_timestamp': 1671288216010,
   

### A More Effective Approach

In [10]:
client = MlflowClient()
for mv in client.search_model_versions(f"name='{model_name}'"):
    pprint(dict(mv), indent=4)

{   'creation_timestamp': 1671311239721,
    'current_stage': 'None',
    'description': 'This model version is a scikit-learn model',
    'last_updated_timestamp': 1671399844279,
    'name': 'random_forest_pipeline',
    'run_id': '2f4f11a37ec541eab8976fac4ce3537b',
    'run_link': None,
    'source': './mlruns/1/2f4f11a37ec541eab8976fac4ce3537b/artifacts/model',
    'status': 'READY',
    'status_message': None,
    'tags': {},
    'user_id': None,
    'version': 5}
{   'creation_timestamp': 1671288216022,
    'current_stage': 'Staging',
    'description': 'A Random Forest Ensemble Model.',
    'last_updated_timestamp': 1671358222849,
    'name': 'random_forest_pipeline',
    'run_id': '3a5f34f7b15c466e9f8e8a81f9b6301d',
    'run_link': None,
    'source': './mlruns/1/3a5f34f7b15c466e9f8e8a81f9b6301d/artifacts/model',
    'status': 'READY',
    'status_message': None,
    'tags': {},
    'user_id': None,
    'version': 1}
{   'creation_timestamp': 1671288776058,
    'current_stage': 

# Configure A Remote Tracking Server On AWS

The guide can be found [here](https://github.com/chineidu/mlops-zoomcamp/blob/main/02-experiment-tracking/mlflow_on_aws.md)

Postgres DB Config:
  - db_instance_identifier: enter the value
  - db_username: enter the value
  - db_password: enter the value
  - initial_db_name: enter the value
  - db_endpoint: enter the db endpoint

S3 Bucket:
  - bucket-name
  
### Install the requirements (on the remote server using ssh)

```console
$ pip3 install mlflow boto3 psycopg2-binary
```
  
Run the server:

```console
$ mlflow server -h 0.0.0.0 -p 5000 \
--backend-store-uri postgresql://$DB_USER:$DB_PASSWORD@$DB_ENDPOINT:5432/$DB_NAME \
--default-artifact-root s3://$S3_BUCKET_NAME
```

### Save The Environment Variables
```console
$ export DB_USER="" \ 
$ export DB_PASSWORD="" \
$ export DB_ENDPOINT="" \
$ export DB_NAME="" \
$ export S3_BUCKET_NAME=""
```

 ### Connect Notebook To A Remote Tracking Server
 
 For the profile config, check [here](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-profiles.html)
 
 ```python
 os.environ["AWS_PROFILE"] = "" # Fill with your AWS profile:  Check docs linked above.
 
 TRACKING_SERVER_HOST = "ec2-3-93-15-244.compute-1.amazonaws.com" # Fill in with public DNS of the EC2 instance
 port = 5000
 mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:port")
 ```

In [11]:
import os

os.environ["AWS_PROFILE"] = "default"
TRACKING_SERVER_HOST = "ec2-3-80-180-139.compute-1.amazonaws.com"  # Fill in with public DNS of the EC2 instance
port = 5000
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{port}")

In [12]:
# Split the data
X = train_data.drop(columns=[config.model_config.TARGET])
y = train_data[config.model_config.TARGET]

X_train, X_validate, y_train, y_validate = train_test_split(
    X,
    y,
    test_size=config.model_config.TEST_SIZE,
    random_state=config.model_config.RANDOM_STATE,
)

X_train.shape, X_validate.shape

((2165539, 19), (240616, 19))

In [13]:
pipe = Pipeline(
    steps=[
        # ===== Select input features =====
        (
            "input vars",
            fe.SelectFeatures(features=config.model_config.INPUT_FEATURES),
        ),
        # ===== Add NaN flags =====
        (
            "add na_flag",
            AddMissingIndicator(
                missing_only=True, variables=config.model_config.NUM_VARS_WF_NA
            ),
        ),
        # ===== Impute NaNs =====
        (
            "impute num_vars",
            MeanMedianImputer(
                imputation_method="median", variables=config.model_config.NUM_VARS_WF_NA
            ),
        ),
        # ===== Create new features =====
        (
            "cal day_of_week",
            fe.CalculateDayOfWeek(feature=config.model_config.TEMPORAL_VAR),
        ),
        (
            "cal hour_of_day",
            fe.CalculateHourOfDay(feature=config.model_config.TEMPORAL_VAR),
        ),
        # ===== Select features =====
        (
            "important vars",
            fe.SelectFeatures(features=config.model_config.IMPORTANT_FEATURES),
        ),
        # ===== Drop features =====
        (
            "drop features",
            DropFeatures(features_to_drop=config.model_config.VARS_TO_DROP),
        ),
        # ===== Transform features =====
        (
            "log transformation",
            LogTransformer(
                variables=config.model_config.VARS_TO_LOG_TRANSFORM, base="e"
            ),
        ),
        # ===== Scale features =====
        ("scale data", StandardScaler()),
        # ===== Linear model =====
        ("linear model", LinearRegression()),
    ]
)
pipe

Pipeline(steps=[('input vars',
                 SelectFeatures(features=['DOLocationID', 'payment_type',
                                          'PULocationID', 'RatecodeID',
                                          'total_amount',
                                          'tpep_pickup_datetime',
                                          'trip_distance', 'VendorID'])),
                ('add na_flag', AddMissingIndicator(variables=['RatecodeID'])),
                ('impute num_vars',
                 MeanMedianImputer(variables=['RatecodeID'])),
                ('cal day_of_week',
                 CalculateDayOfWeek(feature='...
                                          'hour_of_day', 'payment_type',
                                          'PULocationID', 'RatecodeID',
                                          'RatecodeID_na', 'total_amount',
                                          'tpep_pickup_datetime',
                                          'trip_distance', 'VendorID'])),
 

### Track An Experiment

In [14]:
warnings.filterwarnings("ignore")  # Required

delim = "::"
format_ = f"%(levelname)s {delim} %(asctime)s {delim} %(message)s"
logging.basicConfig(level=logging.INFO, format=format_)
logger = logging.getLogger(__name__)


def eval_metrics(actual: np.ndarray, pred: np.ndarray) -> tp.Tuple:
    """This is used to evaluate the performance of the model."""
    rmse = metrics.mean_squared_error(actual, pred, squared=False)
    mse = metrics.mean_squared_error(actual, pred, squared=True)
    mae = metrics.mean_absolute_error(actual, pred)
    r2 = metrics.r2_score(actual, pred)

    return (rmse, mse, mae, r2)


mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{port}")
mlflow.set_experiment("Demo_experiment")

with mlflow.start_run():
    logger.info("========= Training The Model =========")
    pipe.fit(X_train, y_train)

    y_pred = pipe.predict(X_validate)

    (rmse, mse, mae, r2) = eval_metrics(y_validate, y_pred)

    print(f"  RMSE: {rmse}")
    print(f"  MSE: {mse}")
    print(f"  MAE: {mae}")
    print(f"  R2: {r2}")

    mlflow.log_metrics({"RMSE": rmse, "MSE": mse, "MAE": mae, "R2": r2})

    mlflow.sklearn.log_model(pipe, "model", registered_model_name="LinearModel")
    logger.info("========= Training Done! =========")

2022/12/18 22:46:41 INFO mlflow.tracking.fluent: Experiment with name 'Demo_experiment' does not exist. Creating a new experiment.


  RMSE: 0.2896738945525108
  MSE: 0.08391096518521914
  MAE: 0.21940251433736507
  R2: 0.7795462515192009


INFO :: 2022-12-18 22:46:56,254 :: Found credentials in shared credentials file: ~/.aws/credentials
Successfully registered model 'LinearModel'.
2022/12/18 22:47:01 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: LinearModel, version 1
Created version '1' of model 'LinearModel'.


## Run The Server Locally With A Remote Model Registry (S3)

### Requirements:
Tracking URI:
  - tracking_uri
  
S3 Bucket:
  - bucket-name
  
### Install the requirements (on the remote server using ssh)

```console
$ pip3 install mlflow boto3
```

### Save The Environment Variables
```console
$ export DB_NAME=mlflow.db \
$ export S3_BUCKET_NAME=my-unique-bucket
```

#### Run the server:
```console
$ mlflow server --backend-store-uri 'db_type:///path_to_db' \
--default-artifact-root s3://$S3_BUCKET_NAME

# E.g
$ mlflow server --backend-store-uri sqlite:///$DB_NAME \
--default-artifact-root s3://$S3_BUCKET_NAME
```
<br>

```python
# Note: Ensure you run the script and the server in the same directory.
TRACKING_SERVER_HOST = "http://127.0.0.1"  # localhost
PORT = 5000  # Default
TRACKING_URI = f"{TRACKING_SERVER_HOST}:{PORT}"

# OR (Preferred approach)
TRACKING_URI = "sqlite:///mlflow.db"

mlflow.set_tracking_uri(TRACKING_URI)
```
\
The other steps are the same. Continue with the experimentation.


```python
mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment("Experiment_name")

with mlflow.start_run():
    logger.info("========= Training The Model =========")
    pipe.fit(X_train, y_train)

    y_pred = pipe.predict(X_validate)

    (rmse, mse, mae, r2) = eval_metrics(y_validate, y_pred)

    print(f"  RMSE: {rmse}")
    print(f"  MSE: {mse}")
    print(f"  MAE: {mae}")
    print(f"  R2: {r2}")

    mlflow.log_metrics({"RMSE": rmse, "MSE": mse, "MAE": mae, "R2": r2})

    mlflow.sklearn.log_model(pipe, artifact_path="model", registered_model_name="LinearModel")
    logger.info("========= Training Done! =========")
```