# Govern ml artifacts

We will use MLFLOW to get a first approach of what is a model registry, and how to use it

MLFLOW uri : [https://ml-registrycourse.aiengineer.codex-platform.com//](https://ml-registry.course.aiengineer.codex-platform.com/)

## 0.0 install and import dependancies

In [None]:
%pip install mlflow boto boto3 xgboost==1.1.1 # mlflow is the model registry client, boto a s3 api client and xgboost the Machine learning framework

In [None]:
import mlflow.xgboost
from mlflow.store.artifact.runs_artifact_repo import RunsArtifactRepository
from mlflow import MlflowClient
from sklearn.ensemble import RandomForestRegressor
import pandas
import xgboost
from sklearn.metrics import roc_curve
from sklearn.model_selection import train_test_split
import pyarrow
import os
import pandas as pd
from feast import (
    FeatureStore
)

## 0.1 MLflow related configuration

In [None]:
os.environ["AWS_ACCESS_KEY_ID"] = "mlflow-storage"
os.environ["AWS_SECRET_ACCESS_KEY"] = "mlflow-storage"
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage-api.course.aiengineer.codex-platform.com"
mlflow.set_tracking_uri('http://mlflow.mlflow.svc.cluster.local:5000')

## 1.Train and track a model

### 1.0 Minimal function to train an xgboost model

In [None]:
def xgboost_train(
    training_data,
    booster_params,
    label_column ='tips', 
    num_iterations: int = 10
):
        
    ### load data ###
    df = training_data
    
    ### autoclean data to allow only copatible types in features
    numerics = ['int','float']
    df = df.select_dtypes(include=numerics)
    df = df.dropna()
    
    data=df.drop(label_column,axis=1)
    label=df[[label_column]]
    dtrain = xgboost.DMatrix(data, label=label)
    
    model = xgboost.train(
        params=booster_params,
        dtrain=dtrain,
        num_boost_round=num_iterations,
    )
    
    return model
    

### 1.1 model parameters initialization

In [None]:
params = {
    "objective":'reg:squarederror',
    "booster":'gbtree',
    "learning_rate": 0.3,
    "min_split_loss": 0,
    "max_depth": 6,
}

### 1.2 Model tracking set up

To use correctly the mlflow model registry, we will track our training in mlflow 

### 1.2.1 Create a new experiment with "experiment_id" wired to our minio bucket 

In [None]:
### example : 'john-doe'
username=''#firstname-lastname
# set up a name with your username
experiment_name = f"{username} experiments"

In [None]:
# create an experiment with the name, a storage location and some metadata if needed
experiment_id = mlflow.create_experiment(
    experiment_name,
    artifact_location=f's3://{username}/mlflowmodels/experiments-tp4',
    tags={"version": "TP4"}
)

In [None]:
### list all mlflow experiments
mlflow.search_experiments()

In [None]:
experiment = mlflow.get_experiment_by_name(experiment_name)
print("Name: {}".format(experiment.name))
print("Experiment_id: {}".format(experiment.experiment_id))
print("Artifact Location: {}".format(experiment.artifact_location))
print("Tags: {}".format(experiment.tags))
print("Lifecycle_stage: {}".format(experiment.lifecycle_stage))

### 1.3 Train the model

#### 1.3.1 get the data from the feature store

In [None]:
fs = FeatureStore(repo_path="./feature_repo")

In [None]:
# Choose a service to query features on
feature_service = fs.get_feature_service("taxi_trip_service_v2")

# scope the features to retreive
# take the 1000 first index
entity_df = pd.DataFrame.from_dict({"index": [*range(1, 1001)]})

# take all timestamp older than now
entity_df["event_timestamp"] = pd.to_datetime("now", utc=True)    

# get the list of the wanted features, for the scoped entity dataframe
training_data = fs.get_historical_features(
    entity_df=entity_df,
    features=feature_service,
).to_df()

#### 1.3.2 use MLflow autolog and context manager to track the training

In [None]:
### this will log all params availables into mlflow
mlflow.xgboost.autolog()

### this context manager will bind the process to an experiment
with mlflow.start_run(experiment_id=experiment.experiment_id) as run:
    model = xgboost_train(training_data,params)

#### 1.3.3 inspect results on uri

In [None]:
f'click https://ml-registry.course.aiengineer.codex-platform.com/#/experiments/{experiment.experiment_id}'

![exper](./images/exper.png)

## 2. Register model into registry

### 2.1 define client

In [None]:
client = MlflowClient(tracking_uri='http://mlflow.mlflow.svc.cluster.local:5000')


### 2.2 Create the object model

In [None]:
### name your model
name="ge_chicago_taxi_tips_with_features"

In [None]:
### create the associated entry in the registry
client.create_registered_model(name)


### 2.3 Create a model version from training logs

In [None]:
desc = "A new taxi trip model trained with feast features"
runs_uri = f"runs:/{run.info.run_id}/model"
model_src = RunsArtifactRepository.get_underlying_uri(runs_uri)
mv = client.create_model_version(name, model_src, run.info.run_id, description=desc)
print("Name: {}".format(mv.name))
print("Version: {}".format(mv.version))
print("Description: {}".format(mv.description))
print("Status: {}".format(mv.status))
print("Stage: {}".format(mv.current_stage))

### Visualize the version

In [None]:
f'click https://ml-registry.course.aiengineer.codex-platform.com/#/models/{mv.name}'

![versions](./images/versions.png)

### 3. Pull the model from the registry to test some sample predictions


### 3.0.1 pull the model using the snippet available in the experiment snippet page

In [None]:
import mlflow.pyfunc

model_name = mv.name
stage = None

modelfromRegistry = mlflow.pyfunc.load_model(
    model_uri=f"models:/{model_name}/{stage}"
)

### 3.0.2 get some feast offline data to score

In [None]:
# Choose a service to query features on
feature_service = fs.get_feature_service("taxi_trip_service_v2")

# scope the features to retreive, here just 25 examples to score
entity_df = pd.DataFrame.from_dict({"index": [12,78]})

# take all timestamp older than now
entity_df["event_timestamp"] = pd.to_datetime("now", utc=True)    

# get the list of the wanted features, for the scoped entity dataframe
feature_vector = fs.get_historical_features(
    entity_df=entity_df,
    features=feature_service,
).to_df()


In [None]:
feature_vector.head()

### 3.0.3 do a  prediction on the feast online data using the model pulled from the registry

In [None]:
modelfromRegistry.predict(feature_vector.drop('tips',axis=1).head())