# Model registry: MLFlow

Example on how to use the MLFlow model during the training of a model

## 1. Install MLFlow

In [7]:
# !pip install mlflow
# !pip freeze | grep mlflow

## 2. Setup MLFlow Tracking Server
Run the following command in a terminal to start the MLFlow Tracking Server

Configure mlflow to store all in a local directory

mlflow server --backend-store-uri ./mlruns --default-artifact-root ./mlruns

## 3. Create a new experiment

In [1]:
# This is a local tracking server. You can also use a remote tracking server. See https://mlflow.org/docs/latest/tracking.html for more details.
# Remote tracking server will be simulated later

# Uncomment if using the docker compose setup:
# import os
# os.environ["AWS_ACCESS_KEY_ID"] = "minio"
# os.environ["AWS_SECRET_ACCESS_KEY"] = "minio123"
# os.environ["MLFLOW_S3_ENDPOINT_URL"] = f"http://localhost:9000"
# os.environ["MYSQL_DATABASE"] = "mlflow_database"
# os.environ["MYSQL_USER"] = "mlflow_user"
# os.environ["MYSQL_PASSWORD"] = "mlflow"
# os.environ["MYSQL_ROOT_PASSWORD"] = "mysql"



import mlflow
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("mlflow-model-training-iris")

2023/06/18 11:13:46 INFO mlflow.tracking.fluent: Experiment with name 'mlflow-model-training-iris' does not exist. Creating a new experiment.


<Experiment: artifact_location='/Users/dlri/Documents/TEACHING/mlruns/128870716038014623', creation_time=1687079626468, experiment_id='128870716038014623', last_update_time=1687079626468, lifecycle_stage='active', name='mlflow-model-training-iris', tags={}>

## 4. Log a model experiment result

In [2]:
# Lets first create a simple function to train a model
# The function will receive a set of hyperparameters and return the model and the accuracy
import datetime

def get_data():
    from sklearn import datasets

    # X, y = datasets.make_classification(n_samples=1000, n_features=4, n_informative=4, n_redundant=0, n_classes=3, n_clusters_per_class=1, class_sep=0.5, random_state=40)
    # use iris dataset
    X, y = datasets.load_iris(return_X_y=True)
    return X, y

def train_model(hyperparameters):
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import accuracy_score
    from sklearn.model_selection import train_test_split
    import mlflow.sklearn
    X, y = get_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)
    model = RandomForestClassifier(**hyperparameters)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_pred, y_test)
    return model, accuracy

# Now lets train a model with some hyperparameters
hyperparameters = {"n_estimators": 10, "max_depth": 5}
model, accuracy = train_model(hyperparameters)

# Now lets log the model and the accuracy
date = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
with mlflow.start_run(run_name=f"mlflow-training-{date}"):
    
    # Hyperparameters are logged as parameters
    for hyperparameter, value in hyperparameters.items():
        mlflow.log_param(hyperparameter, value)
    
    # Accuracy is logged as a metric 
    mlflow.log_metric("accuracy", accuracy)
    
    # Model is logged as an artifact
    mlflow.sklearn.log_model(model, "model")



## 5. Run a hyperparameter search and log the results

In [5]:
!pip install optuna==3.2.0

Collecting optuna
  Downloading optuna-3.2.0-py3-none-any.whl (390 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m390.6/390.6 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting cmaes>=0.9.1 (from optuna)
  Downloading cmaes-0.9.1-py3-none-any.whl (21 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting tqdm (from optuna)
  Using cached tqdm-4.65.0-py3-none-any.whl (77 kB)
Installing collected packages: tqdm, colorlog, cmaes, optuna
Successfully installed cmaes-0.9.1 colorlog-6.7.0 optuna-3.2.0 tqdm-4.65.0


In [6]:

import optuna
import mlflow
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

X, y = get_data()

def optimize_rf(trial):
    
    with mlflow.start_run(run_name=f"optuna-hp-{trial.number}-{datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}"):
        # Set the hyperparameter values that we want to optimize
        n_estimators = trial.suggest_int('n_estimators', 1, 100)
        max_depth = trial.suggest_int('max_depth', 2, 10)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)
        max_features = trial.suggest_float('max_features', 0.1, 1.0)
        
        
        # Create a random forest classifier using the suggested hyperparameters
        rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth,
                                    min_samples_split=min_samples_split,
                                    min_samples_leaf=min_samples_leaf,
                                    max_features=max_features)
        
        # Use cross-validation to evaluate the performance of the classifier
        scores = cross_val_score(rf, X, y, cv=5)
        
        # Log the hyperparameters and cross-validation scores to MLflow
        mlflow.log_param('n_estimators', n_estimators)
        mlflow.log_param('max_depth', max_depth)
        mlflow.log_param('min_samples_split', min_samples_split)
        mlflow.log_param('min_samples_leaf', min_samples_leaf)
        mlflow.log_param('max_features', max_features)
        mlflow.log_metric('mean_cv_score', scores.mean())
    
    # Return the mean of the cross-validation scores as the objective value
    return scores.mean()

# Create an Optuna study
study = optuna.create_study()

# Run the optimization loop
study.optimize(optimize_rf, n_trials=50)

# Get the best hyperparameter values
best_params = study.best_params

  from .autonotebook import tqdm as notebook_tqdm
[I 2023-06-18 11:17:00,931] A new study created in memory with name: no-name-34deb260-3248-4052-9810-007ea5c557e1
[I 2023-06-18 11:17:01,124] Trial 0 finished with value: 0.9533333333333334 and parameters: {'n_estimators': 61, 'max_depth': 5, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 0.7382873729535502}. Best is trial 0 with value: 0.9533333333333334.
[I 2023-06-18 11:17:01,381] Trial 1 finished with value: 0.9666666666666668 and parameters: {'n_estimators': 97, 'max_depth': 10, 'min_samples_split': 19, 'min_samples_leaf': 1, 'max_features': 0.6576244310627135}. Best is trial 0 with value: 0.9533333333333334.
[I 2023-06-18 11:17:01,500] Trial 2 finished with value: 0.9533333333333334 and parameters: {'n_estimators': 31, 'max_depth': 5, 'min_samples_split': 11, 'min_samples_leaf': 2, 'max_features': 0.9816136942568103}. Best is trial 0 with value: 0.9533333333333334.
[I 2023-06-18 11:17:01,655] Trial 3 finished with 

## 6. Register a model version using the best hyperparameters

In [13]:
from mlflow.models.signature import infer_signature

with mlflow.start_run(run_name=f"optuna-hp-final"):
    # Create the final model using the best hyperparameters
    final_model = RandomForestClassifier(**best_params)

    # Train the final model on the entire dataset
    final_model.fit(X, y)

    # Infer the model signature
    signature = infer_signature(X, final_model.predict(X))

    # Log the model to the "Models" section
    mlflow.sklearn.log_model(final_model, "random_forest_model", registered_model_name="random_forest_model", signature=signature)

Registered model 'random_forest_model' already exists. Creating a new version of this model...
2023/06/18 11:29:43 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: random_forest_model, version 5
Created version '5' of model 'random_forest_model'.


## 7. Retrieve a model version and use it for prediction

In [9]:
import mlflow.pyfunc

model_name = "random_forest_model"
model_version = 1

model = mlflow.pyfunc.load_model(
    model_uri=f"models:/{model_name}/{model_version}"
)

model.predict(X[:5])

 - mlflow (current: 2.4.1, required: mlflow==2.4)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


array([0, 0, 0, 0, 0])