<img src="datamecum_logo.png" align="right" style="float" width="400">
<font color="#CA3532"><h1 align="left">Programa técnico intensivo en data science. Datamecum.</h1></font>
<font color="#6E6E6E"><h2 align="left">Módulo Implementación de Sistemas Data-driven.</h2></font> 
<font color="#6E6E6E"><h2 align="left">Tarea 0. Usando MLFlow como model registry.</h2></font> 

#### Daniel Ruiz Riquelme

# Tasks:
1. Install MLFlow
2. Setup MLFlow Tracking Server
3. Create a new experiment
4. Log a model experiment result
5. Run a hyperparameter search and log the results (OPTIONAL)
6. Register a model version using the best hyperparameters (OPTIONAL - if you did the previous step)
7. Retrieve a model version and use it for prediction

In [12]:
# 1. Install MLFlow
# !pip install mlflow[extras]
# !pip freeze | grep mlflow

In [13]:
# 2. Setup MLFlow Tracking Server
# Run the following command in a terminal to start the MLFlow Tracking Server
# Configure mlflow to store all in a local directory
# !mlflow server --backend-store-uri ./mlruns --default-artifact-root ./mlruns

In [24]:
# 3. Create a new experiment
# This is a local tracking server. You can also use a remote tracking server. See https://mlflow.org/docs/latest/tracking.html for more details.
# Remote tracking server will be simulated later

# Uncomment if using the docker compose setup:
# import os
# os.environ["AWS_ACCESS_KEY_ID"] = "minio"
# os.environ["AWS_SECRET_ACCESS_KEY"] = "minio123"
# os.environ["MLFLOW_S3_ENDPOINT_URL"] = f"http://localhost:9000"
# os.environ["MYSQL_DATABASE"] = "mlflow_database"
# os.environ["MYSQL_USER"] = "mlflow_user"
# os.environ["MYSQL_PASSWORD"] = "mlflow"
# os.environ["MYSQL_ROOT_PASSWORD"] = "mysql"



import mlflow
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("mlflow-tutorial")

<Experiment: artifact_location='s3://mlflow/1', creation_time=1672164435455, experiment_id='1', last_update_time=1672164435455, lifecycle_stage='active', name='mlflow-tutorial', tags={}>

In [25]:
# 4. Log a model experiment result

# Lets first create a simple function to train a model
# The function will receive a set of hyperparameters and return the model and the accuracy
import datetime

def get_data():
    from sklearn import datasets

    X, y = datasets.make_classification(n_samples=1000, n_features=4, n_informative=4, n_redundant=0, n_classes=3, n_clusters_per_class=1, class_sep=0.5, random_state=40)
    return X, y

def train_model(hyperparameters):
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import accuracy_score
    from sklearn.model_selection import train_test_split
    import mlflow.sklearn
    X, y = get_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)
    model = RandomForestClassifier(**hyperparameters)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_pred, y_test)
    return model, accuracy

# Now lets train a model with some hyperparameters
hyperparameters = {"n_estimators": 10, "max_depth": 5}
model, accuracy = train_model(hyperparameters)

# Now lets log the model and the accuracy
date = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
with mlflow.start_run(run_name=f"mlflow-tutorial-{date}"):
    
    # Hyperparameters are logged as parameters
    for hyperparameter, value in hyperparameters.items():
        mlflow.log_param(hyperparameter, value)
    
    # Accuracy is logged as a metric 
    mlflow.log_metric("accuracy", accuracy)
    
    # Model is logged as an artifact
    mlflow.sklearn.log_model(model, "model")

In [26]:
# 5. Run a hyperparameter search and log the results
import optuna
import mlflow
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

X, y = get_data()

def optimize_rf(trial):
    
    with mlflow.start_run(run_name=f"optuna-hp-{trial.number}-{datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}"):
        # Set the hyperparameter values that we want to optimize
        n_estimators = trial.suggest_int('n_estimators', 1, 100)
        max_depth = trial.suggest_int('max_depth', 2, 10)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)
        max_features = trial.suggest_float('max_features', 0.1, 1.0)
        
        
        # Create a random forest classifier using the suggested hyperparameters
        rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth,
                                    min_samples_split=min_samples_split,
                                    min_samples_leaf=min_samples_leaf,
                                    max_features=max_features)
        
        # Use cross-validation to evaluate the performance of the classifier
        scores = cross_val_score(rf, X, y, cv=5)
        
        # Log the hyperparameters and cross-validation scores to MLflow
        mlflow.log_param('n_estimators', n_estimators)
        mlflow.log_param('max_depth', max_depth)
        mlflow.log_param('min_samples_split', min_samples_split)
        mlflow.log_param('min_samples_leaf', min_samples_leaf)
        mlflow.log_param('max_features', max_features)
        mlflow.log_metric('mean_cv_score', scores.mean())
    
    # Return the mean of the cross-validation scores as the objective value
    return scores.mean()

# Create an Optuna study
study = optuna.create_study()

# Run the optimization loop
study.optimize(optimize_rf, n_trials=100)

# Get the best hyperparameter values
best_params = study.best_params

  from .autonotebook import tqdm as notebook_tqdm
[32m[I 2022-12-27 19:17:41,554][0m A new study created in memory with name: no-name-f352d7a9-e59b-47a4-af3d-878b47c56ede[0m
[32m[I 2022-12-27 19:17:44,377][0m Trial 0 finished with value: 0.7030000000000001 and parameters: {'n_estimators': 99, 'max_depth': 2, 'min_samples_split': 12, 'min_samples_leaf': 12, 'max_features': 0.9175075767031363}. Best is trial 0 with value: 0.7030000000000001.[0m
[32m[I 2022-12-27 19:17:46,772][0m Trial 1 finished with value: 0.7989999999999999 and parameters: {'n_estimators': 59, 'max_depth': 5, 'min_samples_split': 9, 'min_samples_leaf': 20, 'max_features': 0.789342314961703}. Best is trial 0 with value: 0.7030000000000001.[0m
[32m[I 2022-12-27 19:17:48,795][0m Trial 2 finished with value: 0.8160000000000001 and parameters: {'n_estimators': 71, 'max_depth': 8, 'min_samples_split': 9, 'min_samples_leaf': 12, 'max_features': 0.20821254130743697}. Best is trial 0 with value: 0.7030000000000001.[

In [28]:
# 6. Register a model version using the best hyperparameters

with mlflow.start_run(run_name=f"optuna-hp-final"):
    # Create the final model using the best hyperparameters
    final_model = RandomForestClassifier(**best_params)

    # Train the final model on the entire dataset
    final_model.fit(X, y)

    # Log the model to the "Models" section
    mlflow.sklearn.log_model(final_model, "random_forest_model", registered_model_name="random_forest_model")



Registered model 'random_forest_model' already exists. Creating a new version of this model...
2022/12/27 19:22:04 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: random_forest_model, version 2
Created version '2' of model 'random_forest_model'.


In [29]:
# 7. Retrieve a model version and use it for prediction

import mlflow.pyfunc

model_name = "random_forest_model"
model_version = 2

model = mlflow.pyfunc.load_model(
    model_uri=f"models:/{model_name}/{model_version}"
)

model.predict(X[:5])

array([2, 1, 0, 2, 2])

In [31]:
# EXTRA: Deploy the model to a REST API endpoint with mlflow models serve
# Set up env variables
# export $(grep -v '^#' .env | xargs)
# export MLFLOW_TRACKING_URI=http://localhost:5000

Usage: mlflow models [OPTIONS] COMMAND [ARGS]...
Try 'mlflow models --help' for help.

Error: No such command 'build'.


In [35]:
X[:5]

array([[ 1.48722421,  1.02475923,  6.03974507, -1.72832624],
       [ 1.66930138,  1.34532094, -0.22424842, -0.56881206],
       [ 0.47854928, -1.15415362, -2.4204069 ,  1.76360495],
       [ 0.19853164,  0.78233069,  2.16172363, -1.78084574],
       [-0.13384718,  0.10490039,  2.89640135, -1.02639753]])

In [34]:
# Using python_function backend
# mlflow models serve --model-uri s3://mlflow/1/e18a1a1695e844258403040ae6e6cb4b/artifacts/random_forest_model --env-manager=local -p 5002
# Try it out with curl

# curl -d '{"inputs":[[ 1.48722421,  1.02475923,  6.03974507, -1.72832624]]}' -H 'Content-Type: application/json'  localhost:5002/invocations

In [None]:
# Using MLServer backend
# mlflow models serve --model-uri s3://mlflow/1/e18a1a1695e844258403040ae6e6cb4b/artifacts/random_forest_model --env-manager=local -p 5002 --enable-mlserver
# Try it out with curl

# curl -d '{"inputs":[[ 1.48722421,  1.02475923,  6.03974507, -1.72832624]]}' -H 'Content-Type: application/json'  localhost:5002/invocations



In [None]:
# Undeploy mlflow docker compose
# docker-compose down