In [None]:
# MLFlow Tutorial
# This is a simple tutorial of using MLFlow to register experiments and model versions.
# 1. Install MLFlow
# 2. Setup MLFlow Tracking Server
# 3. Create a new experiment
# 4. Log a model version
# 5. View the model version in the MLFlow UI
# 6. Register a model version
# 7. View the model version in the MLFlow UI
# 8. Run a hyperparameter search and log the results
# 9. View the hyperparameter search results in the MLFlow UI

In [None]:
# 1. Install MLFlow
# !pip install mlflow[extras]
# !pip freeze | grep mlflow

In [None]:
# 2. Setup MLFlow Tracking Server
# Run the following command in a terminal to start the MLFlow Tracking Server
# Configure mlflow to store all in a local directory
# !mlflow server --backend-store-uri ./mlruns --default-artifact-root ./mlruns

In [1]:
# 3. Create a new experiment
# This is a local tracking server. You can also use a remote tracking server. See https://mlflow.org/docs/latest/tracking.html for more details.
# Remote tracking server will be simulated later
import mlflow
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("mlflow-tutorial")

<Experiment: artifact_location='./mlruns/136298590476637391', creation_time=1672075851197, experiment_id='136298590476637391', last_update_time=1672075851197, lifecycle_stage='active', name='mlflow-tutorial', tags={}>

In [2]:
# 4. Log a model version

#Lets first create a simple function to train a model
# The function will receive a set of hyperparameters and return the model and the accuracy
import datetime

def get_data():
    from sklearn import datasets

    X, y = datasets.make_classification(n_samples=1000, n_features=4, n_informative=4, n_redundant=0, n_classes=3, n_clusters_per_class=1, class_sep=0.5, random_state=40)
    return X, y

def train_model(hyperparameters):
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import accuracy_score
    from sklearn.model_selection import train_test_split
    import mlflow.sklearn
    X, y = get_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)
    model = RandomForestClassifier(**hyperparameters)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_pred, y_test)
    return model, accuracy

# Now lets train a model with some hyperparameters
hyperparameters = {"n_estimators": 10, "max_depth": 5}
model, accuracy = train_model(hyperparameters)

# Now lets log the model and the accuracy
date = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
with mlflow.start_run(run_name=f"mlflow-tutorial-{date}"):
    
    # Hyperparameters are logged as parameters
    for hyperparameter, value in hyperparameters.items():
        mlflow.log_param(hyperparameter, value)
    
    # Accuracy is logged as a metric 
    mlflow.log_metric("accuracy", accuracy)
    
    # Model is logged as an artifact
    mlflow.sklearn.log_model(model, "model")



In [6]:
# 8. Run a hyperparameter search and log the results
import optuna
import mlflow
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

X, y = get_data()

def optimize_rf(trial):
    
    with mlflow.start_run(run_name=f"optuna-hp-{trial.number}-{datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}"):
        # Set the hyperparameter values that we want to optimize
        n_estimators = trial.suggest_int('n_estimators', 1, 100)
        max_depth = trial.suggest_int('max_depth', 2, 10)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)
        max_features = trial.suggest_float('max_features', 0.1, 1.0)
        
        
        # Create a random forest classifier using the suggested hyperparameters
        rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth,
                                    min_samples_split=min_samples_split,
                                    min_samples_leaf=min_samples_leaf,
                                    max_features=max_features)
        
        # Use cross-validation to evaluate the performance of the classifier
        scores = cross_val_score(rf, X, y, cv=5)
        
        # Log the hyperparameters and cross-validation scores to MLflow
        mlflow.log_param('n_estimators', n_estimators)
        mlflow.log_param('max_depth', max_depth)
        mlflow.log_param('min_samples_split', min_samples_split)
        mlflow.log_param('min_samples_leaf', min_samples_leaf)
        mlflow.log_param('max_features', max_features)
        mlflow.log_metric('mean_cv_score', scores.mean())
    
    # Return the mean of the cross-validation scores as the objective value
    return scores.mean()

# Create an Optuna study
study = optuna.create_study()

# Run the optimization loop
study.optimize(optimize_rf, n_trials=100)

# Get the best hyperparameter values
best_params = study.best_params

[32m[I 2022-12-26 19:29:46,632][0m A new study created in memory with name: no-name-6fa8d55f-47c5-49a0-a98f-930ac6dcb28c[0m
[32m[I 2022-12-26 19:29:48,278][0m Trial 0 finished with value: 0.786 and parameters: {'n_estimators': 11, 'max_depth': 8, 'min_samples_split': 6, 'min_samples_leaf': 18, 'max_features': 0.13166750514300726}. Best is trial 0 with value: 0.786.[0m
[32m[I 2022-12-26 19:29:50,884][0m Trial 1 finished with value: 0.8309999999999998 and parameters: {'n_estimators': 85, 'max_depth': 9, 'min_samples_split': 4, 'min_samples_leaf': 9, 'max_features': 0.899413536343465}. Best is trial 0 with value: 0.786.[0m
[32m[I 2022-12-26 19:29:52,977][0m Trial 2 finished with value: 0.827 and parameters: {'n_estimators': 77, 'max_depth': 8, 'min_samples_split': 2, 'min_samples_leaf': 10, 'max_features': 0.5223151270616982}. Best is trial 0 with value: 0.786.[0m
[32m[I 2022-12-26 19:29:54,563][0m Trial 3 finished with value: 0.818 and parameters: {'n_estimators': 69, 'max_

In [12]:
# 10. Create a model using the best model previously logged

with mlflow.start_run(run_name=f"optuna-hp-final"):
    # Create the final model using the best hyperparameters
    final_model = RandomForestClassifier(**best_params)

    # Train the final model on the entire dataset
    final_model.fit(X, y)

    # Log the model to the "Models" section
    mlflow.sklearn.log_model(final_model, "random_forest_model", registered_model_name="random_forest_model")



Registered model 'random_forest_model' already exists. Creating a new version of this model...
2022/12/26 19:51:17 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: random_forest_model, version 2
Created version '2' of model 'random_forest_model'.


In [13]:
# Fetch a model

import mlflow.pyfunc

model_name = "random_forest_model"
model_version = 2

model = mlflow.pyfunc.load_model(
    model_uri=f"models:/{model_name}/{model_version}"
)

model.predict(X[:5])

array([2, 0, 0, 2, 2])