# Hyperparameter tuning with MLFlow and Optuna

Setting up and training a ElasticNet model with hyperparameter tuning using Optuna and MLFlow. As seen in this [post]()

## Dependencies

In [1]:
import mlflow
import numpy as np
import optuna
from optuna.integration.mlflow import MLflowCallback
import pandas as pd
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from datetime import datetime

import warnings
warnings.filterwarnings('ignore')

## Setup

This dataset is from UCI's [machine learning repository](http://archive.ics.uci.edu/dataset/186/wine+quality). For this example, I'm replicating MLFlow's tutorial (you can find it [here](https://mlflow.org/docs/latest/tutorials-and-examples/tutorial.html)).

In [2]:
CSV_URL = "https://raw.githubusercontent.com/mlflow/mlflow/master/tests/datasets/winequality-red.csv"
TEST_SIZE = 0.25
RANDOM_STATE = 42
TRACKING_URI = "http://127.0.0.1:5000"

In [3]:
def load() -> pd.DataFrame:
    """
    Load the wine-quality csv file from the URL.

    Returns:
        pd.DataFrame: The wine-quality dataset.
    """
    try:
        data = pd.read_csv(CSV_URL, sep=";")
    except Exception as e:
        print(
            "Unable to download training & test CSV, "
            "check your internet connection. Error: %s", e
        )

    return data

def prepare(data: pd.DataFrame) -> (np.ndarray, np.ndarray):
    """
    Prepare the wine-quality dataset for training.

    Args:
        data (pd.DataFrame): The wine-quality dataset.

    Returns:
        np.ndarray: The features of the wine-quality dataset.
        np.ndarray: The labels of the wine-quality dataset.
    """
    x = data.drop(["quality"], axis=1)
    y = data[["quality"]]

    return x, y



## Load and prepare data

In [4]:
data = load()
x, y = prepare(data)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)


## Hyperparameter search

### Setup MLFlow

In [5]:
mlflow.set_tracking_uri(TRACKING_URI)
tracking_uri = mlflow.get_tracking_uri()


In [6]:
mlflc = MLflowCallback(
    tracking_uri=tracking_uri,
    metric_name="rmse",
)


### Setup Optuna

Objective function, finding minimum RMSE,

In [7]:
@mlflc.track_in_mlflow()
def objective(trial: optuna.trial.Trial) -> float:
    """
    Optuna objective function for hyperparameter tuning of a regression model.

    Args:
        trial: An Optuna `Trial` object used to sample hyperparameters.
        x_train: A numpy array of shape `(n_samples, n_features)` containing
            the training data.
        y_train: A numpy array of shape `(n_samples,)` containing the target
            values for the training data.
        x_test: A numpy array of shape `(n_samples, n_features)` containing
            the test data.
        y_test: A numpy array of shape `(n_samples,)` containing the target
            values for the test data.

    Returns:
        The root mean squared error (RMSE) of the regression model on the test
            data.
    """
    params = {
        "alpha": trial.suggest_float("alpha", 0.05, 1.0, step=0.05),
        "l1_ratio": trial.suggest_float("l1_ratio", 0.05, 1.0, step=0.05),
    }
    model = ElasticNet(**params)
    model.fit(x_train, y_train)
    mlflow.sklearn.log_model(model, "model")
    y_pred = model.predict(x_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    return rmse


## Training

In [8]:
date_run = datetime.now().strftime("%Y%m%d_%H%M%S")
study = optuna.create_study(
    direction="minimize",
    study_name=f"elastic_net_{date_run}"
)
study.optimize(objective, n_trials=100, callbacks=[mlflc])


[I 2023-08-31 16:29:17,886] A new study created in memory with name: elastic_net_20230831_162917
2023/08/31 16:29:17 INFO mlflow.tracking.fluent: Experiment with name 'elastic_net_20230831_162917' does not exist. Creating a new experiment.
[I 2023-08-31 16:29:19,573] Trial 0 finished with value: 0.7383336363712971 and parameters: {'alpha': 0.55, 'l1_ratio': 0.4}. Best is trial 0 with value: 0.7383336363712971.
[I 2023-08-31 16:29:20,819] Trial 1 finished with value: 0.7850026425141758 and parameters: {'alpha': 0.8500000000000001, 'l1_ratio': 0.7500000000000001}. Best is trial 0 with value: 0.7383336363712971.
[I 2023-08-31 16:29:22,011] Trial 2 finished with value: 0.7330321305614628 and parameters: {'alpha': 0.55, 'l1_ratio': 0.35000000000000003}. Best is trial 2 with value: 0.7330321305614628.
[I 2023-08-31 16:29:23,221] Trial 3 finished with value: 0.7849385305823857 and parameters: {'alpha': 0.7500000000000001, 'l1_ratio': 0.6500000000000001}. Best is trial 2 with value: 0.73303213