## Import libraries

In [6]:
import warnings
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope
import mlflow
import mlflow.sklearn
import logging

logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)

warnings.filterwarnings("ignore")
np.random.seed(40)

## Import Data

In [7]:
# Read the wine-quality csv file from the URL
csv_url = (
    "https://raw.githubusercontent.com/mlflow/mlflow/master/tests/datasets/winequality-red.csv"
)
try:
    data = pd.read_csv(csv_url, sep=";")
except Exception as e:
    logger.exception(
        "Unable to download training & test CSV, check your internet connection. Error: %s", e
    )

# Split the data into training and test sets. (0.75, 0.25) split.
train, test = train_test_split(data)
# The predicted column is "quality" which is a scalar from [3, 9]
train_x = train.drop(["quality"], axis=1)
test_x = test.drop(["quality"], axis=1)
train_y = train[["quality"]]
test_y = test[["quality"]]
train.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
1316,5.4,0.74,0.0,1.2,0.041,16.0,46.0,0.99258,4.01,0.59,12.5,6
1507,7.5,0.38,0.57,2.3,0.106,5.0,12.0,0.99605,3.36,0.55,11.4,6
849,6.4,0.63,0.21,1.6,0.08,12.0,32.0,0.99689,3.58,0.66,9.8,5
82,7.4,0.5,0.47,2.0,0.086,21.0,73.0,0.997,3.36,0.57,9.1,5
644,9.9,0.54,0.45,2.3,0.071,16.0,40.0,0.9991,3.39,0.62,9.4,5


## Hyperparameter tuning and track each run

In [8]:
mlflow.set_experiment("Random Forest hyperparameter tuning")

<Experiment: artifact_location='file:///Users/guro/Desktop/My%20Workspace/Medium/Experiment-Tracking/mlruns/406104440280023279', creation_time=1689449402242, experiment_id='406104440280023279', last_update_time=1689449402242, lifecycle_stage='active', name='Random Forest hyperparameter tuning', tags={}>

In [9]:
def objective(params):
    
    # Create a Random Forest Regressor with the given hyperparameters
    model = RandomForestRegressor(
        n_estimators=params['n_estimators'],
        max_depth=params['max_depth'],
        min_samples_split=params['min_samples_split'],
        min_samples_leaf=params['min_samples_leaf']
    )
    
    # Fit the model on the training data
    model.fit(train_x, train_y)
    
    # Predict on the test data
    y_pred = model.predict(test_x)
    
    # Calculate the mean squared error
    mse = mean_squared_error(test_y, y_pred)
    
    # Log the hyperparameters and evaluation metric to MLflow
    with mlflow.start_run():
        mlflow.set_tag("model", "RandomForest")
        mlflow.log_params(params)
        mlflow.log_metric('mse', mse)
    
    # Return the evaluation metric
    return {'loss': mse, 'status': STATUS_OK}

In [10]:
space = {
    'n_estimators': hp.choice('n_estimators', range(10, 200)),
    'max_depth': hp.choice('max_depth', range(1, 20)),
    'min_samples_split': hp.choice('min_samples_split', range(2, 20)),
    'min_samples_leaf': hp.choice('min_samples_leaf', range(1, 20)),
}

# Run the hyperparameter optimization using Hyperopt
trials = Trials()
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=100, trials=trials)

# Print the best hyperparameters
print("Best Hyperparameters:")
print(best)

  0%|          | 0/100 [00:00<?, ?trial/s, best loss=?]

100%|██████████| 100/100 [00:25<00:00,  3.99trial/s, best loss: 0.3371851636033816]
Best Hyperparameters:
{'max_depth': 14, 'min_samples_leaf': 1, 'min_samples_split': 1, 'n_estimators': 65}
