In [11]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
import mlflow
import numpy as np
import pandas as pd
import math
import optuna
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR

from lib.reproduction import major_oxides



In [14]:
from lib import full_flow_dataloader
from lib.config import AppConfig

mlflow.set_tracking_uri(AppConfig().mlflow_tracking_uri)

drop_cols = ["ID", "Sample Name"]

from sklearn.preprocessing import MaxAbsScaler, PowerTransformer

train_processed, test_processed = full_flow_dataloader.load_full_flow_data(load_cache_if_exits=True, average_shots=True)
target = major_oxides[0]

drop_cols.extend([oxide for oxide in major_oxides if oxide != target])
train_processed = train_processed.drop(columns=drop_cols)
test_processed = test_processed.drop(columns=drop_cols)

# Applying MaxAbsScaler and PowerTransformer
scaler = MaxAbsScaler()
power_transformer = PowerTransformer()

# Exclude target column from normalization
cols_to_normalize = [col for col in train_processed.columns if col not in drop_cols + [target]]

# Normalize only the columns that are not in drop_cols or the target column
train_processed[cols_to_normalize] = scaler.fit_transform(train_processed[cols_to_normalize])
train_processed[cols_to_normalize] = power_transformer.fit_transform(train_processed[cols_to_normalize])

test_processed[cols_to_normalize] = scaler.transform(test_processed[cols_to_normalize])
test_processed[cols_to_normalize] = power_transformer.transform(test_processed[cols_to_normalize])

# Reconstruct the DataFrame to include the target column
train_processed = pd.DataFrame(train_processed, columns=train_processed.columns)
test_processed = pd.DataFrame(test_processed, columns=test_processed.columns)

X_train = train_processed.drop(columns=[target])
y_train = train_processed[target]

X_test = test_processed.drop(columns=[target])
y_test = test_processed[target]


In [15]:
# https://mlflow.org/docs/latest/traditional-ml/hyperparameter-tuning-with-child-runs/notebooks/hyperparameter-tuning-with-child-runs.html
def get_or_create_experiment(experiment_name: str) -> str:
    """
    Retrieve the ID of an existing MLflow experiment or create a new one if it doesn't exist.

    This function checks if an experiment with the given name exists within MLflow.
    If it does, the function returns its ID. If not, it creates a new experiment
    with the provided name and returns its ID.

    Parameters:
    - experiment_name (str): Name of the MLflow experiment.

    Returns:
    - str: ID of the existing or newly created MLflow experiment.
    """

    if experiment := mlflow.get_experiment_by_name(experiment_name):
        return experiment.experiment_id
    else:
        return mlflow.create_experiment(experiment_name)

In [16]:
experiment_id = get_or_create_experiment("optuna_experiment")

experiment_id

'202075117645542195'

In [17]:
mlflow.set_experiment(experiment_id=experiment_id)

<Experiment: artifact_location='mlflow-artifacts:/202075117645542195', creation_time=1713779016259, experiment_id='202075117645542195', last_update_time=1713779016259, lifecycle_stage='active', name='optuna_experiment', tags={}>

In [18]:
# override Optuna's default logging to ERROR only
optuna.logging.set_verbosity(optuna.logging.ERROR)

# define a logging callback that will report on only new challenger parameter configurations if a
# trial has usurped the state of 'best conditions'


# https://mlflow.org/docs/latest/traditional-ml/hyperparameter-tuning-with-child-runs/notebooks/hyperparameter-tuning-with-child-runs.html
def champion_callback(study, frozen_trial):
    """
    Logging callback that will report when a new trial iteration improves upon existing
    best trial values.

    Note: This callback is not intended for use in distributed computing systems such as Spark
    or Ray due to the micro-batch iterative implementation for distributing trials to a cluster's
    workers or agents.
    The race conditions with file system state management for distributed trials will render
    inconsistent values with this callback.
    """

    winner = study.user_attrs.get("winner", None)

    if study.best_value and winner != study.best_value:
        study.set_user_attr("winner", study.best_value)
        if winner:
            improvement_percent = (abs(winner - study.best_value) / study.best_value) * 100
            print(
                f"Trial {frozen_trial.number} achieved value: {frozen_trial.value} with "
                f"{improvement_percent: .4f}% improvement"
            )
        else:
            print(f"Initial trial {frozen_trial.number} achieved value: {frozen_trial.value}")

In [19]:
def objective(trial):
    with mlflow.start_run(nested=True):
        # Define hyperparameters
        params = {
            "C": trial.suggest_float("C", 1e-3, 100.0, log=True),
            "epsilon": trial.suggest_float("epsilon", 1e-3, 1.0, log=True),
            "kernel": trial.suggest_categorical("kernel", ["linear", "poly", "rbf", "sigmoid"]),
        }

        if params["kernel"] == "poly":
            params["degree"] = trial.suggest_int("degree", 2, 5)

        # Train SVR model
        model = SVR(C=params["C"], epsilon=params["epsilon"], kernel=params["kernel"], degree=params.get("degree", 3))
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        error = mean_squared_error(y_test, preds)

        # Log to MLflow
        mlflow.log_params(params)
        mlflow.log_metric("mse", float(error))
        mlflow.log_metric("rmse", math.sqrt(error))

    return error

In [20]:
run_name = f"SVR_{target}"

In [21]:
# Initiate the parent run and call the hyperparameter tuning child run logic
with mlflow.start_run(experiment_id=experiment_id, run_name=run_name, nested=True):
    # Initialize the Optuna study
    study = optuna.create_study(direction="minimize")

    # Execute the hyperparameter optimization trials.
    # Note the addition of the `champion_callback` inclusion to control our logging
    study.optimize(objective, n_trials=500, callbacks=[champion_callback])

    mlflow.log_params(study.best_params)
    mlflow.log_metric("best_mse", study.best_value)
    mlflow.log_metric("best_rmse", math.sqrt(study.best_value))

    # Log tags
    mlflow.set_tags(
        tags={
            "project": "Stacking SVR",
            "optimizer_engine": "optuna",
            "model_family": "SVR",
            "feature_set_version": 1,
        }
    )

    # Log a fit model instance
    model = SVR(**study.best_params)

    artifact_path = "model"

    mlflow.sklearn.log_model(
        sklearn_model=model,
        artifact_path=artifact_path,
        input_example=train_processed.iloc[[0]],
        model_format="ubj",
        metadata={"model_data_version": 1},
    )

    # Get the logged model uri so that we can load it from the artifact store
    model_uri = mlflow.get_artifact_uri(artifact_path)


Initial trial 0 achieved value: 474.3755621384993
Trial 1 achieved value: 132.25590196816165 with  258.6801% improvement
Trial 2 achieved value: 66.51084247089571 with  98.8486% improvement
Trial 3 achieved value: 13.825796864355281 with  381.0634% improvement
Trial 48 achieved value: 13.82212665508972 with  0.0266% improvement
Trial 49 achieved value: 13.818523543958706 with  0.0261% improvement
