In [19]:
# Basic Libraries
import mlflow, warnings

# Machine Learning
import optuna, sklearn
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score, f1_score, log_loss

warnings.filterwarnings("ignore")

In [10]:
# The set_experiment API creates a new experiment if it doesn't exist.
mlflow.set_experiment("Hyperparameter Tuning Experiment")

<Experiment: artifact_location='file:///Users/guane/Documentos/GitHub/MLOPS/Notebooks/mlruns/395073105550929572', creation_time=1767105597527, experiment_id='395073105550929572', last_update_time=1767105597527, lifecycle_stage='active', name='Hyperparameter Tuning Experiment', tags={}>

In [18]:
X, y = load_breast_cancer(return_X_y=True, as_frame=True)
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=0)

X_train.shape, X_val.shape, y_train.shape, y_val.shape

((426, 30), (143, 30), (426,), (143,))

In [20]:
def objective(trial):
    """
    Optuna objective function for Random Forest Classifier hyperparameter tuning.
    
    Args:
        trial: Optuna trial object
        
    Returns:
        float: Negative accuracy (to minimize) or log_loss (to minimize)
    """
    # Setting nested=True will create a child run under the parent run.
    with mlflow.start_run(nested=True, run_name=f"trial_{trial.number}") as child_run:
        # Hyperparameter suggestions
        rf_max_depth = trial.suggest_int("rf_max_depth", 2, 32)
        rf_n_estimators = trial.suggest_int("rf_n_estimators", 50, 300, step=10)
        rf_max_features = trial.suggest_float("rf_max_features", 0.2, 1.0)
        rf_min_samples_split = trial.suggest_int("rf_min_samples_split", 2, 10)
        
        params = {
            "max_depth": rf_max_depth,
            "n_estimators": rf_n_estimators,
            "max_features": rf_max_features,
            "min_samples_split": rf_min_samples_split,
            "random_state": 42,
        }
        
        # Log current trial's parameters
        mlflow.log_params(params)

        # Train classifier
        classifier_obj = sklearn.ensemble.RandomForestClassifier(**params)
        classifier_obj.fit(X_train, y_train)

        # Predictions
        y_pred = classifier_obj.predict(X_val)
        y_pred_proba = classifier_obj.predict_proba(X_val)
        
        # Calculate metrics
        accuracy = accuracy_score(y_val, y_pred)
        f1 = f1_score(y_val, y_pred, average='binary')  # Use 'macro' for multiclass
        logloss = log_loss(y_val, y_pred_proba)
        
        # Log metrics
        mlflow.log_metrics({
            "accuracy": accuracy,
            "f1_score": f1,
            "log_loss": logloss
        })

        # Log the model with signature
        signature = mlflow.models.infer_signature(X_train, classifier_obj.predict(X_train))
        mlflow.sklearn.log_model(
            classifier_obj, 
            artifact_path="model",
            signature=signature,
            input_example=X_train.iloc[:5]
        )
        
        # Make it easy to retrieve the best-performing child run later
        trial.set_user_attr("run_id", child_run.info.run_id)
        
        # Return negative accuracy to minimize (Optuna minimizes by default)
        # Or return log_loss directly (lower is better)
        return -accuracy  # Or: return logloss

In [21]:
with mlflow.start_run(run_name="classification_study") as run:
    # Log the experiment settings
    n_trials = 30
    mlflow.log_param("n_trials", n_trials)
    mlflow.log_param("task_type", "classification")

    # Create study - direction depends on what objective returns
    # If returning -accuracy: use "minimize"
    # If returning log_loss: use "minimize"
    # If returning accuracy: use "maximize"
    study = optuna.create_study(direction="minimize")  # Because we return -accuracy
    study.optimize(objective, n_trials=n_trials)

    # Log the best trial and its run ID
    mlflow.log_params(study.best_trial.params)
    
    # Log best metrics (convert back to positive accuracy if needed)
    best_accuracy = -study.best_value  # Convert back from negative
    mlflow.log_metrics({
        "best_accuracy": best_accuracy,
        "best_negative_accuracy": study.best_value
    })
    
    if best_run_id := study.best_trial.user_attrs.get("run_id"):
        mlflow.log_param("best_child_run_id", best_run_id)
        
    print(f"Best accuracy: {best_accuracy:.4f}")
    print(f"Best parameters: {study.best_trial.params}")

[I 2025-12-30 09:59:28,168] A new study created in memory with name: no-name-8d612e0b-698a-4d94-94d5-c544ffcb7e93
[I 2025-12-30 09:59:35,272] Trial 0 finished with value: -0.965034965034965 and parameters: {'rf_max_depth': 30, 'rf_n_estimators': 270, 'rf_max_features': 0.8250166096343861, 'rf_min_samples_split': 10}. Best is trial 0 with value: -0.965034965034965.
[I 2025-12-30 09:59:43,231] Trial 1 finished with value: -0.972027972027972 and parameters: {'rf_max_depth': 16, 'rf_n_estimators': 220, 'rf_max_features': 0.2322013447948213, 'rf_min_samples_split': 4}. Best is trial 1 with value: -0.972027972027972.
[I 2025-12-30 09:59:49,482] Trial 2 finished with value: -0.972027972027972 and parameters: {'rf_max_depth': 25, 'rf_n_estimators': 250, 'rf_max_features': 0.4261283756782451, 'rf_min_samples_split': 3}. Best is trial 1 with value: -0.972027972027972.
[I 2025-12-30 09:59:54,143] Trial 3 finished with value: -0.9790209790209791 and parameters: {'rf_max_depth': 4, 'rf_n_estimators

Best accuracy: 0.9790
Best parameters: {'rf_max_depth': 4, 'rf_n_estimators': 60, 'rf_max_features': 0.36116117487694643, 'rf_min_samples_split': 5}
