In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import mlflow
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from lib.experiment_setup import Experiment
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV

In [None]:
experiment = Experiment(name="GBR", norm=3)
models = []
rmse_scores = []

gbr_params = {
    'loss': ['squared_error', 'absolute_error', 'huber', 'quantile'],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'subsample': [0.8, 0.9, 1.0],
    'criterion': ['friedman_mse', 'squared_error'],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1, 2, 3],
    'min_weight_fraction_leaf': [0.0, 0.1],
    'max_depth': [3, 4, 5],
    'min_impurity_decrease': [0.0, 0.01],
    'init': [None, 'zero'],
    'random_state': [None, 42],
    'max_features': [None, 'sqrt', 'log2'],
    'alpha': [0.9, 0.95],
    'verbose': [0],
    'max_leaf_nodes': [None, 4, 8],
    'warm_start': [False, True],
    'validation_fraction': [0.1],
    'n_iter_no_change': [None, 5],
    'tol': [1e-4],
    'ccp_alpha': [0.0, 0.01]
}

gb_regressor = GradientBoostingRegressor()
rand_search = RandomizedSearchCV(estimator=gb_regressor, param_distributions=gbr_params, n_iter=150, cv=5, n_jobs=-1, verbose=3)

In [None]:
def gbr_experiment(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.DataFrame, y_test: pd.DataFrame, target: str, norm: int):
    rand_search.fit(X_train, y_train[target])
    best_estimator = rand_search.best_estimator_

    # Predict using the best estimator
    y_pred = best_estimator.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test[target], y_pred))

    # MLflow logging
    mlflow.log_metric("rmse", float(rmse))

    # Log best parameters from the grid search
    mlflow.log_params(rand_search.best_params_)

    # Log the best estimator as a model
    experiment.mlflow_sklearn_log_model(best_estimator)


experiment.run(gbr_experiment)