In [3]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
import mlflow
import numpy as np
from sklearn.metrics import mean_squared_error
from lib import full_flow_dataloader
from lib.full_flow_dataloader import load_train_test_data

Loading data: 100%|██████████| 414/414 [02:26<00:00,  2.82it/s]
Transforming samples: 100%|██████████| 308/308 [00:33<00:00,  9.26it/s]
Transforming samples: 100%|██████████| 78/78 [00:08<00:00,  9.39it/s]


In [8]:
norm = 3
X_train, y_train, X_test, y_test = load_train_test_data(norm=3)

In [11]:
import datetime
import time
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV

mlflow.set_experiment(f'GBR_Norm{norm}_{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}')

models = []
rmse_scores = []

gbr_params = {
    'loss': ['squared_error', 'absolute_error', 'huber', 'quantile'],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'subsample': [0.8, 0.9, 1.0],
    'criterion': ['friedman_mse', 'squared_error'],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1, 2, 3],
    'min_weight_fraction_leaf': [0.0, 0.1],
    'max_depth': [3, 4, 5],
    'min_impurity_decrease': [0.0, 0.01],
    'init': [None, 'zero'],
    'random_state': [None, 42],
    'max_features': [None, 'sqrt', 'log2'],
    'alpha': [0.9, 0.95],
    'verbose': [0],
    'max_leaf_nodes': [None, 4, 8],
    'warm_start': [False, True],
    'validation_fraction': [0.1],
    'n_iter_no_change': [None, 5],
    'tol': [1e-4],
    'ccp_alpha': [0.0, 0.01]
}

gb_regressor = GradientBoostingRegressor()
rand_search = RandomizedSearchCV(estimator=gb_regressor, param_distributions=gbr_params, n_iter=150, cv=5, n_jobs=-1, verbose=3)

# Iterate over each target variable
for target in y_train.columns:
    start_time = time.time()
    print(f"Starting training for target: {target}")

    with mlflow.start_run(run_name=f"GBR_{target}"):

        # Fit the grid search to the data
        rand_search.fit(X_train, y_train[target])
        best_estimator = rand_search.best_estimator_

        # Predict using the best estimator
        y_pred = best_estimator.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test[target], y_pred))

        # MLflow logging
        mlflow.log_metric("rmse", float(rmse))

        # Log best parameters from the grid search
        mlflow.log_params(rand_search.best_params_)
        mlflow.log_param("target", target)
        mlflow.log_param("norm", norm)

        # Log the best estimator as a model
        mlflow.sklearn.log_model(best_estimator, f"model_{target}_{norm}")

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Completed training for target: {target}. Time taken: {elapsed_time} seconds.")

2024/03/07 17:43:53 INFO mlflow.tracking.fluent: Experiment with name 'GBR_Norm3_20240307-174352' does not exist. Creating a new experiment.


Starting training for target: SiO2
Fitting 5 folds for each of 150 candidates, totalling 750 fits
Completed training for target: SiO2. Time taken: 21459.992425203323 seconds.
Starting training for target: TiO2
Fitting 5 folds for each of 150 candidates, totalling 750 fits




Completed training for target: TiO2. Time taken: 17017.658001184464 seconds.
Starting training for target: Al2O3
Fitting 5 folds for each of 150 candidates, totalling 750 fits




Completed training for target: Al2O3. Time taken: 15088.231660604477 seconds.
Starting training for target: FeOT
Fitting 5 folds for each of 150 candidates, totalling 750 fits




Completed training for target: FeOT. Time taken: 14825.886114120483 seconds.
Starting training for target: MgO
Fitting 5 folds for each of 150 candidates, totalling 750 fits




Completed training for target: MgO. Time taken: 14817.101302146912 seconds.
Starting training for target: CaO
Fitting 5 folds for each of 150 candidates, totalling 750 fits




Completed training for target: CaO. Time taken: 14858.943468809128 seconds.
Starting training for target: Na2O
Fitting 5 folds for each of 150 candidates, totalling 750 fits




Completed training for target: Na2O. Time taken: 14952.649617433548 seconds.
Starting training for target: K2O
Fitting 5 folds for each of 150 candidates, totalling 750 fits




Completed training for target: K2O. Time taken: 16660.1279630661 seconds.
