In [6]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
import mlflow
import datetime
import warnings

from lib.norms import Norm1Scaler, Norm3Scaler
from lib.reproduction import major_oxides
from experiments.optuna_run import get_data
from lib.cross_validation import get_cross_validation_metrics, perform_cross_validation
from lib.get_preprocess_fn import get_preprocess_fn
from lib.metrics import rmse_metric, std_dev_metric
from ngboost import NGBRegressor

In [8]:
norm = 3
drop_cols = major_oxides + ["ID", "Sample Name"]

In [12]:
# disable warnings
warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=UserWarning)

models = []

mlflow.set_experiment(
    f'NGB_Norm{norm}_{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}'
)

for target in major_oxides:
    folds, train, test = get_data(target)
    with mlflow.start_run(run_name=f"NGB_{target}"):
        # == Cross Validation ==
        scaler = Norm1Scaler() if norm == 1 else Norm3Scaler()

        cv_metrics = perform_cross_validation(
            model=NGBRegressor(random_state=42),
            preprocess_fn=get_preprocess_fn(
                target_col=target, drop_cols=drop_cols, preprocessor=scaler
            ),
            folds=folds,
            metric_fns=[rmse_metric, std_dev_metric],
        )

        mlflow.log_metrics(get_cross_validation_metrics(cv_metrics).as_dict())

        # == Training ==
        preprocess_fn = get_preprocess_fn(
            target_col=target, drop_cols=drop_cols, preprocessor=scaler
        )
        X_train, y_train, X_test, y_test = preprocess_fn(train, test)

        # Train the model
        model = NGBRegressor(random_state=42)
        model.fit(X_train, y_train)
        models.append(model)

        pred = model.predict(X_test)
        rmse = rmse_metric(y_test, pred)
        std_dev = std_dev_metric(y_test, pred)

        # Logging
        mlflow.log_params({"target": target, "norm": norm})
        mlflow.log_metrics({"rmse": rmse, "std_dev": std_dev})

        # mlflow.sklearn.log_model(ngbr, f"model_{target}")

Running fold 1 with size: 1513 train and 415 test
Running fold 2 with size: 1518 train and 410 test
Running fold 3 with size: 1558 train and 370 test
[iter 0] loss=4.0852 val_loss=0.0000 scale=1.0000 norm=10.2342
[iter 0] loss=4.0899 val_loss=0.0000 scale=1.0000 norm=10.2236
[iter 0] loss=4.1244 val_loss=0.0000 scale=1.0000 norm=10.6688


KeyboardInterrupt: 