In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import mlflow
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error

from lib.reproduction import major_oxides

In [3]:
from sklearn.model_selection import train_test_split
from lib.full_flow_dataloader import load_and_scale_data
from lib.utils import custom_train_test_split

norm = 3

train, test = load_and_scale_data(norm)

drop_cols = major_oxides + ["ID", "Sample Name"]

## - VALIDATION -
# split_train, split_val = custom_train_test_split(train, "Sample Name", test_size=0.2, random_state=42)

# X_train = split_train.drop(columns=drop_cols)
# y_train = split_train[major_oxides]
# X_val = split_val.drop(columns=drop_cols)
# y_val = split_val[major_oxides]

# Converting train set - comment out if using validation
X_train = train.drop(columns=drop_cols)
y_train = train[major_oxides]

# Converting test set
X_test = test.drop(columns=drop_cols)
y_test = test[major_oxides]

In [4]:
import datetime

mlflow.set_experiment(f'GPR_Norm{norm}_{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}')

2024/04/18 16:35:42 INFO mlflow.tracking.fluent: Experiment with name 'GPR_Norm3_20240418-163537' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/121599189731183263', creation_time=1713450945257, experiment_id='121599189731183263', last_update_time=1713450945257, lifecycle_stage='active', name='GPR_Norm3_20240418-163537', tags={}>

In [5]:
from ngboost import NGBRegressor
import warnings

# disable warnings
warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=UserWarning)

models = []


for target in y_train.columns:
    with mlflow.start_run(run_name=f"NGB_{target}"):
        ngbr = NGBRegressor(random_state=42)
        ngbr.fit(X_train, y_train[target])

        y_pred = ngbr.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test[target], y_pred))
        mlflow.log_metric("rmse", float(rmse))
        mlflow.log_param("target", target)
        mlflow.log_param("norm", norm)

        models.append(ngbr)
        mlflow.sklearn.log_model(ngbr, f"model_{target}")

[iter 0] loss=4.1695 val_loss=0.0000 scale=1.0000 norm=11.1493
[iter 100] loss=3.1437 val_loss=0.0000 scale=2.0000 norm=6.9691
[iter 200] loss=2.4534 val_loss=0.0000 scale=2.0000 norm=4.0605
[iter 300] loss=2.0456 val_loss=0.0000 scale=2.0000 norm=3.3166
[iter 400] loss=1.8581 val_loss=0.0000 scale=2.0000 norm=3.0035
[iter 0] loss=1.2077 val_loss=0.0000 scale=1.0000 norm=0.8944
[iter 100] loss=0.2565 val_loss=0.0000 scale=2.0000 norm=0.9772
[iter 200] loss=-0.3455 val_loss=0.0000 scale=2.0000 norm=0.8381
[iter 300] loss=-0.6680 val_loss=0.0000 scale=2.0000 norm=0.8054
[iter 400] loss=-0.8126 val_loss=0.0000 scale=1.0000 norm=0.4052
[iter 0] loss=3.2855 val_loss=0.0000 scale=1.0000 norm=4.7661
[iter 100] loss=2.3983 val_loss=0.0000 scale=2.0000 norm=3.7614
[iter 200] loss=1.7422 val_loss=0.0000 scale=2.0000 norm=2.2410
[iter 300] loss=1.3566 val_loss=0.0000 scale=2.0000 norm=1.8570
[iter 400] loss=1.1683 val_loss=0.0000 scale=1.0000 norm=0.8562
[iter 0] loss=3.3389 val_loss=0.0000 scale