In [6]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
import mlflow
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error

from lib.reproduction import major_oxides

In [8]:
from sklearn.model_selection import train_test_split
from lib.full_flow_dataloader import load_and_scale_data
from lib.utils import custom_train_test_split

norm = 1

train, test = load_and_scale_data(norm)

drop_cols = major_oxides + ["ID", "Sample Name"]

## - VALIDATION -
# split_train, split_val = custom_train_test_split(train, "Sample Name", test_size=0.2, random_state=42)

# X_train = split_train.drop(columns=drop_cols)
# y_train = split_train[major_oxides]
# X_val = split_val.drop(columns=drop_cols)
# y_val = split_val[major_oxides]

# Converting train set - comment out if using validation
X_train = train.drop(columns=drop_cols)
y_train = train[major_oxides]

# Converting test set
X_test = test.drop(columns=drop_cols)
y_test = test[major_oxides]

In [9]:
import datetime

mlflow.set_experiment(f'GPR_Norm{norm}_{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}')

2024/04/18 16:29:08 INFO mlflow.tracking.fluent: Experiment with name 'GPR_Norm3_20240418-162904' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/905130345086743371', creation_time=1713450551527, experiment_id='905130345086743371', last_update_time=1713450551527, lifecycle_stage='active', name='GPR_Norm3_20240418-162904', tags={}>

In [10]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel
import warnings

# disable warnings
warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=UserWarning)

models = []


for target in y_train.columns:
    with mlflow.start_run(run_name=f"GPR_{target}"):
        gpr_reg = GaussianProcessRegressor(kernel=1.0 * RBF(length_scale=1.0) + WhiteKernel(noise_level=1), random_state=42)
        gpr_reg.fit(X_train, y_train[target])

        y_pred = gpr_reg.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test[target], y_pred))
        mlflow.log_metric("rmse", float(rmse))
        mlflow.log_param("target", target)
        mlflow.log_param("norm", norm)

        models.append(gpr_reg)
        mlflow.sklearn.log_model(gpr_reg, f"model_{target}")