In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import mlflow
import datetime

from lib.reproduction import major_oxides
from lib.full_flow_dataloader import load_full_flow_data
from lib.norms import Norm1Scaler, Norm3Scaler
from sklearn.cross_decomposition import PLSRegression
from lib.get_preprocess_fn import get_preprocess_fn
from lib.cross_validation import CustomKFoldCrossValidator, get_cross_validation_metrics, perform_cross_validation
from lib.metrics import rmse_metric, std_dev_metric


In [3]:
drop_cols = major_oxides + ["ID", "Sample Name"]
norm = 3

In [4]:
from experiments.optuna_run import get_data


models = []

pls_params = {
    "n_components": 34,
    "scale": True,
    "max_iter": 500,
}

mlflow.set_experiment(f'PLS_Norm{norm}_{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}')

for target in major_oxides:
    folds, train, test = get_data(target)
    with mlflow.start_run(run_name=f"PLS_{target}"):
        # == Cross Validation ==
        
        scaler = Norm1Scaler() if norm == 1 else Norm3Scaler()

        cv_metrics = perform_cross_validation(
            model=PLSRegression(**pls_params),
            preprocess_fn=get_preprocess_fn(target_col=target, drop_cols=drop_cols, preprocessor=scaler),
            folds=folds,
            metric_fns=[rmse_metric, std_dev_metric],
        )

        mlflow.log_metrics(get_cross_validation_metrics(cv_metrics).as_dict())

        # == Training ==
        preprocess_fn = get_preprocess_fn(target_col=target, drop_cols=drop_cols, preprocessor=scaler)
        X_train, y_train, X_test, y_test = preprocess_fn(train, test)

        # Train the model
        model = PLSRegression(**pls_params)
        model.fit(X_train, y_train)
        models.append(model)

        pred = model.predict(X_test)
        rmse = rmse_metric(y_test, pred)
        std_dev = std_dev_metric(y_test, pred)

        # Logging
        mlflow.log_params({
            **pls_params,
            "target": target,
            "norm": norm
        })
        mlflow.log_metrics({"rmse": rmse, "std_dev": std_dev})
        #mlflow.sklearn.log_model(model, f"model_{target}")

  from .autonotebook import tqdm as notebook_tqdm
2024/05/17 16:02:07 INFO mlflow.tracking.fluent: Experiment with name 'PLS_Norm3_20240517-160207' does not exist. Creating a new experiment.


Running fold 1 with size: 1538 train and 390 test
Running fold 2 with size: 1523 train and 405 test
Running fold 3 with size: 1533 train and 395 test
Running fold 4 with size: 1534 train and 394 test
Running fold 1 with size: 1543 train and 385 test
Running fold 2 with size: 1529 train and 399 test
Running fold 3 with size: 1523 train and 405 test
Running fold 4 with size: 1529 train and 399 test
Running fold 1 with size: 1523 train and 405 test
Running fold 2 with size: 1509 train and 419 test
Running fold 3 with size: 1539 train and 389 test
Running fold 4 with size: 1558 train and 370 test
Running fold 1 with size: 1524 train and 404 test
Running fold 2 with size: 1533 train and 395 test
Running fold 3 with size: 1534 train and 394 test
Running fold 4 with size: 1538 train and 390 test
Running fold 1 with size: 1518 train and 410 test
Running fold 2 with size: 1543 train and 385 test
Running fold 3 with size: 1549 train and 379 test
Running fold 4 with size: 1518 train and 410 test
