In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import mlflow
import datetime

from lib.reproduction import major_oxides
from lib.norms import Norm1Scaler, Norm3Scaler
from sklearn.ensemble import ExtraTreesRegressor
from lib.get_preprocess_fn import get_preprocess_fn
from lib.cross_validation import get_cross_validation_metrics, perform_cross_validation
from lib.metrics import rmse_metric, std_dev_metric
from experiments.optuna_run import get_data

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
drop_cols = major_oxides + ["ID", "Sample Name"]
norm = 3

In [4]:
folds, train, test = get_data("SiO2")

# Check for overlap between train and test on the column "Sample Name"
overlap = set(train["Sample Name"]).intersection(set(test["Sample Name"]))
if overlap:
    raise ValueError(f"Overlap detected ({len(overlap)}) between train and test on 'Sample Name': {overlap}")


In [5]:
models = []

params = {
    "n_estimators": 100,
    "max_depth": 10,
    "min_samples_split": 2,
    "min_samples_leaf": 1,
    "random_state": 42
}

mlflow.set_experiment(f'ExtraTrees_Norm{norm}_{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}')


for target in major_oxides:
    folds, train, test = get_data(target)
    with mlflow.start_run(run_name=f"ExtraTrees_{target}"):
        # == Cross Validation ==
        scaler = Norm1Scaler() if norm == 1 else Norm3Scaler()

        cv_metrics = perform_cross_validation(
            model=ExtraTreesRegressor(**params),
            preprocess_fn=get_preprocess_fn(target_col=target, drop_cols=drop_cols, preprocessor=scaler),
            folds=folds,
            metric_fns=[rmse_metric, std_dev_metric],
        )

        mlflow.log_metrics(get_cross_validation_metrics(cv_metrics).as_dict())

        # == Training ==
        preprocess_fn = get_preprocess_fn(target_col=target, drop_cols=drop_cols, preprocessor=scaler)
        X_train, y_train, X_test, y_test = preprocess_fn(train, test)

        # Train the model
        model = ExtraTreesRegressor(**params)
        model.fit(X_train, y_train)
        models.append(model)

        pred = model.predict(X_test)
        rmse = rmse_metric(y_test, pred)
        std_dev = std_dev_metric(y_test, pred)

        # Logging
        mlflow.log_params({"target": target, "norm": norm})
        mlflow.log_metrics({"rmse": rmse, "std_dev": std_dev})
        # mlflow.sklearn.log_model(model, f"model_{target}")

2024/06/06 13:27:30 INFO mlflow.tracking.fluent: Experiment with name 'ExtraTrees_Norm3_20240606-132730' does not exist. Creating a new experiment.


Running fold 1 with size: 1528 train and 400 test
Running fold 2 with size: 1543 train and 385 test
Running fold 3 with size: 1533 train and 395 test
Running fold 4 with size: 1524 train and 404 test
Running fold 1 with size: 1538 train and 390 test
Running fold 2 with size: 1544 train and 384 test
Running fold 3 with size: 1528 train and 400 test
Running fold 4 with size: 1514 train and 414 test
Running fold 1 with size: 1528 train and 400 test
Running fold 2 with size: 1544 train and 384 test
Running fold 3 with size: 1534 train and 394 test
Running fold 4 with size: 1523 train and 405 test
Running fold 1 with size: 1529 train and 399 test
Running fold 2 with size: 1543 train and 385 test
Running fold 3 with size: 1533 train and 395 test
Running fold 4 with size: 1524 train and 404 test
Running fold 1 with size: 1528 train and 400 test
Running fold 2 with size: 1543 train and 385 test
Running fold 3 with size: 1534 train and 394 test
Running fold 4 with size: 1523 train and 405 test
