In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import mlflow
import datetime

from lib.reproduction import major_oxides
from lib.norms import Norm1Scaler, Norm3Scaler
from sklearn.svm import SVR
from lib.get_preprocess_fn import get_preprocess_fn
from lib.cross_validation import get_cross_validation_metrics, perform_cross_validation
from lib.metrics import rmse_metric, std_dev_metric
from lib.cross_validation import StratifiedGroupKFoldSplit
from experiments.optuna_run import get_data

In [3]:
drop_cols = major_oxides + ["ID", "Sample Name"]
norm = 3

In [5]:
folds, train, test = get_data("SiO2")

# Check for overlap between train and test on the column "Sample Name"
overlap = set(train["Sample Name"]).intersection(set(test["Sample Name"]))
if overlap:
    raise ValueError(f"Overlap detected ({len(overlap)}) between train and test on 'Sample Name': {overlap}")


No overlap between folds detected (0 samples)


ValueError: Overlap detected (69) between train and test on 'Sample Name': {'ultramafic', '152769', 'p11mt', '81', 'g8mt', 'jsc1385', 'mix5b', 'p7mt', 'lp11', 'jsc1406', '54951m', 'pg4', 'sancb', 'jb3', 'jsc1460', 'dkm390', 'g19mt', 'bwqc1', 'jsc1422', 'g21mt', 'dkm2190', 'moppin', 'jsc1409', 'sc3', 'gl3mt', 'gl4mt', 'g10mt', 'mc', '57a', 'vs211681', 'mix1b', 'lp30', 'jsc1405', 'm1mt', 'jsc1438', 'hvalfj011', 'icel009010', 'sc7', 'mix4b', 'jsc1381', 'ao6', 'jsc1428', 'p10mt', 'vzo106', 'ben', 'jsc1447', 'mi1', 'p2mt', 's4', 'lp4', 'jsc1393', 'gbw07104', '42877m', 'jsc1434', 'r62', 'mix2b', 'jsc1390', 'm2mt', 'jsc1374', 'gbw07311', 'jsc1468', 'ja1', 'jsc1458', 'r45', '54943bm', 'unszk', 'jsc1432', 'ja2', 'p1mt'}

In [8]:
models = []

svr_params = {"kernel": "poly", "C": 100, "epsilon": 0.1, "gamma": "scale", "degree": 2, "coef0": 1.0}

mlflow.set_experiment(f'SVR_Norm{norm}_{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}')


for target in major_oxides:
    folds, train, test = get_data(target)
    with mlflow.start_run(run_name=f"SVR_{target}"):
        # == Cross Validation ==
        scaler = Norm1Scaler() if norm == 1 else Norm3Scaler()

        cv_metrics = perform_cross_validation(
            model=SVR(**svr_params),
            preprocess_fn=get_preprocess_fn(target_col=target, drop_cols=drop_cols, preprocessor=scaler),
            folds=folds,
            metric_fns=[rmse_metric, std_dev_metric],
        )

        mlflow.log_metrics(get_cross_validation_metrics(cv_metrics).as_dict())

        # == Training ==
        preprocess_fn = get_preprocess_fn(target_col=target, drop_cols=drop_cols, preprocessor=scaler)
        X_train, y_train, X_test, y_test = preprocess_fn(train, test)

        # Train the model
        model = SVR(**svr_params)
        model.fit(X_train, y_train)
        models.append(model)

        pred = model.predict(X_test)
        rmse = rmse_metric(y_test, pred)
        std_dev = std_dev_metric(y_test, pred)

        # Logging
        mlflow.log_params({**svr_params, "target": target, "norm": norm})
        mlflow.log_metrics({"rmse": rmse, "std_dev": std_dev})
        # mlflow.sklearn.log_model(model, f"model_{target}")

2024/05/17 15:00:54 INFO mlflow.tracking.fluent: Experiment with name 'SVR_Norm3_20240517-150054' does not exist. Creating a new experiment.


Running fold 1 with size: 1528 train and 400 test
Running fold 2 with size: 1558 train and 370 test
Running fold 3 with size: 1518 train and 410 test
Running fold 4 with size: 1524 train and 404 test
Running fold 1 with size: 1548 train and 380 test
Running fold 2 with size: 1529 train and 399 test
Running fold 3 with size: 1518 train and 410 test
Running fold 4 with size: 1529 train and 399 test
Running fold 1 with size: 1518 train and 410 test
Running fold 2 with size: 1534 train and 394 test
Running fold 3 with size: 1529 train and 399 test
Running fold 4 with size: 1548 train and 380 test
Running fold 1 with size: 1504 train and 424 test
Running fold 2 with size: 1548 train and 380 test
Running fold 3 with size: 1533 train and 395 test
Running fold 4 with size: 1544 train and 384 test
Running fold 1 with size: 1508 train and 420 test
Running fold 2 with size: 1533 train and 395 test
Running fold 3 with size: 1539 train and 389 test
Running fold 4 with size: 1548 train and 380 test
