In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import mlflow
import datetime

from lib.reproduction import major_oxides
from lib.norms import Norm1Scaler, Norm3Scaler
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from lib.get_preprocess_fn import get_preprocess_fn
from lib.cross_validation import get_cross_validation_metrics, perform_cross_validation
from lib.metrics import rmse_metric, std_dev_metric
from experiments.optuna_run import get_data

In [None]:
drop_cols = major_oxides + ["ID", "Sample Name"]
norm = 3

In [None]:
folds, train, test = get_data("SiO2")

# Check for overlap between train and test on the column "Sample Name"
overlap = set(train["Sample Name"]).intersection(set(test["Sample Name"]))
if overlap:
    raise ValueError(f"Overlap detected ({len(overlap)}) between train and test on 'Sample Name': {overlap}")


In [None]:
models = []

mlflow.set_experiment(f'PolyLinearRegression_Norm{norm}_{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}')


for target in major_oxides:
    folds, train, test = get_data(target)
    with mlflow.start_run(run_name=f"PolyLinearRegression_{target}"):
        # == Cross Validation ==
        scaler = Norm1Scaler() if norm == 1 else Norm3Scaler()

        for degree in range(1, 2):  # Experimenting with polynomial degrees 1, 2, 3, and 4
            cv_metrics = perform_cross_validation(
                model=make_pipeline(PolynomialFeatures(degree), LinearRegression()),
                preprocess_fn=get_preprocess_fn(target_col=target, drop_cols=drop_cols, preprocessor=scaler),
                folds=folds,
                metric_fns=[rmse_metric, std_dev_metric],
            )

            mlflow.log_metrics(get_cross_validation_metrics(cv_metrics).as_dict(), step=degree)

            # == Training ==
            preprocess_fn = get_preprocess_fn(target_col=target, drop_cols=drop_cols, preprocessor=scaler)
            X_train, y_train, X_test, y_test = preprocess_fn(train, test)

            # Train the model
            model = make_pipeline(PolynomialFeatures(degree), LinearRegression())
            model.fit(X_train, y_train)
            models.append(model)

            pred = model.predict(X_test)
            rmse = rmse_metric(y_test, pred)
            std_dev = std_dev_metric(y_test, pred)

            # Logging
            mlflow.log_params({"target": target, "norm": norm, "degree": degree})
            mlflow.log_metrics({"rmse": rmse, "std_dev": std_dev}, step=degree)
            # mlflow.sklearn.log_model(model, f"model_{target}_degree_{degree}")
