In [7]:
import logging

import matplotlib.pyplot as plt
import mlflow
import numpy as np
import pandas as pd
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import mean_squared_error
from tqdm import tqdm
from pathlib import Path

import sys
import os

# to allow importing `reproduction` from parent directory
sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath("./train.ipynb"))))

from config import logger  # noqa: E402
from data_handling import CustomSpectralPipeline, load_data  # type: ignore  # noqa: E402
from reproduction import (  # noqa: E402
    major_oxides,
    masks,
    oxide_ranges,
    paper_individual_sm_rmses,
)  # noqa: E402
from utils import (  # noqa: E402
    custom_kfold_cross_validation,
    custom_train_test_split,
    filter_data_by_compositional_range,
)


mlflow.set_tracking_uri("http://localhost:5000")

dataset_loc = Path("../data/data/calib/calib_2015/1600mm/pls/")
calib_loc = Path("../data/data/calib/ccam_calibration_compositions.csv")
take_samples = None

In [4]:
logger.info("Loading data from location: %s", dataset_loc)
data = load_data(str(dataset_loc), take_samples)
logger.info("Data loaded successfully.")

Loading data from location: ../data/data/calib/calib_2015/1600mm/pls


Loading data: 100%|██████████| 414/414 [01:40<00:00,  4.13it/s]

Data loaded successfully.





In [5]:
logger.info("Initializing CustomSpectralPipeline.")
pipeline = CustomSpectralPipeline(
    masks=masks,
    composition_data_loc=calib_loc,
    major_oxides=major_oxides,
)
logger.info("Pipeline initialized. Fitting and transforming data.")
processed_data = pipeline.fit_transform(data)
logger.info("Data processing complete.")

Initializing CustomSpectralPipeline.
Pipeline initialized. Fitting and transforming data.


Transforming samples: 100%|██████████| 414/414 [00:29<00:00, 13.94it/s]


Data processing complete.


In [11]:
experiment_name = "PLS_Models_05-12-23_3"
mlflow.set_experiment(experiment_name)

2023/12/05 13:37:47 INFO mlflow.tracking.fluent: Experiment with name 'PLS_Models_05-12-23_3' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/203198903230460690', creation_time=1701779867540, experiment_id='203198903230460690', last_update_time=1701779867540, lifecycle_stage='active', name='PLS_Models_05-12-23_3', tags={}>

In [12]:
k_folds = 5
random_state = 42
n_components = 25  # paper said 20-30 TODO: grid search this
influence_plot_dir = Path("plots/")

for oxide in tqdm(major_oxides, desc="Processing oxides"):
    _oxide_ranges = oxide_ranges.get(oxide, None)
    if _oxide_ranges is None:
        logger.info("Skipping oxide: %s", oxide)
        continue

    for compositional_range in _oxide_ranges.keys():
        logger.debug(
            "Starting MLflow run for compositional range: %s, oxide: %s",
            compositional_range,
            oxide,
        )
        with mlflow.start_run(
            run_name=f"{experiment_name}_{compositional_range}_{oxide}",
        ):
            best_model = None
            best_rmse = float("inf")
            mlflow.log_param("n_components", n_components)
            mlflow.log_param("random_state", random_state)
            logger.info("Filtering data by compositional range.")
            data_filtered = filter_data_by_compositional_range(
                processed_data, compositional_range, oxide, oxide_ranges
            )

            train, test = custom_train_test_split(
                data_filtered,
                group_by="Sample Name",
                test_size=0.2,
                random_state=random_state,
            )

            logger.info("Performing custom k-fold cross-validation.")
            kf = custom_kfold_cross_validation(
                train,
                k=k_folds,
                group_by="Sample Name",
                random_state=random_state,
            )

            fold_rmse = []
            for i, (train_data, test_data) in enumerate(kf):
                logger.debug("Defining PLSRegression model.")
                pls = PLSRegression(
                    n_components=n_components
                )  # Adjust n_components as needed

                logger.debug("Extracting features and target for training.")
                X_train = train_data.drop(columns=major_oxides + ["Sample Name"])
                y_train = train_data[oxide]
                logger.debug("Extracting features and target for testing.")
                X_test = test_data.drop(columns=major_oxides + ["Sample Name"])
                y_test = test_data[oxide]

                logger.debug("Training the model.")
                pls.fit(X_train, y_train)
                logger.debug("Model training complete.")

                logger.debug("Predicting on test data.")
                y_pred = pls.predict(X_test)
                rmse = mean_squared_error(y_test, y_pred, squared=False)
                fold_rmse.append(rmse)
                logger.debug("Fold RMSE: %f", rmse)

                if rmse < best_rmse:
                    best_rmse = rmse
                    best_model = pls

            avg_rmse = sum(fold_rmse) / k_folds

            logger.debug("Logging parameters, metrics, and model to MLflow.")
            mlflow.log_params(
                {
                    "masks": masks,
                    "range": oxide_ranges[oxide][compositional_range],
                    "k_folds": k_folds,
                    "compositional_range": compositional_range,
                    "oxide": oxide,
                    "n_spectra": len(train),
                }
            )

            metrics = {
                "avg_rmse": float(avg_rmse),
                "best_rmse": float(best_rmse),
                "paper_rmse": paper_individual_sm_rmses[compositional_range][oxide],
            } | {
                f"fold_{i}_rmse": float(rmse)
                for i, rmse in enumerate(fold_rmse, start=1)
            }

            mlflow.log_metrics(metrics)
            mlflow.sklearn.log_model(
                best_model,
                "model",
                registered_model_name=f"{oxide}_{compositional_range}",
            )

            # ----- Influence Plots for Outlier Removal ----- #

            pls = PLSRegression(n_components=n_components)
            train_data = train.drop(columns=major_oxides + ["Sample Name"])
            X_train = train_data.to_numpy()
            Y_train = train[oxide].to_numpy()
            pls.fit(X_train, Y_train)

            # Calculate leverage
            t = pls.x_scores_
            leverage = np.diag(np.dot(t, np.dot(np.linalg.inv(np.dot(t.T, t)), t.T)))

            # Calculate residuals
            X_reconstructed = np.dot(t, pls.x_loadings_.T)
            residuals = X_train - X_reconstructed
            Q = np.sum(residuals**2, axis=1)

            # Plotting the influence plot
            plt.scatter(leverage, Q)
            plt.xlabel("Leverage")
            plt.ylabel("Residuals")
            plt.title("Influence Plot")
            plot_path = Path(
                influence_plot_dir
                / f"{experiment_name}/{oxide}_{compositional_range}_ip.png"
            )
            if not plot_path.parent.exists():
                plot_path.parent.mkdir(parents=True, exist_ok=True)
            plt.savefig(plot_path)
            plt.close()

            mlflow.log_artifact(str(plot_path))

            # # Identify outliers (this step is more qualitative and depends on your specific dataset)
            # outliers = identify_outliers(
            #     leverage, Q
            # )  # Implement this function based on your criteria

            # # Remove outliers and repeat the process
            # X_train = np.delete(X_train, outliers, axis=0)
            # Y_train = np.delete(Y_train, outliers, axis=0)

            logger.debug(
                "Compositional Range: %s, Oxide: %s, Average RMSE: %f",
                compositional_range,
                oxide,
                avg_rmse,
            )

Processing oxides:   0%|          | 0/8 [00:00<?, ?it/s]

Filtering data by compositional range.
Performing custom k-fold cross-validation.


Registered model 'SiO2_Full' already exists. Creating a new version of this model...
2023/12/05 13:38:15 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: SiO2_Full, version 7
Created version '7' of model 'SiO2_Full'.


Filtering data by compositional range.
Performing custom k-fold cross-validation.


Registered model 'SiO2_Low' already exists. Creating a new version of this model...
2023/12/05 13:38:30 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: SiO2_Low, version 6
Created version '6' of model 'SiO2_Low'.


Filtering data by compositional range.
Performing custom k-fold cross-validation.


Registered model 'SiO2_Mid' already exists. Creating a new version of this model...
2023/12/05 13:38:50 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: SiO2_Mid, version 6
Created version '6' of model 'SiO2_Mid'.


Filtering data by compositional range.
Performing custom k-fold cross-validation.


Registered model 'SiO2_High' already exists. Creating a new version of this model...
2023/12/05 13:39:05 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: SiO2_High, version 6
Created version '6' of model 'SiO2_High'.
Processing oxides:  12%|█▎        | 1/8 [01:19<09:13, 79.03s/it]

Filtering data by compositional range.
Performing custom k-fold cross-validation.


Registered model 'TiO2_Full' already exists. Creating a new version of this model...
2023/12/05 13:39:29 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: TiO2_Full, version 5
Created version '5' of model 'TiO2_Full'.


Filtering data by compositional range.
Performing custom k-fold cross-validation.


Registered model 'TiO2_Low' already exists. Creating a new version of this model...
2023/12/05 13:39:53 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: TiO2_Low, version 5
Created version '5' of model 'TiO2_Low'.


Filtering data by compositional range.
Performing custom k-fold cross-validation.


Registered model 'TiO2_Mid' already exists. Creating a new version of this model...
2023/12/05 13:40:05 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: TiO2_Mid, version 5
Created version '5' of model 'TiO2_Mid'.


Filtering data by compositional range.
Performing custom k-fold cross-validation.


Registered model 'TiO2_High' already exists. Creating a new version of this model...
2023/12/05 13:40:10 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: TiO2_High, version 5
Created version '5' of model 'TiO2_High'.
Processing oxides:  25%|██▌       | 2/8 [02:21<06:54, 69.12s/it]

Filtering data by compositional range.
Performing custom k-fold cross-validation.


Registered model 'Al2O3_Full' already exists. Creating a new version of this model...
2023/12/05 13:40:31 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Al2O3_Full, version 5
Created version '5' of model 'Al2O3_Full'.


Filtering data by compositional range.
Performing custom k-fold cross-validation.


Registered model 'Al2O3_Low' already exists. Creating a new version of this model...
2023/12/05 13:40:45 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Al2O3_Low, version 5
Created version '5' of model 'Al2O3_Low'.


Filtering data by compositional range.
Performing custom k-fold cross-validation.


Registered model 'Al2O3_Mid' already exists. Creating a new version of this model...
2023/12/05 13:41:05 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Al2O3_Mid, version 5
Created version '5' of model 'Al2O3_Mid'.


Filtering data by compositional range.
Performing custom k-fold cross-validation.


Registered model 'Al2O3_High' already exists. Creating a new version of this model...
2023/12/05 13:41:15 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Al2O3_High, version 5
Created version '5' of model 'Al2O3_High'.
Processing oxides:  38%|███▊      | 3/8 [03:27<05:39, 67.84s/it]

Filtering data by compositional range.
Performing custom k-fold cross-validation.


Registered model 'FeOT_Full' already exists. Creating a new version of this model...
2023/12/05 13:41:40 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: FeOT_Full, version 5
Created version '5' of model 'FeOT_Full'.


Filtering data by compositional range.
Performing custom k-fold cross-validation.


Registered model 'FeOT_Low' already exists. Creating a new version of this model...
2023/12/05 13:42:08 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: FeOT_Low, version 5
Created version '5' of model 'FeOT_Low'.


Filtering data by compositional range.
Performing custom k-fold cross-validation.


Registered model 'FeOT_Mid' already exists. Creating a new version of this model...
2023/12/05 13:42:28 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: FeOT_Mid, version 5
Created version '5' of model 'FeOT_Mid'.


Filtering data by compositional range.
Performing custom k-fold cross-validation.


Registered model 'FeOT_High' already exists. Creating a new version of this model...
2023/12/05 13:42:37 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: FeOT_High, version 5
Created version '5' of model 'FeOT_High'.
Processing oxides:  50%|█████     | 4/8 [04:49<04:53, 73.25s/it]

Filtering data by compositional range.
Performing custom k-fold cross-validation.


Registered model 'MgO_Full' already exists. Creating a new version of this model...
2023/12/05 13:43:00 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: MgO_Full, version 5
Created version '5' of model 'MgO_Full'.


Filtering data by compositional range.
Performing custom k-fold cross-validation.


Registered model 'MgO_Low' already exists. Creating a new version of this model...
2023/12/05 13:43:21 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: MgO_Low, version 5
Created version '5' of model 'MgO_Low'.


Filtering data by compositional range.
Performing custom k-fold cross-validation.


Registered model 'MgO_Mid' already exists. Creating a new version of this model...
2023/12/05 13:43:46 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: MgO_Mid, version 5
Created version '5' of model 'MgO_Mid'.


Filtering data by compositional range.
Performing custom k-fold cross-validation.


Registered model 'MgO_High' already exists. Creating a new version of this model...
2023/12/05 13:43:54 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: MgO_High, version 5
Created version '5' of model 'MgO_High'.
Processing oxides:  62%|██████▎   | 5/8 [06:06<03:44, 74.72s/it]

Filtering data by compositional range.
Performing custom k-fold cross-validation.


Registered model 'CaO_Full' already exists. Creating a new version of this model...
2023/12/05 13:44:20 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: CaO_Full, version 5
Created version '5' of model 'CaO_Full'.


Filtering data by compositional range.
Performing custom k-fold cross-validation.


Registered model 'CaO_Low' already exists. Creating a new version of this model...
2023/12/05 13:44:44 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: CaO_Low, version 5
Created version '5' of model 'CaO_Low'.


Filtering data by compositional range.
Performing custom k-fold cross-validation.


Registered model 'CaO_Mid' already exists. Creating a new version of this model...
2023/12/05 13:45:08 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: CaO_Mid, version 5
Created version '5' of model 'CaO_Mid'.


Filtering data by compositional range.
Performing custom k-fold cross-validation.


Registered model 'CaO_High' already exists. Creating a new version of this model...
2023/12/05 13:45:17 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: CaO_High, version 5
Created version '5' of model 'CaO_High'.
Processing oxides:  75%|███████▌  | 6/8 [07:28<02:34, 77.19s/it]

Filtering data by compositional range.
Performing custom k-fold cross-validation.


Registered model 'Na2O_Full' already exists. Creating a new version of this model...
2023/12/05 13:45:38 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Na2O_Full, version 5
Created version '5' of model 'Na2O_Full'.


Filtering data by compositional range.
Performing custom k-fold cross-validation.


Registered model 'Na2O_Low' already exists. Creating a new version of this model...
2023/12/05 13:46:01 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Na2O_Low, version 5
Created version '5' of model 'Na2O_Low'.


Filtering data by compositional range.
Performing custom k-fold cross-validation.


Registered model 'Na2O_High' already exists. Creating a new version of this model...
2023/12/05 13:46:12 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Na2O_High, version 5
Created version '5' of model 'Na2O_High'.
Processing oxides:  88%|████████▊ | 7/8 [08:24<01:10, 70.19s/it]

Filtering data by compositional range.
Performing custom k-fold cross-validation.


Registered model 'K2O_Full' already exists. Creating a new version of this model...
2023/12/05 13:46:34 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: K2O_Full, version 5
Created version '5' of model 'K2O_Full'.


Filtering data by compositional range.
Performing custom k-fold cross-validation.


Registered model 'K2O_Low' already exists. Creating a new version of this model...
2023/12/05 13:46:52 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: K2O_Low, version 5
Created version '5' of model 'K2O_Low'.


Filtering data by compositional range.
Performing custom k-fold cross-validation.


Registered model 'K2O_High' already exists. Creating a new version of this model...
2023/12/05 13:47:09 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: K2O_High, version 5
Created version '5' of model 'K2O_High'.
Processing oxides: 100%|██████████| 8/8 [09:23<00:00, 70.41s/it]


In [7]:
mlflow.end_run()