In [1]:
import logging
from pathlib import Path
from typing import Dict

import mlflow
import numpy as np
import pandas as pd
from dotenv import dotenv_values
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import mean_squared_error
from tqdm import tqdm

# from config import logger
from lib.data_handling import CustomSpectralPipeline, load_split_data  # type: ignore
from lib.norms import Norm1Scaler, Norm3Scaler
from lib.outlier_removal import (
    calculate_leverage_residuals,
    identify_outliers,
    plot_leverage_residuals,
)
from lib.reproduction import (
    major_oxides,
    masks,
    optimized_blending_ranges,
    oxide_ranges,
    paper_individual_sm_rmses,
    spectrometer_wavelength_ranges,
    training_info,
)
from lib.utils import custom_kfold_cross_validation, filter_data_by_compositional_range
from PLS_SM.inference import predict_composition_with_blending

env = dotenv_values()
comp_data_loc = env.get("COMPOSITION_DATA_PATH")
dataset_loc = env.get("DATA_PATH")

if not comp_data_loc:
    print("Please set COMPOSITION_DATA_PATH in .env file")
    exit(1)

if not dataset_loc:
    print("Please set DATA_PATH in .env file")
    exit(1)

logger = logging.getLogger("train")

mlflow.set_tracking_uri("http://localhost:5000")

preformatted_data_path = Path("../data/_preformatted_sm/")
train_path = preformatted_data_path / "train.csv"
test_path = preformatted_data_path / "test.csv"

if (
    not preformatted_data_path.exists()
    or not train_path.exists()
    or not test_path.exists()
):
    take_samples = None

    logger.info("Loading data from location: %s", dataset_loc)
    # data = load_data(str(dataset_loc))
    train_data, test_data = load_split_data(
        str(dataset_loc), split_loc="../train_test_split.csv", average_shots=True
    )
    logger.info("Data loaded successfully.")

    logger.info("Initializing CustomSpectralPipeline.")
    pipeline = CustomSpectralPipeline(
        masks=masks,
        composition_data_loc=comp_data_loc,
        major_oxides=major_oxides,
    )
    logger.info("Pipeline initialized. Fitting and transforming data.")
    train_processed = pipeline.fit_transform(train_data)
    test_processed = pipeline.fit_transform(test_data)
    logger.info("Data processing complete.")

    preformatted_data_path.mkdir(parents=True, exist_ok=True)

    train_processed.to_csv(train_path, index=False)
    test_processed.to_csv(test_path, index=False)
else:
    logger.info("Loading preformatted data from location: %s", preformatted_data_path)
    train_processed = pd.read_csv(train_path)
    test_processed = pd.read_csv(test_path)

In [2]:
train_processed.shape

(1538, 5495)

In [3]:
from lib.reproduction import major_oxides

In [4]:
train_processed.head()

Unnamed: 0,246.688,246.741,246.79401,246.847,246.89999,246.953,247.007,247.06,247.11301,247.166,...,SiO2,TiO2,Al2O3,FeOT,MgO,CaO,Na2O,K2O,Sample Name,ID
0,181627800000.0,162356700000.0,137014300000.0,111209300000.0,99880300000.0,106348600000.0,122620700000.0,138380200000.0,145977500000.0,146046700000.0,...,79.35,0.3,9.95,2.18,1.0,1.2,2.75,1.84,201426,201426_2013_11_06_161336_ccs
1,200483900000.0,179342100000.0,152217400000.0,125571200000.0,113342800000.0,119450100000.0,136650600000.0,153770600000.0,161690600000.0,160635600000.0,...,79.35,0.3,9.95,2.18,1.0,1.2,2.75,1.84,201426,201426_2013_11_06_161134_ccs
2,210447400000.0,188949800000.0,160787800000.0,131232400000.0,115454300000.0,119934300000.0,138120900000.0,156985500000.0,165394000000.0,163626200000.0,...,79.35,0.3,9.95,2.18,1.0,1.2,2.75,1.84,201426,201426_2013_11_06_162544_ccs
3,252546700000.0,227186900000.0,193114900000.0,156445400000.0,137305400000.0,143085600000.0,165532100000.0,188551800000.0,198612800000.0,195561700000.0,...,79.35,0.3,9.95,2.18,1.0,1.2,2.75,1.84,201426,201426_2013_11_06_161514_ccs
4,217290700000.0,196582800000.0,169036700000.0,139365200000.0,125594800000.0,131780900000.0,150566600000.0,170207100000.0,178536100000.0,175534400000.0,...,79.35,0.3,9.95,2.18,1.0,1.2,2.75,1.84,201426,201426_2013_11_06_160941_ccs


In [5]:
from sklearn.model_selection import train_test_split
from lib.utils import custom_train_test_split

train_cols = train_processed.columns
test_cols = test_processed.columns

scaler = (
    Norm1Scaler(reshaped=True)
    # if norm == 1
    # else 
    # Norm3Scaler(spectrometer_wavelength_ranges, reshaped=True)
)

train = scaler.fit_transform(train_processed)
test = scaler.fit_transform(test_processed)

# turn back into dataframe
train = pd.DataFrame(train, columns=train_cols)
test = pd.DataFrame(test, columns=test_cols)

drop_cols = major_oxides + ["ID", "Sample Name"]

## - VALIDATION -
split_train, split_val = custom_train_test_split(train, "Sample Name", test_size=0.2, random_state=42)

X_train = split_train.drop(columns=drop_cols)
y_train = split_train[major_oxides]
X_val = split_val.drop(columns=drop_cols)
y_val = split_val[major_oxides]

# Converting train set - comment out if using validation
# X_train = train.drop(columns=drop_cols)
# y_train = train[major_oxides]


# Converting test set
X_test = test.drop(columns=drop_cols)
y_test = test[major_oxides]

In [8]:
import xgboost as xgb
import pandas as pd

mlflow.set_experiment('XGBoost_Norm1')

models = []
rmse_scores = []

xgb_params = {
    'max_depth': 4,        # Slightly deeper trees since data is high-dimensional
    'min_child_weight': 5, # Higher to control over-fitting
    'gamma': 0.1,          # Minimum loss reduction required to make further partition
    'subsample': 0.7,      # Subsample ratio of the training instances
    'colsample_bytree': 0.5,   # Subsample ratio of columns when constructing each tree
    'colsample_bylevel': 0.5,  # Subsample ratio of columns for each level
    'colsample_bynode': 0.5,   # Subsample ratio of columns for each split
    'lambda': 1,           # L2 regularization term on weights (lambda)
    'alpha': 0.5,          # L1 regularization term on weights (alpha)
    'learning_rate': 0.05, # Step size shrinkage used in update to prevent overfitting
    'n_estimators': 100,   # Number of boosting rounds
    'objective': 'reg:squarederror', # Regression with squared loss
    'eval_metric': 'rmse'  # Evaluation metric for validation data
}

# Iterate over each target variable
for target in y_train.columns:
    with mlflow.start_run(run_name=f"XGB_{target}"):
        dtrain = xgb.DMatrix(X_train, label=y_train[target])

        # Log hyperparameters
        for param_key, param_value in xgb_params.items():
            mlflow.log_param(param_key, param_value)

        # Train the model
        bst = xgb.train(xgb_params, dtrain, num_boost_round=xgb_params['n_estimators'])
        models.append(bst)

        # (Optional) Log additional metrics or artifacts as needed
        # For example, logging the RMSE on a validation set (if you have one)
        dval = xgb.DMatrix(X_val, label=y_val[target])
        val_pred = bst.predict(dval)
        rmse_val = np.sqrt(mean_squared_error(y_val[target], val_pred))
        mlflow.log_metric("rmse_val", float(rmse_val))

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.



In [9]:
# To predict, use each model and combine results
predictions = pd.DataFrame()
for i, target in enumerate(y_test.columns):
    dtest = xgb.DMatrix(X_test)
    pred = models[i].predict(dtest)
    predictions[target] = pred

In [16]:
rmse_scores = pd.DataFrame()
for target in y_test.columns:
    rmse = np.sqrt(mean_squared_error(y_test[target], predictions[target]))
    rmse_scores[target] = [rmse]

rmse_scores.set_index([['XGB']], inplace=True) # type: ignore

rmse_scores.T

Unnamed: 0,XGB
SiO2,5.082647
TiO2,0.439449
Al2O3,2.055644
FeOT,5.019102
MgO,1.073851
CaO,1.216045
Na2O,0.527934
K2O,0.571002
