In [36]:
import logging
from pathlib import Path
from typing import Dict

import mlflow
import numpy as np
import pandas as pd
from dotenv import dotenv_values
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import mean_squared_error
from tqdm import tqdm

# from config import logger
from lib.data_handling import CustomSpectralPipeline, load_split_data  # type: ignore
from lib.norms import Norm1Scaler, Norm3Scaler
from lib.outlier_removal import (
    calculate_leverage_residuals,
    identify_outliers,
    plot_leverage_residuals,
)
from lib.reproduction import (
    major_oxides,
    masks,
    optimized_blending_ranges,
    oxide_ranges,
    paper_individual_sm_rmses,
    spectrometer_wavelength_ranges,
    training_info,
)
from lib.utils import custom_kfold_cross_validation, filter_data_by_compositional_range
from PLS_SM.inference import predict_composition_with_blending

env = dotenv_values()
comp_data_loc = env.get("COMPOSITION_DATA_PATH")
dataset_loc = env.get("DATA_PATH")

if not comp_data_loc:
    print("Please set COMPOSITION_DATA_PATH in .env file")
    exit(1)

if not dataset_loc:
    print("Please set DATA_PATH in .env file")
    exit(1)

logger = logging.getLogger("train")

mlflow.set_tracking_uri("http://localhost:5000")

preformatted_data_path = Path("../data/_preformatted_sm/")
train_path = preformatted_data_path / "train.csv"
test_path = preformatted_data_path / "test.csv"

if (
    not preformatted_data_path.exists()
    or not train_path.exists()
    or not test_path.exists()
):
    take_samples = None

    logger.info("Loading data from location: %s", dataset_loc)
    # data = load_data(str(dataset_loc))
    train_data, test_data = load_split_data(
        str(dataset_loc), split_loc="../train_test_split.csv", average_shots=True
    )
    logger.info("Data loaded successfully.")

    logger.info("Initializing CustomSpectralPipeline.")
    pipeline = CustomSpectralPipeline(
        masks=masks,
        composition_data_loc=comp_data_loc,
        major_oxides=major_oxides,
    )
    logger.info("Pipeline initialized. Fitting and transforming data.")
    train_processed = pipeline.fit_transform(train_data)
    test_processed = pipeline.fit_transform(test_data)
    logger.info("Data processing complete.")

    preformatted_data_path.mkdir(parents=True, exist_ok=True)

    train_processed.to_csv(train_path, index=False)
    test_processed.to_csv(test_path, index=False)
else:
    logger.info("Loading preformatted data from location: %s", preformatted_data_path)
    train_processed = pd.read_csv(train_path)
    test_processed = pd.read_csv(test_path)

In [37]:
train_processed.shape

(1538, 6154)

In [38]:
from lib.reproduction import major_oxides

In [39]:
train_processed.head()

Unnamed: 0,240.811,240.86501,240.918,240.972,241.02699,241.07899,241.133,241.188,241.24001,241.29401,...,SiO2,TiO2,Al2O3,FeOT,MgO,CaO,Na2O,K2O,Sample Name,ID
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,79.35,0.3,9.95,2.18,1.0,1.2,2.75,1.84,201426,201426_2013_11_06_161336_ccs
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,79.35,0.3,9.95,2.18,1.0,1.2,2.75,1.84,201426,201426_2013_11_06_161134_ccs
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,79.35,0.3,9.95,2.18,1.0,1.2,2.75,1.84,201426,201426_2013_11_06_162544_ccs
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,79.35,0.3,9.95,2.18,1.0,1.2,2.75,1.84,201426,201426_2013_11_06_161514_ccs
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,79.35,0.3,9.95,2.18,1.0,1.2,2.75,1.84,201426,201426_2013_11_06_160941_ccs


In [40]:
from sklearn.model_selection import train_test_split
from lib.utils import custom_train_test_split

train_cols = train_processed.columns
test_cols = test_processed.columns
norm = 3

scaler = (
    Norm1Scaler()
    if norm == 1
    else 
    Norm3Scaler()
)

train = scaler.fit_transform(train_processed)
test = scaler.fit_transform(test_processed)

# turn back into dataframe
train = pd.DataFrame(train, columns=train_cols)
test = pd.DataFrame(test, columns=test_cols)

drop_cols = major_oxides + ["ID", "Sample Name"]

## - VALIDATION -
# split_train, split_val = custom_train_test_split(train, "Sample Name", test_size=0.2, random_state=42)

# X_train = split_train.drop(columns=drop_cols)
# y_train = split_train[major_oxides]
# X_val = split_val.drop(columns=drop_cols)
# y_val = split_val[major_oxides]

# Converting train set - comment out if using validation
X_train = train.drop(columns=drop_cols)
y_train = train[major_oxides]

# Converting test set
X_test = test.drop(columns=drop_cols)
y_test = test[major_oxides]

In [41]:
import datetime
import xgboost as xgb
import pandas as pd

mlflow.set_experiment(f'XGBoost_Norm{norm}_{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}')

models = []
rmse_scores = []

xgb_params = {
    'max_depth': 4,        # Slightly deeper trees since data is high-dimensional
    'min_child_weight': 5, # Higher to control over-fitting
    'gamma': 0.1,          # Minimum loss reduction required to make further partition
    'subsample': 0.7,      # Subsample ratio of the training instances
    'colsample_bytree': 0.5,   # Subsample ratio of columns when constructing each tree
    'colsample_bylevel': 0.5,  # Subsample ratio of columns for each level
    'colsample_bynode': 0.5,   # Subsample ratio of columns for each split
    'lambda': 1,           # L2 regularization term on weights (lambda)
    'alpha': 0.5,          # L1 regularization term on weights (alpha)
    'learning_rate': 0.05, # Step size shrinkage used in update to prevent overfitting
    'n_estimators': 100,   # Number of boosting rounds
    'objective': 'reg:squarederror', # Regression with squared loss
    'eval_metric': 'rmse'  # Evaluation metric for validation data
}

# Iterate over each target variable
for target in y_train.columns:
    with mlflow.start_run(run_name=f"XGB_{target}"):
        dtrain = xgb.DMatrix(X_train, label=y_train[target])

        # Log hyperparameters
        for param_key, param_value in xgb_params.items():
            mlflow.log_param(param_key, param_value)

        # Train the model
        bst = xgb.train(xgb_params, dtrain, num_boost_round=xgb_params['n_estimators'])
        models.append(bst)

        # (Optional) Log additional metrics or artifacts as needed
        # For example, logging the RMSE on a validation set (if you have one)
        # dval = xgb.DMatrix(X_val, label=y_val[target])
        # val_pred = bst.predict(dval)
        # rmse_val = np.sqrt(mean_squared_error(y_val[target], val_pred))
        # mlflow.log_metric("rmse_val", float(rmse_val))

        dtest = xgb.DMatrix(X_test)
        pred = bst.predict(dtest)
        rmse = np.sqrt(mean_squared_error(y_test[target], pred))
        mlflow.log_metric("rmse", float(rmse))

2024/01/25 16:27:40 INFO mlflow.tracking.fluent: Experiment with name 'XGBoost_Norm3_20240125-162740' does not exist. Creating a new experiment.
Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

