In [1]:
import mlflow
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from lib import full_flow_dataloader
import os

from lib.norms import Norm1Scaler, Norm3Scaler


os.chdir(os.path.dirname(os.getcwd()))

train_processed, test_processed = full_flow_dataloader.load_full_flow_data()

In [2]:
train_processed.shape

(1538, 6154)

In [3]:
from lib.reproduction import major_oxides

In [4]:
train_processed.head()

Unnamed: 0,240.811,240.86501,240.918,240.972,241.02699,241.07899,241.133,241.188,241.24001,241.29401,...,SiO2,TiO2,Al2O3,FeOT,MgO,CaO,Na2O,K2O,Sample Name,ID
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,79.35,0.3,9.95,2.18,1.0,1.2,2.75,1.84,201426,201426_2013_11_06_161336_ccs
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,79.35,0.3,9.95,2.18,1.0,1.2,2.75,1.84,201426,201426_2013_11_06_161134_ccs
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,79.35,0.3,9.95,2.18,1.0,1.2,2.75,1.84,201426,201426_2013_11_06_162544_ccs
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,79.35,0.3,9.95,2.18,1.0,1.2,2.75,1.84,201426,201426_2013_11_06_161514_ccs
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,79.35,0.3,9.95,2.18,1.0,1.2,2.75,1.84,201426,201426_2013_11_06_160941_ccs


In [5]:
from sklearn.model_selection import train_test_split
from lib.utils import custom_train_test_split

train_cols = train_processed.columns
test_cols = test_processed.columns
norm = 3

scaler = Norm1Scaler() if norm == 1 else Norm3Scaler()

train = scaler.fit_transform(train_processed)
test = scaler.fit_transform(test_processed)

# turn back into dataframe
train = pd.DataFrame(train, columns=train_cols)
test = pd.DataFrame(test, columns=test_cols)

drop_cols = major_oxides + ["ID", "Sample Name"]

## - VALIDATION -
# split_train, split_val = custom_train_test_split(train, "Sample Name", test_size=0.2, random_state=42)

# X_train = split_train.drop(columns=drop_cols)
# y_train = split_train[major_oxides]
# X_val = split_val.drop(columns=drop_cols)
# y_val = split_val[major_oxides]

# Converting train set - comment out if using validation
X_train = train.drop(columns=drop_cols)
y_train = train[major_oxides]

# Converting test set
X_test = test.drop(columns=drop_cols)
y_test = test[major_oxides]

In [6]:
import datetime
import xgboost as xgb
import pandas as pd

mlflow.set_experiment(f'XGBoost_Norm{norm}_{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}')

models = []
rmse_scores = []

xgb_params = {
    "max_depth": 4,  # Slightly deeper trees since data is high-dimensional
    "min_child_weight": 5,  # Higher to control over-fitting
    "gamma": 0.1,  # Minimum loss reduction required to make further partition
    "subsample": 0.7,  # Subsample ratio of the training instances
    "colsample_bytree": 0.5,  # Subsample ratio of columns when constructing each tree
    "colsample_bylevel": 0.5,  # Subsample ratio of columns for each level
    "colsample_bynode": 0.5,  # Subsample ratio of columns for each split
    "lambda": 1,  # L2 regularization term on weights (lambda)
    "alpha": 0.5,  # L1 regularization term on weights (alpha)
    "learning_rate": 0.05,  # Step size shrinkage used in update to prevent overfitting
    "n_estimators": 100,  # Number of boosting rounds
    "objective": "reg:squarederror",  # Regression with squared loss
    "eval_metric": "rmse",  # Evaluation metric for validation data
}

# Iterate over each target variable
for target in y_train.columns:
    with mlflow.start_run(run_name=f"XGB_{target}"):
        dtrain = xgb.DMatrix(X_train, label=y_train[target])

        # Log hyperparameters
        for param_key, param_value in xgb_params.items():
            mlflow.log_param(param_key, param_value)

        # Train the model
        bst = xgb.train(xgb_params, dtrain, num_boost_round=xgb_params["n_estimators"])
        models.append(bst)

        # (Optional) Log additional metrics or artifacts as needed
        # For example, logging the RMSE on a validation set (if you have one)
        # dval = xgb.DMatrix(X_val, label=y_val[target])
        # val_pred = bst.predict(dval)
        # rmse_val = np.sqrt(mean_squared_error(y_val[target], val_pred))
        # mlflow.log_metric("rmse_val", float(rmse_val))

        dtest = xgb.DMatrix(X_test)
        pred = bst.predict(dtest)
        rmse = np.sqrt(mean_squared_error(y_test[target], pred))
        mlflow.log_metric("rmse", float(rmse))

2024/01/26 16:40:36 INFO mlflow.tracking.fluent: Experiment with name 'XGBoost_Norm3_20240126-164036' does not exist. Creating a new experiment.
Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

