In [1]:
import mlflow
import optuna
import sys
from pathlib import Path
import pandas as pd
import warnings
project_path = Path.cwd().parent
sys.path.append(project_path.as_posix())
from pipeline.p0_data_loader import DataLoader
from pipeline.p1_model_trainer import XGBoostTrainer
from pipeline.p2_optuna_hpo import objective
from pipeline.p3_model_evaluator import ModelEvaluator
from sklearn.model_selection import train_test_split
from loguru import logger

In [2]:
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

In [3]:
data_path = Path.home() / 'data/craig_pfc_2023/step_2_cleaned'
assert data_path.exists()
[i for i in data_path.glob('*.pqt')]

[PosixPath('/Users/erdemkarakoylu/data/craig_pfc_2023/step_2_cleaned/df_phy.pqt'),
 PosixPath('/Users/erdemkarakoylu/data/craig_pfc_2023/step_2_cleaned/df_rrs.pqt'),
 PosixPath('/Users/erdemkarakoylu/data/craig_pfc_2023/step_2_cleaned/df_env.pqt'),
 PosixPath('/Users/erdemkarakoylu/data/craig_pfc_2023/step_2_cleaned/df_all.pqt')]

In [4]:
loader = DataLoader(
    data_path=data_path, rrs_file = 'df_rrs.pqt', 
    phy_file='df_phy.pqt', env_file='df_env.pqt')
dX, dX_env, dY =  loader.load_data()

[32m2025-03-17 15:00:21.051[0m | [34m[1mDEBUG   [0m | [36mpipeline.p0_data_loader[0m:[36m__init__[0m:[36m18[0m - [34m[1mData directory set to /Users/erdemkarakoylu/data/craig_pfc_2023/step_2_cleaned[0m
[32m2025-03-17 15:00:21.051[0m | [34m[1mDEBUG   [0m | [36mpipeline.p0_data_loader[0m:[36m__init__[0m:[36m19[0m - [34m[1mRrs file used: df_rrs.pqt[0m
[32m2025-03-17 15:00:21.052[0m | [34m[1mDEBUG   [0m | [36mpipeline.p0_data_loader[0m:[36m__init__[0m:[36m20[0m - [34m[1mPhytoplankton file use df_phy.pqt[0m
[32m2025-03-17 15:00:21.052[0m | [34m[1mDEBUG   [0m | [36mpipeline.p0_data_loader[0m:[36m__init__[0m:[36m22[0m - [34m[1mEnv file used: df_env.pqt[0m


In [5]:
# Reduce features
dX = dX.iloc[:, ::10]
dX_env_sub = dX_env[['lat', 'temp']]
dX = pd.concat((dX, dX_env_sub), axis=1)

In [6]:
# Subsample data for quicker debugging iterations
sample_size = 10000  # Use a smaller sample size for debugging
dX = dX.sample(sample_size)
dY = dY.loc[dX.index]

In [7]:
logger.info(f"\nAfter subsampling: Features shape ={dX.shape},\nTargets shape ={dY.shape}")

[32m2025-03-17 15:00:24.810[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1m
After subsampling: Features shape =(10000, 53),
Targets shape =(10000, 7)[0m


In [8]:
# --- Step 2: Train/Test Split ---
dX_train, dX_test, dY_train, dY_test = train_test_split(
    dX, dY, test_size=0.2, random_state=42)
logger.info(f"\nTrain/Test split completed --> Train shape: {dX_train.shape}, Test shape: {dX_test.shape}")


[32m2025-03-17 15:00:24.818[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1m
Train/Test split completed --> Train shape: (8000, 53), Test shape: (2000, 53)[0m


In [9]:
# --- Step 3: Train Initial Model & Evaluate ---
# Define a basic set of hyperparameters for quick testing
initial_params = {
    "objective": "reg:squarederror",
    "learning_rate": 0.1,
    "max_depth": 3,
    "n_estimators": 100,
}
model_trainer = XGBoostTrainer(initial_params)
model = model_trainer.train_model(dX_train, dY_train)
logger.info("Initial model trained with basic hyperparameters.")

# Run predictions on the test set
preds = model.predict(dX_test)
# Evaluate using the ModelEvaluator which computes MSE, R², MAE, RMSE, etc.
evaluator = ModelEvaluator()
mse, r2, mae, rmse = evaluator.evaluate(dY_test, preds)


[32m2025-03-17 15:00:26.092[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m11[0m - [1mInitial model trained with basic hyperparameters.[0m


In [10]:
logger.info("Initial Evaluation Results:")
logger.info(f"MSE: {mse:.3f}, R2: {r2:.3f}, MAE: {mae:.3f}, RMSE: {rmse:.3f}")

[32m2025-03-17 15:00:26.132[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mInitial Evaluation Results:[0m
[32m2025-03-17 15:00:26.134[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mMSE: 0.002, R2: 0.757, MAE: 0.006, RMSE: 0.041[0m


In [11]:
# --- Step 4: Quick Hyperparameter Optimization Test using Optuna ---
# Run a small-scale hyperparameter optimization with 5 trials for debugging purposes.
with mlflow.start_run():
    study = optuna.create_study(direction="minimize")
    study.optimize(lambda trial: objective(trial, dX_train, dY_train), n_trials=5)
    best_params = study.best_trial.params
    print("Best hyperparameters from quick HPO test:", best_params)

logger.info("===  Basic Pipeline and Quick HPO Test Completed ===")

[I 2025-03-17 15:00:26,157] A new study created in memory with name: no-name-66811ffa-cf72-4710-af0e-7b9e02885f30
[I 2025-03-17 15:00:43,198] Trial 0 finished with value: 0.11714991469441438 and parameters: {'learning_rate': 0.003335253605086211, 'max_depth': 10, 'n_estimators': 237, 'subsample': 0.9663562244152417, 'colsample_bytree': 0.8693153033728531, 'gamma': 0.047656602310498085}. Best is trial 0 with value: 0.11714991469441438.
[I 2025-03-17 15:01:04,465] Trial 1 finished with value: 0.16739570240757748 and parameters: {'learning_rate': 0.0011373693253201329, 'max_depth': 7, 'n_estimators': 293, 'subsample': 0.8033356217905692, 'colsample_bytree': 0.7285081766863049, 'gamma': 3.218280590206628e-05}. Best is trial 0 with value: 0.11714991469441438.
[I 2025-03-17 15:01:12,335] Trial 2 finished with value: 0.03266205730100188 and parameters: {'learning_rate': 0.1775012165041938, 'max_depth': 3, 'n_estimators': 381, 'subsample': 0.5328384874271805, 'colsample_bytree': 0.624176510209

Best hyperparameters from quick HPO test: {'learning_rate': 0.12675939992184268, 'max_depth': 4, 'n_estimators': 294, 'subsample': 0.8036115207778418, 'colsample_bytree': 0.8510057935977824, 'gamma': 8.767235319443697e-06}


### Hyperparameter Explanation and Interpretation

1. **learning_rate**  
   - **What it does:** Controls the step size at each boosting iteration. A smaller value means the model learns more slowly but can yield a more robust model if combined with a larger number of estimators.
   - **Interpreting Values:**  
     - **High Value (closer to 0.3):** Faster learning; risk of overshooting the optimal solution, potentially leading to overfitting.
     - **Low Value (closer to 1e-3):** Slower learning; may require more estimators to converge, but can lead to better generalization.

2. **max_depth**  
   - **What it does:** Sets the maximum depth of each decision tree. This controls the complexity of the model.
   - **Interpreting Values:**  
     - **High Value (closer to 10):** Allows for deeper trees, capturing more complex patterns but with a higher risk of overfitting.
     - **Low Value (closer to 3):** Results in shallower trees, which may underfit if the data is complex, but generally increases model generalizability.

3. **n_estimators**  
   - **What it does:** Specifies the number of boosting rounds (i.e., trees) to build.
   - **Interpreting Values:**  
     - **High Value:** More trees can lead to better performance on training data, but might also cause overfitting if not regulated by other parameters.
     - **Low Value:** Fewer trees can lead to faster training and less overfitting, but might not capture enough complexity in the data.

4. **subsample**  
   - **What it does:** Represents the fraction of samples used for fitting each individual tree.
   - **Interpreting Values:**  
     - **High Value (closer to 1.0):** Uses most of the data for each tree, which can increase accuracy but may also increase overfitting.
     - **Low Value (closer to 0.5):** Uses fewer samples per tree, introducing randomness that can reduce overfitting but might also lead to underfitting if too low.

5. **colsample_bytree**  
   - **What it does:** Specifies the fraction of features (columns) used when building each tree.
   - **Interpreting Values:**  
     - **High Value (closer to 1.0):** More features are used, which can increase accuracy but also the risk of overfitting.
     - **Low Value (closer to 0.5):** Fewer features are used per tree, adding regularization and potentially improving generalizability.

6. **gamma**  
   - **What it does:** Sets the minimum loss reduction required to make a further partition on a leaf node. It acts as a regularization parameter.
   - **Interpreting Values:**  
     - **High Value:** Demands a larger reduction in loss to split a node, leading to simpler trees (more regularization). This can prevent overfitting.
     - **Low Value:** Allows more splits even if the loss reduction is small, potentially capturing more complex patterns but increasing the risk of overfitting.

### How to Use These Interpretations

- **Optimized Values Context:**  
  After running your hyperparameter optimization, review the best parameters:
  - For example, if the optimized `learning_rate` is very low, it suggests that the model benefits from gradual learning, potentially indicating that the data is noisy or complex.
  - A high `max_depth` might mean your data has complex interactions, but you should verify that the performance on the validation set is not a result of overfitting.
  - Lower values for `subsample` or `colsample_bytree` indicate that some regularization was beneficial to avoid overfitting, especially given the high dimensionality (500+ features) of your data.
  - A moderate to high `gamma` value might suggest that the model benefits from stronger regularization to avoid unnecessary splits.
