In [1]:
import mlflow
import optuna
import sys
from pathlib import Path
import time
import pandas as pd
import warnings
project_path = Path.cwd().parent
sys.path.append(project_path.as_posix())
from pipeline.p0_data_loader import DataLoader
from pipeline.p1_model_trainer import XGBoostTrainer
from pipeline.p2_optuna_hpo import objective
from pipeline.p3_model_evaluator import ModelEvaluator
from sklearn.model_selection import train_test_split
from loguru import logger

In [2]:
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

In [3]:
# Set our tracking server uri for logging
mlflow.set_tracking_uri(uri="http://127.0.0.1:8088")

# Create a new MLflow Experiment
mlflow.set_experiment("TOA Full HPO")

In [4]:
data_path = Path.home() / 'data/craig_pfc_2023/step_2_cleaned'
assert data_path.exists()
[i for i in data_path.glob('*.pqt')]

[PosixPath('/Users/erdemkarakoylu/data/craig_pfc_2023/step_2_cleaned/df_phy.pqt'),
 PosixPath('/Users/erdemkarakoylu/data/craig_pfc_2023/step_2_cleaned/df_rrs.pqt'),
 PosixPath('/Users/erdemkarakoylu/data/craig_pfc_2023/step_2_cleaned/df_env.pqt'),
 PosixPath('/Users/erdemkarakoylu/data/craig_pfc_2023/step_2_cleaned/df_all.pqt'),
 PosixPath('/Users/erdemkarakoylu/data/craig_pfc_2023/step_2_cleaned/df_rrs_every_every10_51total_bands.pqt')]

In [5]:
loader = DataLoader(
    data_path=data_path, rrs_file = 'df_rrs_every_every10_51total_bands.pqt', 
    phy_file='df_phy.pqt', env_file='df_env.pqt')
dX, dX_env, dY =  loader.load_data()

[32m2025-03-19 12:58:11.294[0m | [34m[1mDEBUG   [0m | [36mpipeline.p0_data_loader[0m:[36m__init__[0m:[36m18[0m - [34m[1mData directory set to /Users/erdemkarakoylu/data/craig_pfc_2023/step_2_cleaned[0m
[32m2025-03-19 12:58:11.295[0m | [34m[1mDEBUG   [0m | [36mpipeline.p0_data_loader[0m:[36m__init__[0m:[36m19[0m - [34m[1mRrs file used: df_rrs_every_every10_51total_bands.pqt[0m
[32m2025-03-19 12:58:11.296[0m | [34m[1mDEBUG   [0m | [36mpipeline.p0_data_loader[0m:[36m__init__[0m:[36m20[0m - [34m[1mPhytoplankton file use df_phy.pqt[0m
[32m2025-03-19 12:58:11.296[0m | [34m[1mDEBUG   [0m | [36mpipeline.p0_data_loader[0m:[36m__init__[0m:[36m22[0m - [34m[1mEnv file used: df_env.pqt[0m


In [6]:
dX_env_sub = dX_env[['lat', 'temp']]
dX = pd.concat((dX, dX_env_sub), axis=1)

In [7]:
project_path

PosixPath('/Users/erdemkarakoylu/projex/toa_2_phyto_ml/multioutput_regression')

In [8]:
# --- Step 2: Train/Test Split ---
dX_train, dX_test, dY_train, dY_test = train_test_split(
    dX, dY, test_size=0.2)
logger.info(f'\nTrain/Test split completed --> Train shape: {dX_train.shape}, Test shape: {dX_test.shape}')
dX_train.to_parquet(project_path / 'models/dX_train.pqt')
dY_train.to_parquet(project_path / 'models/dY_train.pqt')
dX_test.to_parquet(project_path / 'models/dX_test.pqt')
dY_test.to_parquet(project_path / 'models/dY_test.pqt')

[32m2025-03-19 12:58:19.543[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1m
Train/Test split completed --> Train shape: (1009285, 53), Test shape: (252322, 53)[0m
  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


In [None]:
# --- Step 3 Run Full HPO with MedianPruner and 5 warmup steps.
N_TRIALS= 50
start_time = time.time()
with mlflow.start_run():
    pruner = optuna.pruners.MedianPruner(n_warmup_steps=5)  # Stop trials that fall below the median after 5 steps
    study = optuna.create_study(study_name="full_pipeline_training", direction="minimize", pruner=pruner)
    study.optimize(lambda trial: objective(trial, dX_train, dY_train), n_trials=N_TRIALS)
    best_params = study.best_trial.params
    print("Best hyperparameters from quick HPO test:", best_params)
elapsed_time = time.time() - start_time
logger.info("===  HPO Completed ===")
logger.info(f'---{N_TRIALS} trials took {elapsed_time:.2f} seconds.')

[I 2025-03-18 16:35:34,385] A new study created in memory with name: no-name-46b6c713-e7ba-472b-a18f-6fd674a3707b
[I 2025-03-18 16:39:28,061] Trial 0 finished with value: 0.026452138627502064 and parameters: {'learning_rate': 0.007754500643797615, 'max_depth': 10, 'n_estimators': 412, 'subsample': 0.5774657401715373, 'colsample_bytree': 0.7471236180477316, 'gamma': 0.5327565229364415}. Best is trial 0 with value: 0.026452138627502064.
[I 2025-03-18 16:43:43,611] Trial 1 finished with value: 0.013290540366581123 and parameters: {'learning_rate': 0.06490618570816657, 'max_depth': 8, 'n_estimators': 429, 'subsample': 0.8494034508700612, 'colsample_bytree': 0.7191864430412669, 'gamma': 0.0001353742710352044}. Best is trial 1 with value: 0.013290540366581123.
[I 2025-03-18 16:45:59,497] Trial 2 finished with value: 0.13970760866096155 and parameters: {'learning_rate': 0.0019455326214996456, 'max_depth': 4, 'n_estimators': 303, 'subsample': 0.8844555529089925, 'colsample_bytree': 0.572739354

Best hyperparameters from quick HPO test: {'learning_rate': 0.08301458467594765, 'max_depth': 10, 'n_estimators': 466, 'subsample': 0.6577479197695524, 'colsample_bytree': 0.8935835004842283, 'gamma': 8.631996891289372e-06}


Retraining optimized model:

In [None]:
best_params = study.best_trial.params
print("Best hyperparameters:", best_params)

Best hyperparameters: {'learning_rate': 0.08301458467594765, 'max_depth': 10, 'n_estimators': 466, 'subsample': 0.6577479197695524, 'colsample_bytree': 0.8935835004842283, 'gamma': 8.631996891289372e-06}


In [19]:
optimized_model_trainer = XGBoostTrainer(params=best_params)

In [22]:
optimized_model = optimized_model_trainer.train_model(dX_train, dY_train)

In [26]:
import joblib

# Save the final model to a file
joblib.dump(optimized_model, project_path / "models/optimized_model.pkl")
logger.info("Model saved to models/optimzied_model.pkl")


[32m2025-03-19 11:22:29.520[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1mModel saved to models/optimzied_model.pkl[0m


In [None]:
# Reloading the optimzied mode
# Later, you can load it with:
#optimized_model = joblib.load("fmodels/optimized_model.pkl")
#logger.info("Model loaded successfully.")

In [27]:
dY_pred = optimized_model.predict(dX_test)

In [31]:
dY_pred = pd.DataFrame(
    dY_pred, columns=dY_test.columns
    )

In [32]:
dY_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 252322 entries, 1956024 to 1633618
Data columns (total 7 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   dia        252322 non-null  float64
 1   chl        252322 non-null  float64
 2   cya        252322 non-null  float64
 3   coc        252322 non-null  float64
 4   din        252322 non-null  float64
 5   pha        252322 non-null  float64
 6   tot_cphyl  252322 non-null  float64
dtypes: float64(7)
memory usage: 15.4 MB


In [35]:
dY_pred.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252322 entries, 0 to 252321
Data columns (total 7 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   dia        252322 non-null  float32
 1   chl        252322 non-null  float32
 2   cya        252322 non-null  float32
 3   coc        252322 non-null  float32
 4   din        252322 non-null  float32
 5   pha        252322 non-null  float32
 6   tot_cphyl  252322 non-null  float32
dtypes: float32(7)
memory usage: 6.7 MB


### A Short Guide to XGBOOST Hyperparameter Explanation and Interpretation

1. **learning_rate**  
   - **What it does:** Controls the step size at each boosting iteration. A smaller value means the model learns more slowly but can yield a more robust model if combined with a larger number of estimators.
   - **Interpreting Values:**  
     - **High Value (closer to 0.3):** Faster learning; risk of overshooting the optimal solution, potentially leading to overfitting.
     - **Low Value (closer to 1e-3):** Slower learning; may require more estimators to converge, but can lead to better generalization.

2. **max_depth**  
   - **What it does:** Sets the maximum depth of each decision tree. This controls the complexity of the model.
   - **Interpreting Values:**  
     - **High Value (closer to 10):** Allows for deeper trees, capturing more complex patterns but with a higher risk of overfitting.
     - **Low Value (closer to 3):** Results in shallower trees, which may underfit if the data is complex, but generally increases model generalizability.

3. **n_estimators**  
   - **What it does:** Specifies the number of boosting rounds (i.e., trees) to build.
   - **Interpreting Values:**  
     - **High Value:** More trees can lead to better performance on training data, but might also cause overfitting if not regulated by other parameters.
     - **Low Value:** Fewer trees can lead to faster training and less overfitting, but might not capture enough complexity in the data.

4. **subsample**  
   - **What it does:** Represents the fraction of samples used for fitting each individual tree.
   - **Interpreting Values:**  
     - **High Value (closer to 1.0):** Uses most of the data for each tree, which can increase accuracy but may also increase overfitting.
     - **Low Value (closer to 0.5):** Uses fewer samples per tree, introducing randomness that can reduce overfitting but might also lead to underfitting if too low.

5. **colsample_bytree**  
   - **What it does:** Specifies the fraction of features (columns) used when building each tree.
   - **Interpreting Values:**  
     - **High Value (closer to 1.0):** More features are used, which can increase accuracy but also the risk of overfitting.
     - **Low Value (closer to 0.5):** Fewer features are used per tree, adding regularization and potentially improving generalizability.

6. **gamma**  
   - **What it does:** Sets the minimum loss reduction required to make a further partition on a leaf node. It acts as a regularization parameter.
   - **Interpreting Values:**  
     - **High Value:** Demands a larger reduction in loss to split a node, leading to simpler trees (more regularization). This can prevent overfitting.
     - **Low Value:** Allows more splits even if the loss reduction is small, potentially capturing more complex patterns but increasing the risk of overfitting.