In [None]:
["82033", "82026", "82034", "82039", "82046", "82060", "82050"]

In [None]:
import os
import random
import sys
import warnings
from pathlib import Path

import geopandas as gpd
import numpy as np
import optuna
import pandas as pd
import torch
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split

# Add parent directory to Python path to access src modules
sys.path.append(str(Path("..").resolve()))
from src.data_loaders.hydro_data_loader import (
    data_creator,
    find_valid_gauges,
    select_uncorrelated_features,
)
from src.models.catboost.optimizer import (
    limit_optuna_logging,
)
from src.utils.logger import setup_logger
from src.utils.metrics import kling_gupta_efficiency, nash_sutcliffe_efficiency

# Configure logging
warnings.filterwarnings("ignore")
limit_optuna_logging()  # Reduce Optuna logging verbosity
# ---- full reproducibility ----
SEED = 42
N_TRIALS = 100  # raise if you have time
ES_ROUNDS = 100  # early-stopping patience
GPU_OK = torch.cuda.is_available()

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
logger = setup_logger(
    name="CatboostTraining", log_file="../logs/notebooks/catboost.log", log_level="INFO"
)

# Read watershed and gauge geometry ../data from GeoPackage files
e_obs_ws = gpd.read_file("../data/Geometry/EOBSWatersheds2025.gpkg", ignore_geometry=False)
e_obs_ws.set_index("gauge_id", inplace=True)
e_obs_gauge = gpd.read_file("../data/Geometry/EOBSPoints2025.gpkg", ignore_geometry=False)
e_obs_gauge.set_index("gauge_id", inplace=True)
# Find valid gauges (no missing ../data)
logger.info("Finding gauges with valid ../data...")
full_gauges, partial_gauges = find_valid_gauges(e_obs_ws, Path("../data/HydroFiles"))
static_data = pd.read_csv(
    "../data/Geometry/static_data.csv", dtype={"gauge_id": str}, index_col="gauge_id"
)
static_data = static_data.loc[full_gauges + partial_gauges, :]
# Get uncorrelated features from static_data
uncorrelated_static_features = select_uncorrelated_features(static_data)
old_static_features = [
    "for_pc_sse",
    "crp_pc_sse",
    "inu_pc_ult",
    "ire_pc_sse",
    "lka_pc_use",
    "prm_pc_sse",
    "pst_pc_sse",
    "cly_pc_sav",
    "slt_pc_sav",
    "snd_pc_sav",
    "kar_pc_sse",
    "urb_pc_sse",
    "gwt_cm_sav",
    "lkv_mc_usu",
    "rev_mc_usu",
    "sgr_dk_sav",
    "slp_dg_sav",
    "ws_area",
    "ele_mt_sav",
]
combined_feature = list(set(old_static_features + uncorrelated_static_features))
combined_features = static_data[combined_feature].reset_index()
logger.info(f"Selected {len(combined_feature)} uncorrelated features from static_data.")
# Split full_gauges into train (80%) and test (20%) sets
train_gauges, test_gauges = train_test_split(full_gauges, test_size=0.15, random_state=42)
logger.info(
    f"Split {len(full_gauges)} gauges into {len(train_gauges)} train and {len(test_gauges)} test gauges"
)
logger.info(f"Train gauges: {len(train_gauges)} samples")
logger.info(f"Test gauges: {len(test_gauges)} samples")


2025-06-21 12:42:33 | CatboostTraining | INFO     | <module>:51 | Finding gauges with valid ../data...
2025-06-21 12:42:36 | CatboostTraining | INFO     | <module>:82 | Selected 29 uncorrelated features from static_data.
2025-06-21 12:42:36 | CatboostTraining | INFO     | <module>:85 | Split 302 gauges into 256 train and 46 test gauges
2025-06-21 12:42:36 | CatboostTraining | INFO     | <module>:88 | Train gauges: 256 samples
2025-06-21 12:42:36 | CatboostTraining | INFO     | <module>:89 | Test gauges: 46 samples


In [None]:
MODEL_DIR = Path("../data/res/catboost/")
for meteo_dataset in ["meteo_ru_nc_02", "e_obs", "era5_land"]:
    logger.info(f"\nCatBoost training for {meteo_dataset} dataset")
    model_path = MODEL_DIR / f"catboost_model_{meteo_dataset}.cbm"
    if model_path.is_file():
        logger.info(f"Model already exists for {meteo_dataset} at {model_path}. Skipping training.")
        continue
    # Define constants for file paths and directories
    METEO_DIR = Path(f"../data/MeteoData/ProcessedGauges/{meteo_dataset}/res/")
    HYDRO_DIR = Path("../data/HydroFiles/")
    TEMP_DIR = Path("../data/MeteoData/ProcessedGauges/era5_land/res/")
    # Log dataset creation
    logger.info(f"Creating training dataset with {len(train_gauges)} gauges...")
    data = data_creator(
        full_gauges=train_gauges,
        static_data=combined_features,
        meteo_dir=METEO_DIR,
        hydro_dir=HYDRO_DIR,
        temp_dir=TEMP_DIR,
    )

    logger.info(f"Creating test dataset with {len(test_gauges)} gauges...")
    test_data = data_creator(
        full_gauges=partial_gauges,
        static_data=combined_features,
        meteo_dir=METEO_DIR,
        hydro_dir=HYDRO_DIR,
        temp_dir=TEMP_DIR,
    )

    pc_features = [f for f in combined_feature if "_pc_" in f]

    for feature in combined_feature:
        data[feature] = data[feature].round(0).astype(int)
        test_data[feature] = test_data[feature].round(0).astype(int)

    data["day_of_year"] = data["day_of_year"].astype(int)
    test_data["day_of_year"] = test_data["day_of_year"].astype(int)

    # Define date ranges for train and validation sets
    logger.info("Splitting data into training and validation periods...")
    train_mask = ("2008-01-01" <= data["date"]) & (data["date"] <= "2018-12-31")
    valid_mask = ("2019-01-01" <= data["date"]) & (data["date"] <= "2020-12-31")
    test_mask = ("2021-01-01" <= data["date"]) & (data["date"] <= "2022-12-31")
    # Verify we have data in both splits
    train_count = train_mask.sum()
    valid_count = valid_mask.sum()
    test_count = test_mask.sum()
    logger.info(
        f"Training samples: {train_count}, Validation samples: {valid_count}, Test samples: {test_count}"
    )

    # Select numeric and integer features (excluding 'date', 'q', and categorical features)
    CATEGORICAL_FEATURES = ["gauge_id", "day_of_year"] + combined_feature

    NUMERIC_FEATURES = [col for col in data.columns if col not in ["date", "q"]]

    # Get indices for efficient slicing
    train_idx = data[train_mask].index
    valid_idx = data[valid_mask].index
    test_idx = test_data[test_mask].index

    train_pool = Pool(
        data=data.loc[train_idx, NUMERIC_FEATURES],
        label=data.loc[train_idx, "q"],
        cat_features=CATEGORICAL_FEATURES,
    )
    valid_pool = Pool(
        data=data.loc[valid_idx, NUMERIC_FEATURES],
        label=data.loc[valid_idx, "q"],
        cat_features=CATEGORICAL_FEATURES,
    )
    test_pool = Pool(
        data=test_data.loc[test_idx, NUMERIC_FEATURES],
        label=test_data.loc[test_idx, "q"],
        cat_features=CATEGORICAL_FEATURES,
    )

    logger.info(
        f"Prepared {len(NUMERIC_FEATURES)} features with {len(CATEGORICAL_FEATURES)} categorical features"
    )
    if GPU_OK:
        logger.info("GPU is available for training.")
    else:
        logger.warning("GPU is not available. Training will use CPU, which may be slower.")

    BASE_PARAMS = {
        "loss_function": "RMSE",
        "eval_metric": "RMSE",
        "early_stopping_rounds": ES_ROUNDS,
        "random_seed": SEED,
        "verbose": 0,
        "task_type": "GPU" if GPU_OK else "CPU",
        "devices": "0",
        # GPU-friendly fixed choices
        "grow_policy": "SymmetricTree",
        "bootstrap_type": "Bayesian",
        "border_count": 256,
        "gpu_ram_part": 0.95 if GPU_OK else None,
    }

    def objective(trial: optuna.Trial) -> float:
        """Objective function for Optuna hyperparameter optimization of CatBoostRegressor.

        This function suggests hyperparameters, trains a CatBoostRegressor on the training pool,
        evaluates it on the validation pool, and returns the validation RMSE.

        Args:
            trial (optuna.Trial): An Optuna trial object for suggesting hyperparameters.

        Returns:
            float: The RMSE score on the validation set.
        """
        params = BASE_PARAMS | {
            "iterations": trial.suggest_int("iterations", 420, 6000, step=420),
            "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.15, log=True),
            "depth": trial.suggest_int("depth", 6, 15),
            "l2_leaf_reg": trial.suggest_int("l2_leaf_reg", 4, 20),
            "random_strength": trial.suggest_float("random_strength", 1e-4, 1.0, log=True),
            "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 1.5),
        }

        model = CatBoostRegressor(**params)
        model.fit(train_pool, eval_set=valid_pool, use_best_model=True)

        return model.best_score_["validation"]["RMSE"]

    study = optuna.create_study(
        "slp_dg_sav",
        study_name="catboost_mape",
        direction="minimize",
        sampler=optuna.samplers.TPESampler(seed=SEED),
        pruner=optuna.pruners.MedianPruner(n_startup_trials=10, n_warmup_steps=20),
    )
    study.optimize(objective, n_trials=N_TRIALS, show_progress_bar=True, gc_after_trial=True)

    best_params = BASE_PARAMS | study.best_params
    # Save best CatBoost parameters to a file for reproducibility and future use

    best_params["iterations"] = 10000
    final_model = CatBoostRegressor(**best_params)

    final_model.fit(
        train_pool,
        eval_set=valid_pool,
        verbose=500,
        use_best_model=True,
    )

    # Save the trained CatBoost model to disk for reproducibility and future inference
    def save_catboost_model(model: CatBoostRegressor, output_dir: Path, meteo_dataset: str) -> None:
        """Save a trained CatBoost model to a file.

        Args:
            model (CatBoostRegressor): Trained CatBoost model.
            output_dir (Path): Directory where the model will be saved.
            meteo_dataset (str): Name of the meteorological dataset, used in the filename.

        Returns:
            None

        Raises:
            OSError: If the output directory does not exist and cannot be created.
            Exception: If saving the model fails.
        """
        output_dir.mkdir(parents=True, exist_ok=True)
        model_path = output_dir / f"catboost_model_{meteo_dataset}.cbm"
        try:
            model.save_model(model_path)
            logger.info(f"CatBoost model saved to {model_path}")
        except Exception as e:
            logger.error(f"Failed to save CatBoost model: {e}")
            raise

    save_catboost_model(final_model, MODEL_DIR, meteo_dataset)


2025-06-21 12:42:55 | CatboostTraining | INFO     | <module>:3 | 
CatBoost training for meteo_ru_nc_02 dataset
2025-06-21 12:42:55 | CatboostTraining | INFO     | <module>:6 | Model already exists for meteo_ru_nc_02 at ../data/res/catboost/catboost_model_meteo_ru_nc_02.cbm. Skipping training.
2025-06-21 12:42:55 | CatboostTraining | INFO     | <module>:3 | 
CatBoost training for e_obs dataset
2025-06-21 12:42:55 | CatboostTraining | INFO     | <module>:6 | Model already exists for e_obs at ../data/res/catboost/catboost_model_e_obs.cbm. Skipping training.
2025-06-21 12:42:55 | CatboostTraining | INFO     | <module>:3 | 
CatBoost training for era5_land dataset
2025-06-21 12:42:55 | CatboostTraining | INFO     | <module>:13 | Creating training dataset with 256 gauges...


2025-06-21 12:42:59 | CatboostTraining | INFO     | <module>:22 | Creating test dataset with 46 gauges...
2025-06-21 12:43:03 | CatboostTraining | INFO     | <module>:38 | Splitting data into training and validation periods...
2025-06-21 12:43:03 | CatboostTraining | INFO     | <module>:46 | Training samples: 1028608, Validation samples: 187136, Test samples: 186880


CatBoostError: Invalid type for cat_feature[non-default value idx=0,feature_idx=5]=1.0 : cat_features must be integer or string, real number values and NaN values should be converted to string.

In [3]:
data

Unnamed: 0,date,t_mean,t_min,t_max,prcp,gauge_id,day_of_year,q,prcp_1,t_min_mean_1,...,tmp_dc_s01,glc_pc_s15,slp_dg_sav,slt_pc_sav,ele_mt_sav,urb_pc_sse,ire_pc_sse,crp_pc_sse,glc_pc_s18,rev_mc_usu
0,2007-01-01,0.336485,-1.087112,1.418032,2.442841,83063,1.0,,2.442841,-1.087112,...,-6,1,2,38,162,9,3,57,19,0
1,2007-01-02,3.193040,-0.505057,4.385016,3.590178,83063,2.0,,3.590178,-0.505057,...,-6,1,2,38,162,9,3,57,19,0
2,2007-01-03,1.961395,-0.295840,3.245567,3.210715,83063,3.0,,3.210715,-0.295840,...,-6,1,2,38,162,9,3,57,19,0
3,2007-01-04,2.767307,1.015368,5.202342,5.119605,83063,4.0,,5.119605,1.015368,...,-6,1,2,38,162,9,3,57,19,0
4,2007-01-05,0.287456,-1.204532,1.719060,0.485085,83063,5.0,,0.485085,-1.204532,...,-6,1,2,38,162,9,3,57,19,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1496059,2022-12-27,-9.212279,-10.715747,-7.906977,0.301430,70348,361.0,0.449225,0.301430,-10.715747,...,-15,2,5,29,127,0,0,0,0,0
1496060,2022-12-28,-11.159609,-11.656342,-10.601622,0.463918,70348,362.0,0.449225,0.463918,-11.656342,...,-15,2,5,29,127,0,0,0,0,0
1496061,2022-12-29,-12.230348,-12.620804,-11.642210,1.076792,70348,363.0,0.449225,1.076792,-12.620804,...,-15,2,5,29,127,0,0,0,0,0
1496062,2022-12-30,-12.767002,-13.184958,-12.311711,0.389330,70348,364.0,0.442762,0.389330,-13.184958,...,-15,2,5,29,127,0,0,0,0,0


In [None]:
meteo_dataset = "mswep"
METEO_DIR = Path(f"../data/MeteoData/ProcessedGauges/{meteo_dataset}/res/")
HYDRO_DIR = Path("../data/HydroFiles/")
TEMP_DIR = Path("../data/MeteoData/ProcessedGauges/era5_land/res/")
# Log dataset creation
logger.info(f"Creating training dataset with {len(train_gauges)} gauges...")
data = data_creator(
    full_gauges=train_gauges,
    static_data=combined_features,
    meteo_dir=METEO_DIR,
    hydro_dir=HYDRO_DIR,
    temp_dir=TEMP_DIR,
)

logger.info(f"Creating test dataset with {len(test_gauges)} gauges...")
test_data = data_creator(
    full_gauges=partial_gauges,
    static_data=combined_features,
    meteo_dir=METEO_DIR,
    hydro_dir=HYDRO_DIR,
    temp_dir=TEMP_DIR,
)

pc_features = [f for f in combined_feature if "_pc_" in f]

for feature in combined_feature:
    data[feature] = data[feature].round(0).astype(int)
    test_data[feature] = test_data[feature].round(0).astype(int)


# Define date ranges for train and validation sets
logger.info("Splitting data into training and validation periods...")
train_mask = ("2008-01-01" <= data["date"]) & (data["date"] <= "2018-12-31")
valid_mask = ("2019-01-01" <= data["date"]) & (data["date"] <= "2020-12-31")
test_mask = ("2021-01-01" <= data["date"]) & (data["date"] <= "2022-12-31")
# Verify we have data in both splits
train_count = train_mask.sum()
valid_count = valid_mask.sum()
test_count = test_mask.sum()
logger.info(
    f"Training samples: {train_count}, Validation samples: {valid_count}, Test samples: {test_count}"
)

# Define features
# Select categorical features based on object dtype columns in data
CATEGORICAL_FEATURES = ["gauge_id"] + combined_feature
# Select numeric and integer features (excluding 'date')
NUMERIC_FEATURES = [c for c in data.columns if c not in ["date", "q"]]

# Get indices for efficient slicing
train_idx = data[train_mask].index
valid_idx = data[valid_mask].index
test_idx = test_data[test_mask].index

train_pool = Pool(
    data=data.loc[train_idx, NUMERIC_FEATURES],
    label=data.loc[train_idx, "q"],
    cat_features=CATEGORICAL_FEATURES,
)
valid_pool = Pool(
    data=data.loc[valid_idx, NUMERIC_FEATURES],
    label=data.loc[valid_idx, "q"],
    cat_features=CATEGORICAL_FEATURES,
)
test_pool = Pool(
    data=test_data.loc[:, NUMERIC_FEATURES],
    label=test_data.loc[:, "q"],
    cat_features=CATEGORICAL_FEATURES,
)

logger.info(
    f"Prepared {len(NUMERIC_FEATURES)} features with {len(CATEGORICAL_FEATURES)} categorical features"
)

In [None]:
if GPU_OK:
    logger.info("GPU is available for training.")
else:
    logger.warning("GPU is not available. Training will use CPU, which may be slower.")


BASE_PARAMS = {
    "loss_function": "RMSE",
    "eval_metric": "RMSE",
    "early_stopping_rounds": ES_ROUNDS,
    "random_seed": SEED,
    "verbose": 0,
    "task_type": "GPU" if GPU_OK else "CPU",
    "devices": "0",
    # GPU-friendly fixed choices
    "grow_policy": "SymmetricTree",
    "bootstrap_type": "Bayesian",
    "border_count": 256,
    "gpu_ram_part": 0.95 if GPU_OK else None,
}


def objective(trial: optuna.Trial) -> float:
    """Objective function for Optuna hyperparameter optimization of CatBoostRegressor.

    This function suggests hyperparameters, trains a CatBoostRegressor on the training pool,
    evaluates it on the validation pool, and returns the validation RMSE.

    Args:
        trial (optuna.Trial): An Optuna trial object for suggesting hyperparameters.

    Returns:
        float: The RMSE score on the validation set.
    """
    params = BASE_PARAMS | {
        "iterations": trial.suggest_int("iterations", 420, 6000, step=420),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.15, log=True),
        "depth": trial.suggest_int("depth", 6, 15),
        "l2_leaf_reg": trial.suggest_int("l2_leaf_reg", 4, 20),
        "random_strength": trial.suggest_float("random_strength", 1e-4, 1.0, log=True),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 1.5),
    }

    model = CatBoostRegressor(**params)
    model.fit(train_pool, eval_set=valid_pool, use_best_model=True)

    return model.best_score_["validation"]["RMSE"]


study = optuna.create_study(
    study_name="catboost_mape",
    direction="minimize",
    sampler=optuna.samplers.TPESampler(seed=SEED),
    pruner=optuna.pruners.MedianPruner(n_startup_trials=10, n_warmup_steps=20),
)
study.optimize(objective, n_trials=N_TRIALS, show_progress_bar=True, gc_after_trial=True)

best_params = BASE_PARAMS | study.best_params
# Save best CatBoost parameters to a file for reproducibility and future use


final_model = CatBoostRegressor(**best_params)

final_model.fit(
    train_pool,
    eval_set=valid_pool,
    verbose=500,
    use_best_model=True,
)


# Save the trained CatBoost model to disk for reproducibility and future inference
def save_catboost_model(model: CatBoostRegressor, output_dir: str, meteo_dataset: str) -> None:
    """Save a trained CatBoost model to a file.

    Args:
        model (CatBoostRegressor): Trained CatBoost model.
        output_dir (str): Directory where the model will be saved.
        meteo_dataset (str): Name of the meteorological dataset, used in the filename.

    Returns:
        None

    Raises:
        OSError: If the output directory does not exist and cannot be created.
        Exception: If saving the model fails.
    """
    os.makedirs(output_dir, exist_ok=True)
    model_path = os.path.join(output_dir, f"catboost_model_{meteo_dataset}.cbm")
    try:
        model.save_model(model_path)
        logger.info(f"CatBoost model saved to {model_path}")
    except Exception as e:
        logger.error(f"Failed to save CatBoost model: {e}")
        raise


save_catboost_model(final_model, "../data/res/catboost/", meteo_dataset)


In [None]:
best_params = BASE_PARAMS | study.best_params
final_model = CatBoostRegressor(**best_params)

final_model.fit(
    train_pool,
    eval_set=valid_pool,
    verbose=500,
    use_best_model=True,
)


# Save the trained CatBoost model to disk for reproducibility and future inference
def save_catboost_model(model: CatBoostRegressor, output_dir: str, meteo_dataset: str) -> None:
    """Save a trained CatBoost model to a file.

    Args:
        model (CatBoostRegressor): Trained CatBoost model.
        output_dir (str): Directory where the model will be saved.
        meteo_dataset (str): Name of the meteorological dataset, used in the filename.

    Returns:
        None

    Raises:
        OSError: If the output directory does not exist and cannot be created.
        Exception: If saving the model fails.
    """
    os.makedirs(output_dir, exist_ok=True)
    model_path = os.path.join(output_dir, f"catboost_model_{meteo_dataset}.cbm")
    try:
        model.save_model(model_path)
        logger.info(f"CatBoost model saved to {model_path}")
    except Exception as e:
        logger.error(f"Failed to save CatBoost model: {e}")
        raise


save_catboost_model(final_model, "../data/res/catboost/", meteo_dataset)


In [None]:
import matplotlib.pyplot as plt

test_data["q_pred"] = final_model.predict(test_pool)


def plot_predictions_over_test_gauges_matplotlib(
    df: pd.DataFrame,
    gauge_column: str = "gauge_id",
    date_column: str = "date",
    pred_column: str = "q_pred",
    true_column: str = "q",
    figsize: tuple = (15, 7),
    linewidth: int = 2,
    legend_loc: str = "upper right",
    show: bool = True,
) -> None:
    """Plot predicted vs. true discharge for each test gauge using matplotlib.

    Args:
        df (pd.DataFrame): DataFrame containing predictions and true values.
        gauge_column (str): Name of the column with gauge IDs.
        date_column (str): Name of the column with dates.
        pred_column (str): Name of the column with predicted values.
        true_column (str): Name of the column with true values.
        figsize (tuple): Figure size in inches (width, height).
        linewidth (int): Width of the lines.
        legend_loc (str): Location of the legend.
        show (bool): Whether to display the plot immediately.

    Returns:
        None
    """
    gauges = df[gauge_column].unique()
    for gid in gauges:
        gauge_df = df[df[gauge_column] == gid].sort_values(date_column)
        plt.figure(figsize=figsize)
        # Plot true values as a line
        plt.plot(
            gauge_df[date_column],
            gauge_df[true_column],
            label="True Q",
            color="royalblue",
            linewidth=linewidth,
        )
        # Plot predicted values as a line
        plt.plot(
            gauge_df[date_column],
            gauge_df[pred_column],
            label="Predicted Q",
            color="firebrick",
            linewidth=linewidth,
            linestyle="--",
        )
        plt.title(f"Predicted vs. Observed Discharge — Gauge {gid}", fontsize=16)
        plt.xlabel("Date", fontsize=14)
        plt.ylabel("Discharge (q)", fontsize=14)
        plt.legend(loc=legend_loc, fontsize=12)
        plt.grid(True, linestyle=":", alpha=0.5)
        plt.tight_layout()
        if show:
            plt.show()


# Plot predictions for all test gauges in test_data
plot_predictions_over_test_gauges_matplotlib(test_data)


In [None]:
test_data.loc[test_data["gauge_id"] == "75158"]


In [None]:
test_gauges = test_data.loc[test_data["gauge_id"] == "49099", ["q", "q_pred"]]

(
    kling_gupta_efficiency(test_gauges["q"], test_gauges["q_pred"]),
    nash_sutcliffe_efficiency(test_gauges["q"], test_gauges["q_pred"]),
)