In [2]:
import json
import random
import sys
import warnings
from pathlib import Path

import optuna
from catboost import CatBoostRegressor

# Evaluate model performance

# Add parent directory to Python path to access src modules
sys.path.append(str(Path("..").resolve()))
from src.models.catboost.data_loaders import (
    create_pools,
    get_data_masks,
    get_feature_lists,
    round_static_features,
    split_gauges,
)
from src.readers.geo_char_reader import (
    get_combined_features,
    load_static_data,
)
from src.readers.geom_reader import load_geodata
from src.readers.hydro_data_reader import data_creator, find_valid_gauges
from src.utils.logger import setup_logger
from src.utils.metrics import evaluate_model

optuna.logging.set_verbosity(optuna.logging.WARNING)
warnings.filterwarnings("ignore")
# Set random seed for reproducibility
random.seed(42)
logger = setup_logger(name="CatboostTraining", log_file="../logs/catboost.log")


In [3]:
e_obs_ws, _ = load_geodata(folder_depth="..")
logger.info("Finding gauges with valid data...")
full_gauges, partial_gauges = find_valid_gauges(e_obs_ws, Path("../data/HydroFiles"))
static_data = load_static_data(full_gauges + partial_gauges, path_prefix="../")
combined_feature, combined_features_df = get_combined_features(static_data)
train_gauges, test_gauges = split_gauges(full_gauges)

2025-06-23 16:51:27 | CatboostTraining | INFO     | <module>:2 | Finding gauges with valid data...
2025-06-23 16:51:29 | hydro_atlas_loader | INFO     | get_combined_features:48 | Selected 29 uncorrelated features from static_data.
2025-06-23 16:51:29 | catboost_loader | INFO     | split_gauges:28 | Split 302 gauges into 256 train and 46 test gauges


In [5]:
METEO_DATASETS = ["meteo_ru_nc_02", "e_obs", "era5_land", "mswep"]
for meteo_dataset in METEO_DATASETS:
    meteo_dir = Path(f"../data/MeteoData/ProcessedGauges/{meteo_dataset}/res/")
    hydro_dir = Path("../data/HydroFiles/")
    temp_dir = Path("../data/MeteoData/ProcessedGauges/era5_land/res/")

    logger.info(f"Creating test dataset with {len(full_gauges)} gauges...")
    test_data = data_creator(
        full_gauges=full_gauges,
        static_data=combined_features_df,
        meteo_dir=meteo_dir,
        hydro_dir=hydro_dir,
        temp_dir=temp_dir,
    )
    test_data = round_static_features(test_data, combined_feature)
    test_data["day_of_year"] = test_data["day_of_year"].astype(int)
    train_mask, valid_mask, test_mask = get_data_masks(test_data)
    logger.info(
        f"Training samples: {train_mask.sum()}, Validation samples: {valid_mask.sum()}, Test samples: {test_mask.sum()}"
    )
    categorical_features, numeric_features = get_feature_lists(test_data, combined_feature)
    logger.info(
        f"Prepared {len(numeric_features)} features with {len(categorical_features)} categorical features"
    )
    _, _, test_pool = create_pools(
        test_data, test_data, train_mask, valid_mask, test_mask, numeric_features, categorical_features
    )

    # Path to the CatBoost model
    catboost_model_path = f"../data/res/catboost/catboost_model_{meteo_dataset}.cbm"
    model = CatBoostRegressor()
    model.load_model(catboost_model_path)

    eval_data = test_data[test_mask].copy()
    eval_data["q_pred"] = model.predict(test_pool)

    for gauge_id in train_gauges:
        save_path = Path(f"../data/res/catboost/{gauge_id}/{gauge_id}_{meteo_dataset}/")
        save_path.mkdir(parents=True, exist_ok=True)
        gauge_data = eval_data.loc[eval_data["gauge_id"] == gauge_id, ["date", "q", "q_pred"]].set_index(
            "date"
        )
        gauge_metrics = evaluate_model(gauge_data["q"], gauge_data["q_pred"])
        with open(save_path / "metrics.json", "w") as f:
            json.dump(gauge_metrics, f)


2025-06-23 16:52:15 | CatboostTraining | INFO     | <module>:7 | Creating test dataset with 302 gauges...
2025-06-23 16:52:19 | CatboostTraining | INFO     | <module>:18 | Training samples: 1213436, Validation samples: 220762, Test samples: 220460
2025-06-23 16:52:19 | CatboostTraining | INFO     | <module>:22 | Prepared 53 features with 31 categorical features


CatBoostError: catboost/libs/model/model_import_interface.h:19: Model file doesn't exist: ../data/res/catboost/catboost_model_meteo_ru_nc_02.cbm