In [3]:
import json
from pathlib import Path
import sys

import geopandas as gpd
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

sys.path.append("../")
from src.readers.geom_reader import load_geodata
from src.timeseries_stats.metrics import evaluate_model
from src.utils.logger import setup_logger

# Set style
sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (12, 8)
LOG = setup_logger("fine_tune", log_file="../logs/fine_tuning.log")

In [7]:
# Load watershed geometries and gauge locations
ws, gauges = load_geodata(folder_depth="../")
common_index = gauges.index.to_list()
basemap_data = gpd.read_file("../data/geometry/basemap_2023.gpkg")
# Load cluster assignments (from Chapter 1)
# gauge_mapping = pd.read_csv(
#     "../res/chapter_one/gauge_hybrid_mapping.csv",
#     index_col="gauge_id",
#     dtype={"gauge_id": str},
# )
fine_tune_gauges = gpd.read_file("../data/res/FineTuneGauges.gpkg")[
    [
        "gauge_id",
        "name_ru",
        "name_en",
        "geometry",
    ]
]
fine_tune_gauges.set_index("gauge_id", inplace=True)

ft_index = fine_tune_gauges.index.tolist()
rest_gauges = gauges.loc[~gauges.index.isin(ft_index)]
rest_index = rest_gauges.index.tolist()
print(f"Loaded {len(gauges)} gauges with hybrid classification")


Loaded 996 gauges with hybrid classification


In [8]:
def load_metrics_from_folder(base_path: Path, gauge_id: str, dataset: str) -> dict | None:
    """Load metrics JSON file for a specific gauge and dataset."""
    metrics_file = base_path / gauge_id / f"{gauge_id}_{dataset}_prediction_metrics.json"
    if metrics_file.exists():
        with open(metrics_file) as f:
            return json.load(f)
    return None


def load_all_metrics(
    models: list[str],
    datasets: list[str],
    gauge_ids: list[str],
    rest_path: Path,
    ft_path: Path,
) -> tuple[pd.DataFrame, pd.DataFrame]:
    """Load metrics for rest_index and ft_index predictions.

    Returns:
        rest_metrics: DataFrame with metrics from rest_index predictions
        ft_metrics: DataFrame with metrics from ft_index predictions
    """
    poor_data = []
    initial_data = []

    for model in models:
        for dataset in datasets:
            for gauge_id in gauge_ids:
                # Load rest_index metrics (from regular predictions folder)
                poor_metrics = load_metrics_from_folder(
                    rest_path / f"{model}_poor_gauges", str(gauge_id), dataset
                )
                if poor_metrics:
                    poor_metrics["model"] = model
                    poor_metrics["dataset"] = dataset
                    poor_metrics["gauge_id"] = str(gauge_id)
                    poor_data.append(poor_metrics)

                # Load ft_index metrics (from poor_gauges folder)
                initial_metrics = load_metrics_from_folder(
                    ft_path / f"{model}", str(gauge_id), dataset
                )
                if initial_metrics:
                    initial_metrics["model"] = model
                    initial_metrics["dataset"] = dataset
                    initial_metrics["gauge_id"] = str(gauge_id)
                    initial_data.append(initial_metrics)

    poor_df = pd.DataFrame(poor_data)
    poor_df.set_index("gauge_id", inplace=True)

    initial_df = pd.DataFrame(initial_data)
    initial_df.set_index("gauge_id", inplace=True)
    return poor_df, initial_df

In [10]:
model = "lstm"
meteo = "mswep"

metrics_17_18 = pd.DataFrame(index=ft_index)
metrics_17_18["NSE"] = None
metrics_17_18.index.name = "gauge_id"
metrics_19_20 = pd.DataFrame(index=ft_index)
metrics_19_20["NSE"] = None
metrics_19_20.index.name = "gauge_id"


for gauge_id in ft_index:
    period_17_18 = pd.read_csv(
        f"../data/predictions/{model}_poor_gauges/{gauge_id}/{gauge_id}_{meteo}_predictions.csv",
        index_col="date",
        parse_dates=True,
    ).loc["2017-01-01":"2018-12-31"]
    metrics_17_18.loc[gauge_id, "NSE"] = evaluate_model(
        observed=period_17_18["q_obs"], simulated=period_17_18["q_sim"]
    )["NSE"]


for gauge_id in ft_index:
    period_19_20 = pd.read_csv(
        f"../data/predictions/{model}/{gauge_id}/{gauge_id}_{meteo}_predictions.csv",
        index_col="date",
        parse_dates=True,
    ).loc["2019-01-01":"2020-12-31"]
    metrics_19_20.loc[gauge_id, "NSE"] = evaluate_model(
        observed=period_19_20["q_obs"], simulated=period_19_20["q_sim"]
    )["NSE"]
print(f"17-18 period: {metrics_17_18.median().values[0]:.2f}")
print(f"19-20 period: {metrics_19_20.median().values[0]:.2f}")


17-18 period: 0.58
19-20 period: 0.22
