In [2]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
import geopandas as gpd
import pandas as pd
import numpy as np
import sys
from pathlib import Path
from tqdm.notebook import tqdm
from neuralhydrology.nh_run import start_run, eval_run
from neuralhydrology.utils.config import Config
from scripts.file_manipulator import train_rewriter
sys.path.append("/app")
from visualizations.scripts.drawer import russia_plots, russia_plots_n

In [3]:
q_obs = {"full": list(), "partial": list(), "empty": list()}
h_obs = {"full": list(), "partial": list(), "empty": list()}


def nan_fractor(df_col: pd.Series):
    nan_frac = df_col.isna().sum() / len(df_col)
    return nan_frac


nc_cond = [file.stem for file in Path("/app/data/ws_related_meteo/nc_concat/").glob("*.nc")]

for file in tqdm(list(Path("/app/data/ais_data/results_2024").glob("*.csv"))):
    gauge_id = file.stem
    if gauge_id in nc_cond:
        test = pd.read_csv(file, index_col="date")
        lvls = test[["lvl_sm", "lvl_mbs"]]
        dschrg = test[["q_cms_s", "q_mm_day"]]

        fraction_of_nan_q = dschrg["q_cms_s"].isna().sum() / len(dschrg)

        if fraction_of_nan_q == 0.0:
            q_obs["full"].append(gauge_id)
        elif fraction_of_nan_q <= 0.25:
            q_obs["partial"].append(gauge_id)
        elif fraction_of_nan_q > 0.25:
            q_obs["empty"].append(gauge_id)

        fraction_of_nan_h = lvls["lvl_mbs"].isna().sum() / len(lvls)

        if fraction_of_nan_h == 0.0:
            h_obs["full"].append(gauge_id)
        elif fraction_of_nan_h <= 0.25:
            h_obs["partial"].append(gauge_id)
        elif fraction_of_nan_h > 0.25:
            h_obs["empty"].append(gauge_id)
    else:
        continue


  0%|          | 0/1918 [00:00<?, ?it/s]

In [4]:
gauges_file = gpd.read_file("../data/geometry/russia_gauges.gpkg")
gauges_file["gauge_id"] = gauges_file["gauge_id"].astype(str)
gauges_file = gauges_file.set_index("gauge_id")
ws_file = gpd.read_file("../data/geometry/russia_ws.gpkg")
ws_file = ws_file.set_index("gauge_id")
ws_file = ws_file[ws_file["new_area"] <= 50000]


def lim_definer(area: float):
    lim_1, lim_2, lim_3, lim_4 = 100, 2000, 10000, 50000

    if area < lim_1:
        return "а) < 100 $км^2$"
    elif (area >= lim_1) & (area <= lim_2):
        return "б) 100 $км^2$ - 2 000 $км^2$"
    elif (area > lim_2) & (area <= lim_3):
        return "в) 2 000 $км^2$ - 10 000 $км^2$"
    elif (area > lim_3) & (area <= lim_4):
        return "г) 10 000 $км^2$ - 50 000 $км^2$"
    else:
        return "> 50 000 $км^2$"


ws_file["size"] = ws_file.loc[:, "new_area"].apply(lambda x: lim_definer(x))
ws_file["size"] = pd.Categorical(
    ws_file["size"],
    [
        "а) < 100 $км^2$",
        "б) 100 $км^2$ - 2 000 $км^2$",
        "в) 2 000 $км^2$ - 10 000 $км^2$",
        "г) 10 000 $км^2$ - 50 000 $км^2$",
    ],
)

basemap_data = gpd.read_file("../data/geometry/basemap_2023.gpkg")

basin_districts = gpd.read_file("../data/geometry/basin_districts_2024.gpkg")
basin_districts = basin_districts.reset_index(drop=True)
# basin_districts = basin_districts.drop([26, 27])
# fixed umgs shapes
basin_districts = basin_districts.reindex(
    [
        0,
        1,
        2,
        3,
        4,
        5,
        6,
        7,
        8,
        26,
        10,
        11,
        12,
        27,
        14,
        15,
        16,
        17,
        18,
        19,
        20,
        21,
        22,
        23,
        24,
        25,
        9,
        13,
    ]
)
basin_districts = basin_districts.reset_index(drop=True)
basin_districts = basin_districts.rename(columns={"BAS_OKRUG": "name1"})


In [5]:
hbv_table = pd.read_csv(
    "/app/conclusions/tables/HBV_res_mle_NSE.csv",
    index_col="gauge_id",
    dtype={"gauge_id": str},
)

gr4j_table = pd.read_csv(
    "/app/conclusions/tables/GR4J_res_mle_NSE.csv",
    index_col="gauge_id",
    dtype={"gauge_id": str},
)
rfr_table = pd.read_csv(
    "/app/conclusions/tables/res_rfr.csv",
    index_col="gauge_id",
    dtype={"gauge_id": str},
)
lstm_regional = pd.read_csv(
    "/app/conclusions/tables/cudalstm_q_mm_day_mswep_era5_res.csv",
    index_col="gauge_id",
    dtype={"gauge_id": str},
)
lstm_single = pd.read_csv(
    "/app/conclusions/tables/Single_LSTM_table.csv",
    index_col="gauge_id",
    dtype={"gauge_id": str},
)
lstm_no_static = pd.read_csv(
    "/app/conclusions/tables/LSTM_no_static_res.csv",
    index_col="gauge_id",
    dtype={"gauge_id": str},
)

common_index = (
    hbv_table.index.intersection(gr4j_table.index)
    .intersection(rfr_table.index)
    .intersection(lstm_regional.index)
    .intersection(ws_file.index)
    .intersection(lstm_single.index)
    .intersection(lstm_no_static.index)
)


ws_file = ws_file.loc[common_index, :]
gauges_file = gauges_file.loc[common_index, :]
gauges_file["size"] = np.nan
gauges_file = gauges_file.combine_first(ws_file)
gauges_file = gauges_file.set_crs(epsg=4326)


In [5]:
ws_file = gpd.read_file("../data/geometry/russia_ws.gpkg")
ws_file = ws_file.set_index("gauge_id")
ws_file = ws_file[ws_file["new_area"] <= 50000]
gauges_file = gpd.read_file("../data/geometry/russia_gauges.gpkg")
gauges_file["gauge_id"] = gauges_file["gauge_id"].astype(str)
gauges_file = gauges_file.set_index("gauge_id")

In [8]:
ws_partial = ws_file.loc[ws_file.index.isin(q_obs["full"]), :]
# ws_partial.to_file("/app/data/geometry/partial_ws.gpkg")
gauges_partial = gauges_file.loc[ws_partial.index, :]
# gauges_partial.to_file("/app/data/geometry/partial_gauges.gpkg")

### Best meteo for each Basin district

In [6]:
def read_table_gauge_str(
    table_path: str, index_filter: pd.Index = pd.Index([])
) -> tuple[pd.DataFrame, float]:
    table = pd.read_csv(table_path)
    if "Unnamed: 0" in table.columns:
        table = table.rename(columns={"Unnamed: 0": "gauge_id"})
    table["gauge_id"] = table["gauge_id"].astype(str)
    table = table.set_index("gauge_id")

    if index_filter.empty:
        pass
    else:
        table = table.loc[table.index.isin(index_filter)]
    median_nse = table["NSE"].median()
    return table, median_nse


In [7]:
lstm_mswep_table, lstm_mswep_nse = read_table_gauge_str(
    table_path="/app/conclusions/tables/cudalstm_q_mm_day_mswep_res.csv", index_filter=common_index
)
lstm_era5_table, lstm_era5_nse = read_table_gauge_str(
    table_path="/app/conclusions/tables/cudalstm_q_mm_day_era5_res.csv", index_filter=common_index
)
lstm_era5_land_table, lstm_era5_land_nse = read_table_gauge_str(
    table_path="/app/conclusions/tables/cudalstm_q_mm_day_era5l_res.csv", index_filter=common_index
)
lstm_gpcp_table, lstm_gpcp_nse = read_table_gauge_str(
    table_path="/app/conclusions/tables/cudalstm_q_mm_day_gpcp_res.csv", index_filter=common_index
)

lstm_geom_nse = gauges_file.loc[common_index, ["geometry"]]

lstm_geom_nse.loc[:, ["NSE LSTM MSWEP", "NSE LSTM ERA5", "NSE LSTM ERA5-Land", "NSE LSTM GPCP"]] = (
    pd.concat(
        [
            lstm_mswep_table.loc[common_index, "NSE"],
            lstm_era5_table.loc[common_index, "NSE"],
            lstm_era5_land_table.loc[common_index, "NSE"],
            lstm_gpcp_table.loc[common_index, "NSE"],
        ],
        axis=1,
    ).values
)
lstm_geom_nse = lstm_geom_nse.dropna()

lstm_geom_nse["best NSE"] = lstm_geom_nse[
    ["NSE LSTM MSWEP", "NSE LSTM ERA5", "NSE LSTM ERA5-Land", "NSE LSTM GPCP"]
].max(axis=1)
mswep_index = lstm_geom_nse[lstm_geom_nse["best NSE"].isin(lstm_geom_nse["NSE LSTM MSWEP"])].index
era5_index = lstm_geom_nse[lstm_geom_nse["best NSE"].isin(lstm_geom_nse["NSE LSTM ERA5"])].index
era5_land_index = lstm_geom_nse[
    lstm_geom_nse["best NSE"].isin(lstm_geom_nse["NSE LSTM ERA5-Land"])
].index
gpcp_index = lstm_geom_nse[lstm_geom_nse["best NSE"].isin(lstm_geom_nse["NSE LSTM GPCP"])].index

for name, index in {
    "MSWEP": mswep_index,
    "ERA5": era5_index,
    "ERA5-Land": era5_land_index,
    "GPCP": gpcp_index,
}.items():
    lstm_geom_nse.loc[index, "Лучшая модель"] = name


In [8]:
model_dict = {"MSWEP": 0., "ERA5-Land": 1., "ERA5": 2., "GPCP": 3.}
lstm_geom_nse['Лучшая модель, цифра'] = lstm_geom_nse['Лучшая модель'].apply(lambda x: model_dict[x])
lstm_geom_nse


Unnamed: 0_level_0,geometry,NSE LSTM MSWEP,NSE LSTM ERA5,NSE LSTM ERA5-Land,NSE LSTM GPCP,best NSE,Лучшая модель,"Лучшая модель, цифра"
gauge_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
70158,POINT (45.634 60.054),0.833943,0.820407,0.831432,0.740344,0.833943,MSWEP,0.0
8290,POINT (97.383 58.032),0.847587,0.850689,0.871560,0.833061,0.871560,ERA5-Land,1.0
70583,POINT (39.273 59.991),0.525068,0.624427,0.625878,0.439023,0.625878,ERA5-Land,1.0
76408,POINT (59.675 55.172),0.542724,0.306889,0.233973,0.446993,0.542724,MSWEP,0.0
72668,POINT (29.146 56.505),0.757252,0.580494,0.641525,0.479778,0.757252,MSWEP,0.0
...,...,...,...,...,...,...,...,...
10584,POINT (88.325 55.764),0.359196,0.263809,0.361511,-0.580998,0.361511,ERA5-Land,1.0
72577,POINT (28.593 59.383),0.815723,0.742813,0.823620,0.620366,0.823620,ERA5-Land,1.0
8347,POINT (98.665 55.537),0.411140,0.463229,0.329005,0.344148,0.463229,ERA5,2.0
48069,POINT (35.81 62.61),0.478429,0.711289,0.708054,0.572330,0.711289,ERA5,2.0


In [11]:
for i, geom in enumerate(basin_districts["geometry"]):
    ugms_part = lstm_geom_nse.loc[
        [geom.intersects(gage_point) for gage_point in lstm_geom_nse["geometry"]], :
    ]
    ugms_name = basin_districts.loc[i, "name1"]
    if ugms_part["Лучшая модель, цифра"].value_counts().empty:
        basin_districts.loc[i, ["Лучшая модель, цифра"]] = np.nan
    else:
        basin_districts.loc[i, ["Лучшая модель, цифра"]] = int(
            ugms_part["Лучшая модель, цифра"].value_counts().index[0]
        )

In [10]:
mswep_config = "/app/data/lstm_configs/model_runs/cudalstm_q_mm_day_mswep_no_autocorr_static_1103_191754/config.yml"
era5l_config = "/app/data/lstm_configs/model_runs/cudalstm_q_mm_day_era5l_no_autocorr_static_1003_133332/config.yml"
er5_config = "/app/data/lstm_configs/model_runs/cudalstm_q_mm_day_era5_no_autocorr_static_1203_220232/config.yml"
gpcp_config = "/app/data/lstm_configs/model_runs/cudalstm_q_mm_day_gpcp_no_autocorr_static_1203_080402/config.yml"

In [13]:
configs = [mswep_config, era5l_config, er5_config, gpcp_config]
configs_names = [Path(i).parent.stem for i in configs]
logs = list(
    list(Path("/app/data/lstm_configs/model_runs").glob(f"{cfg_name}/*.log"))[0]
    for cfg_name in configs_names
)


def best_epoch_finder(log_file: Path) -> int:
    with open(f"{log_file}", "r") as f:
        lines = f.readlines()
    full_lines = [line for line in lines if ("NSE" in line) & ("Epoch" in line)]

    epoch_nse = {
        int(line.split(" Epoch ")[1].split(" ")[0]): float(line.split(" NSE: ")[1].split(",")[0])
        for line in full_lines
    }

    max_epoch = max(epoch_nse, key=epoch_nse.get)

    return max_epoch


best_epochs = list(best_epoch_finder(log) for log in logs)

meteo_inputs = [
    ["prcp_mswep", "t_max_e5l", "t_min_e5l"],
    ["prcp_e5l", "t_max_e5l", "t_min_e5l"],
    ["prcp_e5", "t_max_e5", "t_min_e5"],
    ["prcp_gpcp", "t_max_e5l", "t_min_e5l"],
    # ["prcp_mswep", "t_max_e5", "t_min_e5"],
    # ["prcp_mswep", "prcp_gpcp", "prcp_e5", "prcp_e5l", "t_max_e5l", "t_min_e5l"],
]
# q_mm_day or lvl_sm
hydro_target = "q_mm_day"
nc_variable = "nc_all_q"

# time series directory
ts_dir = Path("/app/data/time_series")
ts_dir.mkdir(exist_ok=True, parents=True)


with open("/app/neural_forecast/blind_basins.txt", "w") as the_file:
    for gauge_name in ws_file.index:
        the_file.write(f"{int(gauge_name)}\n")

for meteo_input, config_name, best_epoch in zip(meteo_inputs, configs, best_epochs):
    
    train_rewriter(
        era_paths=list(str(i) for i in Path(f"/app/data/ws_related_meteo/{nc_variable}/").glob("*.nc")),
        ts_dir=ts_dir,
        hydro_target=hydro_target,
        area_index=ws_file.index,
        predictors=meteo_input,
        possible_nans=1,
    )
    cfg = Config(Path(config_name))

    gauge_size = ws_file.__len__()
    cfg.update_config(
    {
        "train_basin_file": "/app/neural_forecast/blind_basins.txt",
        "validate_n_random_basins": gauge_size,
        "validation_basin_file": "/app/neural_forecast/blind_basins.txt",
        "test_basin_file": "/app/neural_forecast/blind_basins.txt",
    })
    eval_run(run_dir=cfg.run_dir, period="test", epoch=best_epoch)


# Evaluation: 100%|██████████| 996/996 [01:42<00:00,  9.73it/s]
# Evaluation: 100%|██████████| 996/996 [01:43<00:00,  9.59it/s]
# Evaluation: 100%|██████████| 996/996 [01:42<00:00,  9.70it/s]
# Evaluation: 100%|██████████| 996/996 [01:42<00:00,  9.70it/s]


In [127]:
mswep_pickle = Path(
    "/app/data/lstm_configs/model_runs/cudalstm_q_mm_day_mswep_no_autocorr_static_1103_191754/test/model_epoch024/test_results.p"
)
gpcp_pickle = Path(
    "/app/data/lstm_configs/model_runs/cudalstm_q_mm_day_gpcp_no_autocorr_static_1203_080402/test/model_epoch024/test_results.p"
)
era5l_pickle = Path(
    "/app/data/lstm_configs/model_runs/cudalstm_q_mm_day_era5l_no_autocorr_static_1003_133332/test/model_epoch026/test_results.p"
)
era5_pickle = Path(
    "/app/data/lstm_configs/model_runs/cudalstm_q_mm_day_era5_no_autocorr_static_1203_220232/test/model_epoch020/test_results.p"
)


In [134]:
from conceptual_runs.scripts.data_readers import metric_df

res_df = list()

for gauge_id, res_dict in pd.read_pickle(era5_pickle).items():
    draw_df = pd.DataFrame()
    df_res = res_dict["1D"]['xr']
    obs, pred = (
        df_res["q_mm_day_obs"].to_numpy().squeeze()[:-1],
        df_res["q_mm_day_sim"].to_numpy().squeeze()[:-1],
    )
    draw_df.index = df_res.date[:-1]
    draw_df["Наблюдения, мм/день"] = obs
    draw_df["Модель, мм/день"] = pred
    res_df.append(
        metric_df(
            gauge_id=gauge_id,
            predictions=draw_df["Модель, мм/день"],
            targets=draw_df["Наблюдения, мм/день"],
        )
    )
res_df = pd.concat(res_df)
res_df.to_csv("/app/conclusions/tables/era5_blind.csv")

  np.nansum((targets - predictions) ** 2) / np.nansum((targets - np.nanmean(targets)) ** 2)
  np.nansum((targets - predictions) ** 2) / np.nansum((targets - np.nanmean(targets)) ** 2)
  res_df = pd.concat(res_df)


In [160]:
era5_blind = pd.read_csv(
    "/app/conclusions/tables/era5_blind.csv", index_col="Unnamed: 0", dtype={"Unnamed: 0": str}
)
era5l_blind = pd.read_csv(
    "/app/conclusions/tables/era5l_blind.csv", index_col="Unnamed: 0", dtype={"Unnamed: 0": str}
)
gpcp_blind = pd.read_csv(
    "/app/conclusions/tables/gpcp_blind.csv", index_col="Unnamed: 0", dtype={"Unnamed: 0": str}
)
mswep_blind = pd.read_csv(
    "/app/conclusions/tables/mswep_blind.csv", index_col="Unnamed: 0", dtype={"Unnamed: 0": str}
)
lstm_geom_nse = gauges_partial.loc[:, ["geometry"]]

lstm_geom_nse.loc[:, ["NSE LSTM MSWEP", "NSE LSTM ERA5", "NSE LSTM ERA5-Land", "NSE LSTM GPCP"]] = (
    pd.concat(
        [
            mswep_blind.loc[:, "NSE"],
            era5_blind.loc[:, "NSE"],
            era5l_blind.loc[:, "NSE"],
            gpcp_blind.loc[:, "NSE"],
        ],
        axis=1,
    ).values
)
lstm_geom_nse = lstm_geom_nse.dropna()
lstm_geom_nse["best NSE"] = lstm_geom_nse[
    ["NSE LSTM MSWEP", "NSE LSTM ERA5", "NSE LSTM ERA5-Land", "NSE LSTM GPCP"]
].max(axis=1)
mswep_index = lstm_geom_nse[lstm_geom_nse["best NSE"].isin(lstm_geom_nse["NSE LSTM MSWEP"])].index
era5_index = lstm_geom_nse[lstm_geom_nse["best NSE"].isin(lstm_geom_nse["NSE LSTM ERA5"])].index
era5_land_index = lstm_geom_nse[
    lstm_geom_nse["best NSE"].isin(lstm_geom_nse["NSE LSTM ERA5-Land"])
].index
gpcp_index = lstm_geom_nse[lstm_geom_nse["best NSE"].isin(lstm_geom_nse["NSE LSTM GPCP"])].index
for name, index in {
    "MSWEP": mswep_index,
    "ERA5": era5_index,
    "ERA5-Land": era5_land_index,
    "GPCP": gpcp_index,
}.items():
    lstm_geom_nse.loc[index, "Лучшая модель"] = name
ugms_dict = dict()

for i, geom in enumerate(basin_districts["geometry"]):
    ugms_part = lstm_geom_nse.loc[
        [geom.intersects(gage_point) for gage_point in lstm_geom_nse["geometry"]], :
    ]
    ugms_name = basin_districts.loc[i, "name1"]
    ugms_dict[ugms_name] = ugms_part
    basin_districts.loc[i, ["best NSE"]] = ugms_part[["best NSE"]].median()

basin_districts.loc[[26, 27], ["best NSE"]] = np.nan


In [164]:
lstm_geom_nse[["NSE LSTM MSWEP", "NSE LSTM ERA5", "NSE LSTM ERA5-Land" ,"NSE LSTM GPCP", "best NSE"]].median()

NSE LSTM MSWEP        0.437688
NSE LSTM ERA5         0.451681
NSE LSTM ERA5-Land    0.412412
NSE LSTM GPCP         0.371116
best NSE              0.617745
dtype: float64

In [157]:
lstm_geom_nse['best NSE']

gauge_id
9122     0.421621
9124     0.709472
9134     0.097049
9138     0.806075
9204     0.651574
           ...   
10342    0.594343
10103    0.215689
10111    0.138405
10127    0.864114
2027     0.822629
Name: best NSE, Length: 408, dtype: float64

FileNotFoundError: model_runs/config.yml

### Specific config based on meteo source

### Eval run on given data