In [1]:
%load_ext autoreload
%autoreload 2

In [5]:
import glob
import gc
# import torch
import json
import random
import xarray as xr
import pandas as pd
import numpy as np
import geopandas as gpd
from pathlib import Path
from tqdm.notebook import tqdm
# from neuralhydrology.nh_run import start_run, eval_run
from scripts.file_manipulator import (test_rewriter, train_cmip_val,
                                      train_rewriter)
from sklearn.metrics import mean_absolute_error
from scripts.comparison import gauge_cmip

# from neuralhydrology.evaluation import get_tester
# from neuralhydrology.utils.config import Config

import warnings
warnings.filterwarnings("ignore")

### Read all necessary data

In [7]:
era_input = ['prcp_e5l',  't_max_e5l', 't_min_e5l']
# q_mm_day or lvl_mbs or lvl_sm
hydro_target = 'lvl_sm'
q_h_relation = False

if (hydro_target == 'lvl_mbs') | (hydro_target == 'lvl_sm'):
    static_parameters = ['for_pc_sse', 'crp_pc_sse',
                         'inu_pc_ult', 'ire_pc_sse',
                         'lka_pc_use', 'prm_pc_sse',
                         'pst_pc_sse', 'cly_pc_sav',
                         'slt_pc_sav', 'snd_pc_sav',
                         'kar_pc_sse', 'urb_pc_sse',
                         'gwt_cm_sav', 'lkv_mc_usu',
                         'rev_mc_usu', 'sgr_dk_sav',
                         'slp_dg_sav', 'ws_area',
                         'ele_mt_sav', 'height_bs']
    nc_variable = 'nc_all_h'
    if q_h_relation:
        nc_variable = 'nc_all_q_h'
else:
    static_parameters = ['for_pc_sse', 'crp_pc_sse',
                         'inu_pc_ult', 'ire_pc_sse',
                         'lka_pc_use', 'prm_pc_sse',
                         'pst_pc_sse', 'cly_pc_sav',
                         'slt_pc_sav', 'snd_pc_sav',
                         'kar_pc_sse', 'urb_pc_sse',
                         'gwt_cm_sav', 'lkv_mc_usu',
                         'rev_mc_usu', 'sgr_dk_sav',
                         'slp_dg_sav', 'ws_area',
                         'ele_mt_sav']
    nc_variable = 'nc_all_q'

ws_file = gpd.read_file('../geo_data/geometry/russia_ws.gpkg')
ws_file = ws_file.set_index('gauge_id')

# define dates for data split
train_start_date = '01/01/2009'
train_end_date = '31/12/2016'
validation_start_date = '01/01/2017'
validation_end_date = '31/12/2018'
test_start_date = '01/01/2019'
test_end_date = '31/12/2020'
# dates for cmip comparisson
compare_start = '01/01/2015'
compare_end = '31/12/2020'


### compare CMIP models and select best ones

In [8]:
compare_list = dict()
for f in tqdm(glob.glob("../geo_data/cmip_concat/cmip_245/*.nc")):
    gauge_id = f.split("/")[-1][:-3]
    try:
        compare_element, model_order = gauge_cmip(
            gauge_id=gauge_id,
            cmip_storage="../geo_data/cmip_concat/cmip_245/",
            era_storage=f"../geo_data/ws_related_meteo/{nc_variable}/",
            compare_start=compare_start,
            compare_end=compare_end,
        )
    except FileNotFoundError:
        continue
    except OSError:
        continue

    compare_list[gauge_id] = compare_element

number_df = pd.concat(
    [i["number"].reset_index(drop=True) for gauge, i in compare_list.items()], axis=1
)
number_df.columns = [gauge for gauge, i in compare_list.items()]

error_df = pd.concat(
    [i["error"].reset_index(drop=True) for gauge, i in compare_list.items()], axis=1
)
error_df.columns = [gauge for gauge, i in compare_list.items()]

res_models = np.unique(number_df.mode(axis=1).iloc[:7, 0].values)

model_dict = dict()
for i, order in enumerate(res_models):
    model_dict[i + 1] = model_order[order]

with open("cmip_ranks.json", "w") as fp:
    json.dump(model_dict, fp)

  0%|          | 0/1886 [00:00<?, ?it/s]

In [16]:
res_models

array([ 1.,  3.,  6.,  8., 15.])

In [17]:
model_dict

{1: 'miroc6', 2: 'noresm2_mm', 3: 'cmcc_esm2', 4: 'inm_cm4_8', 5: 'cesm2'}

In [13]:
number_df

Unnamed: 0,75402,9283,3210,72102,76486,72004,75297,70211,72760,75710,...,75627,10641,12222,7039,8288,7062,12092,71277,19382,70466
0,8.0,7.0,16.0,1.0,8.0,19.0,8.0,6.0,13.0,19.0,...,4.0,16.0,19.0,16.0,19.0,7.0,16.0,1.0,15.0,17.0
1,13.0,10.0,5.0,19.0,15.0,1.0,3.0,3.0,8.0,8.0,...,9.0,19.0,3.0,19.0,16.0,8.0,19.0,2.0,13.0,16.0
2,3.0,1.0,3.0,13.0,3.0,16.0,19.0,7.0,6.0,3.0,...,13.0,15.0,16.0,15.0,5.0,3.0,15.0,3.0,3.0,19.0
3,19.0,5.0,19.0,3.0,13.0,3.0,13.0,15.0,19.0,15.0,...,1.0,6.0,15.0,6.0,6.0,9.0,8.0,4.0,4.0,2.0
4,15.0,15.0,17.0,16.0,6.0,13.0,15.0,8.0,3.0,13.0,...,6.0,9.0,8.0,11.0,3.0,6.0,5.0,5.0,8.0,11.0
5,6.0,17.0,9.0,4.0,4.0,17.0,6.0,13.0,1.0,16.0,...,15.0,5.0,6.0,3.0,9.0,15.0,3.0,6.0,16.0,8.0
6,1.0,4.0,10.0,6.0,1.0,4.0,16.0,19.0,16.0,6.0,...,2.0,1.0,9.0,1.0,10.0,13.0,6.0,7.0,7.0,13.0
7,9.0,14.0,1.0,17.0,7.0,5.0,7.0,1.0,15.0,1.0,...,19.0,7.0,7.0,7.0,15.0,1.0,7.0,8.0,10.0,10.0
8,7.0,6.0,7.0,5.0,16.0,6.0,17.0,17.0,17.0,7.0,...,16.0,3.0,1.0,10.0,1.0,5.0,9.0,9.0,5.0,1.0
9,16.0,3.0,6.0,9.0,19.0,15.0,1.0,5.0,9.0,17.0,...,8.0,17.0,5.0,17.0,7.0,16.0,13.0,10.0,1.0,9.0


In [15]:
compare_list['10002']

Unnamed: 0,number,r,error
ipsl_cm6a_lr,17.0,0.066443,2.569475
access_cm2,10.0,0.11328,5.638792
miroc6,1.0,0.081598,9.378504
mpi_esm1_2_lr,14.0,0.039216,10.294606
cnrm_cm6_1,16.0,0.084799,11.696788
awi_cm_1_1_mr,2.0,0.015426,12.766587
bcc_csm2_mr,7.0,0.115705,15.01479
ec_earth3_veg_lr,5.0,0.113162,17.340603
cnrm_esm2_1,19.0,0.063092,19.591435
inm_cm5_0,13.0,0.067072,21.77414


In [None]:
compare_list = dict()
for f in tqdm(glob.glob(f'../geo_data/great_db/{nc_variable}/*.nc')):
    gauge_id = f.split('/')[-1][:-3]
    compare_element, model_order = gauge_cmip(
        gauge_id=gauge_id,
        cmip_storage='../geo_data/cmip_concat_21_09/cmip_245/',
        era_storage=f'../geo_data/great_db/{nc_variable}/',
        compare_start=compare_start,
        compare_end=compare_end)
    compare_list[gauge_id] = compare_element

In [5]:
cmip_storage='../geo_data/cmip_concat/cmip_245/'
era_storage=f'../geo_data/great_db/{nc_variable}/'
gauge_id = glob.glob(f'../geo_data/great_db/{nc_variable}/*.nc')[0].split('/')[-1][:-3]

cmip_gauge = xr.open_dataset(f'{cmip_storage}/{gauge_id}.nc')
era_gauge = xr.open_dataset(f'{era_storage}/{gauge_id}.nc')

### Read rankings to prepare input data

In [4]:
with open('./cmip_ranks.json') as json_file:
    cmip_models = json.load(json_file)
cmip_models = list(cmip_models.values())

meteo_inputs = dict()
for model in cmip_models:
    meteo_inputs[model] = [f'precipitation_{model}',
                           f'daily_maximum_near_surface_air_temperature_{model}',
                           f'daily_minimum_near_surface_air_temperature_{model}']
# time series directory
ts_dir = Path('../geo_data/time_series')

### NeuralHydrology train data
Relies on config file. **Main** attention should be payed to **data_dir** argument
This folder is the reference to place where all required data are stored

<br><br/>
Framework automaticcally seek for **time_series** folder which should contain data **for each gauge** in desired experiment
and has all available meteo data in **.nc** format

To inlude static parameters **data_dir** should also has **attributes** folder with **.csv** file for each used gauge as an index of this file

CMIP data of desired model of **cmip_cols** parameter will replace **validation** data of **era_cols**

#### With CMIP

In [5]:
# write files for train procedure
print(f'train data for {hydro_target} with {nc_variable} initial data')
train_cmip_val(era_pathes=glob.glob(
    f'../geo_data/great_db/{nc_variable}/*.nc'),
               ts_dir=ts_dir,
               hydro_target=hydro_target,
               area_index=ws_file.index,
               era_cols=era_input,
               cmip_cols=meteo_inputs['noresm2_mm'],
               cmip_storage='../geo_data/cmip_concat_21_09/cmip_245/',
               val_start=validation_start_date,
               val_end=test_end_date)

train data for lvl_sm with nc_all_h initial data


#### Just ERA

In [5]:
# write files for train procedure
print(f'train data for {hydro_target} with {nc_variable} initial data')
train_rewriter(era_pathes=glob.glob(
    f'../geo_data/great_db/{nc_variable}/*.nc'),
    ts_dir=ts_dir,
    hydro_target=hydro_target,
    area_index=ws_file.index,
    predictors=era_input)

train data for lvl_sm with nc_all_h initial data


### Config setup

Here we define files for **train**, **validation** and **test**

It's possible to use different gauges for certain steps of model learn procedure,

but in this example we'll use every possible gauge and split it **not in space**, but with **different time periods**

'**train_start_date**': '01/01/2009',

'**train_end_date**': '31/12/2016',

'**validation_start_date**': '01/01/2017',

'**validation_end_date**': '31/12/2018',

'**test_start_date**': '01/01/2019',

'**test_end_date**': '31/12/2025'


In [6]:
# define variables require to perform hindcast
gauges = [file.split('/')[-1][:-3] for
          file in glob.glob(f'{ts_dir}/*.nc')]
random.shuffle(gauges)
gauge_size = len(gauges)

# with open('./every_basin.txt', 'w') as the_file:
#     for gauge_name in gauges:
#         the_file.write(f'{gauge_name}\n')

train_gauges = gauges[
    :int(len(gauges) * 0.8)]
with open('./basins_train.txt', 'w') as the_file:
    for gauge_name in train_gauges:
        the_file.write(f'{gauge_name}\n')

val_gauges = gauges[
    int(gauge_size * 0.8):int(gauge_size * 0.8) + int(gauge_size * 0.1)]
with open('./basins_val.txt', 'w') as the_file:
    for gauge_name in val_gauges:
        the_file.write(f'{gauge_name}\n')

test_gauges = gauges[
    int(gauge_size * 0.8) + int(gauge_size * 0.1):]
with open('./basins_test.txt', 'w') as the_file:
    for gauge_name in test_gauges:
        the_file.write(f'{gauge_name}\n')

cfg = Config(Path('./model_config.yml'))
# base model type [cudalstm, customlstm, ealstm, embcudalstm, mtslstm, gru, transformer]
# (has to match the if statement in modelzoo/__init__.py)
model_name = 'gru'
comment = f'era5_land'
cfg.update_config(yml_path_or_dict={
    # define storage and experiment
    'experiment_name': f'{model_name}_{hydro_target}_{comment}',
    'model': f'{model_name}',
    'run_dir': './model_runs/',
    'data_dir': '../geo_data/',
    # define inner parameters
    'static_attributes': static_parameters,
    'dynamic_inputs': era_input,
    # 'hindcast_inputs': era_input,
    # 'forecast_inputs': era_input,
    'target_variables': [hydro_target],
    # 'dynamics_embedding': {'type': 'fc', 'hiddens': [128, 64, 256],
    #                       'activation': 'tanh', 'dropout': 0.2},
    # 'statics_embedding': {'type': 'fc', 'hiddens': [128, 64, 256],
    #                      'activation': 'tanh', 'dropout': 0.2},    
    # define files with gauge data
    'train_basin_file': './every_basin.txt',
    'validate_n_random_basins': gauge_size,
    'validation_basin_file': './every_basin.txt',
    'test_basin_file': './every_basin.txt',
    # define time periods
    # 'seq_length': 14,
    # 'forecast_seq_length': 10,
    'train_start_date': train_start_date,
    'train_end_date': train_end_date,
    'validation_start_date': validation_start_date,
    'validation_end_date': validation_end_date,
    'test_start_date': test_start_date,
    'test_end_date': test_end_date})
cfg.dump_config(folder=Path('./launch_configs'),
                filename=f'{model_name}_{hydro_target}_{comment}.yml')

In [7]:
if torch.cuda.is_available():
    start_run(config_file=Path(
        f'./launch_configs/{model_name}_{hydro_target}_{comment}.yml'))

2023-10-01 09:43:34,176: Logging to model_runs/gru_lvl_sm_era5_land_0110_094334/output.log initialized.
2023-10-01 09:43:34,176: ### Folder structure created at model_runs/gru_lvl_sm_era5_land_0110_094334
2023-10-01 09:43:34,177: ### Run configurations for gru_lvl_sm_era5_land
2023-10-01 09:43:34,177: additional_feature_files: None
2023-10-01 09:43:34,177: batch_size: 256
2023-10-01 09:43:34,178: cache_validation_data: True
2023-10-01 09:43:34,178: checkpoint_path: None
2023-10-01 09:43:34,178: clip_gradient_norm: 1
2023-10-01 09:43:34,179: data_dir: ../geo_data
2023-10-01 09:43:34,179: dataset: generic
2023-10-01 09:43:34,179: device: cuda:0
2023-10-01 09:43:34,180: dynamic_inputs: ['prcp_e5l', 't_max_e5l', 't_min_e5l']
2023-10-01 09:43:34,180: dynamics_embedding: None
2023-10-01 09:43:34,180: epochs: 30
2023-10-01 09:43:34,181: evolving_attributes: None
2023-10-01 09:43:34,181: experiment_name: gru_lvl_sm_era5_land
2023-10-01 09:43:34,181: forecast_hidden_size: 256
2023-10-01 09:43:3

2023-10-01 09:43:34,183: learning_rate: {0: 0.001, 10: 0.0005, 20: 0.0001}
2023-10-01 09:43:34,184: log_interval: 2
2023-10-01 09:43:34,184: log_n_figures: None
2023-10-01 09:43:34,184: log_tensorboard: False
2023-10-01 09:43:34,185: loss: RMSE
2023-10-01 09:43:34,185: metrics: ['NSE', 'RMSE']
2023-10-01 09:43:34,186: model: gru
2023-10-01 09:43:34,186: num_workers: 8
2023-10-01 09:43:34,186: optimizer: Adam
2023-10-01 09:43:34,187: output_activation: linear
2023-10-01 09:43:34,187: output_dropout: 0.4
2023-10-01 09:43:34,187: per_basin_test_periods_file: None
2023-10-01 09:43:34,188: per_basin_train_periods_file: None
2023-10-01 09:43:34,188: per_basin_validation_periods_file: None
2023-10-01 09:43:34,188: predict_last_n: 1
2023-10-01 09:43:34,189: regularization: None
2023-10-01 09:43:34,189: run_dir: model_runs/gru_lvl_sm_era5_land_0110_094334
2023-10-01 09:43:34,189: save_train_data: True
2023-10-01 09:43:34,190: save_validation_results: True
2023-10-01 09:43:34,190: save_weights_e