# Imports

In [1]:
# Import neuralhydrology stuff.
import os
import random
from sklearn import model_selection
import sys
from neuralhydrology.datasetzoo import caravan

# Storage Directory

In [2]:
# Where to put the basin lists.
basin_list_dir = 'basin_lists'
if not os.path.isdir(basin_list_dir):
    os.mkdir(basin_list_dir)

# Load Caravan Data

In [3]:
# Get attributes.
attributes = caravan.load_caravan_attributes(data_dir='data')
attributes.dropna(inplace=True)

# Basins are attributes index.
basins = list(attributes.index)
print(f'There are {len(basins)} basins.')

There are 2425 basins.


# Write Basin Lists

## Full Caravan Basin List

In [None]:
# All Caravan basins.
basin_list_file = os.path.join(basin_list_dir, 'caravan_basins.txt')
with open(basin_list_file, 'wt') as f:
    for basin in basins:
        f.write(basin + '\n')

## Per-Country Basin Lists

In [None]:
# Per-country basin lists.

# Where to put the country basin lists.
countries_basin_list_dir = os.path.join(basin_list_dir, 'countries')
if not os.path.isdir(countries_basin_list_dir):
    os.mkdir(countries_basin_list_dir)

countries = list(set([basin.split('_')[0] for basin in basins]))

for country in countries:
    basin_list_file = os.path.join(countries_basin_list_dir, f'caravan_{country}_basins.txt')
    with open(basin_list_file, 'wt') as f:
        for basin in [basin for basin in basins if basin.startswith(f'{country}_')]:
            f.write(basin + '\n')

## Small Development Basin Lists

In [None]:
# Dev set.

# Where to put the dev basin lists.
dev_basin_list_dir = os.path.join(basin_list_dir, 'dev')
if not os.path.isdir(dev_basin_list_dir):
    os.mkdir(dev_basin_list_dir)

length_of_dev_sets = [3, 6, 36]

for length in length_of_dev_sets:
    dev_basins = random.choices(basins, k=length)
    basin_list_file = os.path.join(dev_basin_list_dir, f'{length}_dev_basins.txt')
    with open(basin_list_file, 'wt') as f:
        for basin in dev_basins:
            f.write(basin + '\n')

## PUB K-Fold Basin Lists

In [None]:
# Number of k-fold splits for PUB experiments
pub_n_kfold = 10

# Number of random seeds.
ensembles = 10

# Where to put the PUB basin lists.
pub_basin_list_dir = os.path.join(basin_list_dir, 'pub')
if not os.path.isdir(pub_basin_list_dir):
    os.mkdir(pub_basin_list_dir)

In [None]:
for ens in range(ensembles):

    # Create a separate split for each ensemble member.
    kf = model_selection.KFold(n_splits=pub_n_kfold, random_state=None, shuffle=True)

    for kfold, (train_index, test_index) in enumerate(kf.split(basins)):
        
        # Basin files.
        train_basin_file = os.path.join(pub_basin_list_dir, f'train_kfold:{kfold}_ens:{ens}.txt')
        test_basin_file = os.path.join(pub_basin_list_dir, f'test_kfold:{kfold}_ens:{ens}.txt')

        # Write.
        with open(train_basin_file, 'wt') as fp:
            for idx in train_index:
                fp.write(f"{basins[idx]}\n")

        with open(test_basin_file, 'wt') as fp:
            for idx in test_index:
                fp.write(f"{basins[idx]}\n")  


## No South America

In [None]:
# All Caravan basins.
basin_list_file = os.path.join(basin_list_dir, 'caravan_basins_no_south_america.txt')
with open(basin_list_file, 'wt') as f:
    for basin in basins:
        if basin.startswith('br_') or basin.startswith('cl_'):
            continue
        f.write(basin + '\n')