# Imports & Paths

In [3]:
import numpy as np
import os
import pathlib
import shutil
from sklearn import model_selection

In [2]:
# Path to working directory.
working_dir = pathlib.Path('/home/gsnearing/projects/lstm_data_assimilation')

In [3]:
# Path to basin lists. 
# There must be a full CAMELS basin list file in the working directory.
basin_file = pathlib.Path('531_basin_list.txt')
basin_list_dir = pathlib.Path('basin_lists')
if os.path.isdir(basin_list_dir):
    shutil.rmtree(basin_list_dir)
os.mkdir(basin_list_dir)

In [4]:
# Paths to master configs. 
# Master configs are templates that are modified for different model runs.
master_config_dir = working_dir / 'master_configs'
master_simulation_config = master_config_dir / 'simulation_config.yml'
master_autoregression_config = master_config_dir / 'autoregression_config.yml'

In [5]:
# Path to experiment configs. These are the configs that this notebook creates.
experiment_config_dir = working_dir / 'run_configs/'

# Ensure that the experiment config directory exists.
if os.path.isdir(experiment_config_dir):
    shutil.rmtree(experiment_config_dir)
os.mkdir(experiment_config_dir)

# Create time-split directories
time_split_dir = experiment_config_dir / 'time_split'
os.mkdir(time_split_dir)

simulation_time_split_config_dir = time_split_dir / 'simulation'
os.mkdir(simulation_time_split_config_dir)

autoregression_time_split_config_dir = time_split_dir / 'autoregression'
os.mkdir(autoregression_time_split_config_dir)

assimilation_time_split_config_dir = time_split_dir / 'assimilation'
os.mkdir(assimilation_time_split_config_dir)

# Create PUB directories.
pub_dir = experiment_config_dir / 'pub'
os.mkdir(pub_dir)

simulation_pub_config_dir = pub_dir / 'simulation'
os.mkdir(simulation_pub_config_dir)

autoregression_pub_config_dir = pub_dir / 'autoregression'
os.mkdir(autoregression_pub_config_dir)

assimilation_pub_config_dir = pub_dir / 'assimilation'
os.mkdir(assimilation_pub_config_dir)

In [6]:
# Paths to run directories.
run_dir = pathlib.Path('runs')

# Ensure that the high level run directory exists.
if not os.path.isdir(run_dir):
    os.mkdir(run_dir)

# Create time-split directories
time_split_run_dir = run_dir / 'time_split'
if not os.path.isdir(time_split_run_dir):
    os.mkdir(time_split_run_dir)

simulation_time_split_run_dir = time_split_run_dir / 'simulation'
if not os.path.isdir(simulation_time_split_run_dir):
    os.mkdir(simulation_time_split_run_dir)

autoregression_time_split_run_dir = time_split_run_dir / 'autoregression'
if not os.path.isdir(autoregression_time_split_run_dir):
    os.mkdir(autoregression_time_split_run_dir)

assimilation_time_split_run_dir = time_split_run_dir / 'assimilation'
if not os.path.isdir(assimilation_time_split_run_dir):
    os.mkdir(assimilation_time_split_run_dir)

# Create PUB directories.
pub_run_dir = run_dir / 'pub'
if not os.path.isdir(pub_run_dir):
    os.mkdir(pub_run_dir)

simulation_pub_run_dir = pub_run_dir / 'simulation'
if not os.path.isdir(simulation_pub_run_dir):
    os.mkdir(simulation_pub_run_dir)

autoregression_pub_run_dir = pub_run_dir / 'autoregression'
if not os.path.isdir(autoregression_pub_run_dir):
    os.mkdir(autoregression_pub_run_dir)

assimilation_pub_run_dir = pub_run_dir / 'assimilation'
if not os.path.isdir(assimilation_pub_run_dir):
    os.mkdir(assimilation_pub_run_dir)

# Experiment Parameters

In [4]:
# Number of ensemble members. Each ensemble is initialized with an explicit seed.
n_ensemble = 1
seeds = list(range(n_ensemble))

# Lead times.
lead_times = [1, 2, 4, 8, 10]

# Fractions of data to be withheld during inference.
holdout_fractions = np.round(np.linspace(0, 1, 5) * 100) / 100
print('holdout fractions: ', holdout_fractions)

# Number of PUB k-fold splits.
n_pub_kfold = 10

holdout fractions:  [0.   0.25 0.5  0.75 1.  ]


In [7]:
# Report
n_time_split_runs = n_ensemble * len(lead_times) * len(holdout_fractions)
print('-----------------------------------------------------')
print(f'There are {n_ensemble} time-split simulation training & test runs.')
print(f'There are {n_time_split_runs} time-split AR training & test runs.')
print(f'There are {n_time_split_runs} time-split DA test runs.')

n_pub_split_runs = n_ensemble * n_pub_kfold
print('-----------------------------------------------------')
print(f'There are {n_pub_split_runs} PUB simulation training & test runs.')
print(f'There are {n_pub_split_runs} PUB AR training & test runs.')
print(f'There are {n_pub_split_runs} PUB DA test runs.')

print('-----------------------------------------------------')
print(f'There are {n_ensemble + n_time_split_runs + 2*n_pub_split_runs} total training runs.')
# print(f'There are {2*n_ensemble + 2*n_time_split_runs + 2*n_pub_split_runs} total test runs.')
print('-----------------------------------------------------')


-----------------------------------------------------
There are 1 time-split simulation training & test runs.
There are 25 time-split AR training & test runs.
There are 25 time-split DA test runs.
-----------------------------------------------------
There are 10 PUB simulation training & test runs.
There are 10 PUB AR training & test runs.
There are 10 PUB DA test runs.
-----------------------------------------------------
There are 46 total training runs.
-----------------------------------------------------


# Time-Split Configs

## Simulation

In [9]:
# The only dimension for pure simulation models (no DA and no AR) is the number of ensemble members.
count = 0
for seed in seeds:

    # Count the number of files created.
    count += 1
    
    # Read the master config for simulation models.
    with open(master_simulation_config, 'r') as f:
        filedata = f.read()
    
    # Change the random seed and experiment name.
    filedata = filedata.replace('seed:', f'seed: {seed}')
    filedata = filedata.replace('experiment_name:', f'experiment_name: simulation_seed_{seed}')

    # Change run directory.
    filedata = filedata.replace('run_dir:', f'run_dir: {simulation_time_split_run_dir}')
 
    # Save the resulting config file.
    new_config = simulation_time_split_config_dir / f'seed_{seed}.yml'
    with open(new_config, 'w') as f:
        f.write(filedata)

print(f'{count} config files were created.')

1 config files were created.


## Autoregression

In [10]:
# Autoregression experiments have dimensions related to:
# -- ensembles
# -- lead times
# -- holdout fraction
count = 0
for seed in seeds:
    for lead in lead_times:
        for holdout in holdout_fractions:

            # Count the number of files created.
            count += 1
    
            # Read the master config for autoregression models.
            with open(master_autoregression_config, 'r') as f:
                filedata = f.read()

            # Change the experiment name.
            filedata = filedata.replace('experiment_name:', f'experiment_name: autoregression_lead_{lead}_holdout_{holdout}_seed_{seed}')

            # Change the random seed.
            filedata = filedata.replace('seed:', f'seed: {seed}')

            # Add shifted inputs. This requires two changes to the config file -- one to request the shift and one to use the shifted data.
            filedata = filedata.replace(f'- xxx1\n', f'- {lead}\n')
            filedata = filedata.replace(f'- xxxQObs(mm/d)_shift1\n', f'- QObs(mm/d)_shift{lead}\n')

            # Add a random holdout fraction of the shifted input data. 
            filedata = filedata + f'random_holdout_from_dynamic_features: \n '
            filedata = filedata + f'  QObs(mm/d)_shift{lead}: \n '
            filedata = filedata + f'    missing_fraction: {holdout} \n '
            filedata = filedata + f'    mean_missing_length: 5 \n '

            # Change run directory.
            filedata = filedata.replace('run_dir:', f'run_dir: {autoregression_time_split_run_dir}')
 
            new_config = autoregression_time_split_config_dir / f'lead_{lead}_holdout_{holdout}_seed_{seed}.yml'
            with open(new_config, 'w') as f:
                f.write(filedata)

print(f'{count} config files were created.')

25 config files were created.


# PUB Configs

In [11]:
# Only run PUB models for one leadtime.
lead = 1

## Create K-Fold Splits

In [12]:
# List of all basins.
with basin_file.open('r') as fp:
    basins = sorted(basin.strip() for basin in fp if basin.strip())
print(f'There are {len(basins)} basins.')

There are 531 basins.


In [13]:
# Create basin list files.
for seed in seeds:

    # Create a separate split for each ensemble member.
    kf = model_selection.KFold(n_splits=n_pub_kfold, random_state=None, shuffle=True)

    for kfold, (train_index, test_index) in enumerate(kf.split(basins)):

        # Count the number of files created.
        count += 1
        
        # Basin files.
        train_basin_file = basin_list_dir / f'train_kfold_{kfold}_seed_{seed}.txt'
        test_basin_file = basin_list_dir / f'test_kfold_{kfold}_seed_{seed}.txt'

        # Write.
        with train_basin_file.open('w') as fp:
            for idx in train_index:
                fp.write(f"{basins[idx]}\n")

        with test_basin_file.open('w') as fp:
            for idx in test_index:
                fp.write(f"{basins[idx]}\n")  

## Simulation

In [14]:
count = 0
for seed in seeds:
    for kfold in range(n_pub_kfold):

        # Count the number of files created.
        count += 1
    
        # Read the master config for simulation models.
        with open(master_simulation_config, 'r') as f:
            filedata = f.read()
        
        
        # Change the random seed and experiment name.
        filedata = filedata.replace('seed:', f'seed: {seed}')
        filedata = filedata.replace('experiment_name:', f'experiment_name: pub_simulation_kfold_{kfold}_seed_{seed}')

        # Change the basin files based on  kfold split.
        train_basin_file = basin_list_dir / f'train_kfold_{kfold}_seed_{seed}.txt'
        test_basin_file = basin_list_dir / f'test_kfold_{kfold}_seed_{seed}.txt'

        filedata = filedata.replace('train_basin_file: basin_lists/531_basin_list.txt',
                                   f'train_basin_file: {train_basin_file}')
        filedata = filedata.replace('validation_basin_file: basin_lists/531_basin_list.txt',
                                   f'validation_basin_file: {test_basin_file}')
        filedata = filedata.replace('test_basin_file: basin_lists/531_basin_list.txt',
                                   f'test_basin_file: {test_basin_file}')

        # Change run directory.
        filedata = filedata.replace('run_dir:', f'run_dir: {simulation_pub_run_dir}')
 
        # Save the resulting config file.
        new_config = simulation_pub_config_dir / f'kfold_{kfold}_seed_{seed}.yml'
        with open(new_config, 'w') as f:
            f.write(filedata)

print(f'{count} config files were created.')

10 config files were created.


## Autoregression

In [15]:
count = 0
for seed in seeds:
    for kfold in range(n_pub_kfold):
        for holdout in [0.5]:
                
            # Count the number of files created.
            count += 1

            # Read the master config for autoregression models.
            with open(master_autoregression_config, 'r') as f:
                filedata = f.read()

            # Change the experiment name.
            filedata = filedata.replace('experiment_name:', f'experiment_name: pub_autoregression_holdout_{holdout}_kfold_{kfold}_seed_{str(seed)}')
            
            # Change the random seed.
            filedata = filedata.replace('seed:', f'seed: {seed}')
            
            # Add shifted inputs. This requires two changes to the config file -- one to request the shift and one to use the shifted data.
            filedata = filedata.replace(f'- xxx1\n', f'- {lead}\n')
            filedata = filedata.replace(f'- xxxQObs(mm/d)_shift1\n', f'- QObs(mm/d)_shift{lead}\n')
            
            # Add a random holdout fraction of the shifted input data. 
            filedata = filedata + f'random_holdout_from_dynamic_features: \n '
            filedata = filedata + f'  QObs(mm/d)_shift{lead}: \n '
            filedata = filedata + f'    missing_fraction: {holdout} \n '
            filedata = filedata + f'    mean_missing_length: 5 \n '

            # Change the basin files based on  kfold split.
            train_basin_file = basin_list_dir / f'train_kfold_{kfold}_seed_{seed}.txt'
            test_basin_file = basin_list_dir / f'test_kfold_{kfold}_seed_{seed}.txt'

            filedata = filedata.replace('train_basin_file: basin_lists/531_basin_list.txt',
                                        f'train_basin_file: {train_basin_file}')
            filedata = filedata.replace('validation_basin_file: basin_lists/531_basin_list.txt',
                                        f'validation_basin_file: {test_basin_file}')
            filedata = filedata.replace('test_basin_file: basin_lists/531_basin_list.txt',
                                        f'test_basin_file: {test_basin_file}')

            # Change run directory.
            filedata = filedata.replace('run_dir:', f'run_dir: {autoregression_pub_run_dir}')

            # Save the resulting config file.
            new_config = autoregression_pub_config_dir / f'holdout_{holdout}_kfold_{kfold}_seed_{seed}.yml'    
            with open(new_config, 'w') as f:
                f.write(filedata)

print(f'{count} config files were created.')

10 config files were created.
