# Imports & Paths

In [1]:
import glob
import numpy as np
import os
import pathlib
import re
import shutil
from typing import Any, Mapping

In [37]:
# Path to working directory.
working_dir = pathlib.Path('/home/gsnearing_google_com/lstm-data-assimilation')

# Paths to run directories.
run_dir = working_dir / 'runs/'

time_split_run_dir = run_dir / 'time_split'
simulation_time_split_run_dir = time_split_run_dir / 'simulation'
autoregression_time_split_run_dir = time_split_run_dir / 'autoregression'
assimilation_time_split_run_dir = time_split_run_dir / 'assimilation'

pub_run_dir = run_dir / 'pub'
simulation_pub_run_dir = pub_run_dir / 'simulation'
autoregression_pub_run_dir = pub_run_dir / 'autoregression'
assimilation_pub_run_dir = pub_run_dir / 'assimilation'

# Experiment Parameters

In [3]:
# Number of ensemble members. Each ensemble is initialized with an explicit seed.
n_ensemble = 1
seeds = list(range(n_ensemble))

# Lead times.
lead_times = [1, 2, 4, 8, 10]

# Fractions of data to be withheld during inference.
holdout_fractions = np.round(np.linspace(0, 1, 5) * 100) / 100
print('holdout fractions: ', holdout_fractions)

# Number of PUB k-fold splits.
n_pub_kfold = 10

holdout fractions:  [0.   0.25 0.5  0.75 1.  ]


In [4]:
# Default data assimilation hyperparameters.
# These are chosen from the hypertuning results.
assimilation_config = {
    'assimilation_lead_time': 0,
    'assimilation_targets': ['c_n'],
    'assimilation_window': 5,
    'epochs': 100,
    'history': 20,
    'learning_rate': 0.1,
    'learning_rate_drop_factor': 0.1,
    'learning_rate_epoch_drop': 1001,
    'loss': 'MSE',
    'model_dropout': 0.,
    'optimizer': 'Adam',
    'predict_last_n': 1,
    'regularization': [],
    'seq_length': 365,
    'target_variables': ['QObs(mm/d)'],
}

## Create Assimilation Test Directories

In [None]:
def create_assim_dir(
    sim_dir: pathlib.Path, 
    assim_dir: pathlib.Path, 
    holdout: float,
    lead: int,
    assimilation_config: Mapping[str, Any],
):

    # Copy if the directory does not already exist.
    if os.path.isdir(assim_dir):
        shutil.rmtree(assim_dir)
    shutil.copytree(sim_dir, assim_dir)
    
    # Read the config file to modify it.
    config_file = f'{assim_dir}/config.yml'
    with open(config_file, "r") as file:  
        yaml_file_data = file.read()  

    # Add a random holdout fraction of the shifted input data. 
    yaml_file_data = yaml_file_data + f'random_holdout_from_dynamic_features: \n'
    yaml_file_data = yaml_file_data + f'  QObs(mm/d): \n'
    yaml_file_data = yaml_file_data + f'    missing_fraction: {holdout} \n'
    yaml_file_data = yaml_file_data + f'    mean_missing_length: 5 \n'

    yaml_file_data = yaml_file_data + f'assimilation_config:\n'
    yaml_file_data = yaml_file_data + f'  assimilation_lead_time: {lead}\n'
    yaml_file_data = yaml_file_data + f'  assimilation_targets:\n'
    yaml_file_data = yaml_file_data + f'  - c_n\n'
    yaml_file_data = yaml_file_data + f'  assimilation_window: {assimilation_config["assimilation_window"]}\n'
    yaml_file_data = yaml_file_data + f'  epochs: {assimilation_config["epochs"]}\n'
    yaml_file_data = yaml_file_data + f'  history: {assimilation_config["history"]}\n'
    yaml_file_data = yaml_file_data + f'  learning_rate: {assimilation_config["learning_rate"]}\n'
    yaml_file_data = yaml_file_data + f'  learning_rate_drop_factor: {assimilation_config["learning_rate_drop_factor"]}\n'
    yaml_file_data = yaml_file_data + f'  learning_rate_epoch_drop: {assimilation_config["learning_rate_epoch_drop"]}\n'
    yaml_file_data = yaml_file_data + f'  loss: {assimilation_config["loss"]}\n'
    yaml_file_data = yaml_file_data + f'  model_dropout: {assimilation_config["model_dropout"]}\n'
    yaml_file_data = yaml_file_data + f'  optimizer: {assimilation_config["optimizer"]}\n'
    yaml_file_data = yaml_file_data + f'  predict_last_n: {assimilation_config["predict_last_n"]}\n'
    yaml_file_data = yaml_file_data + f'  regularization: {assimilation_config["regularization"]}\n'
    yaml_file_data = yaml_file_data + f'  seq_length: {assimilation_config["seq_length"]}\n'
    yaml_file_data = yaml_file_data + f'  target_variables:\n'
    yaml_file_data = yaml_file_data + f'  - QObs(mm/d)\n'

    # Save the modified config file.
    with open(config_file, "w") as file:  
        file.write(yaml_file_data)

In [None]:
# Copy trained simulation directory into assimilation directories.
for seed in seeds:
    specific_sim_dir = glob.glob(str(simulation_time_split_run_dir / f'simulation_seed_{seed}_*'))[0]
    for holdout in holdout_fractions:
        for lead in lead_times:
            specific_assim_dir = assimilation_time_split_run_dir / f'assimilation_seed_{seed}_holdout_{holdout}_lead_{lead}'
            create_assim_dir(
                sim_dir=specific_sim_dir, 
                assim_dir=specific_assim_dir,
                holdout=holdout,
                lead=lead,
                assimilation_config=assimilation_config,
            )

In [None]:
# Copy trained simulation directory into assimilation directories.
for seed in seeds:
    for kfold in range(n_pub_kfold):
        specific_sim_dir = glob.glob(str(simulation_pub_run_dir / f'pub_simulation_kfold_{kfold}_seed_{seed}_*'))[0]
        for holdout in [0.5]:
            for lead in [1]:
                specific_assim_dir = assimilation_pub_run_dir / f'assimilation_kfold_{kfold}_seed_{seed}_holdout_{holdout}_lead_{lead}'
                create_assim_dir(
                    sim_dir=specific_sim_dir, 
                    assim_dir=specific_assim_dir,
                    holdout=holdout,
                    lead=lead,
                    assimilation_config=assimilation_config,
                )

# Create Autoregression Test Directories

In [41]:
def create_autoregression_dir(
    ar_dir: pathlib.Path, 
    test_dir: pathlib.Path, 
    train_holdout: float,
    test_holdout: float,
    lead: int,
    seed: int,
):
    # Copy if the directory does not already exist.
    try:
        shutil.copytree(ar_dir, test_dir)
    except:
        pass
                        
    # Read the config file to modify it.
    config_file = f'{test_dir}/config.yml'
    with open(config_file, "r") as file:  
        yaml_file_data = file.read()  

    # Add random holdout for inference (autoregression).
    yaml_file_data = yaml_file_data.replace(f'missing_fraction: {train_holdout}',
                                            f'missing_fraction: {test_holdout}')

    # Change all directory paths.
    pattern = f'autoregression_lead_{lead}_holdout_{train_holdout}_seed_{seed}_\d+_\d+'
    replace = f'autoregression_lead_{lead}_train_holdout_{train_holdout}_test_holdout_{test_holdout}_seed_{seed}'
    yaml_file_data = re.sub(pattern, replace, yaml_file_data)

    # Save the modified config file.
    with open(config_file, "w") as file:  
        file.write(yaml_file_data)

In [None]:
# Copy trained simulation directory into autoregression directories.
for seed in seeds:
    for lead in lead_times:
        for train_holdout in holdout_fractions:
            for test_holdout in holdout_fractions:
                print(seed, lead, train_holdout, test_holdout)
                specific_ar_dir = glob.glob(str(autoregression_time_split_run_dir / f'autoregression_lead_{lead}_holdout_{train_holdout}_seed_{seed}_*'))[0]
                test_ar_dir = autoregression_time_split_run_dir / f'autoregression_lead_{lead}_train_holdout_{train_holdout}_test_holdout_{test_holdout}_seed_{seed}'
                create_autoregression_dir(
                    ar_dir=specific_ar_dir, 
                    test_dir=test_ar_dir, 
                    train_holdout=train_holdout,
                    test_holdout=test_holdout,
                    lead=lead,
                    seed=seed,
                )