# Bootstrapping the $\chi^2$ $1\sigma$ confidence level

In [None]:
import os
import shutil
import subprocess
import numpy as np
import astropy
import dynamite as dyn

## Settings

In [None]:
config_file_name = 'user_test_config_ml_bs.yaml'  # name of the config file in the current directory. Must have n_max_iter=0, reattempt_failures=True.
output_dir = 'bootstrap'  # Directory which will be created and which will hold all results.
n_scen = 5  # number of kinematic data perturbation scenarios

Prerequisite: in this notebook's directory, there has to be a valid config file pointing to a successful DYNAMITE run. The config file used for bootstrapping needs to be changed in one respect: it should have `n_max_iter=0` in the parameter_space_settings to prevent DYNAMITE from doing more than mere weight solving of the best model.

Example:
- Go to the `dev_tests/` directory.
- Execute `test_nnls.py` and verify that it completed successfully.
- Copy the config file (`cp user_test_config_ml.yaml user_test_config_ml_bs.yaml`) and change `n_max_iter=10` -> `n_max_iter=0`.
- Execute this notebook.

The following directories and files will be created by this script:

```
bootstrap.ipynb  (this notebook)

bootstrap.log    (this notebook's logfile)

    <output_dir>/
    |
    |---- bs_0000/      (base case for validation)
            |
            |---- <input_directory>/   (copy of original input directory)
                        |
                        ...
            |---- <output_directory>/
                        |
                        |---- models/
                                |
                                |---- orblib_xxx_yyy/  (directory of best model)
                                        |
                                        |---- datfil/  (copy of best model's orblib)
                                        |
                                        |---- infil/   (empty, will be created by DYNAMITE, unused)
                                        |
                                        |---- mlxx.yy
                                                |
                                                |---- orbit_weights.ecsv  (will contain the weights after weight solving)
                                                |
                                                |---- xxx.yaml            (copy of config file, will be created by DYNAMITE, unused)
                        |
                        |---- plots/           (empty, will be created by DYNAMITE, unused)
                        |
                        |---- all_models.ecsv  (all_models table, in the beginning, weights_done=all_done=False, in the end True)
            |
            |---- run.py        (Python script that runs the scenario)
            |
            |---- dynamite.log  (logfile of run.py)
            |
            |---- xxx.yaml      (copy of the config file, used by run.py)
    |
    |---- bs_0001/      (first scenario)
            |
            ...
            |---- <output_directory>/
                        |
                        |---- models/
                                |
                                |---- orblib_xxx_yyy/
                                        |
                                        |---- datfil/  (to save disk space, this is a SYMBOLIC LINK to datfil in bs_0000)
                                        |
                                        ...
    |
    |---- ...
    |
    |---- bs_<nscen>/   (last scenario)
            |
            ...
    |
    |---- run_bootstrap.sh         (bash-script to execute all scenarios, including the base case)
    |
    |---- bootstrapping_stdout.log (stdout of run_bootstrap.sh)
    |
    |---- bootstrapping_stderr.log (stderr of run_bootstrap.sh, only if not empty)
    |
    |---- <which_chi2>_table.ecsv  (scenario results: chi2-values and chi2 mean and standard deviation)
```

In [None]:
if output_dir[-1] != '/':  # make sure the directory name ends with a slash
    output_dir += '/'

In [None]:
c = dyn.config_reader.Configuration(filename=config_file_name, reset_logging=True, user_logfile='bootstrap', reset_existing_output=False)

In [None]:
# some config file checks
# it should have n_max_iter = 0 and reattempt_failures = True so that it only runs weight solving on the best model and nothing else
n_max_iter = c.settings.parameter_space_settings['stopping_criteria']['n_max_iter']
if n_max_iter > 0:
    print(f'***** parameter_space_settings -> stopping_criteria -> n_max_iter SHOULD BE ZERO, but it is {n_max_iter}! *****')
if not c.settings.weight_solver_settings['reattempt_failures']:
    print(f"***** weight_solver_settings -> reattempt_failures SHOULD BE TRUE, but it isn't! *****")

In [None]:
# set some parameters and select the best model
which_chi2 = c.settings.parameter_space_settings['which_chi2']
model_idx = c.all_models.get_best_n_models_idx(n=1)[0]
model = c.all_models.get_model_from_row(model_idx)
best_chi2 = c.all_models.table[model_idx][which_chi2]
print(f'Selecting minimum {which_chi2} model:\n'
      f'model index = {model_idx}\n{which_chi2} = {best_chi2}\n'
      f'model directory = {model.directory}\n'
      f'orblib directory = {model.directory_noml}')

In [None]:
# function that writes the executable Python script, used later
def write_pythonscript(directory=''):
    with open(directory + 'run.py', 'w') as f:
        f.write('import dynamite as dyn\n'
                f'c = dyn.config_reader.Configuration("{config_file_name}",\n'
                '                                    reset_logging=True,\n'
                '                                    reset_existing_output=False)\n'
                '_ = dyn.model_iterator.ModelIterator(c)\n')

In [None]:
# function that perturbs the GH kinematics data assuming Gaussian error, used later
def perturb_kins(c, directory, kin_files):
    for kin_file in kin_files:
        gh_kins = astropy.io.ascii.read(directory + kin_file)  # read GH kinematics file
        number_gh = c.settings.weight_solver_settings['number_GH']  # consider GH moments as specified in the config file
        data_cols = ['v', 'sigma'] + [f'h{i+1}' for i in range(2, number_gh)]
        error_cols = [f'd{c}' for c in data_cols]
        measurements = np.array([gh_kins[col] for col in data_cols])
        errors = np.array([gh_kins[col] for col in error_cols])
        rng = np.random.default_rng()
        while True:
            new_data = rng.normal(loc=measurements, scale=errors)
            if np.min(new_data[1]) > 0:  # make sure sigma > 0
                break
            print('sigma <= 0, repeat drawing from error distribution...')
        gh_kins |= {c: new_data[i] for i, c in enumerate(data_cols)}
        gh_kins.write(directory + kin_file, format='ascii.ecsv', overwrite=True)

In [None]:
# create bootstrapping directories: base scenario bs_0000 + n_scen perturbed scenarios
all_models_file = c.settings.io_settings['output_directory'] + c.settings.io_settings['all_models_file']
for scen in range(n_scen + 1):  # scen=0 is the base case
    scen_dir = output_dir + f'bs_{scen:04d}/'
    # copy input directory
    shutil.copytree(c.settings.io_settings['input_directory'],
                    scen_dir + c.settings.io_settings['input_directory'],
                    dirs_exist_ok=True)  # creates intermediate directories
    # copy config file
    shutil.copy2(config_file_name, scen_dir)
    # make model directory
    os.makedirs(scen_dir + model.directory, exist_ok=True)
    # copy or link orbit library
    datfil = scen_dir + model.directory_noml + 'datfil'
    if scen == 0:
        # copy orbit library
        shutil.copytree(model.directory_noml + 'datfil', datfil, dirs_exist_ok=True)
    else:  # create a symbolic link for the orbit library to save disk space
        if os.path.isfile(datfil) or os.path.isdir(datfil):
            os.unlink(datfil)
        os.symlink('../../../../bs_0000/' + model.directory_noml + 'datfil', 
                   datfil, 
                   target_is_directory=True)
    # copy all_models table
    shutil.copy2(all_models_file, scen_dir + all_models_file)
    # cannot delete all entries from all_models table except model_idx because of get_ml_of_original_orblib()!
    # just set weights_done = all_done = False for the best model
    all_models = astropy.io.ascii.read(scen_dir + all_models_file)  # read all_models table
    if not (np.all(all_models['weights_done']) and np.all(all_models['all_done'])):
        print(f'***** The all_models table should NOT contain any weights_done=False or all_done=False entries, but it does! *****')
    all_models[model_idx]['weights_done'] = all_models[model_idx]['all_done'] = False
    # save the all_models table
    all_models.write(scen_dir + all_models_file, format='ascii.ecsv', overwrite=True)
    if scen > 0:
        if c.system.is_bar_disk_system():
            stars = c.system.get_unique_bar_component()
        else:
            stars = c.system.get_unique_triaxial_visible_component()
        kin_files = [kin.datafile for kin in stars.kinematic_data]
        perturb_kins(c, directory=scen_dir + c.settings.io_settings['input_directory'], kin_files=kin_files)
    # create Python script
    write_pythonscript(scen_dir)

In [None]:
# create a bash script that executes all the scenarios
with open(output_dir + 'run_bootstrap.sh', 'w') as f:
    f.write('#!/bin/bash\n'
            'for s in bs_*; do cd $s; python run.py; cd ..; done\n')

In [None]:
# execute the bootstrapping
cur_dir = os.getcwd()
try:
    os.chdir(output_dir)
    p = subprocess.run('bash ' + 'run_bootstrap.sh',
                       stdout=subprocess.PIPE,
                       stderr=subprocess.STDOUT,
                       shell=True)
    if p.stdout is not None:
        with open('bootstrapping_stdout.log', 'w') as f:
            f.write(p.stdout.decode("UTF-8"))
    if p.stderr is not None:
        with open('bootstrapping_stderr.log', 'w') as f:
            f.write(p.stderr.decode("UTF-8"))
finally:
    os.chdir(cur_dir)

In [None]:
# collect the chi2 values and calculate their mean and standard deviation
chi2 = []
for scen in range(n_scen + 1):  # scen=0 is the base case
    scen_dir = output_dir + f'bs_{scen:04d}/'
    # read the all_models table
    all_models = astropy.io.ascii.read(scen_dir + all_models_file)
    # get the chi2 value
    chi2 += [all_models[which_chi2][model_idx]]
chi2_table = astropy.table.Table({'Scenario': range(n_scen + 1),
                                  'Directory': [f'bs_{s:04d}/' for s in range(n_scen + 1)],
                                  which_chi2: chi2})
chi2_table.meta = {f'{which_chi2} mean': np.mean(chi2),
                   f'{which_chi2} standard deviation': np.std(chi2)}
# save the result in an astropy table
chi2_table.write(output_dir + which_chi2 + '_table.ecsv', format='ascii.ecsv', overwrite=True)

In [None]:
print(chi2_table.meta)
chi2_table.pprint_all()