# Generate and store benchmark environments

In this notebook can be used to generate benchmark environments.

In [None]:
%load_ext autoreload
%autoreload 2
import os
import shutil

import pandas as pd

from src.environments.generic_environments import *

Generate the environments.

In [None]:
# setup
env_class = BarabasiAlbert
num_envs = 20
num_nodes = 50
delete_existing = False  # delete existing benchmarks
output_dir = f'../data/{env_class.__name__}/{num_nodes}_nodes_100_train_linear/'  # dir where to store the generated envs

# generate/empty folder for envs of same type
if os.path.isdir(output_dir) and not delete_existing:
    print('\nDirectory \'' + output_dir + '\' already exists, not generating benchmarks...')
else:
    if os.path.isdir(output_dir):
        print('\nDirectory \'' + output_dir + '\' already exists, delete existing benchmarks...')
        for root, dirs, files in os.walk(output_dir):
            for file in files:
                os.remove(os.path.join(root, file))
            for folder in dirs:
                shutil.rmtree(os.path.join(root, folder))

    os.makedirs(output_dir, exist_ok=True)

    # generate benchmark envs
    for i in range(num_envs):
        os.system(f'cp ../src/config.py {output_dir}config.py')
        env = env_class(num_nodes=num_nodes)
        env_path = output_dir + env.name + '.pth'
        env.save(env_path)
        print(f'\rGenerated {i + 1}/{num_envs} environments.', end='')


Take existing environments, modify them and store them seperately.

In [None]:
# setup
env_class = ErdosRenyi
delete_existing = False  # delete existing benchmarks
source_dir = f'../data/{env_class.__name__}/20_nodes_100_train'  # dir where origianl envs are stored
target_dir = f'../data/{env_class.__name__}/20_nodes_100_train_normalised/'  # dir where to store the generated envs


# you can implement modifications here
def modify_env(env: Environment):
    # normalise train and test data
    env.cfg.normalise_data = True

    env.normalisation_means = {}
    env.normalisation_stds = {}
    for node, values in env.observational_train_data[0].data.items():
        env.normalisation_means[node] = values.mean()
        env.normalisation_stds[node] = values.std()

    env.observational_train_data[0].normalise(env.normalisation_means, env.normalisation_stds)

    for eidx, exp in enumerate(env.interventional_train_data):
        exp.normalise(env.normalisation_means, env.normalisation_stds)

    for eidx, exp in enumerate(env.observational_test_data):
        exp.normalise(env.normalisation_means, env.normalisation_stds)

    for experiments in env.interventional_test_data.values():
        for exp in experiments:
            exp.normalise(env.normalisation_means, env.normalisation_stds)

    return env


# check if source envs available
if not os.path.isdir(source_dir):
    print(f'Source directory {source_dir} does not exist!')
elif os.path.isdir(target_dir) and not delete_existing:
    print('\nTarget directory \'' + target_dir + '\' already exists, not generating benchmarks...')
else:
    # generate/empty folder for target envs
    if os.path.isdir(target_dir):
        print('\nTarget directory \'' + target_dir + '\' already exists, delete existing benchmarks...')
        for root, dirs, files in os.walk(target_dir):
            for file in files:
                os.remove(os.path.join(root, file))
            for folder in dirs:
                shutil.rmtree(os.path.join(root, folder))

    os.makedirs(target_dir, exist_ok=True)

    # load source envs
    env_files = [entry for entry in os.scandir(source_dir) if
                 entry.is_file() and os.path.basename(entry)[-4:] == '.pth']
    for i, f in enumerate(env_files):
        env = env_class.load(os.path.abspath(f))
        modified_env = modify_env(env)
        modified_env.save(target_dir + os.path.basename(f))

        print(f'\rProcessed {i + 1}/{len(env_files)} environments in {source_dir}.', end='')



Export .csv dataset from existing environments.

In [None]:

env_class = BarabasiAlbert
data_dir = f'../data/'
out_dir = f'../csv_data/'
sub_dirs = [f'{env_class.__name__}/20_nodes_100_train/',
            f'{env_class.__name__}/20_nodes_200_train/',
            ]

for sub_dir in sub_dirs:
    env_files = [os.path.abspath(entry) for entry in os.scandir(os.path.join(data_dir, sub_dir)) if
                 entry.is_file() and os.path.basename(entry)[-4:] == '.pth']
    num_environments = len(env_files)

    tmp_dir = os.path.join(out_dir, sub_dir)
    os.makedirs(tmp_dir, exist_ok=True)
    for fidx, env_file in enumerate(env_files):
        print(f'\rExporting {fidx + 1}/{len(env_files)} datasets for environment {os.path.basename(env_file)}.', end='')

        Environment.export_csv_dataset(env_file, tmp_dir)



Export .csv dataset for BayesDAG from existing .csv datasets.

In [None]:


env_class = ErdosRenyi
data_dir = f'../csv_data/'
out_dir = f'../bayesdag_data/'
sub_dirs = [f'{env_class.__name__}/20_nodes_100_train/',
            f'{env_class.__name__}/20_nodes_200_train/',
            ]

for sub_dir in sub_dirs:
    files = [os.path.abspath(entry) for entry in os.scandir(os.path.join(data_dir, sub_dir)) if
             entry.is_file() and os.path.basename(entry)[-4:] == '.csv']
    num_environments = len(files)

    tmp_dir = os.path.join(out_dir, sub_dir)
    os.makedirs(tmp_dir, exist_ok=True)
    for fidx, file in enumerate(files):
        print(f'\rExporting {fidx + 1}/{len(files)} datasets for environment {os.path.basename(file)}.', end='')
        df = pd.read_csv(file)
        outpath = os.path.join(tmp_dir, os.path.basename(file))
        df.to_csv(outpath, index=False, header=False)

