## Static dataset creation
This notebook walks through how to use the modules to create and save a static dataset for use in training the models.

We demonstrate this for our two data dimensionalities:
1) 0D, linear data
2) 2D, image data

For both data dimensionalities, we use the medium noise injection ($\sigma_y$ = 0.05) and and the output noise injection mode, although both of these settings can be easily changed to simulate and save different types of data.

In [1]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from data.data import DataPreparation, MyDataLoader
from torch.utils.data import TensorDataset

## 1) Save 0D, linear data
Use the `DataPreparation()` class from `data` to simulate data.

In [2]:
data = DataPreparation()

The below function packages together aall of the options for different tweaks to make to the data as it is prepared.

In [3]:
def prepare_data(noise, dim, injection, size_df):
    if dim == "0D":
        data.sample_params_from_prior(size_df)
        if injection == "input":
            vary_sigma = True
            print("are we varying sigma", vary_sigma)
            data.simulate_data(
                data.params,
                noise,
                x=np.linspace(0, 10, 100),
                inject_type=injection,
                vary_sigma=vary_sigma,
            )
        elif injection == "output":
            sigma = DataPreparation.get_sigma(
                noise,
                inject_type=injection,
                data_dimension=dim,
            )
            print(
                f"inject type is {injection},"
                f"dim is {dim}, sigma is {sigma}"
            )
            data.simulate_data(
                data.params,
                sigma,
                x=np.linspace(0, 10, 100),
                inject_type=injection,
            )
    elif dim == "2D":
        print("2D data")
        sigma = DataPreparation.get_sigma(
            noise,
            inject_type=injection,
            data_dimension=dim,
        )
        print(
            f"inject type is {injection}, dim is {dim}, sigma is {sigma}"
        )
        data.sample_params_from_prior(
            size_df,
            low=[0, 1, -1.5],
            high=[0.01, 10, 1.5],
            n_params=3,
            seed=42,
        )
        data.simulate_data_2d(
            size_df,
            data.params,
            sigma,
            image_size=32,
            inject_type=injection,
        )
    return data, sigma

In [None]:
injection = 'output'
dim = '0D'
noise = 'medium'
size_df = 10
data, sigma = prepare_data(noise, dim, injection, size_df)

The dictionary element is what we're interested in.

In [None]:
dict = data.get_dict()

Visualize what the dataset looks like.

In [None]:
plt.scatter(dict['inputs'], dict['output'], color='black', s=3)
plt.xlabel('input variable')
plt.ylabel('output variable');

Load the class you'll need to save the data.

In [None]:
saver = MyDataLoader()
saver.save_data_h5(f'{dim}_{injection}_sigma_{sigma}_size_{size_df}', dict)

## 2) Save 2D, imaging data
Re-initialize the DataPreparation() class and use the same `prepare_data` function as before.

In [41]:
data = DataPreparation()

In [42]:
injection = 'output'
dim = '2D'
noise = 'medium'
size_df = 1000
data, sigma = prepare_data(noise, dim, injection, size_df)

2D data
inject type is output, dim is 2D, sigma is 0.05
2D data generated,                 with noise injected type: output.


In [43]:
dict = data.get_dict()

In [44]:
saver = MyDataLoader()
saver.save_data_h5(f'{dim}_{injection}_sigma_{sigma}_size_{size_df}', dict)

In [45]:
noise = "medium"
dim = "2D"
injection = "output"
n_models = 5
n_epochs = 100
out_dir = "../DeepUQResources/"
size_df = 1000
data_path = "../data/"

In [46]:
!python ../src/scripts/DeepEnsemble.py --noise_level $noise \
    --n_models $n_models --data_dimension $dim \
    --data_injection $injection --uniform --overwrite_final_checkpoint \
    --save_all_checkpoints --save_final_checkpoint --out_dir $out_dir \
    --n_epochs $n_epochs --size_df $size_df --data_path $data_path --verbose

Reading settings from cli and default,               dumping to temp config:  ./DeepUQResources/temp/temp_config_DE_20241009194433.yml
loaded this file:  2D_output_sigma_0.05_size_1000
df {'inputs': tensor([[[3.8165e-04, 4.0786e-04, 4.3822e-04,  ..., 1.5620e-05,
          1.2482e-05, 1.0394e-05],
         [4.2965e-04, 4.6385e-04, 5.0471e-04,  ..., 1.9418e-05,
          1.5489e-05, 1.2879e-05],
         [4.9179e-04, 5.3782e-04, 5.9467e-04,  ..., 2.5329e-05,
          2.0160e-05, 1.6732e-05],
         ...,
         [1.7157e-05, 2.0745e-05, 2.6182e-05,  ..., 7.6410e-04,
          6.9063e-04, 6.3078e-04],
         [1.3266e-05, 1.6011e-05, 2.0162e-05,  ..., 6.4822e-04,
          5.9607e-04, 5.5199e-04],
         [1.0745e-05, 1.2949e-05, 1.6276e-05,  ..., 5.6223e-04,
          5.2403e-04, 4.9058e-04]],

        [[2.4898e-15, 6.9285e-15, 2.1391e-14,  ..., 1.4118e-08,
          6.9968e-09, 3.6206e-09],
         [4.9008e-15, 1.3752e-14, 4.2831e-14,  ..., 1.8868e-08,
          8.9024e-09, 4.4111

In [None]:
plt.clf()
for i in range(5):
    if inject == 'predictive':
        xs = np.reshape(data.input, (1000, 101))
        print(np.shape(data.input))
        print(np.shape(xs))
        plt.plot(xs[i], data.output[i])
        plt.scatter(xs[i], data.output[i],
                label = f'$m = ${data.params[i][0]}, $b = ${data.params[i][1]}')
    elif inject == 'feature':
        plt.plot(data.input[i], data.output[i])
        plt.scatter(data.input[i], data.output[i],
                    label = f'$m = ${data.params[i][0]}, $b = ${data.params[i][1]}')
plt.legend()
if inject == 'predictive':
    plt.xlabel('x')
    plt.ylabel(r'$y^{\prime}$')
elif inject == 'feature':
    plt.xlabel(r'$x^{\prime}$')
    plt.ylabel('y')
plt.show()
    

## Save the data to h5

In [None]:
saver = MyDataLoader()

In [None]:
# save the dataframe
saver = MyDataLoader()
'''
filename = (
                str(injection)
                + "_sigma_"
                + str(sigma)
                + "_size_"
                + str(size_df)
            )
'''
saver.save_data_h5(f'{noise_profile}_{inject}_sigma_{noise_to_sigma[noise]}_size_{size_df}',
                   dict)

## Save for all noise levels and all inject styles

In [None]:
noise_list = ['low', 'medium', 'high']
inject_list = ['feature', 'predictive']

for noise in noise_list:
    for injection in inject_list:
        data.simulate_data(data.params,
                           noise_to_sigma[noise], 
                           simulation_name = noise_profile,
                           inject_type = injection
                            )
        
        dict = data.get_dict()
        saver.save_data_h5(f'{noise_profile}_{injection}_sigma_{noise_to_sigma[noise]}_size_{size_df}',
                           dict)