In [9]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
from yaml import safe_load, YAMLError
from src.utils import setup_output_folder
from src.snapshots import snapshots_assembly, data_normalization
from src.linear_reduction import SVD
from src.nonlinear_reduction import AutoEncoder
from src.data_split import DataSplitter
from src.postprocessing import compute_errors

In [11]:
with open("parameters.yaml", "r") as stream:
    try:
        params = safe_load(stream)
    except YAMLError as exc:
        print(exc)

In [12]:
params

{'experiment_name': 'epochs_5000',
 'random_state': 42,
 'normalization': {'snapshots': None,
  'svd': 'min_max',
  'autoencoder': None,
  'surrogate': None},
 'splitting': {'strategy': 'kfold',
  'number_of_folds_or_splits': 2,
  'train_size': 0.8,
  'validation_size': 0.1,
  'test_size': 0.1,
  'gap': 0},
 'snapshots': {'file_type_str': 'h5_fenics',
  'folder': 'data/input',
  'visualization_folder': 'data/visualization',
  'file_name_contains': ['concentration'],
  'dataset': None},
 'svd': {'trunc_basis': 300,
  'normalization': 'min_max',
  'svd_type': 'randomized_svd',
  'power_iterations': 1,
  'oversampling': 20},
 'auto_encoder': {'batch_size': 300,
  'num_epochs': 5000,
  'learning_rate': '1e-4',
  'weight_decay': '1e-8',
  'loss_function': 'smooth_l1_loss',
  'loss_parameters': {'beta': 0.2},
  'num_workers': 2,
  'number_of_hidden_layers': 5,
  'hidden_layers_sizes': [256, 128, 64, 32, 16],
  'hidden_layers_activation_function': ['leaky_relu',
   'leaky_relu',
   'leaky_rel

In [13]:
try: 
    print(snapshots.shape)
except:
    filenames, snapshots = snapshots_assembly(params["snapshots"])
    snapshots.shape

(70801, 3000)


In [14]:
def pipeline_modes(backtest_flag = True, inference_flag = False):
    # setup directories
    output_folder = setup_output_folder(params)

    # high dimensional data
    svd = SVD(snapshots, params, output_folder)
    svd.fit()
    svd.plot_singular_values()
    spatial_modes = svd.u
    print(f"spatial modes dim: {spatial_modes.shape}")

    # train_test split
    data_splitter = DataSplitter(params)
    folded_data = data_splitter.split_data(spatial_modes, train_test_flag=True)
    total_train_data = folded_data[0]["train"]
    total_test_data = folded_data[0]["test"] 

    print("train_data shape", total_train_data.shape)
    print("train_data type", type(total_train_data))

    if backtest_flag:
        # train_val split
        model_selection_data = data_splitter.split_data(total_train_data, train_test_flag=False)

        # fold artifacts:
        for fold in model_selection_data.keys():
            fold_train_data = model_selection_data[fold]["train"]
            fold_train_indices = model_selection_data[fold]["train_indices"]
            fold_validation_data = model_selection_data[fold]["validation"]
            fold_validation_indices = model_selection_data[fold]["validation_indices"]

            # preprocess high dimensional data
            normalized_spatial_train_modes, u_normalization_train_fold_obj = data_normalization(
            fold_train_data, params, "svd", transpose=False
            )    
            normalized_spatial_val_modes, u_normalization_val_fold_obj = data_normalization(
            fold_validation_data, params, "svd", transpose=False
            )    
            print(f"normalized spatial train modes dim: {normalized_spatial_train_modes.shape}")
            print(f"normalized spatial val modes dim: {normalized_spatial_val_modes.shape}")

            # fit high dimensional data
            auto_encoder = AutoEncoder(normalized_spatial_train_modes, params, output_folder)
            auto_encoder.fit()
            auto_encoder.plot_quantities_per_epoch("avg_loss_by_epoch")

            # compute error for training data
            normalized_train_predictions = auto_encoder.predict(normalized_spatial_train_modes)
            train_predictions = u_normalization_train_fold_obj.inverse_transform(normalized_train_predictions)
            compute_errors(fold, train_predictions, fold_train_data, fold_train_indices, output_folder, analysis_type="train", modeling_type="backtest")

            # compute error for validation data
            normalized_val_predictions = auto_encoder.predict(normalized_spatial_val_modes)
            val_predictions = u_normalization_val_fold_obj.inverse_transform(normalized_val_predictions)
            compute_errors(fold, val_predictions, fold_validation_data, fold_validation_indices, output_folder, analysis_type="validation", modeling_type="backtest")
            

    if inference_flag:
            # train for all data
            total_train_indices = folded_data[0]["train_indices"]
            total_test_indices = folded_data[0]["test_indices"]

            # normalize training and data
            total_normalized_spatial_train_modes, u_normalization_total_train_obj = data_normalization(
            total_train_data, params, "svd", transpose=False
            )    
            total_normalized_spatial_test_modes, u_normalization_total_test_obj = data_normalization(
            total_test_data, params, "svd", transpose=False
            )    
            print(f"normalized total spatial train modes dim: {total_normalized_spatial_train_modes.shape}")

            # fit high dimensional data
            auto_encoder = AutoEncoder(total_normalized_spatial_train_modes, params, output_folder)
            auto_encoder.fit()
            auto_encoder.plot_quantities_per_epoch("avg_loss_by_epoch")

            # compute error for training data
            total_normalized_train_predictions = auto_encoder.predict(total_normalized_spatial_train_modes)
            total_train_predictions = u_normalization_total_train_obj.inverse_transform(total_normalized_train_predictions)
            compute_errors(fold, total_train_predictions, 0, total_train_indices, output_folder, analysis_type="train", modeling_type="inference")

            # compute error for test data
            total_normalized_test_predictions = auto_encoder.predict(total_normalized_spatial_test_modes)
            total_test_predictions = u_normalization_total_test_obj.inverse_transform(total_normalized_test_predictions)
            compute_errors(fold, total_test_predictions, 0, total_test_indices, output_folder, analysis_type="test", modeling_type="inference")
    


In [15]:
%%capture
pipeline_modes(inference_flag=False)

In [None]:
# TODO: jogar no google docs
# TODO: surrogate