# Sum of Gaussians

Presentation of the neural model supported by kernel regression on sum of Gaussians dataset.

*Note*: to see how dataset was generated, go to `dataset.ipynb`.

In [None]:
import sys

sys.path.append("..")  # we run from subdirectory, so to access sources append repo root to path

In [None]:
import pandas as pd
import seaborn as sns
import torch
from pydentification.data.datamodules.simulation import SimulationDataModule
from pydentification.data.process import unbatch
from sklearn import metrics

from src.nonparametric import kernels
from src.nonparametric.memory import ExactMemoryManager
from src.training.module import BoundedSimulationTrainingModule

In [None]:
sns.set()

# Dataset

In [None]:
data_path = r"../data/csv/sum-of-gaussians.csv"
plot_path = r"../data/plots/sum-of-gaussians/"
model_path = r"../models/sum-of-gaussians.pt"

train_size = 10_000

We do not plot this dataset, since it is 8 dimensional.

In [None]:
dataset = pd.read_csv(data_path)
dataset.head(3)

In [None]:
# we use trick to generate static data using data-module for simulation
# the time and system dimension will be swapped to keep implementation the same
dm = SimulationDataModule.from_csv(
    dataset_path=data_path,
    input_columns=["x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8"],
    output_columns=["y"],
    test_size=test_size,
    batch_size=32,  # used for prediction, we will not train network here
    validation_size=0.0,  # no need for validation
    shift=1,
    forward_input_window_size=1,
    forward_output_window_size=1,
    forward_output_mask=0,
)

In [None]:
# setup the data for prediction
dm.setup("fit")
dm.setup("predict")

In [None]:
for x, y in dm.train_dataloader():
    print(x.shape, y.shape)
    break  # system is 8 dimensional with single time-step

In [None]:
for x, y in dm.test_dataloader():
    print(x.shape, y.shape)
    break

# Model

Create the model from neural network we have trained before.

The settings for kernel regression are selected using hyper-parameter search, which are the best we found for this problem.

In [None]:
network = torch.load(model_path)

In [None]:
model = BoundedSimulationTrainingModule(
    network=network,
    optimizer=torch.optim.Adam(network.parameters()),  # will not be used anyway
    lr_scheduler=None,
    bound_during_training=False,
    bound_crossing_penalty=0.0,
    bandwidth=0.91,  # using kernel size 0.9 generate single NaN, so we set it to 0.91
    kernel=kernels.box_kernel,
    memory_manager=ExactMemoryManager(),  # dataset is low-dimensional, so no need to use approximated nearest neighbours here
    lipschitz_constant=0.25,  # known
    delta=0.1,  # user defined
    noise_variance=0.05,  # we know the variance from dataset generation
    k=32,
    p=2,
    r=None,
    memory_device="cpu",
    predict_device="cpu",
)

In [None]:
# unbatch the dataset to prepare memory manager
x, y = unbatch(dm.train_dataloader())
x.shape, y.shape

In [None]:
model.prepare(x, y)

# Test

Run the predictions with trained network and kernel regression.

In [None]:
outputs = model.predict_dataloader(dm.test_dataloader())

In [None]:
def range_ratio_error(error, y_true):
    return error / (y_true.max() - y_true.min())

def report(outputs, targets):
    rmse_network = metrics.mean_squared_error(y_true=targets, y_pred=outputs["network_predictions"].numpy().flatten(), squared=False)
    rmse_nonparametric = metrics.mean_squared_error(y_true=targets, y_pred=outputs["nonparametric_predictions"].numpy().flatten(), squared=False)
    rmse_bound = metrics.mean_squared_error(y_true=targets, y_pred=outputs["lower_bound"].numpy().flatten(), squared=False)

    print(f"RMSE NET:    {rmse_network:.4f}")
    print(f"RMSE KRE:    {rmse_nonparametric:.4f}")
    print(f"RMSE BOUNDS: {rmse_bound:.4f}", end="\n\n")
    print(f"RRR NET:     {range_ratio_error(error=rmse_network, y_true=targets):.2%}")
    print(f"RRR KRE:     {range_ratio_error(error=rmse_nonparametric, y_true=targets):.2%}")
    print(f"RRR BOUNDS:  {range_ratio_error(error=rmse_bound, y_true=targets):.2%}")

In [None]:
report(outputs, dataset["y"].iloc[train_size:].values)