# Class for Grid Search

In [44]:
# external libraries
import numpy as np
from typing import Iterator
import itertools

# local libraries
from estimator import Estimator
from util_classes import Dataset
from optimizer import Optimizer
from nn import NeuralNetwork, LinearLayer, ActivationFunction

from loss import LossFunction

# def filter_dict_by_key(dictionary: dict, keys: list[str]) -> dict:
#     filtered_dict = {}
#     for key in keys:
#         if key in dictionary.keys():
#             filtered_dict[key] = dictionary[key]
#     return filtered_dict


class GridSearch:
    _net_keys = ["layers"]
    _optimizer_keys = ["eta", "l2_coeff", "alpha"]
    _loss_keys = ["fname"]
    _estimator_keys = ["batchsize"]
    # _global_keys = _net_keys + _optimizer_keys + _loss_keys + _estimator_keys

    # dictionary containing translations from exposed names to names to pass to functions internally
    _param_name_translations = {
        "layers": "layers",
        "l2": "l2_coeff",
        "momentum": "alpha",
        "eta": "eta",
        "loss": "fname",
        "batchsize": "batchsize",
    }

    # check param_grid and remove invalid values
    # TODO implement
    @staticmethod
    def _check_param_grid(hyper_grid) -> bool:
        return True

    def __init__(self, estimator: Estimator, hyper_grid: dict):
        if estimator == None or type(estimator) != Estimator:
            raise TypeError
        self._estimator = estimator
        if hyper_grid == None or type(hyper_grid) != dict:
            raise TypeError

        # check for wrong values
        if not GridSearch._check_param_grid(hyper_grid):
            raise ValueError

        # translate names of parameters and sort by key for better efficiency
        new_grid = {}
        for key in self._param_name_translations.keys():
            if key in hyper_grid:
                new_grid[self._param_name_translations[key]] = hyper_grid[key]
        self._hyper_grid = new_grid

    # returns a list of data folds through indexes
    def _generate_folds(self) -> Iterator[tuple[Dataset, Dataset]]:
        n_folds = self._n_folds
        dataset = self._dataset
        data_size = dataset.ids.shape[0]
        indices = np.arange(data_size)

        # TODO maybe shuffle not needed if we assume dataset has already been shuffled
        np.random.shuffle(indices)

        for index_lists in np.array_split(indices, n_folds):
            # make mask to split test and training set indices
            mask = np.zeros(data_size, dtype=bool)
            mask[index_lists] = True
            test_indices = indices[mask]
            train_indices = indices[~mask]
            # initialize test set and training set
            test_set = Dataset(
                ids=dataset.ids[test_indices],
                labels=dataset.labels[test_indices],
                data=dataset.data[test_indices],
            )
            train_set = Dataset(
                ids=dataset.ids[train_indices],
                labels=dataset.labels[train_indices],
                data=dataset.data[train_indices],
            )
            yield (train_set, test_set)

    # returns the best set of hyperparameters
    def k_fold(self, dataset: Dataset, n_folds: int, n_epochs: int):
        if isinstance(dataset, Dataset):
            self._dataset = dataset
        else:
            raise TypeError

        if type(n_folds) == int:
            self._n_folds = n_folds
        else:
            raise TypeError

        data_size = dataset.ids.shape[0]
        if n_folds > data_size:
            raise ValueError

        hyper_grid = self._hyper_grid
        estimator = self._estimator
        input_dim = dataset.shape[1][0]
        output_dim = dataset.shape[1][1]

        # check if output layer is correct for all combinations
        for layers in hyper_grid["layers"]:
            if layers[-1][0] != output_dim:
                raise ValueError(
                    "Number of units in last layer must be equal to the output dimension of the data"
                )

        # generates all combinations of hyperparameters
        keys, values = zip(*hyper_grid.items())
        param_combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]

        combination_loss_list = []

        # iterates all combinations of hyperparameters
        for combination in param_combinations:
            # filter parameters for various classes
            loss_params = {key: combination[key] for key in self._loss_keys}
            estimator_params = {key: combination[key] for key in self._estimator_keys}
            optimizer_params = {key: combination[key] for key in self._optimizer_keys}
            net_params = {key: combination[key] for key in self._net_keys}

            # create dictionary of params to pass to constructors
            estimator_params["loss"] = LossFunction(**loss_params)
            estimator_params["optimizer"] = Optimizer(**optimizer_params)

            # create list of layers to create NN
            old_units = input_dim
            layer_list = []
            for layer in net_params["layers"][:-1]:
                layer_list.append(LinearLayer((old_units, layer[0])))
                # TODO maybe linear layers can be removed
                layer_list.append(ActivationFunction(fname=layer[1]))
                old_units = layer[0]
            last_layer = net_params["layers"][-1]
            layer_list.append(LinearLayer((old_units, last_layer[0])))
            if last_layer[1] != "linear":
                layer_list.append(ActivationFunction(fname=layer_list[1]))
            estimator_params["net"] = NeuralNetwork(layer_list)

            print(estimator_params)
            estimator.update_params(**estimator_params)

            test_loss_list = []

            # iterates folds of dataset
            for train_set, test_set in self._generate_folds():
                estimator.train(dataset=train_set, n_epochs=n_epochs)
                estimator.reset()
                test_loss_list.append(estimator.evaluate(test_set))

            test_loss_avg = sum(test_loss_list) / n_folds
            test_loss_std = np.std(test_loss_list)

            combination_loss_list.append((combination, test_loss_avg, test_loss_std))

        combination_loss_list.sort(key=lambda x: x[1])
        return combination_loss_list

    # returns an estimation of the risk for the model, average +- variance
    def nested_k_fold(dataset: Dataset, inner_n_folds: int, outer_n_folds: int):
        print("hello")

In [48]:
net = NeuralNetwork(
    [
        LinearLayer((8, 16)),
        ActivationFunction(),
        LinearLayer((16, 16)),
        ActivationFunction(),
        LinearLayer((16, 2)),
    ]
)
estimator = Estimator(net)
grid = {}
grid["eta"] = [0.001, 0.002]
grid["momentum"] = [0.1, 0.2]
grid["layers"] = [[(8, "ReLU"), (1, "linear")], [(4, "ReLU"), (1, "linear")]]
grid["loss"] = ["MSE"]
grid["l2"] = [0.1, 0.2]
grid["batchsize"] = [124]

In [49]:
from datasets import read_monks, read_ML_cup

# data = _ML_cup("train")
data = read_monks(1, "train")

In [50]:
selector = GridSearch(estimator, grid)
selector.k_fold(data, 5)

{'batchsize': 124, 'loss': <loss.LossFunction object at 0x7fd4023d18d0>, 'optimizer': <optimizer.Optimizer object at 0x7fd4023d2650>, 'net': <nn.NeuralNetwork object at 0x7fd4023d2c80>}
{'epoch': 1, 'loss': 148.543418689196}
{'epoch': 2, 'loss': 11.776068282685094}
{'epoch': 3, 'loss': 1.587608469741858}
{'epoch': 4, 'loss': 0.3498037083808738}
{'epoch': 5, 'loss': 0.14246572472548918}
{'epoch': 1, 'loss': 364.00627621048505}
{'epoch': 2, 'loss': 3.8031777447770327}
{'epoch': 3, 'loss': 0.2402034367025574}
{'epoch': 4, 'loss': 0.11789585404721063}
{'epoch': 5, 'loss': 0.12769578219417183}
{'epoch': 1, 'loss': 199.55025099625323}
{'epoch': 2, 'loss': 11.936864367712229}
{'epoch': 3, 'loss': 1.504491501845713}
{'epoch': 4, 'loss': 0.33154189603884743}
{'epoch': 5, 'loss': 0.15000368207013431}
{'epoch': 1, 'loss': 569.2166983607118}
{'epoch': 2, 'loss': 2.3118018028042284}
{'epoch': 3, 'loss': 0.10022971248422886}
{'epoch': 4, 'loss': 0.14198613132400054}
{'epoch': 5, 'loss': 0.1852421295

[({'layers': [(4, 'ReLU'), (1, 'linear')],
   'l2_coeff': 0.1,
   'alpha': 0.1,
   'eta': 0.002,
   'fname': 'MSE',
   'batchsize': 124},
  67.03715028397518,
  40.151153119165535),
 ({'layers': [(4, 'ReLU'), (1, 'linear')],
   'l2_coeff': 0.1,
   'alpha': 0.2,
   'eta': 0.001,
   'fname': 'MSE',
   'batchsize': 124},
  72.06727771382711,
  25.50924543985118),
 ({'layers': [(4, 'ReLU'), (1, 'linear')],
   'l2_coeff': 0.2,
   'alpha': 0.1,
   'eta': 0.001,
   'fname': 'MSE',
   'batchsize': 124},
  74.37118451913543,
  48.8114496687625),
 ({'layers': [(4, 'ReLU'), (1, 'linear')],
   'l2_coeff': 0.2,
   'alpha': 0.2,
   'eta': 0.002,
   'fname': 'MSE',
   'batchsize': 124},
  74.99752660249706,
  25.772080101489273),
 ({'layers': [(4, 'ReLU'), (1, 'linear')],
   'l2_coeff': 0.2,
   'alpha': 0.2,
   'eta': 0.001,
   'fname': 'MSE',
   'batchsize': 124},
  109.32292663675696,
  30.282932753705186),
 ({'layers': [(4, 'ReLU'), (1, 'linear')],
   'l2_coeff': 0.1,
   'alpha': 0.1,
   'eta': 0.