# Optimization techniques Lab. 6: Bayesian Optimization
## Introduction
**Goal.** The goal of this lab is to study the behavior of Bayesian optimization on a regression problem and a classifier one. 
Bayesian optimization is a probabilistic approach that uses the Bayes' Theorem $P(A|B) = \frac{P(B|A)*P(A)}{P(B)}$. Briefly, we use the prior information, $P(A)$,(random samples) to optimize a surrogate function, $P(B|A)$.

**Getting started.** The following cells contain the implementation of the methods that we will use throughout this lab, together with utilities. 


In [None]:
import numpy as np

from typing import Tuple, Callable, List
from warnings import catch_warnings, simplefilter
from matplotlib import pyplot
from numpy import arange, ndarray, sin, argmax, asarray, mean, vstack, pi
from numpy.random import normal, random
from scipy.stats import norm
from sklearn.datasets import make_blobs
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ExpSineSquared, Matern, RationalQuadratic, DotProduct
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from skopt import gp_minimize
from skopt.space import Integer
from skopt.utils import use_named_args

In [None]:
# objective function
# def objective(x: float, noise: float = 0.1) -> float:
#     return x ** 2 * sin(3 * pi * x) + np.random.uniform(-noise, noise)

def objective(x: float, noise: float = 0.05) -> float:
    return sin(3 * 2 * pi * x) + np.random.uniform(-noise, noise)

In [None]:
XS = arange(0, 1, 0.001).reshape(-1, 1)
YS = objective(XS, 0)


def surrogate(model: GaussianProcessRegressor, X: ndarray[float]) -> Tuple[ndarray[float], ndarray[float]]:
    """
    surrogate or approximation for the objective function
    """

    # catch any warning generated when making a prediction
    with catch_warnings():
        # ignore generated warnings
        simplefilter("ignore")
        return model.predict(X, return_std=True)


def opt_acquisition(
        xs_known: ndarray[float],
        ys_known: ndarray[float],
        model: GaussianProcessRegressor,
        acquisition: Callable[[ndarray[float], ndarray[float], GaussianProcessRegressor], ndarray[float]]
) -> float:
    """
    optimize the acquisition function
    """

    # random search, generate random samples
    xs_unknown: ndarray[float] = random(50)
    xs_unknown = xs_unknown.reshape(-1, 1)

    # calculate the acquisition function for each sample
    scores = acquisition(xs_known, xs_unknown, model)

    # locate the index of the largest scores
    ix = argmax(scores)
    return xs_unknown[ix, 0]


def bayesian_optimization(
        generation: int,
        model: GaussianProcessRegressor,
        acquisition: Callable[[ndarray[float], ndarray[float], GaussianProcessRegressor], ndarray[float]],
        initial_points: List[float],
        file: str
) -> Tuple[ndarray[float], ndarray[float], GaussianProcessRegressor]:
    # reshape into rows and cols
    xs_known = asarray(initial_points).reshape(-1, 1)
    ys_known = asarray([objective(x) for x in xs_known]).reshape(-1, 1)

    # fit the model
    model.fit(xs_known, ys_known)

    # perform the optimization process
    for i in range(generation):
        # select the next point
        # and sample it
        x_next = opt_acquisition(xs_known, ys_known, model, acquisition)
        y_next = objective(x_next)

        # region plot
        fig, (ax1, ax2) = pyplot.subplots(2, sharex=True, height_ratios=[3, 1], gridspec_kw={'hspace': 0})
        plot_approximation(ax1, model, xs_known, ys_known, x_next)
        plot_acquisition(ax2, model, xs_known, acquisition, x_next)
        if i == 0:
            lines_labels = [ax.get_legend_handles_labels() for ax in fig.axes]
            lines, labels = [sum(lol, []) for lol in zip(*lines_labels)]
            fig.legend(lines, labels, loc='upper left')
        pyplot.savefig(f'{file} {i}.svg')
        # endregion

        # add the data to the dataset
        xs_known = vstack((xs_known, [[x_next]]))
        ys_known = vstack((ys_known, [[y_next]]))

        # update the model
        model.fit(xs_known, ys_known)

    pyplot.close()
    return xs_known, ys_known, model

In [None]:
def plot_approximation(
        ax,
        model,
        xs_known,
        ys_known,
        x_next,
):
    mu, std = model.predict(XS, return_std=True)
    ax.fill_between(
        XS.ravel(),
        mu.ravel() + 1.96 * std,
        mu.ravel() - 1.96 * std,
        alpha=0.1
    )
    ax.fill_between(
        XS.ravel(),
        YS.ravel() + 0.05,
        YS.ravel() - 0.05,
        alpha=0.1
    )
    ax.plot(XS, YS, 'y--', lw=1, label='objective')
    ax.plot(XS, mu, 'b-', lw=1, label='surrogate function')
    ax.plot(xs_known, ys_known, 'kx', mew=3, label='noisy samples')
    ax.axvline(x=x_next, ls='--', c='k', lw=1)


def plot_acquisition(
        ax,
        model,
        xs_known,
        acquisition: Callable[[ndarray[float], ndarray[float], GaussianProcessRegressor], ndarray[float]],
        x_next,
):
    ax.plot(XS, acquisition(xs_known, XS, model), 'r-', lw=1, label='Acquisition function')
    ax.axvline(x=x_next, ls='--', c='k', lw=1, label='Next sampling location')

Implementation part
---
Your first step, will be to implement the following functions:

1.   objective() is the function to optimize.
2.   initial_point() returns the initial set of points (a priori knowledge)
3.   acquisition_function() implements the acquisition function

In [None]:
def probability_of_improvement(
        xs_known: ndarray[float],
        xs_unknown: ndarray[float],
        model: GaussianProcessRegressor
) -> ndarray[float]:
    y_hat, _ = surrogate(model, xs_known)
    best = max(y_hat)

    mu, sd = surrogate(model, xs_unknown)
    z = (best - mu) / sd

    return norm.cdf(-z)

In [None]:
def expected_improvement(
        xs_known: ndarray[float],
        xs_unknown: ndarray[float],
        model: GaussianProcessRegressor,
) -> ndarray[float]:
    y_hat, _ = surrogate(model, xs_known)
    best = max(y_hat)

    mu, sd = surrogate(model, xs_unknown)
    z = (best - mu) / sd

    return (mu - best) * norm.cdf(-z) + sd * norm.pdf(-z)

Regression
---
## Questions:
- How does the prior knowledge change the optimization?
- How does the kernel change the optimization? (see here the [kernels](https://scikit-learn.org/stable/modules/gaussian_process.html#kernels-for-gaussian-processes))
- How does the acquisition function affect the optimization?

In [None]:
def regression() -> None:
    xs: ndarray[float]
    ys: ndarray[float]
    model: GaussianProcessRegressor
    xs, ys, model = bayesian_optimization(
        generation=10,
        model=GaussianProcessRegressor(ExpSineSquared(length_scale=1/3, periodicity=1/3)),
        acquisition=expected_improvement,
        initial_points=[1/3],
        file='ei/ei exp-sine-squared one-third'
    )

    ix: ndarray[int] = argmax(ys)
    print('Best Result: x=%.3f, y=%.3f' % (xs[ix], ys[ix]))


regression()

Classifier
---
## Questions:
- Try different ranges of hyperparameters. How do the results change?
- Does the model influence the choice of the hyperparameters?

In [None]:
def classifier() -> None:
    # generate 2d classification dataset
    X, y = make_blobs(n_samples=500, centers=3, n_features=2)
    # define the model

    model = KNeighborsClassifier()
    # define the space of hyperparameters to search
    search_space = [Integer(1, 5, name='n_neighbors'), Integer(1, 2, name='p')]

    # define the function used to evaluate a given configuration
    @use_named_args(search_space)
    def evaluate_model(**params):
        # something
        model.set_params(**params)
        # calculate 5-fold cross validation
        with catch_warnings():
            # ignore generated warnings
            simplefilter("ignore")
            result = cross_val_score(model, X, y, cv=5, n_jobs=-1, scoring='accuracy')
            # calculate the mean of the scores
            estimate = mean(result)
            return 1.0 - estimate

    # perform optimization
    result = gp_minimize(evaluate_model, search_space)
    # summarizing finding:
    print('Best Accuracy: %.3f' % (1.0 - result.fun))
    print('Best Parameters: n_neighbors=%d, p=%d' % (result.x[0], result.x[1]))


classifier()

# BONUS

You see in the classifier the effect of hyperparameter tuning. 
You can now change the acquisition functions in the regression problem, adding a slack variable as a hyperparameter. How does this variable affect the optimization problem?