In [None]:
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from typing import Callable, List, Tuple

In [None]:
class Experiment:
    """
    Experiment class to run optimization experiments with second order methods,
    on a given function g, with computable gradient and hessian.
    """

    def __init__(
        self,
        g: Callable,
        g_grad: Callable,
        g_grad_and_hessian: Callable,
        true_theta: np.ndarray = None,
        true_hessian: np.ndarray = None,
    ):
        """
        Initialize the experiment
        """
        self.true_theta = true_theta
        self.true_hessian = true_hessian
        self.g = g
        self.g_grad = g_grad
        self.g_grad_and_hessian = g_grad_and_hessian
        if true_theta is not None:
            self.true_theta = true_theta
        if true_hessian is not None:
            self.true_hessian = true_hessian

    def set_e(self, e: float):
        """
        Set the noise level for generating random initial theta
        """
        self.e = e

    def set_theta(self, theta: np.ndarray):
        """
        Set the initial theta
        """
        self.theta = theta

    def generate_initial_theta(self):
        """
        Generate a random initial theta
        """
        if self.e is None:
            raise ValueError("e is not set for generating random theta")
        self.theta = (
            self.true_theta + np.random.randn(self.true_theta.shape[0]) * self.e
        )

    def set_optimizer(self, optimizer: Callable):
        """
        Set the optimizer
        """
        self.optimizer = optimizer
        self.optimizer.g = self.g
        self.optimizer.g_grad = self.g_grad
        self.optimizer.g_grad_and_hessian = self.g_grad_and_hessian
        self.optimizer.reset_lr()

    def set_dataset(self, dataset: List[Tuple[np.ndarray, np.ndarray]]):
        """
        Set the dataset
        """
        self.dataset = dataset

    def run(self, plot: bool = False) -> Tuple[List[float], List[float]]:
        """
        Run the experiment for a given theta, optimizer and dataset
        """
        if self.theta is None:
            raise ValueError("Theta is not set")
        if self.optimizer is None:
            raise ValueError("Optimizer is not set")
        if self.dataset is None:
            raise ValueError("Dataset is not set")

        self.hessian = np.eye(self.theta.shape[0])  # Reset hessian to identity
        if self.true_theta is not None:
            theta_error = [
                np.dot(self.theta - self.true_theta, self.theta - self.true_theta)
            ]
        if self.true_hessian is not None:
            hessian_error = [
                np.linalg.norm(self.hessian - self.true_hessian, ord="fro")
            ]
        self.optimizer.reset_lr()

        for X, Y in tqdm(self.dataset, desc="Optimizing", leave=False):
            # One step of optimization
            self.theta, self.hessian = self.optimizer.step(self.theta, X, Y)
            # Log parameter error after each update
            if self.true_theta is not None:
                theta_error.append(
                    np.dot(self.theta - self.true_theta, self.theta - self.true_theta)
                )
            if self.true_hessian is not None:
                hessian_error.append(
                    np.linalg.norm(self.hessian - self.true_hessian, ord="fro")
                )
        if plot:
            self.plot_errors(theta_error, hessian_error)
        return theta_error, hessian_error

    def run_multiple(self, num_runs: int = 10) -> Tuple[List[float], List[float]]:
        """
        Run the experiment multiple times, and return the average error
        """
        self.theta_errors_avg = (
            np.zeros(self.true_theta.shape) if self.true_theta is not None else None
        )
        self.hessian_errors_avg = (
            np.zeros(self.true_hessian.shape) if self.true_hessian is not None else None
        )
        for i in tqdm(range(num_runs), desc="Runs"):
            self.generate_initial_theta()
            theta_error, hessian_error = self.run()
            if self.true_theta is not None:
                self.theta_errors_avg += theta_error
            if self.true_hessian is not None:
                self.hessian_errors_avg += hessian_error
        if self.true_theta is not None:
            self.theta_errors_avg /= num_runs
        if self.true_hessian is not None:
            self.hessian_errors_avg /= num_runs
        self.plot_errors(self.theta_errors_avg, self.hessian_errors_avg)
        return self.theta_errors_avg, self.hessian_errors_avg

    def plot_errors(self, theta_error, hessian_error):
        """
        Plot the errors of estimated theta and hessian
        """
        if self.true_theta is not None:
            plt.plot(theta_error)
            plt.title("Parameter error")
            plt.show()
        if self.true_hessian is not None:
            plt.plot(hessian_error)
            plt.title("Hessian error (Frobenius norm)")
            plt.show()

In [None]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))


def create_dataset_logistic(n: int, theta: np.ndarray):
    d = len(theta)
    X = np.random.randn(n, d - 1)
    phi = np.hstack([np.ones((n, 1)), X])
    Y = np.random.binomial(1, sigmoid(phi @ theta))
    return list(zip(X, Y))  # Maybe use a dataloader for large datasets


def g(X: np.ndarray, Y: np.ndarray, h: np.ndarray):
    """
    Compute the logistic loss, works only for a batch of data
    """
    n, d = X.shape
    phi = np.hstack([np.ones(n, 1), X])
    dot_product = np.dot(phi, h)
    return np.log(1 + np.exp(dot_product)) - dot_product * Y


def g_grad(X: np.ndarray, Y: np.ndarray, h: np.ndarray):
    """
    Compute the gradient of the logistic loss, works only for a single data point
    """
    print(X.shape)
    phi = np.hstack([np.ones((1,)), X])
    print(phi.shape)
    dot_product = np.dot(phi, h)
    p = sigmoid(dot_product)
    print(p.shape)
    print(Y.shape)
    # grad = (p - Y)[:, np.newaxis] * phi
    grad = (p - Y) * phi  # Equivalent
    return grad


def g_grad_and_hessian(X, Y, h):
    """
    Compute the gradient and the Hessian of the logistic loss
    Does not work for a batch of data because of the outer product
    """
    # For batch data, should work
    # n, d = X.shape
    # phi = np.hstack([np.ones(n, 1), X])
    # dot_product = np.dot(phi, h)
    # p = sigmoid(dot_product)
    # grad = (p - Y) * phi
    # hessian = np.einsum('i,ij,ik->ijk', p * (1 - p), phi, phi)
    # return grad, hessian

    # For a single data point
    phi = np.hstack([np.ones((1,)), X])
    dot_product = np.dot(phi, h)
    p = sigmoid(dot_product)
    grad = (p - Y) * phi
    hessian = p * (1 - p) * np.outer(phi, phi)
    return grad, hessian

In [None]:
# test broadcast numpy *
a = np.array([[1], [2]])  # like (p - Y) for a batch of 2 samples
X = np.array([[1, 2, 3, 4], [5, 6, 7, 8]])
# print(a * x)

# test outer product
# print(np.outer(x, x))

# test np.atleast_2d
X = np.atleast_2d(np.array([1, 2, 3]))
# print(x.shape)

# test np.dot
X = np.array([[1, 2, 3], [4, 5, 6]])
Y = np.array([[1], [2], [3]])
# print(np.dot(x, y))

# test np.einsum
p = np.array([0.5, 0.6])
X = np.array([[1, 2], [3, 4]])
print(np.einsum("i,ij,ik->ijk", p * (1 - p), X, X))
print(p[0] * (1 - p[0]) * np.outer(X[0], X[0]))
print(p[1] * (1 - p[1]) * np.outer(X[1], X[1]))

In [None]:
class SGD:
    """
    Stochastic Gradient Descent optimizer
    Uses a learning rate lr = c_mu * iteration^(-mu)
    """

    def __init__(self, mu: float, c_mu: float):
        """
        Initialize the optimizer
        """
        self.mu = mu
        self.c_mu = c_mu
        self.iteration = 0

    def reset_lr(self):
        """
        Reset the learning rate
        """
        self.iteration = 0

    def step(self, X: np.ndarray, Y: np.ndarray, theta: np.ndarray):
        """
        Perform one optimization step
        """
        if self.g_grad is None:
            raise ValueError("g_grad is not set")
        self.iteration += 1
        grad = self.g_grad(X, Y, theta)
        lr = self.c_mu * self.iteration ** (-self.mu)
        theta = theta - lr * grad
        return theta, None

In [None]:
# Usage example
N = 100
n = 10_000
true_theta = np.array([0, 3, -9, 4, -9, 15, 0, -7, 1, 0])
exp = Experiment(g, g_grad, g_grad_and_hessian, true_theta=true_theta)
exp.set_e(1)
exp.generate_initial_theta()
optimizer = SGD(0.5, 0.1)
exp.set_optimizer(optimizer)
dataset = create_dataset_logistic(n, true_theta)
exp.set_dataset(dataset)

In [None]:
# Run the experiment
exp.run(plot=True)

In [None]:
# for i in range(N):
#     dataset = create_dataset_logistic(n, true_theta)

#     exp = Experiment(
#         g, g_grad, g_grad_and_hessian, dataset, optimizer, true_theta=true_theta
#     )
#     exp.set_theta(np.random.randn(10))
#     exp.run()

In [None]:
experiment.run()

In [None]:
import torch
from torch import nn
from torch.autograd import functional as autograd_f


class NewtonOptim(torch.optim.Optimizer):
    def __init__(self, params, lr=1):
        defaults = dict(lr=lr)
        super().__init__(params, defaults)

    def step(self, closure=None):
        loss = closure()
        for group in self.param_groups:
            for p in group["params"]:
                if p.grad is None:
                    continue

                def func(input):
                    return closure()

                hessian = autograd_f.hessian(func, p)

                hessian_inv = torch.inverse(hessian + 1e-5 * torch.eye(hessian.size(0)))

                p.data.sub_(group["lr"] * hessian_inv @ p.grad.data.flatten()).view_as(
                    p
                )

        return loss

In [None]:
# Usage example
theta_true = torch.tensor([1.5, -2.0, 1.0, 0.5, 3.0])
X = torch.randn(10000, 5)
noise = 0.5 * torch.randn(10000)
Y = X @ theta_true + noise
dataset = TensorDataset(X, Y)
g = nn.Linear(5, 1, bias=False)
criterion = nn.MSELoss()
experiment = Experiment(
    g, dataset, NewtonOptim, lr=0.001, theta_true=theta_true, criterion=criterion
)

In [None]:
def closure():
    y_pred = g(X)
    loss = criterion(y_pred, Y.view(-1, 1))
    return loss

In [None]:
for X, Y in experiment.dataloader:
    experiment.optimizer.step(closure)

In [None]:
experiment.plot_param_errors()