In [207]:
from itertools import product

from math import inf
import numpy as np
import scipy.special
from sklearn.datasets import make_regression


def bfgs(f, f_grad, x0, max_iter=500, tol=1e-6):
    """
    BFGS algorithm for unconstrained optimization.

    Args:
        f: The objective function to be minimized.
        f_grad: The gradient of the objective function.
        x0: The initial guess for the solution.
        max_iter: The maximum number of iterations.
        tol: The tolerance for convergence.

    Returns:
        A tuple containing:
            - The optimal solution found.
            - The value of the objective function at the solution.
            - The number of iterations performed.
    """

    x = x0
    n = len(x0)
    B = np.eye(n)  # Initial approximation of the Hessian inverse
    grad = f_grad(x)

    for i in range(max_iter):
        # Compute search direction
        p = -np.linalg.solve(B, grad)

        # Perform line search
        t = line_search(f, f_grad, x, p)

        # Update x
        x_next = x + t * p

        # Update B (BFGS update)
        s = x_next - x
        y = f_grad(x_next) - grad
        if np.dot(s, y) > 0:  # Ensure positive definiteness
            rho = 1 / np.dot(s, y)
            B = (np.eye(n) - rho * np.outer(s, y)) @ B @ (np.eye(n) - rho * np.outer(y, s)) + rho * np.outer(s, s)
        else:
            B = np.eye(n)

        # Update x and gradient
        x = x_next
        grad = f_grad(x)

        # Check for convergence
        if np.linalg.norm(grad) < tol:
            break

    return x, f(x), i + 1


def line_search(f, f_grad, x, p, c1=1e-4, c2=0.9):
    """
    Backtracking line search.

    Args:
        f: The objective function.
        f_grad: The gradient of the objective function.
        x: The current point.
        p: The search direction.
        c1: The Armijo condition parameter.
        c2: The curvature condition parameter.

    Returns:
        The step size that satisfies the Wolfe conditions.
    """
    assert 0 < c1 < c2 < 1
    beta = inf
    alpha = 0.0
    t = 1.0
    s = np.dot(f_grad(x), p)

    for iter in range(10_000):
        # Check Armijo Condition
        if f(x + t * p) - f(x) <= c1 * t * s:
            # Check Wolfe Condition
            if f_grad(x + t * p) @ p >= c2 * s:
                break
            else:
                # Wolfe failed
                alpha = t
        else:
            # Armijo failed
            beta = t
        if beta < inf:
            t = (alpha + beta) / 2
        else:
            t = 2 * alpha

    if iter == 10_000 - 1:
        raise ValueError("Line search did not converge at x={x}, p={p}, t={t:.2e}".format(x=x, p=p, t=t))
    return t


# Example usage:
def f(x):
    return x[0] ** 2 + 5 * x[1] ** 4 + 0.1 * abs(x[0])


def f_grad(x, dx=1e-6):
    n = len(x)
    grad = np.zeros(n)
    f_val = f(x)
    for i in range(n):
        x_plus = x.copy()
        x_plus[i] += dx
        grad[i] = (f(x_plus) - f_val) / dx
    return grad


x0 = np.array([3.0, 4.0])
x_opt, f_opt, iterations = bfgs(f, f_grad, x0)

print("Optimal solution:", x_opt)
print("Optimal value:", f_opt)
print("Iterations:", iterations)

ValueError: Line search did not converge at x=[ 4.55014146e-08 -1.22990350e-01], p=[-1.46540516e+04  4.90101639e-02], t=6.21e-12

In [24]:
from scipy.optimize import minimize
from sklearn.datasets import make_regression

x, y = make_regression(n_samples=100, n_features=10, noise=10, n_informative=3, random_state=42)


def f(beta):
    # enet criterion
    y_hat = x @ beta
    mse = np.mean((y - y_hat) ** 2)
    penalty = 10.0 * np.sum(np.abs(beta)) + 5 * np.sum(beta ** 2)
    return mse + penalty


beta_0 = np.zeros(shape=(x.shape[1],))
result = minimize(f, beta_0, method='BFGS')
print(result)


  message: Desired error not necessarily achieved due to precision loss.
  success: False
   status: 2
      fun: 4095.9387337492435
        x: [ 1.869e+00 -3.943e-05  2.888e-06  1.031e+01 -8.730e-05
            -3.855e-06  3.466e-05  1.423e+00  6.406e-01 -5.678e-05]
      nit: 44
      jac: [-6.171e-02 -1.086e+01  1.628e+01 -2.887e-02 -8.228e+00
            -1.065e+01  2.563e+00  1.318e-02  6.421e-02 -3.080e-01]
 hess_inv: [[ 5.202e-02 -2.913e-05 ... -1.341e-03 -8.904e-03]
            [-2.913e-05  7.300e-07 ...  3.534e-06 -9.934e-05]
            ...
            [-1.341e-03  3.534e-06 ...  3.544e-02  6.536e-03]
            [-8.904e-03 -9.934e-05 ...  6.536e-03  1.052e-01]]
     nfev: 2088
     njev: 189


In [227]:
from sklearn.linear_model import ElasticNet


# Try SAGA algorithm

def f_enet(x, beta):
    # enet criterion
    y_hat = x @ beta
    mse = np.mean((y - y_hat) ** 2)
    penalty = 10.0 * np.sum(np.abs(beta)) + 5 * np.sum(beta ** 2)
    return mse + penalty


def f_grad_enet(x, beta, dx=1e-9):
    n = len(beta)
    grad = np.zeros(n)
    for i in range(n):
        beta_plus = beta.copy()
        beta_plus[i] += dx
        beta_minus = beta.copy()
        beta_minus[i] -= dx
        grad[i] = (f_enet(x, beta_plus) - f_enet(x, beta_minus)) / (2 * dx)
    return grad


def saga_enet(l1: float, l2: float, tol: float = 1e-6, max_iter: int = 50_000) -> np.ndarray:
    n, p = x.shape
    grad = np.zeros_like(x)
    g = np.zeros(x.shape[1])
    beta = np.zeros(x.shape[1])
    lr = 1e-1
    lr_target = 1e-8
    lr_factor = (lr_target / lr) ** (1 / max_iter)
    n_print = max_iter // 10

    for i in range(max_iter):
        idx = np.random.randint(n)
        prev = grad[idx, :]
        g -= prev
        x_i = x[idx, :]
        y_i = y[idx]
        grad_i = (
                     # MSE grad
                         (x_i @ beta - y_i) * x_i
                         # L1 grad
                         + l1 * np.sign(beta)
                         # L2 grad
                         + l2 * beta
                 ) / n
        grad[idx, :] = grad_i
        g += grad_i
        beta -= g * lr

        if np.linalg.norm(g) < tol:
            print(f"Converged at iteration {i:,}")
            break

        if i % n_print == 0:
            print(f"Iteration {i:,} grad norm: {np.linalg.norm(g):.2e}, lr: {lr:.2e}")
        lr *= lr_factor

    return beta


for l1, l2 in product([0.1, 1.0, 10.0], [0.1, 1.0, 10.0]):
    beta_saga = saga_enet(l1, l2)
    beta_cd = ElasticNet(alpha=l1 + l2, l1_ratio=l1 / (l1 + l2), fit_intercept=False).fit(x, y).coef_

    print(f"\nl1={l1}, l2={l2}, diff={np.linalg.norm(beta_saga - beta_cd)/np.linalg.norm(beta_cd):.3%}")
    for b, name in zip([beta_saga, beta_cd], ["SAGA", "CD"]):
        print(f"{name: >10}: {[round(float(b_), 3) for b_ in b]}")

# beta = saga_enet()
# print([round(float(b), 3) for b in beta])


Iteration 0 grad norm: 9.21e+00, lr: 1.00e-01
Iteration 5,000 grad norm: 3.74e-03, lr: 2.00e-02
Converged at iteration 7,362

l1=0.1, l2=0.1, diff=0.003%
      SAGA: [15.193, 0.399, 0.164, 57.463, 1.273, 0.205, -0.313, 8.553, 1.007, -1.499]
        CD: [15.193, 0.4, 0.163, 57.463, 1.273, 0.206, -0.314, 8.552, 1.007, -1.499]
Iteration 0 grad norm: 2.22e+00, lr: 1.00e-01
Iteration 5,000 grad norm: 5.08e-05, lr: 2.00e-02
Converged at iteration 5,917

l1=0.1, l2=1.0, diff=0.001%
      SAGA: [8.356, 0.122, -0.56, 32.263, 0.15, 0.264, 0.839, 5.6, 2.438, -1.725]
        CD: [8.356, 0.121, -0.56, 32.263, 0.15, 0.264, 0.839, 5.6, 2.438, -1.725]
Iteration 0 grad norm: 2.83e-02, lr: 1.00e-01
Iteration 5,000 grad norm: 2.18e-04, lr: 2.00e-02
Converged at iteration 6,346

l1=0.1, l2=10.0, diff=0.000%
      SAGA: [1.487, 0.024, -0.291, 6.081, -0.093, 0.037, 0.371, 1.247, 0.847, -0.465]
        CD: [1.487, 0.024, -0.291, 6.081, -0.093, 0.037, 0.371, 1.247, 0.847, -0.465]
Iteration 0 grad norm: 2.14e-

In [118]:
def adam_enet(beta_1: float = 0.98, beta_2: float = 0.99, max_iter: int = 200_000) -> np.ndarray:
    """
    Minimize the Elastic Net criterion using the Adam optimizer.

    Args:
      beta_1: Exponential decay rate for the first moment estimates.
      beta_2: Exponential decay rate for the second moment estimates.
      max_iter: Maximum number of iterations.

    Returns:
      An array of the estimated coefficients.
    """
    n, p = x.shape
    beta = np.zeros(p)
    m = np.zeros(p)
    v = np.ones(p)
    epsilon = 1e-8
    alpha = 0.001  # Learning rate

    for t in range(1, max_iter + 1):
        batch_idx = np.random.randint(n, size=32)
        batch = x[batch_idx, :]
        grad = f_grad_enet(batch, beta) / n
        m = beta_1 * m + (1 - beta_1) * grad
        v = beta_2 * v + (1 - beta_2) * np.square(grad)
        m_hat = m / (1 - beta_1 ** t)
        v_hat = v / (1 - beta_2 ** t)
        beta -= alpha * m_hat / (np.sqrt(v_hat) + epsilon)

        if t % (max_iter // 10) == 0:
            print(f"Iteration {t:,} grad norm: {np.linalg.norm(grad)}")

    return beta


beta = adam_enet()
print(np.round(beta, 3))

ValueError: operands could not be broadcast together with shapes (100,) (32,) 