In [136]:
import numpy as np
from matplotlib import pyplot as plt
import time
import torch
from torch import nn
import os

## Objective Funtions

In [312]:
def sphere(x):
    return torch.sum(x**2)

In [138]:
def ellipsoid(x):
    n = x.size(0)
    out = 0
    for i in range(n):
        out += x[i]**2 * 10**(6 * i / (n-1))
    return out

In [139]:
def cigar(x):
    return x[0]**2 + 10**6 * torch.sum(x[1:]**2)

In [140]:
def tablet(x):
    return 10**6 * x[0]**2 + torch.sum(x[1:]**2)

In [141]:
def parabolic_ridge(x):
    return -x[0] + 100*torch.sum(x[1:]**2)

In [142]:
def sharp_ridge(x):
    return -x[0] + 100*torch.norm(x[1:])

In [143]:
def diffpow(x):
    n = x.size(0)
    out = 0
    for i in range(n):
        out += torch.abs(x[i])**(2 + 10*i/(n-1))
    return out

In [144]:
def rosenbrock(x):
    n = x.size(0)
    out = 0
    for i in range(n-1):
        out += 100*(x[i+1] - x[i]**2)**2 + (x[i] - 1)**2
    return out

In [145]:
#NOTE: Only takes x in R^2
def booth(x):
    return (x[0] + 2*x[1] - 7)**2 + (2*x[0] + x[1] - 5)**2

In [146]:
#TODO: Add more

# Gradient-Based Optimization Methods

Source: https://algorithmsbook.com/optimization/files/optimization.pdf

In [93]:
#Gradient Descent
def gd_step(x, obj, params):
    lr = params["lr"]
    
    loss = obj(x)
    grad, = torch.autograd.grad(loss, inputs=x)
    x_new = x - lr*grad
    return x_new

In [185]:
#Momentum
def momentum_step(x, obj, params):
    lr = params["lr"]
    v = params["v"]
    decay = params["decay"]
    
    loss = obj(x)
    grad, = torch.autograd.grad(loss, inputs=x)
    v_new = decay*v - lr*grad
    x_new = x + v_new
    return x_new, v_new

In [196]:
#AdaGrad
def adagrad_step(x, obj, params):
    lr = params["lr"]
    s = params["s"]
    
    loss = obj(x)
    grad, = torch.autograd.grad(loss, inputs=x)
    s_new = s + grad**2
    x_new = x - (lr / (10e-8 + torch.sqrt(s_new)))*grad
    return x_new, s_new

In [96]:
#RMSProp

In [229]:
#Adam
def adam_step(x, obj, params):
    v = params["v"]
    v_decay = params["v_decay"]
    s = params["s"]
    s_decay = params["s_decay"]
    lr = params["lr"]
    k = params["k"] #stores number of iterations
    
    k += 1
    
    loss = obj(x)
    grad, = torch.autograd.grad(loss, inputs=x)
    
    v_new = v_decay*v + (1-v_decay)*grad
    s_new = s_decay*s + (1-s_decay)*(grad**2)
    
    v_hat = v_new / (1 - v_decay**k)
    s_hat = s_new / (1 - s_decay**k)
    
    x_new = x - lr * v_hat / (1e-8 + torch.sqrt(s_hat))
    return x_new, v_new, s_new

## CMA-ES

In [None]:
#TODO: 
#fix transposes
#make sure all matrix multiplications have correct dims

Source: http://www.cmap.polytechnique.fr/~nikolaus.hansen/evco_11_1_1_0.pdf

In [325]:
def oldcmaes_step(x, obj, params):
    l = params["l"]
    mu = params["mu"]
    C = params["C"]
    pc = params["pc"]
    cc = params["cc"]
    ccov = params["ccov"]
    s = params["s"]
    ps = params["ps"]
    cs = params["cs"]
    ds = params["ds"]
    chi = params["chi"]
    n = x.size(0)
    
    #compute B and D
    B, D_squared, B_transpose = torch.linalg.svd(C)
    D = torch.diag(torch.sqrt(D_squared))
    BD = torch.matmul(B,D)
    
    #Sample offspring
    m = torch.distributions.multivariate_normal.MultivariateNormal(torch.zeros(n), torch.eye(n))
    z = m.sample((l,))
    offspring = x + s * torch.t(torch.matmul(BD, torch.t(z)))
        
    #Compute objective on each offspring
    evals = torch.zeros(l)
    for i in range(l):
        evals[i] = obj(offspring[i])
        
    #Get indices of top mu offspring
    top_evals, top_inds = torch.topk(evals, mu, largest=False)
    
    #Updates
    x_new = (1/mu) * torch.sum(offspring[top_inds], dim=0)
    z_avg = (1/mu) * torch.sum(z[top_inds], dim=0)
    
    #Covariance update
    pc_new = (1-cc)*pc + np.sqrt(cc*(2-cc)*mu) * torch.matmul(BD, z_avg)
    C_new = (1-ccov)*C + ccov*torch.outer(pc_new, pc_new)
    
    #Step size update
    ps_new = (1-cs)*ps + np.sqrt(cs*(2-cs)*mu) * torch.matmul(B, z_avg)
    s_new = s * torch.exp((1/ds) * (torch.norm(ps_new)-chi) / chi)
    
    return x_new, C_new, pc_new, s_new, ps_new

In [316]:
torch.outer(torch.arange(4),torch.arange(4))

tensor([[0, 0, 0, 0],
        [0, 1, 2, 3],
        [0, 2, 4, 6],
        [0, 3, 6, 9]])

Source: http://www.cmap.polytechnique.fr/~nikolaus.hansen/evco_11_1_1_0.pdf

In [326]:
def newcmaes_step(x, obj, params):
    l = params["l"]
    mu = params["mu"]
    C = params["C"]
    pc = params["pc"]
    cc = params["cc"]
    ccov = params["ccov"]
    acov = params["acov"]
    s = params["s"]
    ps = params["ps"]
    cs = params["cs"]
    ds = params["ds"]
    chi = params["chi"]
    n = x.size(0)
    
    #compute B and D
    B, D_squared, B_transpose = torch.linalg.svd(C)
    D = torch.diag(torch.sqrt(D_squared))
    BD = torch.matmul(B,D)
    
    #Sample offspring
    m = torch.distributions.multivariate_normal.MultivariateNormal(torch.zeros(n), torch.eye(n))
    z = m.sample((l,))
    offspring = x + s * torch.matmul(BD, z) #FIX!!!!!
        
    #Compute objective on each offspring
    evals = torch.zeros(l)
    for i in range(l):
        evals[i] = obj(offspring[i])
        
    #Get indices of top mu offspring
    top_evals, top_inds = torch.topk(evals, mu, largest=False)
    
    #Updates
    x_new = (1/mu) * torch.sum(offspring[top_inds], dim=0)
    z_avg = (1/mu) * torch.sum(z[top_inds])
    
    #Covariance update
    pc_new = (1-cc)*pc + torch.sqrt(cc*(2-cc)*mu) * torch.matmul(BD, z_avg)
    
    outer_prod_sum = torch.zeros(n,n)
    for i in range(mu):
        outer_prod_sum += (1/mu) * torch.matmul(z[top_inds[i]], torch.transpose(z[top_inds[i]])) #FIX???
    bigZ = torch.matmul(BD, torch.matmul(outer_prod_sum, torch.transpose(BD))) #FIX???
    
    C_new = ((1-ccov)*C + 
             ccov * (acov*torch.matmul(pc_new, torch.transpose(pc_new)) +
                     (1-acov)*bigZ)) #FIX???
    
    #Step size update
    ps_new = (1-cs)*ps + np.sqrt(cs*(2-cs)*mu) * torch.matmul(B, z_avg)
    s_new = s * torch.exp((1/ds) * (torch.norm(ps_new)-chi) / chi)
    
    return x_new, C_new, pc_new, s_new, ps_new

Source: https://arxiv.org/pdf/1604.00772.pdf (see pg. 29)

In [318]:
def set_cmaes_params(n):
    l = 4 + np.floor(3 * np.log(n))
    mu = np.floor(l/2)
    
    w_prime = np.log((l+1)/2) - torch.log(torch.arange(l))
    
    mu_eff = (torch.sum(w_prime[:mu])**2) / torch.sum(w_prime[:mu]**2)
    mu_eff_neg = (torch.sum(w_prime[mu:])**2) / torch.sum(w_prime[mu:]**2)
    
    cm = 1
    
    cs = (mu_eff + 2) / (n + mu_eff + 5)
    ds = 1 + 2 * max(0, np.sqrt((mu_eff-1)/(n+1))-1) + cs
    
    cc = (4 + mu_eff/n) / (n + 4 + 2*mu_eff/n)
    c1 = 2 / ((n+1.3)**2 + mu_eff)
    cmu = min(1-c1, 2*(1/4 + mu_eff + 1/mu_eff - 2)/((n+2)**2 + mu_eff))
    
    amu = 1 + c1/cmu
    amu_eff = 1 + (2*mu_eff_neg) / (mu_eff + 2)
    a_pd = (1 - c1 - cmu) / (n*cmu)
    
    sum_w_prime_pos = torch.sum(w[:mu])
    sum_w_prime_neg = -torch.sum(w[mu:])
    
    w = torch.zeros(l)
    w[:mu] = w_prime[:mu] / sum_w_prime_pos
    w[mu:] = min(amu, amu_eff, a_pd) * w_prime[mu:] / sum_w_prime_neg
    
    chi = np.sqrt(1 - 1/(4*n) + 1/(21*n**2))

In [327]:
#TODO - finish hs and w_circle
def cmaes_step(x, obj, params):
    #Set parameters
    l = params["l"]
    mu = params["mu"]
    mu_eff = params["mu_eff"]
    C = params["C"]
    pc = params["pc"]
    cc = params["cc"]
    cm = params["cm"]
    c1 = params["c1"]
    cmu = params["cmu"]
    s = params["s"]
    ps = params["ps"]
    cs = params["cs"]
    ds = params["ds"]
    chi = params["chi"]
    w = params["w"]
    n = x.size(0)
    
    
    #Calculate B,D
    B, D_squared, B_transpose = torch.linalg.svd(C)
    D = torch.diag(torch.sqrt(D_squared))
    sqrtCinv = torch.matmul(B, torch.matmul(1/D, B_transpose))
    
    #Sample offspring
    m = torch.distributions.multivariate_normal.MultivariateNormal(torch.zeros(n), torch.eye(n))
    z = m.sample((l,))
    y = torch.matmul(BD, z)
    offspring = x + s*y
    
    #Compute objective on each offspring
    evals = torch.zeros(l)
    for i in range(l):
        evals[i] = obj(offspring[i])
    
    #Get indices of top mu offspring
    top_evals, top_inds = torch.topk(evals, mu, largest=False)
    
    #Selection and recombination
    y_avg = torch.zeros(n)
    for i in range(mu):
        y_avg += w[i] * y[top_inds[i]]
    x_new = x + cm*s*y_avg

    #Covariance update    
    hs = 1#Annoying
    dhs = (1-hs) * cc * (2-cc)
    
    w_circle = torch.zeros(l)
    w_circle[:mu] = w[:mu]
    #TRICKY!#w_circle[mu:] = w[mu:] * (n/torch.norm)
    
    pc = (1-cc)*pc + hs*np.sqrt(cc*(2-cc)*mu_eff)*y_avg

    outer_prod_sum = torch.zeros(n,n)
    for i in range(l):
        outer_prod_sum += w_circle[i] * torch.matmul(y[top_inds[i]], torch.transpose(y[top_inds[i]])) #FIX???
    
    C = ((1 + c1*dhs - c1 - cmu*sumw)*C +
         c1*torch.matmul(pc, torch.transpose(pc)) +
         cmu*outer_prod_sum) #FIX???
    
    #Step size update
    ps_new = (1-cs)*ps + np.sqrt(cs*(2-cs)*mu_eff) * torch.matmul(sqrtCinv, y_avg) #TODO C^-1/2!!!
    s_new = s * torch.exp((cs/ds) * (torch.norm(ps)/chi - 1))
    
    return x_new, C_new, pc_new, s_new, ps_new

## Optimizer Wrapper

In [320]:
def step(optimizer, x, obj):
    if optimizer == "gd":
        x_new = gd_step(x, obj, parameters["gd"])
        
    elif optimizer == "momentum":
        x_new, v_new = momentum_step(x, obj, parameters["momentum"])
        parameters["momentum"]["v"] = v_new
        
    elif optimizer == "adagrad":
        x_new, s_new = adagrad_step(x, obj, parameters["adagrad"])
        parameters["adagrad"]["s"] = s_new
        
    elif optimizer == "rmsprop":
        pass
    
    elif optimizer == "adam":
        x_new, v_new, s_new = adam_step(x, obj, parameters["adam"])
        parameters["adam"]["v"] = v_new
        parameters["adam"]["s"] = s_new
    
    elif optimizer == "oldcmaes":
        x_new, C_new, pc_new, s_new, ps_new = oldcmaes_step(x, obj, parameters["oldcmaes"])
        parameters["oldcmaes"]["C"] = C_new
        parameters["oldcmaes"]["pc"] = pc_new
        parameters["oldcmaes"]["s"] = s_new
        parameters["oldcmaes"]["ps"] = ps_new
        
    elif optimizer == "newcmaes":
        x_new, C_new, pc_new, s_new, ps_new = newcmaes_step(x, obj, parameters["oldcmaes"])
        parameters["newcmaes"]["C"] = C_new
        parameters["newcmaes"]["pc"] = pc_new
        parameters["newcmaes"]["s"] = s_new
        parameters["newcmaes"]["ps"] = ps_new
        
    elif optimizer == "cmaes":
        pass
    
    else:
        print("Optimizer", optimizer, "not found.")
        
    return x_new

# Parameters

In [306]:
def set_params(n):
    gd_params = {
        "lr": 0.001
    }

    momentum_params = {
        "lr": 0.001,
        "v": 0,
        "decay": 0.9
    }

    adagrad_params = {
        "lr": 1,
        "s": 0
    }

    rmsprop_params = {}

    adam_params = {
        "lr": 0.1,
        "v": 0,
        "s": 0,
        "v_decay": 0.9,
        "s_decay": 0.999,
        "k": 0
    }
    
    l = int(4 + np.floor(3*np.log(n)))
    mu = int(np.floor(l/4))
    chi = np.sqrt(n) * (1 - 1/(4*n) + 1/(21*n**2))
    oldcmaes_params = {
        "l": l,
        "mu": mu,
        "C": torch.eye(n),
        "pc": 0,
        "cc": 4 / (n+4),
        "ccov": 2 / (n + np.sqrt(2))**2,
        "s": 1,
        "ps": 0,
        "cs": 4/(n+4),
        "ds": 1 + (n+4)/4,
        "chi": chi
    }
    
    acov = 1/mu
    newcmaes_params = {
        "l": l,
        "mu": mu,
        "C": torch.eye(n),
        "pc": 0,
        "cc": 4 / (n+4),
        "ccov": acov*2/(n+np.sqrt(2))**2 + (1-acov)*min(1, (2*mu-1)/((n+2)**2 + mu)),
        "acov": acov,
        "s": 1,
        "ps": 0,
        "cs": 4/(n+4),
        "ds": 1 + (n+4)/4,
        "chi": chi
    }
    
    #TODO
    cmaes_params = {}

    parameters = {
        "gd": gd_params,
        "momentum": momentum_params,
        "adagrad": adagrad_params,
        "rmsprop": rmsprop_params,
        "adam": adam_params,
        "oldcmaes": oldcmaes_params,
        "newcmaes": newcmaes_params,
        "cmaes": cmaes_params
    }
    
    return parameters

## Training/Tests

In [334]:
epochs = 500
optimizer = "adam"
obj = rosenbrock
x = torch.normal(0,1,(10,), requires_grad=True)
print("Initial x:", x)

parameters = set_params(10)

for epoch in range(epochs):
    if epoch % 50 == 0:
        print("epoch", epoch, "|",obj(x).item())
        #print(x)
    x = step(optimizer, x, obj)


Initial x: tensor([-0.4202,  0.7429, -1.8678,  1.6296, -0.3195, -1.4256,  1.1357, -0.2281,
        -0.5613, -1.1561], requires_grad=True)
epoch 0 | 2668.9189453125
epoch 50 | 10.202725410461426
epoch 100 | 8.658751487731934
epoch 150 | 8.111798286437988
epoch 200 | 7.730213165283203
epoch 250 | 7.432351589202881
epoch 300 | 7.1824631690979
epoch 350 | 6.960973262786865
epoch 400 | 6.75704288482666
epoch 450 | 6.564112186431885


In [61]:
t = torch.tensor([0.0,0.0,0.0], requires_grad=True)

In [62]:
rosenbrock(t)

tensor(2., grad_fn=<AddBackward0>)

In [68]:
print(t)
L = rosenbrock(t)
print(L)
L.backward()
print(t.grad)
newt = t - .001*t.grad
print(newt)
print(rosenbrock(newt))

tensor([0., 0., 0.], requires_grad=True)
tensor(2., grad_fn=<AddBackward0>)
tensor([-12., -12.,   0.])
tensor([0.0120, 0.0120, 0.0000], grad_fn=<SubBackward0>)
tensor(1.9663, grad_fn=<AddBackward0>)


In [42]:
m = torch.distributions.multivariate_normal.MultivariateNormal(torch.zeros(3), torch.eye(3))
#offspring = m.sample()
offspring = m.sample((5,))

print(offspring)

evals = torch.zeros(5)
for i in range(5):
    evals[i] = sphere(offspring[i])

print(evals)

top_vals, top_inds = torch.topk(evals, 2)

print(top_vals)
print(offspring[top_inds])



tensor([[-0.1176,  1.0937,  1.1792],
        [-1.1386,  0.6213, -0.8826],
        [ 1.1929, -0.9009,  0.3181],
        [-1.4222,  0.3023, -1.2437],
        [ 0.4007, -1.0267,  1.2267]])
tensor([2.6007, 2.4616, 2.3357, 3.6608, 2.7196])
tensor([3.6608, 2.7196])
tensor([[-1.4222,  0.3023, -1.2437],
        [ 0.4007, -1.0267,  1.2267]])
