In [6]:
import torch as t

import utils

## Optimizers

In [7]:
def rosenbrocks_banana(x: t.Tensor, y: t.Tensor, a=1, b=100) -> t.Tensor:
    return (a - x) ** 2 + b * (y - x**2) ** 2 + 1

x_range = [-2, 2]
y_range = [-1, 3]
fig = utils.plot_fn(rosenbrocks_banana, x_range, y_range, log_scale=True)

In [8]:
fig.show()

In [40]:
from typing import Callable


def opt_fn_with_sgd(fn: Callable, xy: t.Tensor, lr=0.001, momentum=0.98, n_iters: int = 100):
    '''
    Optimize the a given function starting from the specified point.

    xy: shape (2,). The (x, y) starting point.
    n_iters: number of steps.

    Return: (n_iters, 2). The (x,y) BEFORE each step. So out[0] is the starting point.
    '''
    assert xy.requires_grad
    optimizer = t.optim.SGD([xy], lr=lr, momentum=momentum)
    params = t.zeros(n_iters, 2)

    for n in range(n_iters):
        loss = fn(*xy)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        params[n] = xy
    return params.detach()


In [41]:
xy = t.tensor([-1.5, 2.5], requires_grad=True)
x_range = [-2, 2]
y_range = [-1, 3]

fig = utils.plot_optimization_sgd(opt_fn_with_sgd, rosenbrocks_banana, xy, x_range, y_range, lr=0.001, momentum=0.98, show_min=True)

fig.show()

In [55]:
from typing import Iterable


class SGD:
    params: list

    def __init__(self, params: Iterable[t.nn.parameter.Parameter], lr: float, momentum: float, weight_decay: float):
        '''Implements SGD with momentum.

        Like the PyTorch version, but assume nesterov=False, maximize=False, and dampening=0
            https://pytorch.org/docs/stable/generated/torch.optim.SGD.html#torch.optim.SGD
        '''
        self.gamma = lr
        self.mu = momentum
        self.lam = weight_decay
        self.thetas = list(params)
        self.previous_g_t = [t.zeros_like(p) for p in self.thetas]
        self.t = 0

    def zero_grad(self) -> None:
        for param in self.thetas:
            param.grad = t.zeros_like(param)

    @t.inference_mode()
    def step(self) -> None:
        for i, param in enumerate(self.thetas):
            g_t = param.grad
            if self.lam != 0:
                g_t += self.lam * param
            if self.mu != 0 and self.t > 0:
                g_t += self.mu * self.previous_g_t[i]
            param -= self.gamma * g_t
            self.previous_g_t[i] = g_t
        self.t += 1
        

    def __repr__(self) -> str:
        # Should return something reasonable here, e.g. "SGD(lr=lr, ...)"
        return f"lr={self.lr}, momentum={self.momentum}, weight_decay={self.weight_decay}"

utils.test_sgd(SGD)


Testing configuration:  {'lr': 0.1, 'momentum': 0.0, 'weight_decay': 0.0}

Testing configuration:  {'lr': 0.1, 'momentum': 0.7, 'weight_decay': 0.0}

Testing configuration:  {'lr': 0.1, 'momentum': 0.5, 'weight_decay': 0.0}

Testing configuration:  {'lr': 0.1, 'momentum': 0.5, 'weight_decay': 0.05}

Testing configuration:  {'lr': 0.2, 'momentum': 0.8, 'weight_decay': 0.05}


In [59]:
class RMSprop:
    def __init__(
        self,
        params: Iterable[t.nn.parameter.Parameter],
        lr: float,
        alpha: float,
        eps: float,
        weight_decay: float,
        momentum: float,
    ):
        '''Implements RMSprop.

        Like the PyTorch version, but assumes centered=False
            https://pytorch.org/docs/stable/generated/torch.optim.RMSprop.html#torch.optim.RMSprop
        '''
        self.alpha = alpha
        self.eps = eps
        self.gamma = lr
        self.mu = momentum
        self.lam = weight_decay
        self.thetas = list(params)
        self.previous_g_t = [t.zeros_like(p) for p in self.thetas]
        self.previous_v_t = [t.zeros_like(p) for p in self.thetas]
        self.previous_b_t = [t.zeros_like(p) for p in self.thetas]
        self.t = 0

    def zero_grad(self) -> None:
        for param in self.thetas:
            param.grad = t.zeros_like(param)
    
    @t.inference_mode()
    def step(self) -> None:
        for i, param in enumerate(self.thetas):
            g_t = param.grad

            if self.lam != 0:
                g_t += self.lam * param

            v_t = self.alpha * self.previous_v_t[i] + (1 - self.alpha) * (g_t ** 2)
            v_bar = v_t
            if self.mu > 0:
                b_t = self.mu * self.previous_b_t[i] + g_t / (v_bar ** 0.5 + self.eps)
                param -= self.gamma * b_t
                self.previous_b_t[i] = b_t
            else:
                param -= self.gamma * g_t / (v_bar ** 0.5 + self.eps)
            self.previous_g_t[i] = g_t
            self.previous_v_t[i] = v_t
        self.t += 1

    def __repr__(self) -> str:
        return f"lr={self.lr}, momentum={self.momentum}, alpha={self.alpha}, eps={self.eps}, weight_decay={self.weight_decay}"
    

utils.test_rmsprop(RMSprop)


Testing configuration:  {'lr': 0.1, 'alpha': 0.9, 'eps': 0.001, 'weight_decay': 0.0, 'momentum': 0.0}

Testing configuration:  {'lr': 0.1, 'alpha': 0.95, 'eps': 0.0001, 'weight_decay': 0.05, 'momentum': 0.0}

Testing configuration:  {'lr': 0.1, 'alpha': 0.95, 'eps': 0.0001, 'weight_decay': 0.05, 'momentum': 0.5}

Testing configuration:  {'lr': 0.1, 'alpha': 0.95, 'eps': 0.0001, 'weight_decay': 0.05, 'momentum': 0.0}


In [60]:
class Adam:
    def __init__(
        self,
        params: Iterable[t.nn.parameter.Parameter],
        lr: float,
        betas: tuple[float, float],
        eps: float,
        weight_decay: float,
    ):
        '''Implements Adam.

        Like the PyTorch version, but assumes amsgrad=False and maximize=False
            https://pytorch.org/docs/stable/generated/torch.optim.Adam.html#torch.optim.Adam
        '''
        self.gamma = lr
        self.beta1, self.beta2 = betas
        self.eps = eps
        self.lam = weight_decay
        self.thetas = list(params)
        self.previous_g_t = [t.zeros_like(p) for p in self.thetas]
        self.previous_m_t = [t.zeros_like(p) for p in self.thetas]
        self.previous_v_t = [t.zeros_like(p) for p in self.thetas]
        self.t = 1

    def zero_grad(self) -> None:
        for param in self.thetas:
            param.grad = t.zeros_like(param)

    @t.inference_mode()
    def step(self) -> None:
        for i, param in enumerate(self.thetas):
            # maximize=false, so:
            g_t = param.grad

            if self.lam != 0:
                g_t += self.lam * param

            m_t = self.beta1 * self.previous_m_t[i] + (1 - self.beta1) * g_t
            v_t = self.beta2 * self.previous_v_t[i] + (1 - self.beta2) * g_t ** 2

            m_t_u = m_t / (1 - self.beta1 ** self.t)
            v_t_u = v_t / (1 - self.beta2 ** self.t)
            
            param -= self.gamma * m_t_u / (v_t_u ** 0.5 + self.eps)
            
            self.previous_g_t[i] = g_t
            self.previous_m_t[i] = m_t
            self.previous_v_t[i] = v_t
        self.t += 1

    def __repr__(self) -> str:
        return f"lr={self.lr}, momentum={self.momentum}, alpha={self.alpha}, eps={self.eps}, weight_decay={self.weight_decay}"


utils.test_adam(Adam)


Testing configuration:  {'lr': 0.1, 'betas': (0.8, 0.95), 'eps': 0.001, 'weight_decay': 0.0}

Testing configuration:  {'lr': 0.1, 'betas': (0.8, 0.9), 'eps': 0.001, 'weight_decay': 0.05}

Testing configuration:  {'lr': 0.2, 'betas': (0.9, 0.95), 'eps': 0.01, 'weight_decay': 0.08}


## Learning Rate Schedulers

In [62]:
class ExponentialLR():
    def __init__(self, optimizer, gamma):
        '''Implements ExponentialLR.

        Like the PyTorch version, but assumes last_epoch=-1 and verbose=False
            https://pytorch.org/docs/stable/generated/torch.optim.lr_scheduler.ExponentialLR.html
        '''
        self.optimizer = optimizer
        self.gamma = gamma

    def step(self):
        self.optimizer.gamma *= self.gamma

    def __repr__(self):
        return f"gamma={self.gamma}"

utils.test_ExponentialLR(ExponentialLR, SGD)

Testing ExponentialLR, training loop has 30 epochs, 4 batches per epoch

Testing configuration:
	optimizer:  lr=0.01, momentum=0.0, weight_decay=0.0 
	scheduler:  gamma=1.0

Testing configuration:
	optimizer:  lr=0.01, momentum=0.0, weight_decay=0.0 
	scheduler:  gamma=0.5

Testing configuration:
	optimizer:  lr=0.01, momentum=0.9, weight_decay=0.1 
	scheduler:  gamma=0.5

All tests in `test_ExponentialLR` passed!


In [63]:
class StepLR():
    def __init__(self, optimizer, step_size, gamma=0.1):
        '''Implements StepLR.

        Like the PyTorch version, but assumes last_epoch=-1 and verbose=False
            https://pytorch.org/docs/stable/generated/torch.optim.lr_scheduler.StepLR.html
        '''
        self.optimizer = optimizer
        self.gamma = gamma
        self.step_size = step_size
        self.steps = 0

    def step(self):
        self.steps += 1
        
        if self.steps == self.step_size:
            self.optimizer.gamma *= self.gamma
            self.steps = 0

    def __repr__(self):
        return f"gamma={self.gamma}, step_size={self.step_size}"

utils.test_StepLR(StepLR, SGD)

Testing StepLR, training loop has 30 epochs, 4 batches per epoch

Testing configuration:
	optimizer:  lr=0.01, momentum=0.0, weight_decay=0.0 
	scheduler:  step_size=30, gamma=1.0

Testing configuration:
	optimizer:  lr=0.01, momentum=0.0, weight_decay=0.0 
	scheduler:  step_size=3, gamma=1.0

Testing configuration:
	optimizer:  lr=0.01, momentum=0.0, weight_decay=0.0 
	scheduler:  step_size=1, gamma=0.5

Testing configuration:
	optimizer:  lr=0.01, momentum=0.9, weight_decay=0.1 
	scheduler:  step_size=3, gamma=0.5

All tests in `test_StepLR` passed!


In [64]:
class MultiStepLR():
    def __init__(self, optimizer, milestones, gamma=0.1):
        '''Implements MultiStepLR.

        Like the PyTorch version, but assumes last_epoch=-1 and verbose=False
            https://pytorch.org/docs/stable/generated/torch.optim.lr_scheduler.MultiStepLR.html
        '''
        self.optimizer = optimizer
        self.gamma = gamma
        self.milestones = milestones
        self.steps = 0

    def step(self):
        self.steps += 1
        
        if self.steps in self.milestones:
            self.optimizer.gamma *= self.gamma

    def __repr__(self):
        return f"gamma={self.gamma}, milestones={self.milestones}"

utils.test_MultiStepLR(MultiStepLR, SGD)

Testing MultiStepLR, training loop has 30 epochs, 4 batches per epoch

Testing configuration:
	optimizer:  lr=0.01, momentum=0.0, weight_decay=0.0 
	scheduler:  milestones=[40], gamma=1.0

Testing configuration:
	optimizer:  lr=0.01, momentum=0.0, weight_decay=0.0 
	scheduler:  milestones=[10], gamma=0.5

Testing configuration:
	optimizer:  lr=0.01, momentum=0.9, weight_decay=0.1 
	scheduler:  milestones=[10, 15], gamma=0.5

All tests in `test_MultiStepLR` passed!
