In [None]:
# default_exp learner

In [None]:
#export
from local.imports import *
from local.test import *
from local.core import *
from local.notebook.showdoc import show_doc

# Optimizer

> Define the general fastai optimizer and the variants

## Optimizer -

In [None]:
# export
class Optimizer():
    "Base optimizer class for the fastai library, updating `params` with `steppers`"
    def __init__(self, params, steppers, **defaults):
        steppers,params = L(steppers),L(params)
        for step in steppers: defaults = {**getattr(step, 'defaults', {}), **defaults}
        self.param_groups = params if isinstance(params[0], (L,list)) else L([params])
        self.step_func = compose(*steppers)
        self.hypers = L({**defaults} for p in self.param_groups)

    def grad_params(self):
        "Helper function to loop over param groups then params that have a grad"
        return [(p,hyper) for pg,hyper in zip(self.param_groups,self.hypers)
            for p in pg if p.grad is not None]

    def zero_grad(self):
        "Zero all the grad attributes of the parameters"
        for p,hyper in self.grad_params():
            p.grad.detach_()
            p.grad.zero_()

    def step(self):
        "Execute steppers on all parameters that have a grad"
        for p,hyper in self.grad_params(): self.step_func(p, **hyper)

### Initializing an Optimizer

`params` will be used to create the `param_groups` of the optimizer. If it's a colection (or a generator) of parameters, it will be a `L` containing one `L` with all the parameters. To define multiple parameter groups `params` should be passed as a collection (or a generator) of `L`s.

> Note: In PyTorch, `model.parameters()` returns a generator with all the parameters, that you can directly pass to `Optimizer`.

In [None]:
opt = Optimizer([1,2,3], noop)
test_eq(opt.param_groups, [[1,2,3]])
opt = Optimizer(range(3), noop)
test_eq(opt.param_groups, [[0,1,2]])
opt = Optimizer([[1,2],[3]], noop)
test_eq(opt.param_groups, [[1,2],[3]])
opt = Optimizer(([o,o+1] for o in range(0,4,2)), noop)
test_eq(opt.param_groups, [[0,1],[2,3]])

`steppers` is a list of functions that will be composed when applying the step. For instance, you can compose a function making the SGD step, with one applying weight decay. Additionaly, each `stepper` can have a `defaults` attribute that contains hyper-parameters and their default value. Those are all gathered at initialization, and new value can be passed to overrided those defaults with the `defaults` kwargs.

Once the defaults have all been pulled off, they are copied as many times as there are `param_groups` and stored in `hypers`. To apply different hyper-parameters to different groups (differential learning rates, or no weight decay for certain layers for instance), you will need to adjsut those values after the init. 

In [None]:
def tst_arg(p, lr=0, **kwargs): return p
tst_arg.defaults = dict(lr=1e-2)

opt = Optimizer([1,2,3], tst_arg)
test_eq(opt.hypers, [{'lr': 1e-2}])
opt = Optimizer([1,2,3], tst_arg, lr=0.1)
test_eq(opt.hypers, [{'lr': 0.1}])
opt = Optimizer([[1,2],[3]], tst_arg)
test_eq(opt.hypers, [{'lr': 1e-2}, {'lr': 1e-2}])
opt = Optimizer([[1,2],[3]], tst_arg, lr=0.1)
test_eq(opt.hypers, [{'lr': 0.1}, {'lr': 0.1}])

### Basic steppers

To be able to give examples of optimizer steps, we will need some steppers, like the following:

In [None]:
#export
def sgd_step(p, lr, **kwargs):
    p.data.add_(-lr, p.grad.data)
    return p

In [None]:
def tst_param(val, grad):
    "Create a tensor with `val` and a gradient of `grad` for testing"
    res = tensor([val]).float()
    res.grad = tensor([grad]).float()
    return res

In [None]:
p = tst_param(1., 0.1)
p = sgd_step(p, 1.)
test_eq(p, tensor([0.9]))
test_eq(p.grad, tensor([0.1]))

In [None]:
#export
def weight_decay(p, lr, wd, **kwargs):
    "Weight decay as decaying `p` with `lr*wd`"
    p.data.mul_(1 - lr*wd)
    return p
weight_decay.defaults = dict(wd=0.)

In [None]:
p = tst_param(1., 0.1)
p = weight_decay(p, 1., 0.1)
test_eq(p, tensor([0.9]))
test_eq(p.grad, tensor([0.1]))

In [None]:
#export
def l2_reg(p, lr, wd, **kwargs):
    "L2 regularization as adding `wd*p` to `p.grad`"
    p.grad.data.add_(wd, p.data)
    return p
l2_reg.defaults = dict(wd=0.)

In [None]:
p = tst_param(1., 0.1)
p = l2_reg(p, 1., 0.1)
test_eq(p, tensor([1.]))
test_eq(p.grad, tensor([0.2]))

> Warning: Weight decay and L2 regularization is the same thing for basic SGD, but for more complex optimizers, they are very different. See [Decoupled Weight Decay Regularization](https://arxiv.org/abs/1711.05101) for more information.

### Making the step

In [None]:
show_doc(Optimizer.step)

<h4 id="Optimizer.step" class="doc_header"><code>Optimizer.step</code><a href="https://github.com/fastai/fastai_docs/tree/master/dev/__main__.py#L22" class="source_link" style="float:right">[source]</a></h4>

> <code>Optimizer.step</code>()

Execute steppers on all parameters that have a grad

This method will loop over all param groups, then all parameters for which `grad` is not None and call each function in `stepper`, passing it the parameter `p` with the hyper-parameters in the corresponding dict in `hypers`.

In [None]:
#test basic step
def tst_params(): return [tst_param(i, 0.1*i) for i in range(4)]

params = tst_params()
opt = Optimizer(params, sgd_step, lr=0.1)
opt.step()
test_close([p.item() for p in params], [i*0.99 for i in range(4)])

In [None]:
#test two steps
params = tst_params()
opt = Optimizer(params, [weight_decay, sgd_step], lr=0.1, wd=0.1)
opt.step()
test_close([p.item() for p in params], [i*0.98 for i in range(4)])

In [None]:
#test None gradients are ignored
params = tst_params()
opt = Optimizer(params, sgd_step, lr=0.1)
params[-1].grad = None
opt.step()
test_close([p.item() for p in params], [0., 0.99, 1.98, 3.])

In [None]:
#test discriminative lrs
params = tst_params()
opt = Optimizer([params[:2], params[2:]], sgd_step, lr=0.1)
opt.hypers[0]['lr'] = 0.01
opt.step()
test_close([p.item() for p in params], [0., 0.999, 1.98, 2.97])

In [None]:
show_doc(Optimizer.zero_grad)

<h4 id="Optimizer.zero_grad" class="doc_header"><code>Optimizer.zero_grad</code><a href="https://github.com/fastai/fastai_docs/tree/master/dev/__main__.py#L16" class="source_link" style="float:right">[source]</a></h4>

> <code>Optimizer.zero_grad</code>()

Zero all the grad attributes of the parameters

In [None]:
params = tst_params()
opt = Optimizer(params, [weight_decay, sgd_step], lr=0.1, wd=0.1)
opt.zero_grad()
[test_eq(p.grad, tensor([0.])) for p in params];

In [None]:
show_doc(Optimizer.grad_params)

<h4 id="Optimizer.grad_params" class="doc_header"><code>Optimizer.grad_params</code><a href="https://github.com/fastai/fastai_docs/tree/master/dev/__main__.py#L11" class="source_link" style="float:right">[source]</a></h4>

> <code>Optimizer.grad_params</code>()

Helper function to loop over param groups then params that have a grad

This is used by `Optimizer.step` and `Optimizer.zero_grad` to loop first over the `param_groups` then over all the parameters that have a gradient, and return the tuples `(p,hyper)` where `hyper` is the dictionary of hyper-parameters associated to the parameter groups `p` is in.

In [None]:
params = tst_params()
opt = Optimizer([params[:2], params[2:]], sgd_step, lr=0.1)
opt.hypers[0]['lr'] = 0.01
test_eq(opt.grad_params(), [(tensor([0.]), {'lr': 0.01}),
                            (tensor([1.]), {'lr': 0.01}),
                            (tensor([2.]), {'lr': 0.1}),
                            (tensor([3.]), {'lr': 0.1})])

## StatefulOptimizer -

In [None]:
#export
class StatefulOptimizer(Optimizer):
    "`Optimizer` that can have state through `stats`"
    def __init__(self, params, steppers, stats=None, **defaults): 
        self.stats = L(stats)
        for stat in self.stats: defaults = {**getattr(stat, 'defaults', {}), **defaults}
        super().__init__(params, steppers, **defaults)
        self.state = {}
        
    def step(self):
        "Update the stats and execute the steppers in on all parameters that have a grad"
        for p,hyper in self.grad_params():
            state = self.state.get(p, {})
            for stat in self.stats: state = stat(state, p, **hyper)
            self.step_func(p, **state, **hyper)
            self.state[p] = state
    
    def _init_state(self, p):
        "Create a state for p and call all the statistics to initialize it"
        state = {}
        for stat in self.stats: state = {**getattr(stat, "init_state", lambda:{})(p), **state}
        self.state[p] = state

The difference between a `StatefulOptimizer` and an `Optimizer` is that a `StatefulOptimzier` keeps a state for things like moving averages of gradients. It does so with `stats` which basically are functions taking the state associated to a parameter, that parameter, plus the optimizer hyper-parameters and updates the state. That state can then be used by any stepper. It is initiliazed to an empty dictionary the first time we try to access it, then the `stat` function will have to properly initiliaze it.

In [None]:
def tst_stat(state, p, **kwargs): 
    state['sum'] = state.get('sum', torch.zeros_like(p)) + p.data
    return state
tst_stat.defaults = {'mom': 0.9}

#Test StatefulOptimizer init
opt = StatefulOptimizer([1,2,3], noop, stats=tst_stat)
test_eq(opt.hypers, [{'mom': 0.9}])
opt = StatefulOptimizer([1,2,3], noop, stats=tst_stat, mom=0.99)
test_eq(opt.hypers, [{'mom': 0.99}])

#Test stat
x = torch.randn(4,5)
state = tst_stat({}, x)
assert 'sum' in state
test_eq(state['sum'], x)
state = tst_stat(state, x)
test_eq(state['sum'], 2*x)

## Statistics

In [None]:
# export
def average_grad(self, p, state, mom, dampening=False, **kwargs):
    if 'grad_avg' not in state: state['grad_avg'] = torch.zeros_like(p.grad.data)
    damp = 1-mom if dampening else 1.
    state['grad_avg'].mul_(mom).add_(damp, p.grad.data)
average_grad.defaults = dict(mom=0.9)

In [None]:
#export
def momentum_step(p, lr, grad_avg, **kwargs):
    p.data.add_(-lr, grad_avg)
    return p