In [None]:
#|default_exp sgd

# Accelerated SGD

In [None]:
#|export
import torch

from miniai.datasets import *
from miniai.conv import *
from miniai.learner import *
from miniai.activations import *
from miniai.init import *

In [None]:
import pickle,gzip,math,os,time,shutil,torch,matplotlib as mpl,numpy as np,matplotlib.pyplot as plt
import fastcore.all as fc
from collections.abc import Mapping
from pathlib import Path
from operator import attrgetter,itemgetter
from functools import partial
from copy import copy
from contextlib import contextmanager

import torchvision.transforms.functional as TF,torch.nn.functional as F
from torch import tensor,nn,optim
from torch.utils.data import DataLoader,default_collate
from torch.nn import init
from torch.optim import lr_scheduler
from torcheval.metrics import MulticlassAccuracy
from datasets import load_dataset,load_dataset_builder

from miniai.datasets import *
from miniai.conv import *
from miniai.learner import *
from miniai.activations import *
from miniai.init import *

In [None]:
from fastcore.test import test_close

torch.set_printoptions(precision=2, linewidth=140, sci_mode=False)
torch.manual_seed(1)

import logging
logging.disable(logging.WARNING)

set_seed(42)

In [None]:
xl,yl = 'image','label'
name = "fashion_mnist"
dsd = load_dataset(name)

bs = 1024
xmean,xstd = 0.28, 0.35

@inplace
def transformi(b): b[xl] = [(TF.to_tensor(o)-xmean)/xstd for o in b[xl]]

tds = dsd.with_transform(transformi)
dls = DataLoaders.from_dd(tds, bs, num_workers=4)

In [None]:
metrics = MetricsCB(accuracy=MulticlassAccuracy())
astats = ActivationStats(fc.risinstance(GeneralRelu))
cbs = [DeviceCB(), metrics, ProgressCB(plot=True), astats]
act_gr = partial(GeneralRelu, leak=0.1, sub=0.4)
iw = partial(init_weights, leaky=0.1)
lrf_cbs = [DeviceCB(), LRFinderCB()]

## Optimizers

### SGD

In [None]:
class SGD:
    def __init__(self, params, lr, wd=0.):
        params = list(params)
        fc.store_attr()
        self.i = 0

    def step(self):
        with torch.no_grad():
            for p in self.params:
                self.reg_step(p)
                self.opt_step(p)
        self.i +=1

    def opt_step(self, p): p -= p.grad * self.lr
    def reg_step(self, p):
        if self.wd != 0: p *= 1 - self.lr*self.wd

    def zero_grad(self):
        for p in self.params: p.grad.data.zero_()

### Main Ideas of the `SGD` Class

1. **Initialization (`__init__`):**
   - **Parameters:** Takes a list of parameters (`params`), a learning rate (`lr`), and an optional weight decay (`wd`).
   - **Setup:** Initializes the list of parameters and stores attributes for later use. A counter `i` is also initialized.

2. **Optimization Step (`step`):**
   - **No Gradient Tracking:** Executes the optimization without tracking gradients (`torch.no_grad()`).
   - **Parameter Updates:** For each parameter, performs regularization and optimization steps. The counter `i` is incremented after each step.

3. **Optimization Step (`opt_step`):**
   - **Parameter Update:** Updates each parameter by subtracting the gradient scaled by the learning rate (`p -= p.grad * self.lr`).

4. **Regularization Step (`reg_step`):**
   - **Weight Decay:** If weight decay is applied, it adjusts the parameters by multiplying them by `(1 - lr * wd)`.

5. **Zero Gradient (`zero_grad`):**
   - **Gradient Reset:** Sets the gradients of all parameters to zero to prepare for the next optimization step.

In [None]:
set_seed(42)
model = get_model(act_gr, norm=nn.BatchNorm2d).apply(iw)
learn = TrainLearner(model, dls, F.cross_entropy, lr=0.4, cbs=cbs, opt_func=SGD)

In [None]:
learn.fit(3)

Consider the difference between *weight decay* and *L2 regularization*:

``` python
weight -= lr*wd*weight
```

...vs...

``` python
weight.grad += wd*weight
```

### Momentum

In [None]:
xs = torch.linspace(-4, 4, 100)
ys = 1 - (xs/3) ** 2 + torch.randn(100) * 0.1

In [None]:
_,axs = plt.subplots(2,2, figsize=(12,8))
betas = [0.5,0.7,0.9,0.99]
for beta,ax in zip(betas, axs.flatten()):
    ax.scatter(xs,ys)
    avg,res = 0,[]
    for yi in ys:
        avg = beta*avg + (1-beta)*yi
        res.append(avg)
    ax.plot(xs,np.array(res), color='red');
    ax.set_title(f'beta={beta}')

In [None]:
class Momentum(SGD):
    def __init__(self, params, lr, wd=0., mom=0.9):
        super().__init__(params, lr=lr, wd=wd)
        self.mom=mom

    def opt_step(self, p):
        if not hasattr(p, 'grad_avg'): p.grad_avg = torch.zeros_like(p.grad)
        p.grad_avg = p.grad_avg*self.mom + p.grad*(1-self.mom)
        p -= self.lr * p.grad_avg

1. **Initialization (`__init__`):**
   - **Inherits from `SGD`:** Uses the base class `SGD` for its core functionality.
   - **Momentum Parameter (`mom`):** Introduces a momentum term (`mom`), which is used to update the moving average of gradients.

2. **Optimization Step (`opt_step`):**
   - **Gradient Averaging:**
     - **Initialization:** If a parameter does not already have a `grad_avg` attribute, it initializes it to zeros with the same shape as the gradient.
     - **Update Averaging:** Updates the moving average of gradients using the momentum term (`grad_avg = grad_avg * mom + grad * (1 - mom)`).
   - **Parameter Update:** Updates each parameter using the learning rate and the averaged gradient (`p -= self.lr * p.grad_avg`).


In [None]:
set_seed(42)
model = get_model(act_gr, norm=nn.BatchNorm2d).apply(iw)
learn = TrainLearner(model, dls, F.cross_entropy, lr=1.5, cbs=cbs, opt_func=Momentum)

In [None]:
learn.fit(3)

In [None]:
astats.color_dim()

### RMSProp

In [None]:
class RMSProp(SGD):
    def __init__(self, params, lr, wd=0., sqr_mom=0.99, eps=1e-5):
        super().__init__(params, lr=lr, wd=wd)
        self.sqr_mom,self.eps = sqr_mom,eps

    def opt_step(self, p):
        if not hasattr(p, 'sqr_avg'): p.sqr_avg = p.grad**2
        p.sqr_avg = p.sqr_avg*self.sqr_mom + p.grad**2*(1-self.sqr_mom)
        p -= self.lr * p.grad/(p.sqr_avg.sqrt() + self.eps)

### Main Ideas of the `RMSProp` Class

1. **Initialization (`__init__`):**
   - **Inherits from `SGD`:** It leverages the base functionality of `SGD`.
   - **Additional Parameters:**
     - **Squared Gradient Momentum (`sqr_mom`)**: This controls how much of the previous squared gradients contribute to the current step (similar to exponential moving average).
     - **Epsilon (`eps`)**: A small constant added to avoid division by zero, ensuring numerical stability.

2. **Optimization Step (`opt_step`):**
   - **Squared Gradient Averaging:**
     - **Initialization:** If a parameter doesn't have a `sqr_avg` attribute, it initializes it to the squared gradient:
     $$ p.sqr\_avg = p.grad^2 $$
     - **Exponential Moving Average:** Updates the moving average of squared gradients, which gives more weight to recent gradient squares and less to past ones:  
     $$ p.sqr\_avg = p.sqr\_avg \times \text{sqr\_mom} + p.grad^2 \times (1 - \text{sqr\_mom}) $$
     
   - **Adaptive Learning Rate:**
     - The gradient is scaled down based on the square root of the moving average of squared gradients. This creates an adaptive learning rate, where large gradients are scaled down and small gradients are allowed to influence updates more:
     $$ p \mathrel{-=} \frac{\text{lr} \times p.grad}{\sqrt{p.sqr\_avg} + \epsilon} $$

### Key Insights:

1. **Why Use `sqr_avg`?**
   - The averaging of squared gradients helps to stabilize the learning process. If a gradient is large, its squared value will dominate, slowing down the update for that parameter. If the gradient is small, the update proceeds with less dampening. This adaptive adjustment of learning rates helps prevent exploding or vanishing gradients in different dimensions.

2. **The Role of `eps`:**
   - The epsilon term prevents division by zero, especially when the squared gradient values are very small. Without it, division by zero or extremely large updates could destabilize training.

3. **When is RMSProp Useful?**
   - **Adaptive Learning Rates:** RMSProp is particularly effective when the scale of the gradients varies significantly across dimensions, making it ideal for deep networks where gradient magnitudes can fluctuate wildly.
   - **Stabilization:** By adapting the learning rate based on the variance of past gradients, RMSProp can make training more robust and prevent overshooting, especially in non-convex loss surfaces.

### Why would a dominating squared value slow down updates?

1. **Larger Gradients = Larger Squared Values:**
   - When the gradient for a particular parameter \( p \) is large, its squared value will be even larger. This causes \( p.sqr\_avg \) to increase.
   - **Example:**  
     If \( p.grad = 2 \), then \( p.grad^2 = 4 \). This value adds more weight to the moving average \( p.sqr\_avg \).

2. **Impact on Learning Rate:**
   - The update rule for RMSProp is:
     $$ 
     p \mathrel{-=} \frac{\text{lr} \times p.grad}{\sqrt{p.sqr\_avg} + \epsilon} 
     $$
   - As \( p.sqr\_avg \) grows larger, the denominator \( \sqrt{p.sqr\_avg} \) increases.
   - A larger denominator reduces the size of the update step because the gradient \( p.grad \) is divided by a bigger number.

3. **Slowing Down Updates:**
   - When \( p.sqr\_avg \) is large (due to large squared gradients), the effective learning rate for that parameter becomes smaller. This **slows down the updates** for that parameter.
   - **Intuition:** If a gradient is consistently large, RMSProp interprets it as a sign that updates should be more cautious, preventing overshooting during optimization.

### Why is this good?
- **Adaptive Adjustment:** If a parameter experiences large fluctuations in its gradient, slowing down its update helps stabilize the training. Conversely, if a parameter has small gradients, RMSProp allows larger updates.
- **Preventing Exploding Gradients:** When gradients are large, updates can become too aggressive and lead to exploding gradients. RMSProp curbs this by reducing the effective learning rate for large gradients.

In [None]:
set_seed(42)
model = get_model(act_gr, norm=nn.BatchNorm2d).apply(iw)
learn = TrainLearner(model, dls, F.cross_entropy, lr=3e-3, cbs=cbs, opt_func=RMSProp)
learn.fit(3)

In [None]:
astats.color_dim()

### Adam

In [None]:
class Adam(SGD):
    def __init__(self, params, lr, wd=0., beta1=0.9, beta2=0.99, eps=1e-5):
        super().__init__(params, lr=lr, wd=wd)
        self.beta1,self.beta2,self.eps = beta1,beta2,eps

    def opt_step(self, p):
        if not hasattr(p, 'avg'): p.avg = torch.zeros_like(p.grad.data)
        if not hasattr(p, 'sqr_avg'): p.sqr_avg = torch.zeros_like(p.grad.data)
        p.avg = self.beta1*p.avg + (1-self.beta1)*p.grad
        unbias_avg = p.avg / (1 - (self.beta1**(self.i+1)))
        p.sqr_avg = self.beta2*p.sqr_avg + (1-self.beta2)*(p.grad**2)
        unbias_sqr_avg = p.sqr_avg / (1 - (self.beta2**(self.i+1)))
        p -= self.lr * unbias_avg / (unbias_sqr_avg + self.eps).sqrt()

### Adam Optimizer

Adam combines the ideas from both **Momentum** and **RMSProp** to provide a more robust and adaptive optimization algorithm. The key components are:

1. **Running Average of Gradients (Momentum-like Behavior):**
   $$ 
   p.avg = \beta_1 \cdot p.avg + (1 - \beta_1) \cdot p.grad
   $$
   - This computes an exponentially weighted moving average of the gradients, similar to Momentum, where \( \beta_1 \) controls the smoothness of this average.
   - \( p.avg \) accumulates the momentum of the gradient over time.

2. **Running Average of Squared Gradients (RMSProp-like Behavior):**
   $$ 
   p.sqr\_avg = \beta_2 \cdot p.sqr\_avg + (1 - \beta_2) \cdot (p.grad^2)
   $$
   - This tracks the moving average of squared gradients, where \( \beta_2 \) controls the smoothness of this moving average.
   - This part mimics the behavior of RMSProp by adapting the learning rate based on recent squared gradients.

3. **Bias-Correction:**
   - To counteract the bias introduced by initializing \( p.avg \) and \( p.sqr\_avg \) to zeros, we apply bias correction:
     $$
     \text{unbias\_avg} = \frac{p.avg}{1 - \beta_1^{t+1}}
     $$
     $$
     \text{unbias\_sqr\_avg} = \frac{p.sqr\_avg}{1 - \beta_2^{t+1}}
     $$

   - This ensures that early iterations don't underestimate the gradient values.

4. **Update Step:**
   $$
   p \mathrel{-=} \frac{\text{lr} \cdot \text{unbias\_avg}}{\sqrt{\text{unbias\_sqr\_avg}} + \epsilon}
   $$

   - Adam combines the two running averages (of the gradient and squared gradient) to scale the update.
   - The bias-corrected running average of the gradient is divided by the square root of the bias-corrected squared gradient, ensuring that the update is adaptive and stable.

### Key Insights:

- **Adaptive Learning Rates:** Adam adjusts the learning rate for each parameter based on the gradient magnitudes and momentum, making it effective for sparse gradients or noisy data.
- **Bias Correction:** The bias correction ensures that early updates are accurate, even though the moving averages are initialized to zero.
- **Stability:** By combining momentum with RMSProp's adaptive learning, Adam offers a stable and efficient way to optimize deep networks.

In [None]:
set_seed(42)
model = get_model(act_gr, norm=nn.BatchNorm2d).apply(iw)
learn = TrainLearner(model, dls, F.cross_entropy, lr=6e-3, cbs=cbs, opt_func=Adam)
learn.fit(3)

## Schedulers

We've already seen how we can easily write a custom LR-adjusting callback or `Learner`, or can use the predefined PyTorch schedulers. We'll use the predefined ones for now since there's nothing new to learn in implementing them ourselves.

In [None]:
' '.join(o for o in dir(lr_scheduler) if o[0].isupper() and o[1].islower())

In [None]:
' '.join(filter(lambda x: x[0].isupper() and x[1].islower(), dir(lr_scheduler)))

In [None]:
learn = TrainLearner(get_model(), dls, F.cross_entropy, lr=6e-3, cbs=[DeviceCB(), SingleBatchCB()])
learn.fit(1)

In [None]:
opt = learn.opt
' '.join(o for o in dir(opt) if o[0]!='_')

In [None]:
opt

In [None]:
param = next(iter(learn.model.parameters()))
st = opt.state[param]

In [None]:
st

In [None]:
len(opt.param_groups)

In [None]:
pg = opt.param_groups[0]

In [None]:
list(pg)

In [None]:
sched = lr_scheduler.CosineAnnealingLR(opt, 100)

Cosine Annealing is a learning rate scheduler that reduces the learning rate following a cosine curve. This technique is designed to progressively lower the learning rate as training progresses, helping the model converge more effectively.

### Key Concepts:

1. **Cosine Function for Learning Rate:**
   The learning rate is updated according to a cosine function over a predefined number of iterations or epochs:
   $$
   \text{lr}(t) = \frac{\text{lr}_{\text{min}}}{2} \left( 1 + \cos\left( \frac{t}{T} \pi \right) \right)
   $$
   where:
   - \( t \) is the current time step (iteration or epoch).
   - \( T \) is the total number of time steps (iterations or epochs).
   - \( \text{lr}_{\text{min}} \) is the minimum learning rate.

2. **Smooth Decay:**
   The learning rate starts at its maximum value, then decays smoothly following the shape of the cosine function. This smooth decay helps the model avoid overshooting during training and leads to better convergence.

3. **Restarts (Optional):**
   Sometimes, the cosine annealing schedule includes restarts, where the learning rate is periodically reset to a higher value (or even the maximum). This allows the model to escape local minima and explore other regions of the loss landscape before continuing the decay:
   $$
   T_{i+1} = \frac{T_i}{2}
   $$

### Advantages:
- **Gradual Learning Rate Reduction:** Cosine annealing offers a smooth and gradual decrease in the learning rate, which can help models converge better compared to abrupt drops.
- **Helps Escape Local Minima:** The optional restarts can provide a form of exploration during training, allowing the model to avoid getting stuck in local minima.

Cosine annealing is particularly useful in tasks where a smooth decay in the learning rate improves convergence.


In [None]:
sched.base_lrs

In [None]:
sched.get_last_lr()

In [None]:
def sched_lrs(sched, steps):
    lrs = [sched.get_last_lr()]
    for i in range(steps):
        sched.optimizer.step()
        sched.step()
        lrs.append(sched.get_last_lr())
    plt.plot(lrs)

In [None]:
sched_lrs(sched, 110)

### Scheduler callbacks

In [None]:
#|export
class BaseSchedCB(Callback):
    def __init__(self, sched): self.sched = sched
    def before_fit(self, learn): self.schedo = self.sched(learn.opt)
    def _step(self, learn):
        if learn.training: self.schedo.step()

In [None]:
#|export
class BatchSchedCB(BaseSchedCB):
    def after_batch(self, learn): self._step(learn)

In [None]:
#|export
class HasLearnCB(Callback):
    def before_fit(self, learn): self.learn = learn 
    def after_fit(self, learn): self.learn = None

In [None]:
#|export
class RecorderCB(Callback):
    def __init__(self, **d): self.d = d
    def before_fit(self, learn):
        self.recs = {k:[] for k in self.d}
        self.pg = learn.opt.param_groups[0]
    
    def after_batch(self, learn):
        if not learn.training: return
        for k,v in self.d.items():
            self.recs[k].append(v(self))

    def plot(self):
        for k,v in self.recs.items():
            plt.plot(v, label=k)
            plt.legend()
            plt.show()

In [None]:
def _lr(cb): return cb.pg['lr']

In [None]:
len(dls.train)

In [None]:
tmax = 3 * len(dls.train)
sched = partial(lr_scheduler.CosineAnnealingLR, T_max=tmax)

In [None]:
set_seed(42)
model = get_model(act_gr, norm=nn.BatchNorm2d).apply(iw)
rec = RecorderCB(lr=_lr)
xtra = [BatchSchedCB(sched),rec]
learn = TrainLearner(model, dls, F.cross_entropy, lr=2e-2, cbs=cbs+xtra, opt_func=optim.AdamW)
learn.fit(3)

In [None]:
rec.plot()

In [None]:
#|export
class EpochSchedCB(BaseSchedCB):
    def after_epoch(self, learn): self._step(learn)

In [None]:
sched = partial(lr_scheduler.CosineAnnealingLR, T_max=3)
set_seed(42)
xtra = [EpochSchedCB(sched),rec]
model = get_model(act_gr, norm=nn.BatchNorm2d).apply(iw)
learn = TrainLearner(model, dls, F.cross_entropy, lr=2e-2, cbs=cbs+xtra, opt_func=optim.AdamW)
learn.fit(3)

In [None]:
rec.plot()

### 1cycle training

[Paper](https://arxiv.org/abs/1803.09820) by Leslie Smith.

1Cycle is a training strategy that adjusts the learning rate and momentum dynamically throughout the training process. It was popularized by Leslie Smith and is particularly effective for faster convergence and better generalization in deep learning models.

### Key Concepts:

1. **Learning Rate Schedule:**
   The learning rate is increased in the first half of the training and then decreased in the second half, following a triangular shape:
   $$
   \text{lr}(t) =
   \begin{cases}
   \text{lr}_{\text{max}} - (\text{lr}_{\text{max}} - \text{lr}_{\text{min}}) \cdot \frac{t}{T/2} & \text{if } t \leq T/2 \\
   \text{lr}_{\text{min}} + (\text{lr}_{\text{max}} - \text{lr}_{\text{min}}) \cdot \frac{t - T/2}{T/2} & \text{if } t > T/2
   \end{cases}
   $$
   - \( \text{lr}_{\text{max}} \): Maximum learning rate (reached halfway through training).
   - \( \text{lr}_{\text{min}} \): Minimum learning rate (start and end of the cycle).
   - \( t \): Current time step.
   - \( T \): Total training time.

2. **Momentum Schedule:**
   Momentum follows an inverse pattern of the learning rate. When the learning rate is high, momentum is low, and vice versa:
   $$
   \text{mom}(t) =
   \begin{cases}
   \text{mom}_{\text{min}} + (\text{mom}_{\text{max}} - \text{mom}_{\text{min}}) \cdot \frac{t}{T/2} & \text{if } t \leq T/2 \\
   \text{mom}_{\text{max}} - (\text{mom}_{\text{max}} - \text{mom}_{\text{min}}) \cdot \frac{t - T/2}{T/2} & \text{if } t > T/2
   \end{cases}
   $$
   - \( \text{mom}_{\text{max}} \): Maximum momentum.
   - \( \text{mom}_{\text{min}} \): Minimum momentum.

3. **Super-convergence:**
   The key insight behind 1Cycle is the concept of super-convergence. By increasing the learning rate aggressively and then decreasing it, the model is pushed to learn faster and converge more effectively. The initial increase helps the model escape poor local minima, and the decrease ensures fine-tuning at the end of training.

4. **Key Insights:**
   - **Fast Learning:** The initial learning rate increase encourages the model to explore the parameter space more broadly.
   - **Effective Fine-tuning:** The final learning rate decrease allows the model to settle into a good minimum.
   - **Improved Generalization:** This cyclical pattern helps improve the generalization of the model, often leading to better performance on unseen data.

### Advantages:
- **Faster Training:** Helps models converge faster without compromising accuracy.
- **Less Hyperparameter Tuning:** Reduces the need for manually adjusting the learning rate, as it adapts throughout the cycle.
- **Better Generalization:** Leads to more robust models that generalize well to new data.

1Cycle is particularly useful for large-scale deep learning tasks where achieving faster convergence is important.


In [None]:
def _beta1(cb): return cb.pg['betas'][0]
rec = RecorderCB(lr=_lr, mom=_beta1)

In [None]:
set_seed(42)
lr,epochs = 6e-2,5
model = get_model(act_gr, norm=nn.BatchNorm2d).apply(iw)
tmax = epochs * len(dls.train)
sched = partial(lr_scheduler.OneCycleLR, max_lr=lr, total_steps=tmax)
xtra = [BatchSchedCB(sched), rec]
learn = TrainLearner(model, dls, F.cross_entropy, lr=lr, cbs=cbs+xtra, opt_func=optim.AdamW)
learn.fit(epochs)

In [None]:
rec.plot()

## Export -

In [None]:
import nbdev; nbdev.nbdev_export()