# Cyclical LR policy
- Modified: Oct 26, 2019

This notebook experiements with the cyclical leraning rate policy suggested by [Smith2017](https://arxiv.org/abs/1506.01186)

## Load libraries

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import os,sys
import re

sys.dont_write_bytecode = True

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from pprint import pprint
from pathlib import Path

import pdb, typing

import joblib


In [None]:
import holoviews as hv
import xarray as xr

from holoviews import opts
from holoviews.operation.datashader import datashade, shade, dynspread, rasterize
from holoviews.streams import Stream, param
from holoviews import streams

hv.notebook_extension('bokeh')
hv.Dimension.type_formatters[np.datetime64] = '%Y-%m-%d'

# Dashboards
import param as pm, panel as pn
pn.extension()

In [None]:
# Geoviews visualization default options
H,W, = 250,250
opts.defaults(
    opts.RGB(height=H, width=W, tools=['hover'], active_tools=['wheel_zoom']),
    opts.Image(height=H, width=W, tools=['hover'], active_tools=['wheel_zoom'], framewise=True),#axiswise=True ),
    opts.Points( tools=['hover'], active_tools=['wheel_zoom']),
    opts.Curve( tools=['hover'], active_tools=['wheel_zoom'], padding=0.1),

)

In [None]:
this_nb_path = Path(os.getcwd())
ROOT = this_nb_path.parent
SCRIPTS = ROOT/'codes'
paths2add = [this_nb_path, SCRIPTS]

print("Project root: ", str(ROOT))
print("this nb path: ", str(this_nb_path))
print('Scripts folder: ', str(SCRIPTS))

for p in paths2add:
    if str(p) not in sys.path:
        sys.path.insert(0, str(p))
        print(str(p), "added to the path\n")
        
# print(sys.path)

In [None]:
import ipywidgets
from ipywidgets import interact
def f(x):
    return x

interact(f, x=10)

In [None]:
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils

In [None]:
class TriangleLR():
    def __init__(self, min_lr:float, max_lr:float, stepsize:int):
        """
        min_lr (float): lower bound of the learning rate
        max_lr (float): upper bound of the lr
        stepsize (int): stepsize in number of iterations
            - 2*stepsize = cycle_length in iterations
        """
        self.min_lr = min_lr
        self.max_lr = max_lr
        self.stepsize = stepsize
    
    def __call__(self, x:int):
        """
        x (int): iteration index 
        """
        x = x%(2*self.stepsize)
        slope = (self.min_lr - self.max_lr)/self.stepsize
        return slope * abs(x - self.stepsize) + self.max_lr
    
    def step(self):
        pass
    
class ConstLR():
    def __init__(self, lr):
        """
        Returns a constant LR
        """
        self.lr = lr
    
    def __call__(self, x:int):
        """
        x (int): iteration index 
        """
        return self.lr
    
    def step(self):
        pass

In [None]:
min_lr = 1e-3
max_lr = 1.
stepsize = 10
TLR = TriangleLR(min_lr, max_lr, stepsize)

In [None]:
TLR(0)

In [None]:
xs = np.arange(100)
ys = [TLR(x) for x in xs]
hv.Curve((xs, ys)).opts(width=1000)

## Set random seed
- https://github.com/pytorch/pytorch/issues/7068#issuecomment-484918113


In [None]:
import random 
def random_seed(seed_value, use_cuda):
    np.random.seed(seed_value) # cpu vars
    torch.manual_seed(seed_value) # cpu  vars
    random.seed(seed_value) # Python
    if use_cuda: 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value) # gpu vars
        torch.backends.cudnn.deterministic = True  #needed
        torch.backends.cudnn.benchmark = False
     

In [None]:
# check
for i in range(3):
    random_seed(0, False)
    lin = nn.Linear(1,1); 
    print(lin.weight.data.item(), lin.bias.data.item())

## Find a reasonable bound of learning rates
 
> "There is a simple way to estimate reasonable minimum and maximum boundary values with one training run of the network for a few epochs. It is a “LR range test”; run your model for several epochs while letting the learning rate increase linearly between low and high LR values. This test is enormously valuable whenever you are facing a new architecture or dataset" (Smith2017)

```python
c = 0
while c < 15:
    for i, (x,y) in enumerate(dl):
        if c >= 15:
            break
        print(i, x,y)
        c += 1

```

In [None]:
def found_soln(model, true_w, true_b, threshold=0.1):
    return np.abs(model.weight.data.item()-true_w)/true_w < threshold and \
        np.abs(model.bias.data.item() - true_b)/true_b < threshold

In [None]:
def lr_range_test(model, dataloader, optimizer, loss_fn,
                  lr_gen, n_iters, print_every=None):
    model.train()
    accs = []
    losses = []
    count = 0
    while True:
        for x,y in dataloader:
            if count >= n_iters: #or found_soln(model, true_w, true_b):
                return count, accs, losses
            x.unsqueeze_(-1)
            y.unsqueeze_(-1)
            
            pred = model(x)
            loss = loss_fn(pred, y)
            losses.append(loss.item())
#             print('x: ',x.shape, x)
#             print('y: ', y.shape, y)
#             print('loss: ', losses[-1])
#             acc = compute_acc(pred, y)
#             accs.append(acc)

            # backprop (i.e. update the weights)
            lr = lr_gen(count)
            optimizer.param_groups[0]['lr'] = lr
            optimizer.zero_grad()
            
            loss.backward()
            optimizer.step()
            count += 1

            if print_every and count%print_every == 0:
                print(f'\nIter: {count}')
                print(f'lr:  {lr}')
                print(f"dW: {model.weight.grad.data}")
                print(f"db: {model.bias.grad.data}")

## Simple 1d data

In [None]:
class TableDS(Dataset):
    
    def __init__(self, xs, ys):
        super().__init__()
        self.xs = xs
        self.ys = ys
        
    def __len__(self):
        return len(self.xs)
    
    def __getitem__(self, i):
#         return np.array(self.xs[i]).reshape((1,-1)), np.array(self.ys[i])
        return np.array(self.xs[i]), np.array(self.ys[i])

In [None]:
xs = np.linspace(0, 10, 500, dtype=np.float32)
true_w, true_b = 2.5, 1.0
ys = (true_w*xs + true_b + np.random.rand(len(xs))).astype(np.float32)
hv.Curve((xs, ys)).opts(width=800)


In [None]:
ds = TableDS(xs, ys)

In [None]:

    
dl = DataLoader(ds, batch_size=2)
print(f'Iterations in an epoch: {len(dl)}')
x,y = next(iter(dl))
print(x.shape, y.shape)

In [None]:
# Helpers
def get_clean_model(set_seed=None):
    if set_seed is not None:
        random_seed(set_seed, use_cuda=False)

    model = nn.Linear(1, 1)
    print(f'Initial weights : {model.weight.data, model.bias.data}')
    return model

def show_lr_generator(lr_gen, n_iters):
    xs = np.arange(n_iters)
    ys = [lr_gen(x) for x in xs]
    return hv.Curve((xs, ys))

- Experiment 1

In [None]:
model = get_clean_model(seed)
optimizer = optim.SGD(model.parameters(), lr=0.01)
loss_fn = nn.MSELoss()

In [None]:
# lr schedule
n_cycles = 4 # range-test iteration counts in cycle
n_iters = n_cycles * len(dl)
max_lr = 0.0001
divide_factor = 3.
min_lr = max_lr/divide_factor
stepsize = 1* len(dl) # 2epochs in iteration unit

tri_lr = TriangleLR(min_lr, max_lr, stepsize)
const_lr = ConstLR(min_lr)

In [None]:
ii, _, loss_tri_lr = lr_range_test(model, dl, optimizer, loss_fn, 
                                 tri_lr, n_iters, print_every=None);

In [None]:
# compare with constant learning rate scheuler (with lr at min_lr)
model = get_clean_model(seed)
optimizer = optim.SGD(model.parameters(), lr=0.01)
ii, _, loss_const_lr = lr_range_test(model, dl, optimizer, loss_fn, 
                                 const_lr, n_iters, print_every=None);

In [None]:
hv_lr = show_lr_generator(tri_lr)

layout = (
    hv_lr.opts(color='red', ylim=(min_lr, max_lr)) +
    hv.Curve(loss_tri_lr).opts(color='blue') * hv.Curve(loss_const_lr).opts(color='green')
)

layout.opts(
    opts.Overlay(shared_axes=False),
    opts.Curve(padding=0.1, width=1000, axiswise=True,shared_axes=False)
).cols(1)


In [None]:
print('Number of iterations: ', ii)
print('Trained weight and bias: ', model.weight.data.item(), model.bias.data.item())

In [None]:
print('Difference ratio for weight: ', abs(model.weight.data.item()-true_w)/true_w)
print('Difference ratio for bias: ', abs(model.bias.data.item()-true_b)/true_b)


Interesting that bias is harder to learn (or, takes longer to learn)


- Experiement 2
Set `stepsize` to 1/2 epach. Equivalent to setting `cycle_len` to one epoch.


In [None]:
model = get_clean_model(seed)
optimizer = optim.SGD(model.parameters(), lr=0.01)
loss_fn = nn.MSELoss()

In [None]:
n_cycles = 4 # range-test iteration counts in cycle
n_iters = n_cycles * len(dl)

In [None]:
# max_lr = 0.0001
# divide_factor = 3.
# min_lr = max_lr/divide_factor
stepsize = int(0.5* len(dl)) # 2epochs in iteration unit
tri_lr2 = TriangleLR(min_lr, max_lr, stepsize)
const_lr = ConstLR(max_lr)

In [None]:
ii, _, loss_tri_lr2 = lr_range_test(model, dl, optimizer, loss_fn, 
                                 tri_lr2, n_iters, print_every=None);

In [None]:
hv_lr = show_lr_generator(tri_lr2, n_iters)

layout = (
    hv_lr.opts(color='red', ylim=(min_lr, max_lr)) +
    (
        hv.Curve(loss_const_lr).opts(color='black',line_alpha=0.5, line_width=0.3) *
        hv.Curve(loss_tri_lr).opts(color='blue') * 
        hv.Curve(loss_tri_lr2).opts(color='green')
    )
)

layout.opts(
    opts.Overlay(shared_axes=False),
    opts.Curve(padding=0.1, width=1000, axiswise=True,shared_axes=False)
).cols(1)



In [None]:
print('Number of iterations: ', ii)
print('Trained weight and bias: ', model.weight.data.item(), model.bias.data.item())

In [None]:
print('Difference ratio for weight: ', abs(model.weight.data.item()-true_w)/true_w)
print('Difference ratio for bias: ', abs(model.bias.data.item()-true_b)/true_b)


- Experiement 3: even smaller stepsize

In [None]:
model = get_clean_model(seed)
optimizer = optim.SGD(model.parameters(), lr=0.01)
loss_fn = nn.MSELoss()

In [None]:
# n_cycles = 4 # range-test iteration counts in cycle
# n_iters = n_cycles * len(dl)

In [None]:
# max_lr = 0.0001
# divide_factor = 3.
# min_lr = max_lr/divide_factor
stepsize = int( (1/4)*len(dl)) # 2epochs in iteration unit
tri_lr3 = TriangleLR(min_lr, max_lr, stepsize)

In [None]:
ii, _, loss_tri_lr3 = lr_range_test(model, dl, optimizer, loss_fn, 
                                 tri_lr3, n_iters, print_every=None);

In [None]:
hv_lr = show_lr_generator(tri_lr3)

layout = (
    hv_lr.opts(color='red', ylim=(min_lr, max_lr)) +
    (
        hv.Curve(loss_const_lr).opts(color='black',line_alpha=0.5, line_width=0.3) *
        hv.Curve(loss_tri_lr).opts(color='blue') * 
        hv.Curve(loss_tri_lr2).opts(color='green') *
        hv.Curve(loss_tri_lr3).opts(color='yellow')
    )
)

layout.opts(
    opts.Overlay(shared_axes=False),
    opts.Curve(padding=0.1, width=1000, axiswise=True,shared_axes=False)
).cols(1)




In [None]:
print('Number of iterations: ', ii)
print('Trained weight and bias: ', model.weight.data.item(), model.bias.data.item())

In [None]:
print('Difference ratio for weight: ', abs(model.weight.data.item()-true_w)/true_w)
print('Difference ratio for bias: ', abs(model.bias.data.item()-true_b)/true_b)


- Let's make a function to run experiements


In [None]:
def run_experiment(n_iters, dl, lr_scheduler, seed=1, to_show=True):
    """
    Model architecture is fixed. One unit linear layer, ie. linear regression problem
    
    Args:
    - seed (None or int): random seed for clean model (model weights)
        - None if randomness in initializationg model weights is desired
        - any other int to set the seed
        
    - lr_scheduler: TriangleLR or ConstLR
    
    """
    model = get_clean_model(seed)
    optimizer = optim.SGD(model.parameters(), lr=0.0001) #this lr will be always overridden
    loss_fn = nn.MSELoss()
    iter_count, _, loss_lr= lr_range_test(model, dl, optimizer, loss_fn, lr_scheduler, n_iters);
    
    # Visualization
    if to_show:
        hv_lr = show_lr_generator(lr_scheduler, n_iters)

        layout = (
            hv_lr.opts(color='red', ylim=(lr_scheduler.min_lr, lr_scheduler.max_lr)) +
            hv.Curve(loss_lr).opts(color='blue') 
        )

        display(
            layout.opts(
            opts.Overlay(shared_axes=False),
            opts.Curve(padding=0.1, width=800, axiswise=True,shared_axes=False)
            ).cols(1)
        )
    
    return loss_lr

In [None]:
dl = DataLoader(ds, batch_size=1)
run_experiment(n_iters, dl, tri_lr, seed=0);

In [None]:
dl_2 = DataLoader(ds, batch_size=2); print('epoch size: ', len(dl_2))
run_experiment(n_iters, dl_16, tri_lr, seed=0);

In [None]:
dl_2 = DataLoader(ds, batch_size=2); print('epoch size: ', len(dl_2))
tri_lr_2 = TriangleLR(2*min_lr, 2*max_lr, stepsize)
run_experiment(n_iters, dl_2, tri_lr_2, seed=0);

### Summary

Syncing the period of cyclic learning rate scheduler with the epoch (ie. how many iterations in an epoch) can be a good indicator on whether a single epoch is good enough to learn the mapping. Here, as we see, it is enough -- linear regression is a fairly easy task to be solved with stochastic gradient descent. 

More things to try
- what about if we set stepsize = 1/2 ep?
- what about if we change the batch size? 
    - rule of thumb is to increase lr as we increate batchsize.
    - there is also a paper saying the raio of lr:batchsize is important for SGD to converge to a flatter minima
        - it was a paper with Bengio, saw it on SOF comment [here]()
            

## TODO: 
Modified: Oct 27, 2019


Summarize the effect of `stepsize` wrt the epoch length
- here one epoch = 500 iterations


## Experiment with `max_lr`
- What is I increate the max_lr to 1e-3?


In [None]:
model = nn.Linear(1, 1)
print(f'Initial weights : {model.weight.data, model.bias.data}')

In [None]:
optimizer = optim.SGD(model.parameters(), lr=0.01)

In [None]:
loss_fn = nn.MSELoss()

In [None]:
n_cycles = 1 # range-test iteration counts in cycle
n_iters = n_cycles * len(dl)

In [None]:
max_lr = 0.01
divide_factor = 3.
min_lr = max_lr/divide_factor
stepsize = 2 * len(dl) # 2epochs in iteration unit
const_lr = ConstLR(max_lr)

In [None]:
ii_2, _, loss_list2 = lr_range_test(model, dl, optimizer, loss_fn, const_lr, n_iters, print_every=None);

In [None]:
%%opts Curve [shared_axes=False]
hv.Curve(loss_list).opts(width=1000) * hv.Curve(loss_list2).opts(width=1000, color='red')

^ red: constant learning rate, blue: cyclic (triangle) learning rate

In [None]:
ii_2, model.weight.data, model.bias.data

## Demo videos
- https://recordit.co/97dS4ayylC

## Learning rate and batch size

Hyperparameters in training a neural network are coupled with each other. If I change one parameter (Eg. batch size), then other parameters (eg. learning rate) needs to be adjusted. Another example is that if I change the learning rate of a momentum-based optimizer (eg. Adam), it's recommended to adjust the momentum accordingly.  

- Decrease the momentum as learning rate increases following the triangle learning rate (linear increase, linear decrease) schedule: the intuition is that since we are continuously increasing the learning rate, we don't want the momemtum to even push the learning rate further up (or down, when coming down)
    
- Reference: fastai implementation 

Let's see how learning rate and batch size affect each other in the simple 1d linear regression (ie. a `nn.Linear` layer with a single output unit)


In [None]:
bs = 10
dl = DataLoader(ds, batch_size=bs, num_workers=0)
print(f'Iterations in an epoch: {len(dl)}')

In [None]:
model = get_clean_model(set_seed=seed)
print(f'Initial weights : {model.weight.data, model.bias.data}')

In [None]:
optimizer = optim.SGD(model.parameters(), lr=0.01)

In [None]:
loss_fn = nn.MSELoss()

In [None]:
n_cycles = 3 # range-test iteration counts in cycle
n_iters = n_cycles * len(dl)

In [None]:
max_lr = 0.01
divide_factor = 3.
min_lr = max_lr/divide_factor
stepsize = 2 * len(dl) # 2epochs in iteration unit
triangle_lr = TriangleLR(min_lr, max_lr, stepsize)

tri_lr = TriangleLR(min_lr, max_lr, stepsize)
print('batch size: ', dl.batch_size)

In [None]:
iter_count , _ , l3 = lr_range_test(model, dl, optimizer, loss_fn, tri_lr, n_iters, print_every=None);

In [None]:
model.weight.shape

In [None]:
x,y = next(iter(dl))

In [None]:
x.shape, y.shape