In [None]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [None]:
#export
from exp.nb_07 import *

## ConvNet

Getting the MNIST data and a CNN

In [None]:
x_train,y_train,x_valid,y_valid = get_data()

x_train,x_valid = normalize_to(x_train,x_valid)
train_ds,valid_ds = Dataset(x_train, y_train),Dataset(x_valid, y_valid)

nh,bs = 50,512
c = y_train.max().item()+1
loss_func = F.cross_entropy

data = DataBunch(*get_dls(train_ds, valid_ds, bs), c)

In [None]:
mnist_view = view_tfm(1,28,28)
cbfs = [Recorder,
        partial(AvgStatsCallback,accuracy),
        CudaCallback,
        partial(BatchTransformXCallback, mnist_view)]

In [None]:
nfs = [8,16,32,64,64]

In [None]:
class ConvLayer(nn.Module):
    def __init__(self, ni, nf, ks=3, stride=2, sub=0., **kwargs):
        super().__init__()
        self.conv = nn.Conv2d(ni, nf, ks, padding=ks//2, stride=stride, bias=True)
        self.relu = GeneralRelu(sub=sub, **kwargs)
    
    def forward(self, x): return self.relu(self.conv(x))
    
    @property
    def bias(self): return -self.relu.sub
    @bias.setter
    def bias(self,v): self.relu.sub = -v
    @property
    def weight(self): return self.conv.weight

In [None]:
learn,run = get_learn_run(nfs, data, 0.6, ConvLayer, cbs=cbfs)

Now we're going to look at [All You Need is a Good Init](https://arxiv.org/pdf/1511.06422.pdf), which introduces *Layer-wise Sequential Unit-Variance* (*LSUV*). We initialize our neural net with the usual technique, then we pass a batch through the model and check the outputs of the linear and convolutional layers. We can then rescale the weights according to the variance we observe on the activations, and subtract the mean we observe from the initial bias. That way we will have activations that stay normalize.

We repeat this process until we are satisfied with the mean/variance we observe.

Let's start by looking at a baseline:

In [None]:
run.fit(2, learn)

train: [1.07668421875, tensor(0.6470, device='cuda:0')]
valid: [0.15817132568359374, tensor(0.9531, device='cuda:0')]
train: [0.159655205078125, tensor(0.9503, device='cuda:0')]
valid: [0.10942960205078126, tensor(0.9659, device='cuda:0')]


Now we recreate our model and we'll try again with LSUV:

In [None]:
learn,run = get_learn_run(nfs, data, 0.6, ConvLayer, cbs=cbfs)

In [None]:
#export
def get_batch(dl, run):
    run.xb,run.yb = next(iter(dl))
    for cb in run.cbs: cb.set_runner(run)
    run('begin_batch')
    return run.xb,run.yb

In [None]:
xb,yb = get_batch(data.train_dl, run)

We only want the outputs of convolutional or linear layers. To find them, we need a recursive function. We can use `sum(list, [])` to concatenate the lists the function finds.

In [None]:
#export
def find_modules(m, cond):
    if cond(m): return [m]
    return sum([find_modules(o,cond) for o in m.children()], [])

def is_lin_layer(l):
    lin_layers = (nn.Conv1d, nn.Conv2d, nn.Conv3d, nn.Linear, nn.ReLU)
    return isinstance(l, lin_layers)

In [None]:
mods = find_modules(learn.model, lambda o: isinstance(o,ConvLayer))

In [None]:
mods

[ConvLayer(
   (conv): Conv2d(1, 8, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2))
   (relu): GeneralRelu()
 ), ConvLayer(
   (conv): Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
   (relu): GeneralRelu()
 ), ConvLayer(
   (conv): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
   (relu): GeneralRelu()
 ), ConvLayer(
   (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
   (relu): GeneralRelu()
 ), ConvLayer(
   (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
   (relu): GeneralRelu()
 )]

In [None]:
def append_stat(hook, mod, inp, outp):
    d = outp.data
    hook.mean,hook.std = d.mean().item(),d.std().item()

In [None]:
mdl = learn.model.cuda()

In [None]:
with Hooks(mods, append_stat) as hooks:
    mdl(xb)
    for hook in hooks: print(hook.mean,hook.std)

0.3913503587245941 0.79465651512146
0.3963846266269684 0.7818126678466797
0.30636632442474365 0.6088879108428955
0.3001363277435303 0.523555338382721
0.1954185962677002 0.30904772877693176


The idea is then to change the bias and weights accordingly to make the activations have a mean closer to 0 and a std closer to 1.

In [None]:
#export
def lsuv_module(m, xb):
    h = Hook(m, append_stat)

    while mdl(xb) is not None and abs(h.mean)  > 1e-3: m.bias -= h.mean
    while mdl(xb) is not None and abs(h.std-1) > 1e-3: m.weight.data /= h.std

    h.remove()
    return h.mean,h.std

In [None]:
for m in mods: print(lsuv_module(m, xb))

(0.10112699866294861, 0.9999999403953552)
(0.04836009442806244, 0.9999999403953552)
(0.13652455806732178, 1.000000238418579)
(0.13894684612751007, 0.9999998211860657)
(0.3087792992591858, 1.0000001192092896)


Then training is beginning on better grounds.

In [None]:
%time run.fit(2, learn)

train: [0.42438078125, tensor(0.8629, device='cuda:0')]
valid: [0.14604696044921875, tensor(0.9548, device='cuda:0')]
train: [0.128675537109375, tensor(0.9608, device='cuda:0')]
valid: [0.09168212280273437, tensor(0.9733, device='cuda:0')]
CPU times: user 4.09 s, sys: 504 ms, total: 4.6 s
Wall time: 4.61 s


LSUV is particularly useful for more complex and deeper architectures that are hard to initialize to get unit variance at the last layer.

## Export

In [None]:
!python notebook2script.py 07a_lsuv.ipynb

Converted 07a_lsuv.ipynb to exp/nb_07a.py
