In [None]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [None]:
#export
from exp.nb_05 import *
torch.set_num_threads(2)

## ConvNet

In [None]:
x_train,y_train,x_valid,y_valid = get_data()
train_ds,valid_ds = Dataset(x_train, y_train),Dataset(x_valid, y_valid)

In [None]:
#export
def normalize_to(train, valid):
    m,s = train.mean(),train.std()
    return normalize(train, m, s), normalize(valid, m, s)

In [None]:
x_train,x_valid = normalize_to(x_train,x_valid)

In [None]:
nh,bs = 50,512
loss_func = F.cross_entropy

data = DataBunch(*get_dls(train_ds, valid_ds, bs))

In [None]:
#export
class Lambda(nn.Module):
    def __init__(self, func):
        super().__init__()
        self.func = func

    def forward(self, x): return self.func(x)

def flatten(x):      return x.view(x.shape[0], -1)

In [None]:
def mnist_resize(x): return x.view(-1, 1, 28, 28)

In [None]:
def get_cnn_model(data):
    return nn.Sequential(
        Lambda(mnist_resize),
        nn.Conv2d( 1, 8, 5, padding=2,stride=2), nn.ReLU(), #28
        nn.Conv2d( 8,16, 3, padding=1,stride=2), nn.ReLU(), #14
        nn.Conv2d(16,16, 3, padding=1,stride=2), nn.ReLU(), #7
        nn.Conv2d(16,data.c,kernel_size=3,padding=1,stride=2),
        nn.AdaptiveAvgPool2d(1),
        Lambda(flatten)
    )

In [None]:
def get_runner(model, lr=0.6, cbs=None, loss_func = F.cross_entropy):
    opt = optim.SGD(model.parameters(), lr=lr)
    learn = Learner(model, opt, loss_func, data)
    return learn, Runner([AvgStatsCallback([accuracy])] + listify(cbs))

In [None]:
model = get_cnn_model(data)
learn,run = get_runner(model)

In [None]:
%time run.fit(1, learn)

train: [1.98849421875, tensor(0.3357)]
valid: [0.65527548828125, tensor(0.7967)]
CPU times: user 8min 18s, sys: 8min 6s, total: 16min 25s
Wall time: 17.9 s


## CUDA

In [None]:
#export
class CudaCallback(Callback):
    def begin_fit(self, run): run.model.cuda()
    def begin_batch(self, run): run.xb,run.yb = run.xb.cuda(),run.yb.cuda()

In [None]:
model = get_cnn_model(data)
learn,run = get_runner(model, cbs=CudaCallback())

In [None]:
%time run.fit(3, learn)

train: [2.18338203125, tensor(0.2367, device='cuda:0')]
valid: [2.3074763671875, tensor(0.1064, device='cuda:0')]
train: [2.2986975, tensor(0.1136, device='cuda:0')]
valid: [2.2859005859375, tensor(0.1064, device='cuda:0')]
train: [2.2883159375, tensor(0.1416, device='cuda:0')]
valid: [2.30157578125, tensor(0.1064, device='cuda:0')]
CPU times: user 4.16 s, sys: 932 ms, total: 5.09 s
Wall time: 5.12 s


## Refactor model

In [None]:
def conv2d(ni, nf, ks=3, stride=2, act=True):
    layers = [nn.Conv2d(ni, nf, ks, padding=ks//2, stride=stride)]
    if act: layers.append(nn.ReLU())
    return nn.Sequential(*layers)

In [None]:
#export
class BatchTransformXCallback(Callback):
    _order=2
    def __init__(self, tfm): self.tfm = tfm
    def begin_batch(self, run): run.xb = self.tfm(run.xb)

def resize_tfm(*size):
    def _inner(x): return x.view(*((-1,)+size))
    return _inner

This model can now work on any size input:

In [None]:
#export
def get_cnn_model(data, nfs):
    nfs = [1] + nfs + [data.c]
    layers = [
        conv2d(nfs[i], nfs[i+1], 5 if i==1 else 3, act=i!=len(nfs)-1)
        for i in range(len(nfs)-1)
    ]
    return nn.Sequential(
        *layers,
        nn.AdaptiveAvgPool2d(1),
        Lambda(flatten)
    )

In [None]:
nfs = [8,16,16]

In [None]:
model = get_cnn_model(data, nfs)
learn,run = get_runner(model, cbs=[CudaCallback(), BatchTransformXCallback(resize_tfm(1,28,28))])

In [None]:
run.fit(3, learn)

train: [1.8743634375, tensor(0.3463, device='cuda:0')]
valid: [0.64924033203125, tensor(0.8046, device='cuda:0')]
train: [0.575943828125, tensor(0.8127, device='cuda:0')]
valid: [0.432177783203125, tensor(0.8466, device='cuda:0')]
train: [0.4277487890625, tensor(0.8507, device='cuda:0')]
valid: [0.3820326904296875, tensor(0.8605, device='cuda:0')]


## Batchnorm

### Custom

In [None]:
def var_dims(x, dim):
    numel = 1
    for i in dim: numel *= x.shape[i]
    s1 = x.sum(dim, keepdim=True)/numel
    s2 = x.pow(2).sum(dim, keepdim=True)/numel
    return s2 - s1**2

In [None]:
class BatchNorm(nn.Module):
    def __init__(self, nf, mom=0.9, eps=1e-5):
        super().__init__()
        # Our mom is (1-pytorch_mom), since pytorch is weird
        self.mom,self.eps = mom,eps
        self.mults = nn.Parameter(torch.ones (nf,1,1))
        self.adds  = nn.Parameter(torch.zeros(nf,1,1))
        self.register_buffer('vars',  torch.ones (1,nf,1,1))
        self.register_buffer('means', torch.zeros(1,nf,1,1))

    def update_stats(self, x):
        m = x.mean((0,2,3), keepdim=True)
        v = var_dims(x, (0,2,3)) # remove this once pytorch supports x.std(dim:tuple)
        self.means.detach_()
        self.vars.detach_()
        self.means.lerp_(m, self.mom)
        self.vars.lerp_ (v, self.mom)
        
    def forward(self, x):
        if self.training: self.update_stats(x)
        x = (x-self.means).div_((self.vars+self.eps).sqrt())
        return x.mul_(self.mults).add_(self.adds)

In [None]:
def conv2d(ni, nf, ks=3, stride=2, act=True, bn=True):
    # No bias needed if using bn
    layers = [nn.Conv2d(ni, nf, ks, padding=ks//2, stride=stride, bias=not bn)]
    if act: layers.append(nn.ReLU())
    if bn: layers.append(BatchNorm(nf))
    return nn.Sequential(*layers)

In [None]:
model = get_cnn_model(data, nfs)
learn,run = get_runner(model, lr=0.4, cbs=[CudaCallback(), BatchTransformXCallback(resize_tfm(1,28,28))])

In [None]:
%time run.fit(6, learn)

train: [0.0682016064453125, tensor(0.9835, device='cuda:0')]
valid: [0.07673294677734376, tensor(0.9793, device='cuda:0')]
train: [0.0568701123046875, tensor(0.9865, device='cuda:0')]
valid: [0.08219365234375, tensor(0.9775, device='cuda:0')]
train: [0.0498054931640625, tensor(0.9881, device='cuda:0')]
valid: [0.06638473510742188, tensor(0.9805, device='cuda:0')]
train: [0.0449722265625, tensor(0.9890, device='cuda:0')]
valid: [0.06726471557617188, tensor(0.9809, device='cuda:0')]
train: [0.04034088623046875, tensor(0.9906, device='cuda:0')]
valid: [0.05902564697265625, tensor(0.9831, device='cuda:0')]
train: [0.0355039892578125, tensor(0.9913, device='cuda:0')]
valid: [0.05814517211914062, tensor(0.9834, device='cuda:0')]
CPU times: user 7.59 s, sys: 176 ms, total: 7.76 s
Wall time: 7.79 s


### jit

In [None]:
#export
from torch.jit import ScriptModule, script_method, script
from typing import *

In [None]:
@script
def var1(x, dim:Tuple[int,int,int]):
    numel = 1
    for i in dim: numel = numel * x.shape[i]
    s1 = x.sum(dim, keepdim=True)/numel
    s2 = x.pow(2).sum(dim, keepdim=True)/numel
    return s2 - s1**2

In [None]:
class BatchNorm(ScriptModule):
    __constants__ = ['mom', 'eps']
    def __init__(self, nf, mom=0.9, eps=1e-5):
        super().__init__()
        self.mom,self.eps = mom,eps
        self.mults = nn.Parameter(torch.ones (nf,1,1))
        self.adds  = nn.Parameter(torch.zeros(nf,1,1))
        self.vars = torch.ones (1,nf,1,1).cuda()
        self.means =torch.zeros(1,nf,1,1).cuda()
#         self.register_buffer('vars',  torch.ones (1,nf,1,1))
#         self.register_buffer('means', torch.zeros(1,nf,1,1))

    @script_method
    def update_stats(self, x):
        m = x.mean((0,2,3), keepdim=True)
        v = var1(x, (0,2,3)) # remove this once pytorch supports x.std(dim:tuple)
        self.means.detach_()
        self.vars.detach_()
        self.means = self.means.mul(self.mom).add(1-self.mom, m)
        self.vars  = self.vars.mul (self.mom).add(1-self.mom, v)
#         self.means *= self.mom
#         self.means += (1-self.mom)*m
#         self.vars  *= self.mom
#         self.vars  += (1-self.mom)*v

    @script_method
    def forward(self, x):
        if self.training: self.update_stats(x)
        x = (x-self.means).div((self.vars+self.eps).sqrt())
        return x.mul(self.mults).add(self.adds)

In [None]:
model = get_cnn_model(data, nfs)
learn,run = get_runner(model, lr=0.4, cbs=[CudaCallback(), BatchTransformXCallback(resize_tfm(1,28,28))])

In [None]:
%time run.fit(6, learn)

train: [0.04733970703125, tensor(0.9881, device='cuda:0')]
valid: [0.06652548217773438, tensor(0.9810, device='cuda:0')]
train: [0.042660810546875, tensor(0.9893, device='cuda:0')]
valid: [0.06790380249023438, tensor(0.9806, device='cuda:0')]
train: [0.03964593994140625, tensor(0.9897, device='cuda:0')]
valid: [0.05937213134765625, tensor(0.9828, device='cuda:0')]
train: [0.0349819873046875, tensor(0.9918, device='cuda:0')]
valid: [0.06547821655273438, tensor(0.9830, device='cuda:0')]
train: [0.0333117529296875, tensor(0.9919, device='cuda:0')]
valid: [0.06497180786132813, tensor(0.9812, device='cuda:0')]
train: [0.03085428955078125, tensor(0.9923, device='cuda:0')]
valid: [0.06038035888671875, tensor(0.9831, device='cuda:0')]
CPU times: user 5.12 s, sys: 92 ms, total: 5.21 s
Wall time: 5.24 s


In [None]:
torch.__version__

'1.0.0.dev20190313'

### Builtin batchnorm

In [None]:
#export
def conv2d(ni, nf, ks=3, stride=2, act=True, bn=True):
    layers = [nn.Conv2d(ni, nf, ks, padding=ks//2, stride=stride, bias=not bn)]
    if act: layers.append(nn.ReLU())
    if bn: layers.append(nn.BatchNorm2d(nf))
    return nn.Sequential(*layers)

In [None]:
model = get_cnn_model(data, nfs)
learn,run = get_runner(model, lr=0.4, cbs=[CudaCallback(), BatchTransformXCallback(resize_tfm(1,28,28))])

In [None]:
%time run.fit(6, learn)

train: [0.049129072265625, tensor(0.9879, device='cuda:0')]
valid: [0.08254159545898437, tensor(0.9761, device='cuda:0')]
train: [0.0455973291015625, tensor(0.9886, device='cuda:0')]
valid: [0.06360925903320312, tensor(0.9835, device='cuda:0')]
train: [0.04050991943359375, tensor(0.9893, device='cuda:0')]
valid: [0.06290653686523437, tensor(0.9829, device='cuda:0')]
train: [0.03743682373046875, tensor(0.9906, device='cuda:0')]
valid: [0.060035589599609376, tensor(0.9838, device='cuda:0')]
train: [0.0341969921875, tensor(0.9914, device='cuda:0')]
valid: [0.057718115234375, tensor(0.9842, device='cuda:0')]
train: [0.031028759765625, tensor(0.9922, device='cuda:0')]
valid: [0.06487869262695313, tensor(0.9823, device='cuda:0')]
CPU times: user 5 s, sys: 68 ms, total: 5.07 s
Wall time: 5.09 s


### With scheduler

In [None]:
sched = combine_scheds([0.5, 0.5], [sched_lin(0.2, 0.8), sched_lin(0.8, 0.2)]) 

In [None]:
model = get_cnn_model(data, nfs)
learn,run = get_runner(model, lr=0.4, cbs=[Recorder(), CudaCallback(),
    BatchTransformXCallback(resize_tfm(1,28,28)), ParamScheduler('lr', sched)])

In [None]:
run.fit(3, learn)

train: [0.682678984375, tensor(0.8528, device='cuda:0')]
valid: [0.3456614501953125, tensor(0.8981, device='cuda:0')]
train: [0.157797470703125, tensor(0.9604, device='cuda:0')]
valid: [0.164066064453125, tensor(0.9537, device='cuda:0')]
train: [0.08712751953125, tensor(0.9780, device='cuda:0')]
valid: [0.07777939453125, tensor(0.9789, device='cuda:0')]


## More layers

In [None]:
sched = combine_scheds([0.5, 0.5], [sched_lin(0.2, 0.8), sched_lin(0.8, 0.01)]) 

In [None]:
model = get_cnn_model(data, [8,16,32,64,32])
learn,run = get_runner(model, lr=0.4, cbs=[Recorder(), CudaCallback(),
    BatchTransformXCallback(resize_tfm(1,28,28)), ParamScheduler('lr', sched)])

In [None]:
run.fit(8, learn)

train: [0.498337421875, tensor(0.9000, device='cuda:0')]
valid: [0.2021446044921875, tensor(0.9479, device='cuda:0')]
train: [0.122512021484375, tensor(0.9708, device='cuda:0')]
valid: [0.127957421875, tensor(0.9635, device='cuda:0')]
train: [0.082052060546875, tensor(0.9785, device='cuda:0')]
valid: [0.07273471069335938, tensor(0.9799, device='cuda:0')]
train: [0.0615674462890625, tensor(0.9834, device='cuda:0')]
valid: [0.09457974853515624, tensor(0.9709, device='cuda:0')]
train: [0.042147236328125, tensor(0.9881, device='cuda:0')]
valid: [0.07584328002929687, tensor(0.9770, device='cuda:0')]
train: [0.025492666015625, tensor(0.9933, device='cuda:0')]
valid: [0.04980409545898438, tensor(0.9852, device='cuda:0')]
train: [0.0147454931640625, tensor(0.9970, device='cuda:0')]
valid: [0.04962567443847656, tensor(0.9873, device='cuda:0')]
train: [0.009472476806640625, tensor(0.9987, device='cuda:0')]
valid: [0.04268844909667969, tensor(0.9887, device='cuda:0')]


## Export

In [None]:
!./notebook2script.py 06_cuda_cnn_bn.ipynb

Converted 06_cuda_cnn_bn.ipynb to nb_06.py
