# Imagenet(te) training

In [None]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [None]:
#export
from exp.nb_12 import *

## Gather the data

In [None]:
path = datasets.untar_data(datasets.URLs.IMAGENETTE_160)

In [None]:
#tfms = [make_rgb, RandomResizedCrop(128,scale=(0.35,1)), to_byte_tensor, to_float_tensor, PilRandomFlip()]
tfms = [make_rgb, PilTiltRandomCrop(128, 160, magnitude=0.2), to_byte_tensor, to_float_tensor, PilRandomFlip()]
il = ImageItemList.from_files(path, tfms=tfms)
sd = SplitData.split_by_func(il, partial(grandparent_splitter, valid_name='val'))
ll = label_by_func(sd, parent_labeler)

ll.valid.x.tfms = [make_rgb, CenterCrop(128), to_byte_tensor, to_float_tensor]

In [None]:
bs=64

train_dl,valid_dl = get_dls(ll.train,ll.valid,bs, num_workers=4)
data = DataBunch(train_dl, valid_dl, 3, 10)

### Model

In [None]:
def conv3x3(in_planes, out_planes, stride=1):
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False)


class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(BasicBlock, self).__init__()
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn1 = nn.BatchNorm2d(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = nn.BatchNorm2d(planes)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None: residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
                               padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None: residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out

def conv2d(ni, nf, stride):
    return nn.Sequential(nn.Conv2d(ni, nf, kernel_size=3, stride=stride, padding=1, bias=False),
                         nn.BatchNorm2d(nf), nn.ReLU(inplace=True))

class XResNet(nn.Module):

    def __init__(self, block, layers, num_classes=1000):
        self.inplanes = 64
        super(XResNet, self).__init__()
        self.conv1 = conv2d(3, 32, 2)
        self.conv2 = conv2d(32, 32, 1)
        self.conv3 = conv2d(32, 64, 1)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
        self.avgpool = nn.AdaptiveAvgPool2d(1)
        self.fc = nn.Linear(512 * block.expansion, num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

        for m in self.modules():
            if isinstance(m, BasicBlock): m.bn2.weight = nn.Parameter(torch.zeros_like(m.bn2.weight))
            if isinstance(m, Bottleneck): m.bn3.weight = nn.Parameter(torch.zeros_like(m.bn3.weight))
            if isinstance(m, nn.Linear): m.weight.data.normal_(0, 0.01)

    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            layers = []
            if stride==2: layers.append(nn.AvgPool2d(kernel_size=2, stride=2))
            layers += [
                nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=1, bias=False),
                nn.BatchNorm2d(planes * block.expansion) ]
            downsample = nn.Sequential(*layers)

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks): layers.append(block(self.inplanes, planes))
        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)

        return x

In [None]:
def xresnet34(**kwargs): return XResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
def xresnet50(**kwargs): return XResNet(Bottleneck, [3, 4, 6, 3], **kwargs)

### XResNet

In [None]:
act_fn = nn.ReLU(inplace=True)
class NoOp(nn.Module):
    def forward(self, x): return x

class Flatten(nn.Module):
    def forward(self, x): return x.view(x.size(0), -1)

def conv(ni, nf, ks=3, stride=1, bias=False):
    return nn.Conv2d(ni, nf, kernel_size=ks, stride=stride, padding=ks//2, bias=bias)

In [None]:
def init_cnn(m, a=0):
    if getattr(m, 'bias', None) is not None: nn.init.constant_(m.bias, 0)
    if isinstance(m, (nn.Conv2d,nn.Linear)): nn.init.kaiming_normal_(m.weight, a=a)
    for l in m.children(): init_cnn(l, a)

In [None]:
def conv_layer(ni, nf, ks=3, stride=1, zero_bn=False, act=True):
    bn = nn.BatchNorm2d(nf)
    nn.init.constant_(bn.weight, 0. if zero_bn else 1.)
    layers = [conv(ni, nf, ks, stride=stride), bn]
    if act: layers.append(act_fn)
    return nn.Sequential(*layers)

In [None]:
class ResBlock(nn.Module):
    def __init__(self, expansion, ni, nh, stride=1):
        super().__init__()
        nf,ni = nh*expansion,ni*expansion
        self.convs = nn.Sequential(
            NoOp() if expansion==1 else conv_layer(ni, nh, 1),
            conv_layer(ni if expansion==1 else nh, nh, stride=stride),
            conv_layer(nh, nf, 3 if expansion==1 else 1, zero_bn=True, act=False))
        self.idconv = NoOp() if ni==nf else conv_layer(ni, nf, 1)
        self.pool = NoOp() if stride==1 else nn.AvgPool2d(2)

    def forward(self, x): return act_fn(self.convs(x) + self.pool(self.idconv(x)))

In [None]:
class XResNet(nn.Sequential):
    def __init__(self, expansion, layers, num_classes=1000):
        block_szs = [64//expansion,64,128,256,512]
        blocks = [self._make_layer(expansion, block_szs[i], block_szs[i+1], l, 1 if i==0 else 2)
                  for i,l in enumerate(layers)]
        super().__init__(
            conv_layer(3, 16, stride=2), conv_layer(16, 32), conv_layer(32, 64),
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1),
            *blocks,
            nn.AdaptiveAvgPool2d(1), Flatten(),
            nn.Linear(block_szs[-1]*expansion, num_classes),
        )
        init_cnn(self)

    def _make_layer(self, expansion, ni, nf, blocks, stride):
        return nn.Sequential(
            *[ResBlock(expansion, ni if i==0 else nf, nf, stride if i==0 else 1)
              for i in range(blocks)])

In [None]:
def xresnet34(**kwargs): return XResNet(1, [3, 4, 6, 3], **kwargs)
def xresnet50(**kwargs): return XResNet(4, [3, 4, 6, 3], **kwargs)

## Train

In [None]:
from torch.nn.parallel import DataParallel

In [None]:
def get_learner(arch, data, lr, opt_func, loss_func=F.cross_entropy, cb_funcs=None):
    m = arch()
    return Learner(m, data, loss_func,
                   lr=lr, cb_funcs=cb_funcs, opt_func=opt_func)

In [None]:
lr = 4e-3 * bs/256
sched_lr  = combine_scheds([0.5,0.5], [sched_cos(lr/10.,lr), sched_cos(lr, 0)])
sched_mom = combine_scheds([0.5,0.5], [sched_cos(0.95,0.85), sched_cos(0.85, 0.95)])

In [None]:
cbfs = [partial(AvgStatsCallback,accuracy),
        ProgressCallback,
        CudaCallback,
        partial(BatchTransformXCallback, norm_imagenette),
        partial(MixUp, alpha=0.2), 
        partial(ParamScheduler, 'lr', sched_lr),
        partial(ParamScheduler, 'mom', sched_mom)]

In [None]:
stats = [AverageGrad(dampening=True), AverageSqrGrad(), StepCount()]
loss_func = LabelSmoothingCrossEntropy()
arch = partial(xresnet34, num_classes=10)

In [None]:
opt_func = partial(StatefulOptimizer, steppers=AdamStep(), stats=stats,
               mom=0.9, mom_sqr=0.99, eps=1e-7)

In [None]:
learn = get_learner(arch, data, lr, cb_funcs=cbfs, opt_func=opt_func, loss_func=loss_func)

In [None]:
#export
def get_batch(dl, learn):
    learn.xb,learn.yb = next(iter(dl))
    learn.do_begin_fit(0)
    learn('begin_batch')
    learn('after_fit')
    return learn.xb,learn.yb

In [None]:
#export
def model_summary(model, find_all=False):
    xb,yb = get_batch(data.valid_dl, learn)
    mods = find_modules(model, is_lin_layer) if find_all else model.children()
    f = lambda hook,mod,inp,out: print(out.shape)
    with Hooks(mods, f) as hooks: learn.model(xb)

In [None]:
learn.model = learn.model.cuda()
model_summary(learn.model)

epoch,train_loss,train_accuracy,valid_loss,valid_accuracy,time


torch.Size([128, 16, 64, 64])
torch.Size([128, 32, 64, 64])
torch.Size([128, 64, 64, 64])
torch.Size([128, 64, 32, 32])
torch.Size([128, 64, 32, 32])
torch.Size([128, 128, 16, 16])
torch.Size([128, 256, 8, 8])
torch.Size([128, 512, 4, 4])
torch.Size([128, 512, 1, 1])
torch.Size([128, 512])
torch.Size([128, 10])


In [None]:
learn.fit(5)

epoch,train_loss,train_accuracy,valid_loss,valid_accuracy,time
0,1.929024,0.404839,1.418647,0.62,00:16
1,1.535243,0.619823,1.336317,0.65,00:16
2,1.411618,0.677912,1.600393,0.59,00:16
3,1.295553,0.733752,0.983768,0.828,00:16
4,1.1838,0.792384,0.910198,0.854,00:16


## Export

In [None]:
!./notebook2script.py 13_train_imagenette.ipynb

Converted 13_train_imagenette.ipynb to exp/nb_13.py
