In [1]:
from fastai2.basics import *
from fastai2.vision.all import *
from fastai2.callback.all import *
from fastai2.distributed import *
from fastprogress import fastprogress
from torchvision.models import *
from fastai2.vision.models.xresnet import *
from fastai2.callback.mixup import *
from fastscript import *
import kornia
import re
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.checkpoint as cp
from collections import OrderedDict
from torch import Tensor
from torch.jit.annotations import List

torch.backends.cudnn.benchmark = True
fastprogress.MAX_COLS = 80

def get_dbunch(size, woof, bs, sh=0., workers=None):
    if size<=224: path = URLs.IMAGEWOOF_320 if woof else URLs.IMAGENETTE_320
    else        : path = URLs.IMAGEWOOF     if woof else URLs.IMAGENETTE
    source = untar_data(path)
    if workers is None: workers = min(8, num_cpus())
    dblock = DataBlock(blocks=(ImageBlock, CategoryBlock),
                       splitter=GrandparentSplitter(valid_name='val'),
                       get_items=get_image_files, get_y=parent_label)
    item_tfms=[RandomResizedCrop(size, min_scale=0.35), FlipItem(0.5)]
    batch_tfms=RandomErasing(p=0.9, max_count=3, sh=sh) if sh else None
    return dblock.databunch(source, path=source, bs=bs, num_workers=workers,
                            item_tfms=item_tfms, batch_tfms=batch_tfms)

In [15]:
from torch.nn.utils import weight_norm, spectral_norm
def _conv1d_spect(ni:int, no:int, ks:int=1, stride:int=1, padding:int=0, bias:bool=False):
    "Create and initialize a `nn.Conv1d` layer with spectral normalization."
    conv = nn.Conv1d(ni, no, ks, stride=stride, padding=padding, bias=bias)
    nn.init.kaiming_normal_(conv.weight)
    if bias: conv.bias.data.zero_()
    return spectral_norm(conv)

class SimpleSelfAttention(Module):
    def __init__(self, n_in:int, ks=1, sym=False):
        self.sym,self.n_in = sym,n_in
        self.bn = nn.BatchNorm1d(n_in)
        self.bn.weight.data.fill_(0.1)
        self.bn.bias.data.fill_(0)
        self.conv = _conv1d_spect(n_in, n_in, ks, padding=ks//2, bias=False)
        self.gamma = nn.Parameter(tensor([0.]))

    def forward(self,x):
        size = x.size()
        x = x.view(*size[:2],-1)
        xbn = self.bn(x)
        convx = self.conv(xbn)
        xxT = torch.bmm(xbn,xbn.permute(0,2,1).contiguous()).clamp_(-10,10)
        o = torch.bmm(xxT, convx.view(*size[:2],-1))
        o = F.tanh(self.gamma) * o + x
        return o.view(*size).contiguous()
    
def SEModule(ch, reduction, act_cls=defaults.activation):
    nf = math.ceil(ch//reduction/8)*8
    return SequentialEx(nn.AdaptiveAvgPool2d(1),
                        ConvLayer(ch, nf, ks=1, norm_type=None, act_cls=act_cls),
                        ConvLayer(nf, ch, ks=1, norm_type=None, act_cls=nn.Sigmoid),
                        ProdLayer())

class _Transition(nn.Sequential):
    def __init__(self, num_input_features, num_output_features):
        super(_Transition, self).__init__()
        self.add_module('norm', nn.BatchNorm2d(num_input_features))
        self.add_module('relu', MishJit())
        self.add_module('conv', nn.Conv2d(num_input_features, num_output_features,
                                          kernel_size=1, stride=1, bias=True))
        #self.add_module('relu', MishJit())
        self.add_module('pool', kornia.contrib.MaxBlurPool2d(2, False))
        nn.init.orthogonal_(self.conv.weight.data)
        self.norm.weight.data.fill_(1.)
        self.norm.bias.data.fill_(1e-3)
        
    
class _DenseLayer(nn.Module):
    def __init__(self, num_input_features, growth_rate, bn_size, drop_rate,pp=0.5, memory_efficient=False):
        super(_DenseLayer, self).__init__()
        ni = num_input_features
        nf = growth_rate
        #nh1 = ni*bn_size
        #nh2 = nh1*2
        groups = 1
        feats = [nn.BatchNorm2d(num_input_features),
                 MishJit(),
                 nn.Conv2d(num_input_features, bn_size * growth_rate, 
                                           kernel_size=1, stride=1,
                                           bias=False),
                 nn.BatchNorm2d(bn_size * growth_rate),
                 MishJit(),
                 nn.Conv2d(bn_size * growth_rate, 
                           bn_size * growth_rate,
                           kernel_size=3,
                           stride=1, padding=1,
                           groups=bn_size * growth_rate,
                           bias=False),
                 nn.Conv2d(bn_size * growth_rate, growth_rate,
                           kernel_size=1, stride=1,
                           padding=0, groups=1,
                           bias=False),
                 ]
        
        self.feats = nn.Sequential(*feats)
        nn.init.orthogonal_(self.feats[2].weight.data)
        nn.init.orthogonal_(self.feats[-2].weight.data)
        nn.init.orthogonal_(self.feats[-1].weight.data, gain=pp)
        #self.feats[-1].weight.data.fill_(0.5)
        #self.feats[-1].bias.data.fill_(1e-3)
        
        self.drop_rate = drop_rate

    # torchscript does not yet support *args, so we overload method
    # allowing it to take either a List[Tensor] or single Tensor
    def forward(self, input):  # noqa: F811
        if isinstance(input, Tensor):
            prev_features = input
        else:
            prev_features = torch.cat(input, 1)
        
        new_features = self.feats(prev_features)
        if self.drop_rate > 0:
            new_features = F.dropout(new_features, p=self.drop_rate,
                                     training=self.training)
        return new_features
        
class DenseNet(nn.Module):
    r"""Densenet-BC model class, based on
    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_

    Args:
        growth_rate (int) - how many filters to add each layer (`k` in paper)
        block_config (list of 4 ints) - how many layers in each pooling block
        num_init_features (int) - the number of filters to learn in the first convolution layer
        bn_size (int) - multiplicative factor for number of bottle neck layers
          (i.e. bn_size * k features in the bottleneck layer)
        drop_rate (float) - dropout rate after each dense layer
        num_classes (int) - number of classification classes
        memory_efficient (bool) - If True, uses checkpointing. Much more memory efficient,
          but slower. Default: *False*. See `"paper" <https://arxiv.org/pdf/1707.06990.pdf>`_
    """

    __constants__ = ['features']

    def __init__(self, growth_rate=32, block_config=(6, 12, 24, 16),
                 num_init_features=64, bn_size=4, drop_rate=0, num_classes=1000, memory_efficient=False):

        super(DenseNet, self).__init__()

        # First convolution
        self.features = nn.Sequential(OrderedDict([
            ('conv01', nn.Conv2d(3, num_init_features//4, kernel_size=3, stride=2,
                                padding=1, bias=False)),
            ('relu01', MishJit()),
            ('norm01', nn.BatchNorm2d(num_init_features//4)),
            ('conv02', nn.Conv2d(num_init_features//4, num_init_features//2, kernel_size=3, stride=1,
                                padding=1, bias=False)),
            ('relu02', MishJit()),
            ('norm02', nn.BatchNorm2d(num_init_features//2)),
            ('conv03', nn.Conv2d(num_init_features//2, num_init_features, kernel_size=3, stride=1,
                                padding=1, bias=False)),
            #('relu03', MishJit()),
            ('pool0', kornia.contrib.MaxBlurPool2d(3, False)),
            ('norm03', nn.BatchNorm2d(num_init_features)),
            
        ]))
        nn.init.orthogonal_(self.features.conv01.weight.data)
        nn.init.orthogonal_(self.features.conv02.weight.data)
        nn.init.orthogonal_(self.features.conv03.weight.data)
        # Each denseblock
        num_features = num_init_features
        for i, num_layers in enumerate(block_config):
            block = _DenseBlock(
                num_layers=num_layers,
                num_input_features=num_features,
                bn_size=bn_size,
                growth_rate=growth_rate,
                drop_rate=drop_rate,
                memory_efficient=memory_efficient
            )
            self.features.add_module('denseblock%d' % (i + 1), block)
            num_features = num_features + num_layers * growth_rate
            if i != len(block_config) - 1:
                trans = _Transition(num_input_features=num_features,
                                    num_output_features=num_features // 2)
                self.features.add_module('transition%d' % (i + 1), trans)
                num_features = num_features // 2
            self.features.add_module('se%d' % (i + 1), 
                                 nn.Sequential(
                                     MishJit(),
                                     SEModule(num_features, reduction=8, act_cls=MishJit)))
            self.features.add_module('SA%d' % (i + 1),
            SimpleSelfAttention(num_features,ks=1,sym=False))

        # Final batch norm
        self.features.add_module('norm5', nn.BatchNorm2d(num_features))
        self.features.add_module('relu5', MishJit())
        

        # Linear layer
        self.classifier = nn.Linear(num_features, num_classes)

        # Official init from torch repo.
        #for m in self.modules():
        #    if isinstance(m, nn.Conv2d):
        #        nn.init.kaiming_normal_(m.weight)
        #    elif isinstance(m, nn.BatchNorm2d):
        #        nn.init.constant_(m.weight, 1)
        #        nn.init.constant_(m.bias, 0)
        #    elif isinstance(m, nn.Linear):
        #        nn.init.constant_(m.bias, 0)

    def forward(self, x):
        features = self.features(x)
        out = F.adaptive_avg_pool2d(features, (1, 1))
        out = torch.flatten(out, 1)
        out = self.classifier(out)
        return out
class _DenseBlock(nn.ModuleDict):
    _version = 2

    def __init__(self, num_layers, num_input_features, bn_size, growth_rate, drop_rate, memory_efficient=False):
        super(_DenseBlock, self).__init__()
        for i in range(num_layers):
            
            layer = _DenseLayer(
                num_input_features + i * growth_rate,
                growth_rate=growth_rate,
                bn_size=bn_size,
                drop_rate=drop_rate,
                pp = 2.0*(1.0  - float(i) / float(num_layers)),
                memory_efficient=memory_efficient,
            )
            self.add_module('denselayer%d' % (i + 1), layer)

    def forward(self, init_features):
        features = [init_features]
        for name, layer in self.items():
            new_features = layer(features)
            features.append(new_features)
        out = torch.cat(features, 1)
        return out


In [13]:

@call_parse
def main(
        gpu:   Param("GPU to run on", int)=None,
        woof:  Param("Use imagewoof (otherwise imagenette)", int)=0,
        lr:    Param("Learning rate", float)=1e-2,
        size:  Param("Size (px: 128,192,256)", int)=128,
        sqrmom:Param("sqr_mom", float)=0.99,
        mom:   Param("Momentum", float)=0.9,
        eps:   Param("epsilon", float)=1e-6,
        epochs:Param("Number of epochs", int)=5,
        bs:    Param("Batch size", int)=64,
        mixup: Param("Mixup", float)=0.,
        opt:   Param("Optimizer (adam,rms,sgd,ranger)", str)='ranger',
        arch:  Param("Architecture", str)='xresnet50',
        sh:    Param("Random erase max proportion", float)=0.,
        sa:    Param("Self-attention", int)=0,
        sym:   Param("Symmetry for self-attention", int)=0,
        beta:  Param("SAdam softplus beta", float)=0.,
        act_fn:Param("Activation function", str)='MishJit',
        fp16:  Param("Use mixed precision training", int)=0,
        pool:  Param("Pooling method", str)='AvgPool',
        dump:  Param("Print model; don't train", int)=0,
        runs:  Param("Number of times to repeat training", int)=1,
        meta:  Param("Metadata (ignored)", str)='',
        blurpool: Param("Convert MaxPool to MaxPoolBlur", bool)=False,
        ):
    "Distributed training of Imagenette."

    #gpu = setup_distrib(gpu)
    if gpu is not None: torch.cuda.set_device(gpu)
    if   opt=='adam'  : opt_func = partial(Adam, mom=mom, sqr_mom=sqrmom, eps=eps)
    elif opt=='rms'   : opt_func = partial(RMSprop, sqr_mom=sqrmom)
    elif opt=='sgd'   : opt_func = partial(SGD, mom=mom)
    elif opt=='ranger': opt_func = partial(ranger, mom=mom, sqr_mom=sqrmom, eps=eps, beta=beta)

    dbunch = get_dbunch(size, woof, bs, sh=sh)
    if not gpu: print(f'lr: {lr}; size: {size}; sqrmom: {sqrmom}; mom: {mom}; eps: {eps}')

    m,act_fn,pool = [globals()[o] for o in (arch,act_fn,pool)]

    for run in range(runs):
        print(f'Run: {run}')
        #121
        #model =  DenseNet(32, (6, 12, 24, 16), 64, num_classes=10, drop_rate=0)
        model =  DenseNet(32, (6, 12, 24, 16), 64, num_classes=10, drop_rate=0)
        
        # self-made DenseNet92
        #model =  DenseNet(64, (4, 8, 10, 10), 64, num_classes=10, drop_rate=0)
        
        #169
        #model =  DenseNet(32, (6, 12, 32, 32), 64, num_classes=10, drop_rate=0)
        learn = Learner(dbunch, model, opt_func=opt_func, \
                metrics=[accuracy,top_k_accuracy], loss_func=LabelSmoothingCrossEntropy())
        
        if dump: return learn
        #print(learn.model); exit()
        if fp16: learn = learn.to_fp16()
        cbs = MixUp(mixup) if mixup else []
        n_gpu = torch.cuda.device_count()
        if gpu is None and n_gpu: learn.to_parallel()
        if num_distrib()>1: learn.to_distributed(gpu) # Requires `-m fastai.launch`
        learn.fit_flat_cos(epochs, lr, wd=2e-2, cbs=cbs)#learn.fit_flat_cos(epochs, lr, wd=1e-2, cbs=cbs)

In [7]:
args = ( 0,1, 2e-3, 128, 0.99, 0.95, 1e-8, 5, 64, 0,
 'ranger','xse_resnext50',
 0, 1,0,0,'MishJit',1, 'MaxPool', 0, 5, '')

#with big eps
main( *args, False)



lr: 0.002; size: 128; sqrmom: 0.99; mom: 0.95; eps: 1e-08
Run: 0


epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,1.976702,1.956672,0.383813,0.865106,00:49
1,1.708972,1.716778,0.452532,0.905828,00:50
2,1.532671,1.405896,0.598371,0.942988,00:51
3,1.385892,1.466076,0.584118,0.941206,00:51
4,1.169069,1.147183,0.723085,0.966149,00:51


Run: 1


epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,1.919206,1.837562,0.397811,0.859761,00:49
1,1.67483,1.73787,0.451005,0.887249,00:51
2,1.488483,1.411613,0.600916,0.944261,00:51
3,1.337778,1.422461,0.607534,0.943243,00:51
4,1.153859,1.13336,0.719522,0.967676,00:51


Run: 2


epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,1.95838,1.838036,0.412828,0.865615,00:49
1,1.704462,1.617699,0.516416,0.912446,00:51
2,1.513661,1.407609,0.608806,0.93688,00:51
3,1.362641,1.370318,0.622296,0.93917,00:51
4,1.169387,1.127925,0.733011,0.962586,00:51


Run: 3


epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,2.161414,00:20,,,


KeyboardInterrupt: 

In [None]:
args = ( 0,1, 2e-3, 128, 0.99, 0.95, 1e-6, 5, 64, 0,
 'ranger','xse_resnext50',
 0, 1,0,0,'MishJit',1, 'MaxPool', 0, 5, '')

#orig: bn-relu-conv
main( *args, False)



lr: 0.002; size: 128; sqrmom: 0.99; mom: 0.95; eps: 1e-06
Run: 0


epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,1.962159,1.924368,0.352762,0.851616,00:47
1,1.705863,1.629452,0.492237,0.911937,00:48
2,1.514522,1.567181,0.531687,0.924408,00:48
3,1.378323,1.313132,0.655638,0.949096,00:48
4,1.173259,1.151271,0.71494,0.963349,00:48


Run: 1


epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,1.976798,1.919778,0.363706,0.855943,00:47
1,1.725205,1.617176,0.508272,0.911173,00:48
2,1.521696,1.607497,0.511326,0.919318,00:48
3,1.367846,1.309831,0.645711,0.953423,00:48
4,1.187248,1.159004,0.724103,0.960295,00:48


Run: 2


epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,1.959609,1.878694,0.379231,0.86816,00:47
1,1.706527,1.561373,0.524815,0.920336,00:48


In [5]:
args = ( 0,1, 2e-3, 128, 0.99, 0.95, 1e-6, 5, 64, 0,
 'ranger','xse_resnext50',
 0, 1,0,0,'MishJit',1, 'MaxPool', 0, 5, '')

#with SA in transition
main( *args, False)



lr: 0.002; size: 128; sqrmom: 0.99; mom: 0.95; eps: 1e-06
Run: 0


epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,1.95498,1.780111,0.42199,0.886231,00:42
1,1.709653,1.618769,0.48791,0.9127,00:43
2,1.539177,1.57009,0.521252,0.925681,00:43
3,1.380435,1.264338,0.667091,0.958259,00:43
4,1.194264,1.131327,0.723085,0.966404,00:43


Run: 1


epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,1.96942,1.815082,0.407483,0.86256,00:42
1,1.826952,00:17,,,


KeyboardInterrupt: 

In [9]:
args = ( 0,1, 2e-3, 128, 0.99, 0.95, 1e-6, 5, 64, 0,
 'ranger','xse_resnext50',
 0, 1,0,0,'MishJit',1, 'MaxPool', 0, 5, '')


main( *args, False)



lr: 0.002; size: 128; sqrmom: 0.99; mom: 0.95; eps: 1e-06
Run: 0


epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,1.973392,1.852281,0.392212,0.864088,00:50
1,1.732984,1.665331,0.485365,0.90481,00:52
2,1.534169,1.419242,0.593026,0.950369,00:52
3,1.405187,1.509602,0.595062,0.930771,00:52
4,1.189365,1.144592,0.722576,0.964622,00:52


Run: 1


epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,1.962642,1.853955,0.401883,0.852889,00:51
1,1.703128,1.76223,0.461695,0.899466,00:52
2,1.518852,1.4489,0.582336,0.937643,00:52
3,1.380126,1.448274,0.5859,0.937898,00:52
4,1.162218,1.122532,0.730466,0.968185,00:52


Run: 2


epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,1.967425,1.786834,0.425299,0.879868,00:50
1,1.702835,1.913977,0.415882,0.884449,00:52
2,1.550661,1.456265,0.573428,0.934843,00:52
3,1.400527,1.41343,0.597608,0.936625,00:52
4,1.186695,1.157393,0.71265,0.961568,00:52


Run: 3


epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,1.979757,1.884656,0.416645,0.878341,00:51
1,1.703187,1.871628,0.438279,0.872741,00:52
2,1.512196,1.411778,0.59888,0.940443,00:52
3,1.377462,1.380255,0.619751,0.94197,00:52
4,1.164511,1.122599,0.736829,0.967167,00:52


Run: 4


epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,1.960906,1.794399,0.414609,0.876813,00:51
1,1.708656,1.762611,0.44846,0.907356,00:52
2,1.530324,1.40756,0.608043,0.938661,00:52
3,1.369507,1.402649,0.605752,0.935098,00:52
4,1.174044,1.135999,0.730211,0.965386,00:52


In [9]:

@call_parse
def main(
        gpu:   Param("GPU to run on", int)=None,
        woof:  Param("Use imagewoof (otherwise imagenette)", int)=0,
        lr:    Param("Learning rate", float)=1e-2,
        size:  Param("Size (px: 128,192,256)", int)=128,
        sqrmom:Param("sqr_mom", float)=0.99,
        mom:   Param("Momentum", float)=0.9,
        eps:   Param("epsilon", float)=1e-6,
        epochs:Param("Number of epochs", int)=5,
        bs:    Param("Batch size", int)=64,
        mixup: Param("Mixup", float)=0.,
        opt:   Param("Optimizer (adam,rms,sgd,ranger)", str)='ranger',
        arch:  Param("Architecture", str)='xresnet50',
        sh:    Param("Random erase max proportion", float)=0.,
        sa:    Param("Self-attention", int)=0,
        sym:   Param("Symmetry for self-attention", int)=0,
        beta:  Param("SAdam softplus beta", float)=0.,
        act_fn:Param("Activation function", str)='MishJit',
        fp16:  Param("Use mixed precision training", int)=0,
        pool:  Param("Pooling method", str)='AvgPool',
        dump:  Param("Print model; don't train", int)=0,
        runs:  Param("Number of times to repeat training", int)=1,
        meta:  Param("Metadata (ignored)", str)='',
        blurpool: Param("Convert MaxPool to MaxPoolBlur", bool)=False,
        ):
    "Distributed training of Imagenette."

    #gpu = setup_distrib(gpu)
    if gpu is not None: torch.cuda.set_device(gpu)
    if   opt=='adam'  : opt_func = partial(Adam, mom=mom, sqr_mom=sqrmom, eps=eps)
    elif opt=='rms'   : opt_func = partial(RMSprop, sqr_mom=sqrmom)
    elif opt=='sgd'   : opt_func = partial(SGD, mom=mom)
    elif opt=='ranger': opt_func = partial(ranger, mom=mom, sqr_mom=sqrmom, eps=eps, beta=beta)

    dbunch = get_dbunch(size, woof, bs, sh=sh)
    if not gpu: print(f'lr: {lr}; size: {size}; sqrmom: {sqrmom}; mom: {mom}; eps: {eps}')

    m,act_fn,pool = [globals()[o] for o in (arch,act_fn,pool)]

    for run in range(runs):
        print(f'Run: {run}')
        #121
        #model =  DenseNet(32, (6, 12, 24, 16), 64, num_classes=10, drop_rate=0)
        #model =  DenseNet(32, (6, 12, 24, 16), 64, num_classes=10, drop_rate=0)
        
        # self-made DenseNet92
        #model =  DenseNet(64, (4, 8, 10, 10), 64, num_classes=10, drop_rate=0)
        
        #169
        model =  DenseNet(40, (6, 12, 24, 16), 64, num_classes=10, drop_rate=0)
        learn = Learner(dbunch, model, opt_func=opt_func, \
                metrics=[accuracy,top_k_accuracy], loss_func=LabelSmoothingCrossEntropy())
        
        if dump: return learn
        #print(learn.model); exit()
        if fp16: learn = learn.to_fp16()
        cbs = MixUp(mixup) if mixup else []
        n_gpu = torch.cuda.device_count()
        if gpu is None and n_gpu: learn.to_parallel()
        if num_distrib()>1: learn.to_distributed(gpu) # Requires `-m fastai.launch`
        learn.fit_flat_cos(epochs, lr, wd=2e-2, cbs=cbs)#learn.fit_flat_cos(epochs, lr, wd=1e-2, cbs=cbs)
        
args = ( 0,1, 2e-3, 128, 0.99, 0.95, 1e-8, 5, 64, 0,
 'ranger','xse_resnext50',
 0, 1,0,0,'MishJit',1, 'MaxPool', 0, 5, '')


main( *args, False)


lr: 0.002; size: 128; sqrmom: 0.99; mom: 0.95; eps: 1e-08
Run: 0


epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,1.987061,2.159393,0.35607,0.835836,01:10
1,1.755712,1.638576,0.484347,0.913973,01:05
2,1.529839,1.645974,0.497582,0.904301,01:05
3,1.415944,1.440724,0.598626,0.946297,01:05
4,1.190369,1.155054,0.72283,0.963095,01:05


Run: 1


epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,,00:00,,,


AttributeError: yb

In [10]:

@call_parse
def main(
        gpu:   Param("GPU to run on", int)=None,
        woof:  Param("Use imagewoof (otherwise imagenette)", int)=0,
        lr:    Param("Learning rate", float)=1e-2,
        size:  Param("Size (px: 128,192,256)", int)=128,
        sqrmom:Param("sqr_mom", float)=0.99,
        mom:   Param("Momentum", float)=0.9,
        eps:   Param("epsilon", float)=1e-6,
        epochs:Param("Number of epochs", int)=5,
        bs:    Param("Batch size", int)=64,
        mixup: Param("Mixup", float)=0.,
        opt:   Param("Optimizer (adam,rms,sgd,ranger)", str)='ranger',
        arch:  Param("Architecture", str)='xresnet50',
        sh:    Param("Random erase max proportion", float)=0.,
        sa:    Param("Self-attention", int)=0,
        sym:   Param("Symmetry for self-attention", int)=0,
        beta:  Param("SAdam softplus beta", float)=0.,
        act_fn:Param("Activation function", str)='MishJit',
        fp16:  Param("Use mixed precision training", int)=0,
        pool:  Param("Pooling method", str)='AvgPool',
        dump:  Param("Print model; don't train", int)=0,
        runs:  Param("Number of times to repeat training", int)=1,
        meta:  Param("Metadata (ignored)", str)='',
        blurpool: Param("Convert MaxPool to MaxPoolBlur", bool)=False,
        ):
    "Distributed training of Imagenette."

    #gpu = setup_distrib(gpu)
    if gpu is not None: torch.cuda.set_device(gpu)
    if   opt=='adam'  : opt_func = partial(Adam, mom=mom, sqr_mom=sqrmom, eps=eps)
    elif opt=='rms'   : opt_func = partial(RMSprop, sqr_mom=sqrmom)
    elif opt=='sgd'   : opt_func = partial(SGD, mom=mom)
    elif opt=='ranger': opt_func = partial(ranger, mom=mom, sqr_mom=sqrmom, eps=eps, beta=beta)

    dbunch = get_dbunch(size, woof, bs, sh=sh)
    if not gpu: print(f'lr: {lr}; size: {size}; sqrmom: {sqrmom}; mom: {mom}; eps: {eps}')

    m,act_fn,pool = [globals()[o] for o in (arch,act_fn,pool)]

    for run in range(runs):
        print(f'Run: {run}')
        #121
        #model =  DenseNet(32, (6, 12, 24, 16), 64, num_classes=10, drop_rate=0)
        #model =  DenseNet(32, (6, 12, 24, 16), 64, num_classes=10, drop_rate=0)
        
        # self-made DenseNet92
        #model =  DenseNet(64, (4, 8, 10, 10), 64, num_classes=10, drop_rate=0)
        
        #169
        model =  DenseNet(24, (6, 12, 24, 16), 64, num_classes=10, drop_rate=0)
        learn = Learner(dbunch, model, opt_func=opt_func, \
                metrics=[accuracy,top_k_accuracy], loss_func=LabelSmoothingCrossEntropy())
        
        if dump: return learn
        #print(learn.model); exit()
        if fp16: learn = learn.to_fp16()
        cbs = MixUp(mixup) if mixup else []
        n_gpu = torch.cuda.device_count()
        if gpu is None and n_gpu: learn.to_parallel()
        if num_distrib()>1: learn.to_distributed(gpu) # Requires `-m fastai.launch`
        learn.fit_flat_cos(epochs, lr, wd=2e-2, cbs=cbs)#learn.fit_flat_cos(epochs, lr, wd=1e-2, cbs=cbs)
        
args = ( 0,1, 2e-3, 128, 0.99, 0.95, 1e-8, 5, 64, 0,
 'ranger','xse_resnext50',
 0, 1,0,0,'MishJit',1, 'MaxPool', 0, 5, '')


main( *args, False)


lr: 0.002; size: 128; sqrmom: 0.99; mom: 0.95; eps: 1e-08
Run: 0


epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,1.94931,1.783054,0.422754,0.884194,00:47
1,1.674883,1.646733,0.482311,0.922881,00:44
2,1.488361,1.372764,0.618478,0.941206,00:45
3,1.365454,1.439157,0.588699,0.943752,00:44
4,1.164482,1.144869,0.722067,0.963604,00:44


Run: 1


epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,2.010362,00:28,,,


KeyboardInterrupt: 

In [28]:
results_densenext = [0.731229, 0.721812, 0.706796, 0.720540, 0.732756 ]
print (np.mean(results_densenext), np.median(results_densenext))

results_densenext_92 = [0.737592, 0.714177, 0.720794, 0.708068, 0.722576]
print (np.mean(results_densenext_92), np.median(results_densenext_92))


0.7226266000000001 0.721812
0.7206414000000001 0.720794


In [7]:
#MaxPoolBlur training 
main( *args, True)

lr: 0.008; size: 128; sqrmom: 0.99; mom: 0.95; eps: 1e-06
Run: 0


epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,2.003904,2.223005,0.299822,0.81013,01:01
1,1.764573,2.072203,0.338254,0.850853,00:59
2,1.574507,1.853626,0.443879,0.873759,00:59
3,1.417001,1.321541,0.644184,0.955968,00:59
4,1.216009,1.15548,0.731738,0.964877,00:59


Run: 1


epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,2.02447,1.968835,0.36676,0.850344,00:59
1,1.75863,1.911492,0.400865,0.862815,00:59
2,1.578867,2.05731,0.357343,0.815984,00:59
3,1.432039,1.354007,0.639857,0.947569,00:59
4,1.233604,1.156523,0.729193,0.965386,00:59


Run: 2


epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,2.005394,2.010665,0.344362,0.836091,00:59
1,1.738645,1.870527,0.42199,0.877832,00:59
2,1.55427,1.482105,0.569102,0.937389,01:00
3,1.42462,1.54249,0.543141,0.924663,00:59
4,1.199454,1.148754,0.730466,0.968694,00:59


Run: 3


epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,2.005426,2.111513,0.340036,0.832527,00:59
1,1.735521,1.896309,0.393484,0.851871,00:59
2,1.579385,1.692503,0.455841,0.913973,00:59
3,1.405025,1.343927,0.638585,0.943497,00:59
4,1.192734,1.138111,0.740392,0.966404,00:59


Run: 4


epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,2.008575,2.133754,0.254518,0.756427,00:59
1,1.720407,1.742496,0.439298,0.898447,00:59
2,1.543136,1.828868,0.433189,0.840672,00:59
3,1.400129,1.577766,0.576228,0.924408,01:00
4,1.187301,1.140684,0.737083,0.967167,00:59


In [10]:
results_mbp = [0.731738, 0.729193,0.730466, 0.740392, 0.737083]
print (np.mean(results_mbp), np.median(results_mbp))

0.7337743999999999 0.731738


In [12]:
#Now ImageNette
args = ( 0,0, 8e-3, 128, 0.99, 0.95, 1e-6, 5, 64, 0,
     'ranger','xse_resnext50',
     0, 1,0,0,'MishJit',1, 'MaxPool', 0, 5, '')

In [13]:
#Default training, MaxPool
main( *args, False)

lr: 0.008; size: 128; sqrmom: 0.99; mom: 0.95; eps: 1e-06
Run: 0


epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,1.55337,1.467474,0.610191,0.929427,00:43
1,1.302615,1.273466,0.684841,0.953376,00:42
2,1.154172,1.793088,0.506242,0.873121,00:42
3,1.085731,1.125555,0.760764,0.97172,00:42
4,0.938625,0.905465,0.852739,0.986497,00:42


Run: 1


epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,1.560764,1.48307,0.586242,0.941656,00:42
1,1.29404,1.302575,0.671592,0.955159,00:42
2,1.152244,1.770406,0.552357,0.907261,00:42
3,1.076044,1.039984,0.787771,0.974268,00:42
4,0.939874,0.910965,0.848917,0.983949,00:43


Run: 2


epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,1.558743,1.762441,0.505987,0.885605,00:42
1,1.298764,1.516917,0.566624,0.940127,00:43
2,1.171672,1.471202,0.620127,0.952611,00:43
3,1.07931,1.089393,0.777325,0.967898,00:43
4,0.943211,0.913316,0.848917,0.981401,00:43


Run: 3


epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,1.556514,1.83305,0.524076,0.917452,00:42
1,1.294437,1.419953,0.637197,0.94242,00:42
2,1.156018,1.481858,0.623694,0.949554,00:43
3,1.071758,1.115519,0.76051,0.974522,00:42
4,0.934624,0.911172,0.841529,0.98293,00:42


Run: 4


epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,1.533444,1.45893,0.620127,0.944204,00:42
1,1.287585,1.248199,0.701656,0.958981,00:42
2,1.145923,1.633679,0.55949,0.944713,00:42
3,1.072981,1.021157,0.794395,0.979873,00:42
4,0.929677,0.905917,0.842548,0.985478,00:42


In [15]:
results_nette = [0.852739, 0.848917, 0.848917, 0.841529, 0.842548  ]
print (np.mean(results_nette), np.median(results_nette))

0.8469300000000001 0.848917


In [16]:
#With MaxBlurPool
main( *args, True)

lr: 0.008; size: 128; sqrmom: 0.99; mom: 0.95; eps: 1e-06
Run: 0


epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,1.510369,1.334672,0.654013,0.951338,01:01
1,1.280501,1.323934,0.684076,0.942166,01:01
2,1.140033,1.654492,0.540382,0.903185,01:01
3,1.076823,1.069148,0.772994,0.980382,01:01
4,0.914368,0.887269,0.846879,0.988025,01:01


Run: 1


epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,1.525621,1.484924,0.611975,0.927134,01:01
1,1.253216,1.373257,0.665733,0.943694,01:01
2,1.155353,1.370966,0.656815,0.917707,01:01
3,1.069552,1.025684,0.789809,0.974777,01:02
4,0.907935,0.88914,0.849682,0.985732,01:01


Run: 2


epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,1.568489,1.83959,0.488408,0.900892,01:01
1,1.294772,1.221399,0.713885,0.961019,01:01
2,1.134202,1.433192,0.610191,0.933248,01:01
3,1.049995,1.031356,0.784968,0.979618,01:01
4,0.909276,0.870828,0.857834,0.985478,01:01


Run: 3


epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,1.505874,1.651442,0.563057,0.938854,01:01
1,1.279572,1.346696,0.658599,0.953631,01:01
2,1.15077,1.077656,0.766369,0.974013,01:01
3,1.058478,1.081523,0.766115,0.977834,01:01
4,0.910908,0.890739,0.847898,0.984968,01:02


Run: 4


epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,1.514718,1.462456,0.60051,0.946497,01:01
1,1.269807,1.702702,0.560255,0.934522,01:01
2,1.125594,1.196752,0.725605,0.969427,01:02
3,1.045998,1.039236,0.781656,0.975796,01:01
4,0.898163,0.886169,0.852994,0.987261,01:01


In [17]:
results_nette_mbp = [0.846879, 0.849682, 0.857834, 0.847898,0.852994 ]
print (np.mean(results_nette_mbp), np.median(results_nette_mbp))

0.8510574 0.849682
