From d3cc045a4e52387a9c2b9e005f2f3fc393a84e8c Mon Sep 17 00:00:00 2001 From: Jeremy Howard Date: Sat, 6 Apr 2019 01:02:31 +0000 Subject: [PATCH] experimental xresnet changes --- examples/train_imagenette.py | 3 +- examples/train_imagenette_adv.py | 6 +- fastai/vision/models/xresnet.py | 192 ++++++++++++------------------- 3 files changed, 76 insertions(+), 125 deletions(-) diff --git a/examples/train_imagenette.py b/examples/train_imagenette.py index 9127cef1f0..bac59a1ab6 100644 --- a/examples/train_imagenette.py +++ b/examples/train_imagenette.py @@ -49,10 +49,11 @@ def main( bs_rat = bs/256 lr *= bs_rat - learn = (Learner(data, models.xresnet50(), + learn = (Learner(data, models.resnet50(), metrics=[accuracy,top_k_accuracy], wd=1e-3, opt_func=opt_func, bn_wd=False, true_wd=True, loss_func = LabelSmoothingCrossEntropy()) ) + #print(learn.model); exit() if mixup: learn = learn.mixup(alpha=0.2) learn = learn.to_fp16(dynamic=True) if gpu is None: learn.to_parallel() diff --git a/examples/train_imagenette_adv.py b/examples/train_imagenette_adv.py index fca7c64e4c..4ae85f7171 100644 --- a/examples/train_imagenette_adv.py +++ b/examples/train_imagenette_adv.py @@ -65,7 +65,7 @@ def bn_and_final(m): def on_step(self, p, group, group_idx): st = self.state[p] - alpha = (st['alpha_buffer'].sqrt() + group['eps'] + alpha = ((st['alpha_buffer'] + group['eps']).sqrt() ) if 'alpha_buffer' in st else mom.new_tensor(1.) clip = group['clip'] if 'clip' in group else 1e9 alr = (st['alpha_buffer']).clamp_min_(clip) @@ -79,12 +79,12 @@ def main( debias_mom: Param("Debias statistics", bool)=False, debias_sqr: Param("Debias statistics", bool)=False, opt: Param("Optimizer: 'adam','genopt','rms','sgd'", str)='genopt', - alpha: Param("Alpha", float)=0.9, + alpha: Param("Alpha", float)=0.99, mom: Param("Momentum", float)=0.9, eps: Param("epsilon", float)=1e-7, decay: Param("Decay AvgStatistic (momentum)", bool)=False, epochs: Param("Number of epochs", int)=5, - bs: Param("Batch size", int)=256, + bs: Param("Batch size", int)=128, ): """Distributed training of Imagenette. Fastest multi-gpu speed is if you run with: python -m fastai.launch""" diff --git a/fastai/vision/models/xresnet.py b/fastai/vision/models/xresnet.py index e8c0505d04..0d54dae09f 100644 --- a/fastai/vision/models/xresnet.py +++ b/fastai/vision/models/xresnet.py @@ -3,128 +3,110 @@ import math import torch.utils.model_zoo as model_zoo - __all__ = ['XResNet', 'xresnet18', 'xresnet34', 'xresnet50', 'xresnet101', 'xresnet152'] +def init_cnn(m): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight) + if getattr(m, 'bias', None) is not None: nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + for l in m.children(): init_cnn(l) + +def conv(ni, nf, ks=3, stride=1, bias=False): + return nn.Conv2d(ni, nf, kernel_size=ks, stride=stride, padding=ks//2, bias=bias) + +def conv_relu_bn_(ni, nf, ks=3, stride=1, rev=False): + layers = [conv(ni, nf, ks, stride=stride), + nn.ReLU(inplace=True), + nn.BatchNorm2d(ni if rev else nf)] + if rev: layers = reversed(layers) + return layers -def conv3x3(in_planes, out_planes, stride=1): - return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False) +def conv_bn_relu(ni, nf, ks=3, stride=1): + return nn.Sequential(*conv_relu_bn_(ni, nf, ks=ks, stride=stride)) +def bn_relu_conv(ni, nf, ks=3, stride=1): + return nn.Sequential(*conv_relu_bn_(ni, nf, ks=ks, stride=stride, rev=True)) class BasicBlock(nn.Module): expansion = 1 - def __init__(self, inplanes, planes, stride=1, downsample=None): + def __init__(self, ni, nf, stride=1, downsample=None): super(BasicBlock, self).__init__() - self.conv1 = conv3x3(inplanes, planes, stride) - self.bn1 = nn.BatchNorm2d(planes) - self.relu = nn.ReLU(inplace=True) - self.conv2 = conv3x3(planes, planes) - self.bn2 = nn.BatchNorm2d(planes) + self.conv1 = bn_relu_conv(ni, nf, stride=stride) + self.conv2 = bn_relu_conv(nf, nf) self.downsample = downsample self.stride = stride def forward(self, x): - residual = x - - out = self.conv1(x) - out = self.bn1(out) - out = self.relu(out) - - out = self.conv2(out) - out = self.bn2(out) - - if self.downsample is not None: residual = self.downsample(x) - - out += residual - out = self.relu(out) - - return out - + identity = x if self.downsample is None else self.downsample(x) + x = self.conv1(x) + x = self.conv2(x) + x += identity + return x class Bottleneck(nn.Module): expansion = 4 - def __init__(self, inplanes, planes, stride=1, downsample=None): + def __init__(self, ni, nf, stride=1, downsample=None): super(Bottleneck, self).__init__() - self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) - self.bn1 = nn.BatchNorm2d(planes) - self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, - padding=1, bias=False) - self.bn2 = nn.BatchNorm2d(planes) - self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, bias=False) - self.bn3 = nn.BatchNorm2d(planes * self.expansion) - self.relu = nn.ReLU(inplace=True) + self.conv1 = bn_relu_conv(ni, nf, 1) + self.conv2 = bn_relu_conv(nf, nf, stride=stride) + self.conv3 = bn_relu_conv(nf, nf * self.expansion, 1) self.downsample = downsample self.stride = stride def forward(self, x): - residual = x - - out = self.conv1(x) - out = self.bn1(out) - out = self.relu(out) - - out = self.conv2(out) - out = self.bn2(out) - out = self.relu(out) - - out = self.conv3(out) - out = self.bn3(out) - - if self.downsample is not None: residual = self.downsample(x) - - out += residual - out = self.relu(out) - - return out - -def conv2d(ni, nf, stride): - return nn.Sequential(nn.Conv2d(ni, nf, kernel_size=3, stride=stride, padding=1, bias=False), - nn.BatchNorm2d(nf), nn.ReLU(inplace=True)) + identity = x if self.downsample is None else self.downsample(x) + x = self.conv1(x) + x = self.conv2(x) + x = self.conv3(x) + x += identity + return x class XResNet(nn.Module): def __init__(self, block, layers, num_classes=1000): - self.inplanes = 64 - super(XResNet, self).__init__() - self.conv1 = conv2d(3, 32, 2) - self.conv2 = conv2d(32, 32, 1) - self.conv3 = conv2d(32, 64, 1) + self.ni = 64 + super().__init__() + self.conv1 = conv_bn_relu(3, 32, stride=2) + self.conv2 = conv_bn_relu(32, 32) + self.conv3 = conv(32, 64) self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) self.layer1 = self._make_layer(block, 64, layers[0]) self.layer2 = self._make_layer(block, 128, layers[1], stride=2) self.layer3 = self._make_layer(block, 256, layers[2], stride=2) self.layer4 = self._make_layer(block, 512, layers[3], stride=2) + ni = 512*block.expansion self.avgpool = nn.AdaptiveAvgPool2d(1) - self.fc = nn.Linear(512 * block.expansion, num_classes) + self.fc = nn.Sequential( + nn.ReLU(inplace=True), + nn.BatchNorm1d(ni), + nn.Linear(ni, num_classes)) - for m in self.modules(): - if isinstance(m, nn.Conv2d): - nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') - elif isinstance(m, nn.BatchNorm2d): - nn.init.constant_(m.weight, 1) - nn.init.constant_(m.bias, 0) + init_cnn(self) for m in self.modules(): - if isinstance(m, BasicBlock): m.bn2.weight = nn.Parameter(torch.zeros_like(m.bn2.weight)) - if isinstance(m, Bottleneck): m.bn3.weight = nn.Parameter(torch.zeros_like(m.bn3.weight)) + if isinstance(m, BasicBlock): nn.init.constant_(m.conv2[0].weight, 0.) + if isinstance(m, Bottleneck): nn.init.constant_(m.conv3[0].weight, 0.) if isinstance(m, nn.Linear): m.weight.data.normal_(0, 0.01) - def _make_layer(self, block, planes, blocks, stride=1): + def _make_layer(self, block, nf, blocks, stride=1): downsample = None - if stride != 1 or self.inplanes != planes * block.expansion: + if stride != 1 or self.ni != nf*block.expansion: layers = [] if stride==2: layers.append(nn.AvgPool2d(kernel_size=2, stride=2)) layers += [ - nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=1, bias=False), - nn.BatchNorm2d(planes * block.expansion) ] + conv(self.ni, nf*block.expansion, 1), + nn.BatchNorm2d(nf * block.expansion) ] downsample = nn.Sequential(*layers) layers = [] - layers.append(block(self.inplanes, planes, stride, downsample)) - self.inplanes = planes * block.expansion - for i in range(1, blocks): layers.append(block(self.inplanes, planes)) + layers.append(block(self.ni, nf, stride, downsample)) + self.ni = nf * block.expansion + for i in range(1, blocks): layers.append(block(self.ni, nf)) return nn.Sequential(*layers) def forward(self, x): @@ -144,58 +126,26 @@ def forward(self, x): return x +model_urls = dict(xresnet34='xresnet34', xresnet50='xresnet50') -def xresnet18(pretrained=False, **kwargs): - """Constructs a XResNet-18 model. - - Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - """ - model = XResNet(BasicBlock, [2, 2, 2, 2], **kwargs) - if pretrained: model.load_state_dict(model_zoo.load_url(model_urls['xresnet18'])) +def xresnet(block, n_layers, name, pre=False, **kwargs): + model = XResNet(block, n_layers, **kwargs) + #if pre: model.load_state_dict(model_zoo.load_url(model_urls[name])) + if pre: model.load_state_dict(torch.load(model_urls[name])) return model +def xresnet18(pretrained=False, **kwargs): + return xresnet(BasicBlock, [2, 2, 2, 2], 'xresnet18', pre=pretrained, **kwargs) def xresnet34(pretrained=False, **kwargs): - """Constructs a XResNet-34 model. - - Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - """ - model = XResNet(BasicBlock, [3, 4, 6, 3], **kwargs) - if pretrained: model.load_state_dict(model_zoo.load_url(model_urls['xresnet34'])) - return model - + return xresnet(BasicBlock, [3, 4, 6, 3], 'xresnet34', pre=pretrained, **kwargs) def xresnet50(pretrained=False, **kwargs): - """Constructs a XResNet-50 model. - - Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - """ - model = XResNet(Bottleneck, [3, 4, 6, 3], **kwargs) - if pretrained: model.load_state_dict(model_zoo.load_url(model_urls['xresnet50'])) - return model - + return xresnet(Bottleneck, [3, 4, 6, 3], 'xresnet50', pre=pretrained, **kwargs) def xresnet101(pretrained=False, **kwargs): - """Constructs a XResNet-101 model. - - Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - """ - model = XResNet(Bottleneck, [3, 4, 23, 3], **kwargs) - if pretrained: model.load_state_dict(model_zoo.load_url(model_urls['xresnet101'])) - return model - + return xresnet(Bottleneck, [3, 4, 23, 3], 'xresnet101', pre=pretrained, **kwargs) def xresnet152(pretrained=False, **kwargs): - """Constructs a XResNet-152 model. - - Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - """ - model = XResNet(Bottleneck, [3, 8, 36, 3], **kwargs) - if pretrained: model.load_state_dict(model_zoo.load_url(model_urls['xresnet152'])) - return model + return xresnet(Bottleneck, [3, 8, 36, 3], 'xresnet152', pre=pretrained, **kwargs)