In [12]:
from __future__ import print_function
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.backends.cudnn as cudnn
import torchvision
import torchvision.transforms as transforms
import os
import collections
import argparse
import sys
import pickle
import numpy as np
import time, datetime
import copy
from thop import profile
from collections import OrderedDict
import shutil
import torch.utils
import torch.utils.data.distributed
from torchvision import datasets, transforms
from scipy.spatial import distance

In [13]:
import math
import pdb


def conv_bn(inp, oup, stride):
    return nn.Sequential(
        nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
        nn.BatchNorm2d(oup),
        nn.ReLU6(inplace=True)
    )

def conv_1x1_bn(inp, oup):
    return nn.Sequential(
        nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
        nn.BatchNorm2d(oup),
        nn.ReLU6(inplace=True)
    )

def make_divisible(x, divisible_by=8):
    import numpy as np
    return int(np.ceil(x * 1. / divisible_by) * divisible_by)


class InvertedResidual(nn.Module):
    def __init__(self, inp, oup, stride, expand_ratio):
        super(InvertedResidual, self).__init__()
        self.stride = stride
        assert stride in [1, 2]

        hidden_dim = int(inp * expand_ratio)
        self.use_res_connect = self.stride == 1 and inp == oup

        if expand_ratio == 1:
            self.conv = nn.Sequential(
                # dw
                nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
                nn.BatchNorm2d(hidden_dim),
                nn.ReLU6(inplace=True),
                # pw-linear
                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
                nn.BatchNorm2d(oup),
            )
        else:
            self.conv = nn.Sequential(
                # pw
                nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False),
                nn.BatchNorm2d(hidden_dim),
                nn.ReLU6(inplace=True),
                # dw
                nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
                nn.BatchNorm2d(hidden_dim),
                nn.ReLU6(inplace=True),
                # pw-linear
                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
                nn.BatchNorm2d(oup),
            )

    def forward(self, x):
        if self.use_res_connect:
            return x + self.conv(x)
        else:
            return self.conv(x)


class MobileNetV2(nn.Module):
    def __init__(self, compress_rate, n_class=1000, input_size=224, width_mult=1.):
        super(MobileNetV2, self).__init__()
        block = InvertedResidual
        input_channel = 32
        last_channel = 1280
        interverted_residual_setting = [
            # t-ex, c-channel, n-blocknum, s-stride
            [1, 16, 1, 1],
            [6, 24, 2, 1],
            [6, 32, 3, 2],
            [6, 64, 4, 2],
            [6, 96, 3, 1],
            [6, 160, 3, 2],
            [6, 320, 1, 1],
        ]
        self.compress_rate=compress_rate[:]

        # building first layer
        assert input_size % 32 == 0
        # input_channel = make_divisible(input_channel * width_mult)  # first channel is always 32!
        self.last_channel = make_divisible(last_channel * width_mult) if width_mult > 1.0 else last_channel
        self.features = [conv_bn(3, input_channel, 2)]
        # building inverted residual blocks
        cnt=1
        for t, c, n, s in interverted_residual_setting:
            output_channel = make_divisible(c * width_mult) if t > 1 else c
            output_channel = int((1-self.compress_rate[cnt])*output_channel)
            for i in range(n):
                if i == 0:
                    self.features.append(block(input_channel, output_channel, s, expand_ratio=t))
                else:
                    self.features.append(block(input_channel, output_channel, 1, expand_ratio=t))
                input_channel = output_channel
            cnt+=1

        # building last several layers
        self.features.append(conv_1x1_bn(input_channel, self.last_channel))
        # make it nn.Sequential
        self.features = nn.Sequential(*self.features)

        # building classifier
        #self.classifier = nn.Linear(self.last_channel, n_class)
        self.classifier = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(self.last_channel, n_class),
        )

        self._initialize_weights()

    def forward(self, x):
        x = self.features(x)
        x = x.mean(3).mean(2)
        x = self.classifier(x)
        return x

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
                if m.bias is not None:
                    m.bias.data.zero_()
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.Linear):
                n = m.weight.size(1)
                m.weight.data.normal_(0, 0.01)
                m.bias.data.zero_()


def mobilenet_v2(compress_rate,n_class=1000):
    model = MobileNetV2(compress_rate=compress_rate,n_class=n_class,width_mult=1)
    return model

In [19]:
print("prepare resNet_50 model...")
def adapt_channel(compress_rate, num_layers):

    if num_layers==56:
        stage_repeat = [9, 9, 9]
        stage_out_channel = [16] + [16] * 9 + [32] * 9 + [64] * 9
    elif num_layers==110:
        stage_repeat = [18, 18, 18]
        stage_out_channel = [16] + [16] * 18 + [32] * 18 + [64] * 18

    stage_oup_cprate = []
    stage_oup_cprate += [compress_rate[0]]
    for i in range(len(stage_repeat)-1):
        stage_oup_cprate += [compress_rate[i+1]] * stage_repeat[i]
    stage_oup_cprate +=[0.] * stage_repeat[-1]
    mid_cprate = compress_rate[len(stage_repeat):]

    overall_channel = []
    mid_channel = []
    for i in range(len(stage_out_channel)):
        if i == 0 :
            overall_channel += [int(stage_out_channel[i] * (1-stage_oup_cprate[i]))]
        else:
            overall_channel += [int(stage_out_channel[i] * (1-stage_oup_cprate[i]))]
            mid_channel += [int(stage_out_channel[i] * (1-mid_cprate[i-1]))]

    return overall_channel, mid_channel


def conv3x3(in_planes, out_planes, stride=1):
    """3x3 convolution with padding"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=1, bias=False)

def conv1x1(in_planes, out_planes, stride=1):
    """1x1 convolution"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)


class LambdaLayer(nn.Module):
    def __init__(self, lambd):
        super(LambdaLayer, self).__init__()
        self.lambd = lambd

    def forward(self, x):
        return self.lambd(x)


class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, midplanes, inplanes, planes, stride=1):
        super(BasicBlock, self).__init__()
        self.inplanes = inplanes
        self.planes = planes
        self.conv1 = conv3x3(inplanes, midplanes, stride)
        self.bn1 = nn.BatchNorm2d(midplanes)
        self.relu1 = nn.ReLU(inplace=True)

        self.conv2 = conv3x3(midplanes, planes)
        self.bn2 = nn.BatchNorm2d(planes)
        self.relu2 = nn.ReLU(inplace=True)
        self.stride = stride

        self.shortcut = nn.Sequential()
        if stride != 1 or inplanes != planes:
            if stride!=1:
                self.shortcut = LambdaLayer(
                    lambda x: F.pad(x[:, :, ::2, ::2],
                                    (0, 0, 0, 0, (planes-inplanes)//2, planes-inplanes-(planes-inplanes)//2), "constant", 0))
            else:
                self.shortcut = LambdaLayer(
                    lambda x: F.pad(x[:, :, :, :],
                                    (0, 0, 0, 0, (planes-inplanes)//2, planes-inplanes-(planes-inplanes)//2), "constant", 0))
            #self.shortcut = LambdaLayer(
            #    lambda x: F.pad(x[:, :, ::2, ::2], (0, 0, 0, 0, planes//4, planes//4),"constant", 0))

            '''self.shortcut = nn.Sequential(
                conv1x1(inplanes, planes, stride=stride),
                #nn.BatchNorm2d(planes),
            )#'''

    def forward(self, x):
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu1(out)

        out = self.conv2(out)
        out = self.bn2(out)

        #print(self.stride, self.inplanes, self.planes, out.size(), self.shortcut(x).size())
        out += self.shortcut(x)
        out = self.relu2(out)

        return out


class ResNet(nn.Module):
    def __init__(self, block, num_layers, compress_rate, num_classes=100):
        super(ResNet, self).__init__()
        assert (num_layers - 2) % 6 == 0, 'depth should be 6n+2'
        n = (num_layers - 2) // 6

        self.num_layer = num_layers
        self.overall_channel, self.mid_channel = adapt_channel(compress_rate, num_layers)

        self.layer_num = 0
        self.conv1 = nn.Conv2d(3, self.overall_channel[self.layer_num], kernel_size=3, stride=1, padding=1,
                               bias=False)
        self.bn1 = nn.BatchNorm2d(self.overall_channel[self.layer_num])
        self.relu = nn.ReLU(inplace=True)
        self.layers = nn.ModuleList()
        self.layer_num += 1

        #self.layers = nn.ModuleList()
        self.layer1 = self._make_layer(block, blocks_num=n, stride=1)
        self.layer2 = self._make_layer(block, blocks_num=n, stride=2)
        self.layer3 = self._make_layer(block, blocks_num=n, stride=2)

        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))

        if self.num_layer == 56:
            self.fc = nn.Linear(64 * BasicBlock.expansion, num_classes)
        else:
            self.linear = nn.Linear(64 * BasicBlock.expansion, num_classes)


    def _make_layer(self, block, blocks_num, stride):
        layers = []
        layers.append(block(self.mid_channel[self.layer_num - 1], self.overall_channel[self.layer_num - 1],
                                 self.overall_channel[self.layer_num], stride))
        self.layer_num += 1

        for i in range(1, blocks_num):
            layers.append(block(self.mid_channel[self.layer_num - 1], self.overall_channel[self.layer_num - 1],
                                     self.overall_channel[self.layer_num]))
            self.layer_num += 1

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)

        for i, block in enumerate(self.layer1):
            x = block(x)
        for i, block in enumerate(self.layer2):
            x = block(x)
        for i, block in enumerate(self.layer3):
            x = block(x)

        x = self.avgpool(x)
        x = x.view(x.size(0), -1)

        if self.num_layer == 56:
            x = self.fc(x)
        else:
            x = self.linear(x)

        return x


def resnet_56(compress_rate):
    return ResNet(BasicBlock, 56, compress_rate=compress_rate)

def resnet_110(compress_rate):
    return ResNet(BasicBlock, 110, compress_rate=compress_rate)

prepare resNet_50 model...


In [20]:
os.environ['CUDA_VISIBLE_DEVICES'] = "0"
cudnn.benchmark = True
cudnn.enabled=True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [21]:
print("超参数")
CLASSES = 10
epochs = 400
batch_size=  256

learning_rate = 0.01
momentum = 0.9
# 0.006
weight_decay = 0.005
lr_decay_step = '150,225'

best_acc = 0

save_dir = "./data/model/Hrank_preTrain/cifar-10/"

超参数


In [22]:
net = mobilenet_v2(compress_rate=[0.]*100,n_class=10)
net = net.to(device)
print(net)
class CrossEntropyLabelSmooth(nn.Module):

  def __init__(self, num_classes, epsilon):
    super(CrossEntropyLabelSmooth, self).__init__()
    self.num_classes = num_classes
    self.epsilon = epsilon
    self.logsoftmax = nn.LogSoftmax(dim=1)

  def forward(self, inputs, targets):
    log_probs = self.logsoftmax(inputs)
    targets = torch.zeros_like(log_probs).scatter_(1, targets.unsqueeze(1), 1)
    targets = (1 - self.epsilon) * targets + self.epsilon / self.num_classes
    loss = (-targets * log_probs).mean(0).sum()
    return loss   

learning_rate = 0.1
momentum = 0.9
weight_decay = 1e-4
criterion = nn.CrossEntropyLoss()
criterion = criterion.cuda()
criterion_smooth = CrossEntropyLabelSmooth(CLASSES, 0.1)
criterion_smooth = criterion_smooth.cuda()
lr_decay_step = list(map(int, lr_decay_step.split(',')))
optimizer = torch.optim.SGD(net.parameters(), learning_rate, momentum=momentum, weight_decay=weight_decay)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=60, gamma=0.1)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=lr_decay_step, gamma=0.1)

MobileNetV2(
  (features): Sequential(
    (0): Sequential(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU6(inplace=True)
    )
    (1): InvertedResidual(
      (conv): Sequential(
        (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
        (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU6(inplace=True)
        (3): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (4): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (2): InvertedResidual(
      (conv): Sequential(
        (0): Conv2d(16, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU6(inplace=True)
       

In [23]:
# input_image_size=32
# input_image = torch.randn(1, 3, input_image_size, input_image_size).cuda()
# flops, params = profile(net, inputs=(input_image,))
# print('Params: %.2f' % (params))
# print('Flops: %.2f' % (flops))

In [24]:
print('load training data')
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])
transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])
trainset = torchvision.datasets.CIFAR10(root='./data/cifar-10-batches-py/', train=True, download=True, transform=transform_train)
testset = torchvision.datasets.CIFAR10(root='./data/cifar-10-batches-py/', train=False, download=True, transform=transform_test)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True,drop_last=False)
testloader = torch.utils.data.DataLoader(testset, batch_size=100, shuffle=False,drop_last=False)

load training data
Files already downloaded and verified
Files already downloaded and verified


In [25]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self, name, fmt=':f'):
        self.name = name
        self.fmt = fmt
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

    def __str__(self):
        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
        return fmtstr.format(**self.__dict__)
def accuracy(output, target, topk=(1,)):
    """Computes the accuracy over the k top predictions for the specified values of k"""
    with torch.no_grad():
        maxk = max(topk)
        batch_size = target.size(0)

        _, pred = output.topk(maxk, 1, True, True)
        pred = pred.t()
        correct = pred.eq(target.view(1, -1).expand_as(pred))

        res = []
        for k in topk:
            correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
            res.append(correct_k.mul_(100.0 / batch_size))
        return res

In [26]:
def train(epoch,i,net,optimizer,scheduler):
    net.train()
    train_loss = 0
    correct = 0
    total = 0
    
    losses =AverageMeter('Loss', ':.4e')
    top1 =AverageMeter('Acc@1', ':6.2f')
    top5 =AverageMeter('Acc@5', ':6.2f')

    num_iter = len(trainloader)
    for batch_idx, (inputs, targets) in enumerate(trainloader):
        inputs, targets = inputs.to(device), targets.to(device)
        
        
        outputs = net(inputs)
        loss = criterion(outputs, targets)
        
        prec1, prec5 = accuracy(outputs, targets, topk=(1, 5))
        
        n = inputs.size(0)
        losses.update(loss.item(), n)  # accumulated loss
        top1.update(prec1.item(), n)
        top5.update(prec5.item(), n)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()
        
        if batch_idx % 100 == 0 and batch_idx != 0:
            print(
                'Epoch[{0}]({1}/{2}): '
                'Loss {loss.avg:.4f} '
                'Prec@1(1,5) {top1.avg:.2f}, {top5.avg:.2f}'.format(
                    epoch, batch_idx, num_iter, loss=losses,
                    top1=top1, top5=top5))
    scheduler.step()
    
def test(epoch,net,i):
    global best_acc
    net.eval()
    test_loss = 0
    correct = 0
    total = 0
    
    losses =AverageMeter('Loss', ':.4e')
    top1 =AverageMeter('Acc@1', ':6.2f')
    top5 =AverageMeter('Acc@5', ':6.2f')
    num_iter = len(testloader)
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(testloader):
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = net(inputs)
            loss = criterion(outputs, targets)
            
            pred1, pred5 = accuracy(outputs, targets, topk=(1, 5))
            n = inputs.size(0)
            losses.update(loss.item(), n)
            top1.update(pred1[0], n)
            top5.update(pred5[0], n)
    valid_top1_acc = top1.avg
    valid_top5_acc = top5.avg
    
    if valid_top1_acc > best_acc:
        best_acc = valid_top1_acc
        state = {
            'net': net.state_dict(),
            'acc': best_acc
        }
        torch.save(state, save_dir+'best_%d.t7' % (epoch))
    print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'
                    .format(top1=top1, top5=top5))


In [27]:
for epoch in range(epochs):
    train(epoch,1,net,optimizer,scheduler)
    test(epoch,net,1)

Epoch[0](100/196): Loss 2.1693 Prec@1(1,5) 25.47, 76.69
 * Acc@1 43.870 Acc@5 92.100
Epoch[1](100/196): Loss 1.5062 Prec@1(1,5) 44.82, 91.33
 * Acc@1 51.110 Acc@5 93.590
Epoch[2](100/196): Loss 1.3318 Prec@1(1,5) 51.56, 93.88
 * Acc@1 57.140 Acc@5 95.490
Epoch[3](100/196): Loss 1.1759 Prec@1(1,5) 58.04, 95.29
 * Acc@1 60.040 Acc@5 95.880
Epoch[4](100/196): Loss 1.0780 Prec@1(1,5) 61.47, 96.16
 * Acc@1 64.000 Acc@5 96.470
Epoch[5](100/196): Loss 0.9980 Prec@1(1,5) 64.99, 96.79
 * Acc@1 64.780 Acc@5 96.570
Epoch[6](100/196): Loss 0.9254 Prec@1(1,5) 67.48, 97.25
 * Acc@1 67.370 Acc@5 97.250
Epoch[7](100/196): Loss 0.8570 Prec@1(1,5) 69.87, 97.77
 * Acc@1 71.720 Acc@5 97.650
Epoch[8](100/196): Loss 0.8003 Prec@1(1,5) 72.03, 97.94
 * Acc@1 72.580 Acc@5 97.960
Epoch[9](100/196): Loss 0.7440 Prec@1(1,5) 73.95, 98.38
 * Acc@1 73.880 Acc@5 98.370
Epoch[10](100/196): Loss 0.7007 Prec@1(1,5) 75.58, 98.41
 * Acc@1 75.000 Acc@5 98.200
Epoch[11](100/196): Loss 0.6534 Prec@1(1,5) 77.16, 98.66
 * Acc@

 * Acc@1 85.790 Acc@5 99.370
Epoch[96](100/196): Loss 0.2310 Prec@1(1,5) 91.85, 99.89
 * Acc@1 86.260 Acc@5 99.320
Epoch[97](100/196): Loss 0.2211 Prec@1(1,5) 92.33, 99.89
 * Acc@1 86.470 Acc@5 99.480
Epoch[98](100/196): Loss 0.2281 Prec@1(1,5) 91.99, 99.91
 * Acc@1 87.940 Acc@5 99.470
Epoch[99](100/196): Loss 0.2330 Prec@1(1,5) 91.87, 99.90
 * Acc@1 87.940 Acc@5 99.510
Epoch[100](100/196): Loss 0.2329 Prec@1(1,5) 91.71, 99.90
 * Acc@1 86.590 Acc@5 99.530
Epoch[101](100/196): Loss 0.2166 Prec@1(1,5) 92.30, 99.91
 * Acc@1 87.330 Acc@5 99.470
Epoch[102](100/196): Loss 0.2155 Prec@1(1,5) 92.44, 99.89
 * Acc@1 87.360 Acc@5 99.390
Epoch[103](100/196): Loss 0.2255 Prec@1(1,5) 92.07, 99.87
 * Acc@1 87.370 Acc@5 99.450
Epoch[104](100/196): Loss 0.2183 Prec@1(1,5) 92.35, 99.87
 * Acc@1 86.030 Acc@5 99.410
Epoch[105](100/196): Loss 0.2208 Prec@1(1,5) 92.23, 99.90
 * Acc@1 87.580 Acc@5 99.570
Epoch[106](100/196): Loss 0.2174 Prec@1(1,5) 92.40, 99.91
 * Acc@1 86.540 Acc@5 99.480
Epoch[107](100/196

 * Acc@1 91.530 Acc@5 99.680
Epoch[190](100/196): Loss 0.0182 Prec@1(1,5) 99.38, 100.00
 * Acc@1 91.500 Acc@5 99.660
Epoch[191](100/196): Loss 0.0172 Prec@1(1,5) 99.44, 100.00
 * Acc@1 91.350 Acc@5 99.750
Epoch[192](100/196): Loss 0.0168 Prec@1(1,5) 99.43, 100.00
 * Acc@1 91.340 Acc@5 99.680
Epoch[193](100/196): Loss 0.0167 Prec@1(1,5) 99.45, 100.00
 * Acc@1 91.350 Acc@5 99.580
Epoch[194](100/196): Loss 0.0165 Prec@1(1,5) 99.44, 100.00
 * Acc@1 91.390 Acc@5 99.650
Epoch[195](100/196): Loss 0.0169 Prec@1(1,5) 99.45, 100.00
 * Acc@1 91.260 Acc@5 99.660
Epoch[196](100/196): Loss 0.0191 Prec@1(1,5) 99.39, 100.00
 * Acc@1 91.460 Acc@5 99.660
Epoch[197](100/196): Loss 0.0166 Prec@1(1,5) 99.44, 100.00
 * Acc@1 91.320 Acc@5 99.620
Epoch[198](100/196): Loss 0.0180 Prec@1(1,5) 99.36, 100.00
 * Acc@1 91.370 Acc@5 99.640
Epoch[199](100/196): Loss 0.0151 Prec@1(1,5) 99.52, 100.00
 * Acc@1 91.520 Acc@5 99.690
Epoch[200](100/196): Loss 0.0143 Prec@1(1,5) 99.56, 100.00
 * Acc@1 91.330 Acc@5 99.660
Epo

Epoch[283](100/196): Loss 0.0042 Prec@1(1,5) 99.88, 100.00
 * Acc@1 91.610 Acc@5 99.620
Epoch[284](100/196): Loss 0.0047 Prec@1(1,5) 99.85, 100.00
 * Acc@1 91.660 Acc@5 99.590
Epoch[285](100/196): Loss 0.0054 Prec@1(1,5) 99.82, 100.00
 * Acc@1 91.530 Acc@5 99.640
Epoch[286](100/196): Loss 0.0040 Prec@1(1,5) 99.88, 100.00
 * Acc@1 91.660 Acc@5 99.620
Epoch[287](100/196): Loss 0.0040 Prec@1(1,5) 99.90, 100.00
 * Acc@1 91.660 Acc@5 99.590
Epoch[288](100/196): Loss 0.0051 Prec@1(1,5) 99.83, 100.00
 * Acc@1 91.730 Acc@5 99.570
Epoch[289](100/196): Loss 0.0045 Prec@1(1,5) 99.88, 100.00
 * Acc@1 91.740 Acc@5 99.590
Epoch[290](100/196): Loss 0.0039 Prec@1(1,5) 99.88, 100.00
 * Acc@1 91.810 Acc@5 99.590
Epoch[291](100/196): Loss 0.0040 Prec@1(1,5) 99.90, 100.00
 * Acc@1 91.690 Acc@5 99.590
Epoch[292](100/196): Loss 0.0047 Prec@1(1,5) 99.86, 100.00
 * Acc@1 91.750 Acc@5 99.600
Epoch[293](100/196): Loss 0.0045 Prec@1(1,5) 99.88, 100.00
 * Acc@1 91.790 Acc@5 99.570
Epoch[294](100/196): Loss 0.0048

KeyboardInterrupt: 