In [0]:

import torch
import torch.nn as nn
import torch.nn.functional as F


class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_planes, planes, stride=1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, in_planes, planes, stride=1):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, self.expansion*planes, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(self.expansion*planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = F.relu(self.bn2(self.conv2(out)))
        out = self.bn3(self.conv3(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class ResNet(nn.Module):
    def __init__(self, block, num_blocks, num_classes=10):
        super(ResNet, self).__init__()
        self.in_planes = 64

        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
        self.linear = nn.Linear(512*block.expansion, num_classes)

    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride))
            self.in_planes = planes * block.expansion
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out


def ResNet18():
    return ResNet(BasicBlock, [2,2,2,2])

def ResNet34():
    return ResNet(BasicBlock, [3,4,6,3])

def ResNet50():
    return ResNet(Bottleneck, [3,4,6,3])

def ResNet101():
    return ResNet(Bottleneck, [3,4,23,3])

def ResNet152():
    return ResNet(Bottleneck, [3,8,36,3])


def test():
    net = ResNet18()
    y = net(torch.randn(1,3,32,32))
    print(y.size())



In [44]:
'''Train CIFAR10 with PyTorch.'''
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.backends.cudnn as cudnn

import torchvision
import torchvision.transforms as transforms

from torch.optim.lr_scheduler import CyclicLR, ReduceLROnPlateau

import os
import argparse

lr = 0.1

device = 'cuda' if torch.cuda.is_available() else 'cpu'
best_acc = 0  # best test accuracy
start_epoch = 0  # start from epoch 0 or last checkpoint epoch

# Data
print('==> Preparing data..')
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)
testloader = torch.utils.data.DataLoader(testset, batch_size=100, shuffle=False, num_workers=2)

==> Preparing data..
Files already downloaded and verified
Files already downloaded and verified


In [45]:


classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

# Model
print('==> Building model..')
net = ResNet18()
net = net.to(device)
if device == 'cuda':
    net = torch.nn.DataParallel(net)
    cudnn.benchmark = True


==> Building model..


#### Using "ReduceLROnPlateau" as LR scheduler

We are using simple LR scheduler Reduce on Platue, which reduces LR which helps in exploring local deeper bottoms in the visinity. We can see that It takes long time to reach reasonable accuracy of 90%, compared to Cyclical LR.

In subsiquent training changes in this notebook itself, we use CyclicalLR and CLR with long step size (number of iterations per step), we can get better accuracy and in very less number of iterations.

In [48]:
import time

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4)
scheduler = ReduceLROnPlateau(optimizer, 'min')


pytorch_total_params = sum(p.numel() for p in net.parameters())
pytorch_total_params_trainable = sum(p.numel() for p in net.parameters() if p.requires_grad)
print("pytorch_total_params",pytorch_total_params)
print("pytorch_total_params_trainable",pytorch_total_params_trainable)


# Training
def train(epoch):
    print('\nEpoch: %d' % epoch)
    net.train()
    train_loss = 0
    correct = 0
    total = 0
    start_time = time.time()
    for batch_idx, (inputs, targets) in enumerate(trainloader):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        

        train_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()

    print("total train iters ", len(trainloader), '| time: %.3f sec Loss: %.3f | Acc: %.3f%% (%d/%d)'
        % ((time.time()-start_time), train_loss/(batch_idx+1), 100.*correct/total, correct, total))

start_time = time.time()
def test(epoch):
    global best_acc
    net.eval()
    test_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(testloader):
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = net(inputs)
            loss = criterion(outputs, targets)

            test_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
        scheduler.step(test_loss) 
        print("total test iters ", len(testloader), '| time: %.3f sec Loss: %.3f | Acc: %.3f%% (%d/%d)'
        % ((time.time()-start_time), test_loss/(batch_idx+1), 100.*correct/total, correct, total))

    

for epoch in range(start_epoch, start_epoch+100):
    train(epoch)
    test(epoch)    

pytorch_total_params 11173962
pytorch_total_params_trainable 11173962

Epoch: 0
total train iters  391 | time: 40.596 sec Loss: 1.601 | Acc: 40.410% (20205/50000)
total test iters  100 | time: 43.436 sec Loss: 1.485 | Acc: 47.780% (4778/10000)

Epoch: 1
total train iters  391 | time: 41.917 sec Loss: 1.093 | Acc: 60.540% (30270/50000)
total test iters  100 | time: 88.056 sec Loss: 1.327 | Acc: 58.300% (5830/10000)

Epoch: 2
total train iters  391 | time: 41.199 sec Loss: 0.849 | Acc: 70.124% (35062/50000)
total test iters  100 | time: 132.054 sec Loss: 1.092 | Acc: 62.800% (6280/10000)

Epoch: 3
total train iters  391 | time: 41.415 sec Loss: 0.701 | Acc: 75.816% (37908/50000)
total test iters  100 | time: 176.223 sec Loss: 0.808 | Acc: 72.640% (7264/10000)

Epoch: 4
total train iters  391 | time: 41.258 sec Loss: 0.617 | Acc: 78.842% (39421/50000)
total test iters  100 | time: 220.250 sec Loss: 0.665 | Acc: 77.330% (7733/10000)

Epoch: 5
total train iters  391 | time: 41.320 sec Loss:

#### Cyclincal learning rate with 2000 iteration per step
Cyclincal learnging rate in a super convergence plicy. Using 2000 iterations per step, we are getting 
- **90% accuracy on validation data at 10th epoch**.
- **validation accuracy of  93.830%** at **51st Epoch**.
- And a maximum **validation accuracy of  94.410%** at **82nd Epoch**.

We also ensure that **cyclic momentum** also works simultaniously, as LR increases and decreases, mometum decreases and increases. In pytorch this option is true by default.

In [41]:
import time

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4)
scheduler = CyclicLR(optimizer, base_lr=0.0001, max_lr=0.1,step_size_up=2000, step_size_down=None, mode='triangular')


pytorch_total_params = sum(p.numel() for p in net.parameters())
pytorch_total_params_trainable = sum(p.numel() for p in net.parameters() if p.requires_grad)
print("pytorch_total_params",pytorch_total_params)
print("pytorch_total_params_trainable",pytorch_total_params_trainable)


# Training
def train(epoch):
    print('\nEpoch: %d' % epoch)
    net.train()
    train_loss = 0
    correct = 0
    total = 0
    start_time = time.time()
    for batch_idx, (inputs, targets) in enumerate(trainloader):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        scheduler.step()

        train_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()

    print("total train iters ", len(trainloader), '| time: %.3f sec Loss: %.3f | Acc: %.3f%% (%d/%d)'
        % ((time.time()-start_time), train_loss/(batch_idx+1), 100.*correct/total, correct, total))

start_time = time.time()
def test(epoch):
    global best_acc
    net.eval()
    test_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(testloader):
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = net(inputs)
            loss = criterion(outputs, targets)

            test_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
            
        print("total test iters ", len(testloader), '| time: %.3f sec Loss: %.3f | Acc: %.3f%% (%d/%d)'
        % ((time.time()-start_time), test_loss/(batch_idx+1), 100.*correct/total, correct, total))

    

for epoch in range(start_epoch, start_epoch+100):
    train(epoch)
    test(epoch)    

pytorch_total_params 11173962
pytorch_total_params_trainable 11173962

Epoch: 0
total train iters  391 | time: 39.969 sec Loss: 1.570 | Acc: 41.814% (20907/50000)
total test iters  100 | time: 42.688 sec Loss: 1.220 | Acc: 56.360% (5636/10000)

Epoch: 1
total train iters  391 | time: 40.647 sec Loss: 1.006 | Acc: 64.150% (32075/50000)
total test iters  100 | time: 86.092 sec Loss: 1.164 | Acc: 62.990% (6299/10000)

Epoch: 2
total train iters  391 | time: 40.981 sec Loss: 0.786 | Acc: 72.308% (36154/50000)
total test iters  100 | time: 129.834 sec Loss: 0.759 | Acc: 73.680% (7368/10000)

Epoch: 3
total train iters  391 | time: 41.118 sec Loss: 0.657 | Acc: 77.186% (38593/50000)
total test iters  100 | time: 173.695 sec Loss: 0.976 | Acc: 70.430% (7043/10000)

Epoch: 4
total train iters  391 | time: 41.187 sec Loss: 0.579 | Acc: 79.912% (39956/50000)
total test iters  100 | time: 217.609 sec Loss: 0.894 | Acc: 72.590% (7259/10000)

Epoch: 5
total train iters  391 | time: 41.175 sec Loss:

#### Cyclincal learning rate with 10000 iteration per step
Cyclincal learnging rate in a super convergence plicy. Using 10000 iterations per step, we are getting **Val Acc: 94.360% accuracy on validation data at 52nd epoch**. Here we crossed 94% human benchmark at quite ealry compared to CLR with step size of 2000, at 81st epoch.
But it took 42 epochs to reach an val accuracy of 90%, which was attained in 9th epoch with small step size cycles. 



In [43]:
import time

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4)
scheduler = CyclicLR(optimizer, base_lr=0.0001, max_lr=0.1,step_size_up=10000, step_size_down=10000, mode='triangular')


pytorch_total_params = sum(p.numel() for p in net.parameters())
pytorch_total_params_trainable = sum(p.numel() for p in net.parameters() if p.requires_grad)
print("pytorch_total_params",pytorch_total_params)
print("pytorch_total_params_trainable",pytorch_total_params_trainable)


# Training
def train(epoch):
    print('\nEpoch: %d' % epoch)
    net.train()
    train_loss = 0
    correct = 0
    total = 0
    start_time = time.time()
    for batch_idx, (inputs, targets) in enumerate(trainloader):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        scheduler.step()

        train_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()

    print("total train iters ", len(trainloader), '| time: %.3f sec Loss: %.3f | Acc: %.3f%% (%d/%d)'
        % ((time.time()-start_time), train_loss/(batch_idx+1), 100.*correct/total, correct, total))

start_time = time.time()
def test(epoch):
    global best_acc
    net.eval()
    test_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(testloader):
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = net(inputs)
            loss = criterion(outputs, targets)

            test_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
            
        print("total test iters ", len(testloader), '| time: %.3f sec Loss: %.3f | Acc: %.3f%% (%d/%d)'
        % ((time.time()-start_time), test_loss/(batch_idx+1), 100.*correct/total, correct, total))

    

for epoch in range(start_epoch, start_epoch+100):
    train(epoch)
    test(epoch)    

pytorch_total_params 11173962
pytorch_total_params_trainable 11173962

Epoch: 0
total train iters  391 | time: 41.249 sec Loss: 1.743 | Acc: 34.648% (17324/50000)
total test iters  100 | time: 44.038 sec Loss: 1.333 | Acc: 51.410% (5141/10000)

Epoch: 1
total train iters  391 | time: 41.677 sec Loss: 1.149 | Acc: 58.546% (29273/50000)
total test iters  100 | time: 88.454 sec Loss: 1.124 | Acc: 61.090% (6109/10000)

Epoch: 2
total train iters  391 | time: 41.393 sec Loss: 0.862 | Acc: 69.476% (34738/50000)
total test iters  100 | time: 132.599 sec Loss: 0.903 | Acc: 70.450% (7045/10000)

Epoch: 3
total train iters  391 | time: 41.542 sec Loss: 0.702 | Acc: 75.276% (37638/50000)
total test iters  100 | time: 176.914 sec Loss: 0.763 | Acc: 74.350% (7435/10000)

Epoch: 4
total train iters  391 | time: 41.406 sec Loss: 0.612 | Acc: 78.758% (39379/50000)
total test iters  100 | time: 221.112 sec Loss: 0.719 | Acc: 76.390% (7639/10000)

Epoch: 5
total train iters  391 | time: 41.573 sec Loss: