In [None]:
# ---------------------------------------------------------------------------- #
# An implementation of https://arxiv.org/pdf/1512.03385.pdf                    #
# See section 4.2 for the model architecture on CIFAR-10                       #
# Some part of the code was referenced from below                              #
# https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py   #
# ---------------------------------------------------------------------------- #

# [Residual Network](https://arxiv.org/pdf/1512.03385.pdf)

## Problem of deeper layer
### 1. Vanishing / Exploding Gradient
When you update a parameter in CNN, the gradient value is saturated with a too large value or a small value, and it does not move anymore, so the learning effect disappears or the learning speed becomes very slow. 

As the network becomes deeper, this problem becomes more and more serious.

In order to avoid this problem, techniques such as batch normalization and parameter initialization are applied.

However, if the number of layers exceeds a certain number, it still becomes a headache .

### 2. Diffcult to learning 
If the network is deepened, the number of parameters will increase proportionally, and even though it is not a problem of overfitting, a situation occurs in which the error becomes rather large.

## Residual block
In CNN, learning is performed to obtain H(x), but in resnet, learning is performed to obtain H(x)-x (F(x)=H(x)+x). </p> In the optimal case, F(x) must be zero, so the direction to learn is predetermined and this is the pre-conditioning role. 

If learning is performed in a direction in which F(x) becomes almost zero, a small fluctuation of the input can be easily detected. 

In this sense, F(x) is called residual learning in terms of learning a small motion, the residual.

Also, since the same x as the input is connected to the output as it is, there is no effect on the number of parameters, and there is no increase in the computation through the shortcut connection except for the addition. 

Since input and output are connected by skipping several layers, forward and backward paths can be simplified.


As a result, the effect that can be obtained through the connection of the identity shortcut is as follows :
1. Deep layers can be easily optimized.

2. Increased depth can improve accuracy.
![Residual block](https://cdn-images-1.medium.com/max/1200/1*ByrVJspW-TefwlH7OLxNkg.png)


## VGG,Plain,Residual Architecture
![Residual Network](https://cdn-images-1.medium.com/max/1200/1*2ns4ota94je5gSVjrpFq3A.png)

## Deeper Bottleneck Architecture
Considering the training time, the basic structure is slightly modified for 50- / 101- / 152-layer, and the residual function is composed of 1x1, 3x3, 1x1. 

The reason named Bottleneck structure is because it looks like a bottleneck in reducing dimensions and increasing dimensions from the back.

The reason for this configuration is to reduce computation time.

The first 1x1 convolution is intended to reduce the dimension as shown in the Inception structure of NIN (Network-in-Network) or GoogLeNet. 

After reducing the dimension and performing the 3x3 convolution, the final 1x1 convolution enlarges the dimension again. 

It plays a role. As a result, it is possible to reduce the computational complexity as compared with the structure in which two 3x3 convolutions are directly connected.

![bottleneck](https://camo.qiitausercontent.com/1bb6b91505493e7e08fc0c9430f2ce2978e3bb70/68747470733a2f2f71696974612d696d6167652d73746f72652e73332e616d617a6f6e6177732e636f6d2f302f3130303532332f37346438383965662d386462642d363365632d336661302d3631396533663232366537312e706e67)

## Reference
https://laonple.blog.me/220761052425 </BR>
https://laonple.blog.me/220764986252 </BR>

In [1]:
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms

In [2]:
# device configuration
device = torch.device('cuda:1')
torch.cuda.set_device(1)

In [3]:
# Hyper-paramters
num_epochs = 80
learning_rate = 0.001

In [4]:
# Image preprocessing modules
transform = transforms.Compose([
    transforms.Pad(4),
    transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(32),
    transforms.ToTensor()
])

### transforms.Pad()
Pad the given PIL Image on all sides with the given “pad” value. 

### transforms.RandomHorizonFlip()
Horizontally flip the given PIL Image randomly with a given probability.(default p = 0.5)

### transforms.RandomCrop()
Crop the given PIL Image at a random location.

In [5]:
# CIFAR-10 dataset
train_dataset = torchvision.datasets.CIFAR10(root='../../data/',
                                             train=True,
                                             transform=transform,
                                             download=True)
test_dataset = torchvision.datasets.CIFAR10(root='../../data/',
                                            train=False,
                                            transform=transforms.ToTensor())

Files already downloaded and verified


In [6]:
# Dataloader
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=100,
                                           shuffle=True) 
test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                          batch_size=100,
                                          shuffle=False)

In [7]:
# 3x3 convolution
def conv3x3layer(in_channels,out_channels,stride=1) :
    conv_layer = nn.Conv2d(in_channels,out_channels,kernel_size=3,
                        stride=stride,padding=1,bias=False)
    return conv_layer

In [8]:
# Residual block
class ResidualBlock(nn.Module) :
    def __init__(self,in_channels,out_channels,stride=1,downsample=None) :
        super(ResidualBlock, self).__init__()
        self.conv1 = conv3x3layer(in_channels,out_channels,stride)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3layer(out_channels,out_channels)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.downsample = downsample
    def forward(self,x) :
        residual = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        if self.downsample :
            residual = self.downsample(x)
        out += residual
        out = self.relu(out)
        return out 

## CIFAR-10 14-layer ResNet model
![cifar10_resnet](../../../image/image_54.png)

## CIFAR-10,100 ResNet result
![cifar result](../../../image/image_55.png)

## Reference
https://laonple.blog.me/220770760226

In [14]:
# Resnet
class ResNet(nn.Module) :
    def __init__(self,block,layers,num_classes=10) :
        super(ResNet,self).__init__()
        self.in_channels = 16
        self.conv = conv3x3layer(3,16)
        self.bn = nn.BatchNorm2d(16)
        self.relu = nn.ReLU(inplace=True)
        self.layer1 = self.make_layer(block,16,layers[0])
        self.layer2 = self.make_layer(block,32,layers[1],2)
        self.layer3 = self.make_layer(block,64,layers[2],2)
        self.avg_pool = nn.AvgPool2d(8)
        self.fc = nn.Linear(64,num_classes)
    def make_layer(self,block,out_channels,blocks,stride=1) :
        downsample = None
        if (stride != 1) or (self.in_channels != out_channels) :
            downsample = nn.Sequential(
                conv3x3layer(self.in_channels,out_channels,stride=stride),
                nn.BatchNorm2d(out_channels))
        layers = []
        layers.append(block(self.in_channels,out_channels,stride,downsample))
        self.in_channels = out_channels
        for i in range(1,blocks) :
            layers.append(block(out_channels,out_channels))
        return nn.Sequential(*layers)
    def forward(self,x) :
        out = self.conv(x)
        out = self.bn(out)
        out = self.relu(out)
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.avg_pool(out)
        out = out.view(out.size(0),-1)
        out = self.fc(out)
        return out
    
model = ResNet(ResidualBlock,[2,2,2]).to(device)

In [15]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr = learning_rate)

In [16]:
# For updating learning rate
def update_lr(optimizer,lr) :
    for param_group in optimizer.param_groups :
        param_group['lr'] = lr

In [17]:
# Train the model
total_step = len(train_loader)
curr_lr = learning_rate
for epoch in range(num_epochs) :
    for i, (images,labels) in enumerate(train_loader) :
        images = images.to(device)
        labels = labels.to(device)
        
        #Forward pass
        outputs = model(images)
        loss = criterion(outputs,labels)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) % 100 == 0:
            print ("Epoch [{}/{}], Step [{}/{}] Loss: {:.4f}"
                   .format(epoch+1, num_epochs, i+1, total_step, loss.item()))
        
        # Decay learning rate
        if (epoch+1) % 20 == 0:
            curr_lr /= 3
            update_lr(optimizer,curr_lr)

Epoch [1/80], Step [100/500] Loss: 1.7200
Epoch [1/80], Step [200/500] Loss: 1.4032
Epoch [1/80], Step [300/500] Loss: 1.3187
Epoch [1/80], Step [400/500] Loss: 1.3043
Epoch [1/80], Step [500/500] Loss: 1.1406
Epoch [2/80], Step [100/500] Loss: 1.1065
Epoch [2/80], Step [200/500] Loss: 1.0044
Epoch [2/80], Step [300/500] Loss: 0.9449
Epoch [2/80], Step [400/500] Loss: 1.0289
Epoch [2/80], Step [500/500] Loss: 0.9420
Epoch [3/80], Step [100/500] Loss: 1.0902
Epoch [3/80], Step [200/500] Loss: 1.0721
Epoch [3/80], Step [300/500] Loss: 0.8130
Epoch [3/80], Step [400/500] Loss: 0.9076
Epoch [3/80], Step [500/500] Loss: 0.9649
Epoch [4/80], Step [100/500] Loss: 0.7986
Epoch [4/80], Step [200/500] Loss: 0.7498
Epoch [4/80], Step [300/500] Loss: 0.9079
Epoch [4/80], Step [400/500] Loss: 0.8767
Epoch [4/80], Step [500/500] Loss: 0.7828
Epoch [5/80], Step [100/500] Loss: 0.8237
Epoch [5/80], Step [200/500] Loss: 0.7378
Epoch [5/80], Step [300/500] Loss: 0.8489
Epoch [5/80], Step [400/500] Loss:

In [18]:
# Test the model
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print('Accuracy of the model on the test images: {} %'.format(100 * correct / total))

# Save the model checkpoint
torch.save(model.state_dict(), 'resnet.ckpt')

Accuracy of the model on the test images: 83.82 %
