- Copyright 2021. Dongwon Kim All rights reserved.
- File name : DenseNet_CIFAR10.ipynb
- Written by Dongwon Kim
- DenseNet
    - build an advanced CNN model to classify CIFAR-10
- Modificatoin history
    - written by Dongwon Kim on Oct 10, 2021

In [None]:
import numpy as np
import torchvision.datasets as datasets
from torchvision import transforms
from torch.utils.data.sampler import SubsetRandomSampler
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.model_selection import train_test_split
import torch
from google.colab import files
from torchsummary import summary
import math

# Choosing Model to use
- first tried with VGG16, but accuracy was not enough
- try DenseNet instead refering the [rank](https://paperswithcode.com/sota/image-classification-on-cifar-10)


# Parameters
- batch size: 128
- epoch: 200
- learning rate: 0.1
- dropout: 0.2
- growth rate: 12
- reduction: 0.5

# Prepare dataset


## Download Data
1. load CIFAR10 dataset from torchvision.datasets
2. change dataset to tenser and normalize with meand and std  
[reference](https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html)
    - image: 3 x 32 x32
    - total 10 classes
3. download train and dataset

In [None]:
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])

train_dataset = datasets.CIFAR10(
    root = './',
    download = True,
    train = True,
    transform = transform
)

test_dataset = datasets.CIFAR10(
    root = './',
    download = True,
    train = False,
    transform = transform
)

Files already downloaded and verified
Files already downloaded and verified


In [None]:
print(train_dataset.data.shape, len(train_dataset.targets))

(50000, 32, 32, 3) 50000


In [None]:
print(test_dataset.data.shape, len(test_dataset.targets))

(10000, 32, 32, 3) 10000


## Shuffle and Split train, validation data
1. by using train_test_split, shuffle the dataset and split data
    - validation size = 10% of train data
    - split data to have similar ratio of targets using stratify option  
2. set samplers for each dataset using SubsetRandomSampler 

In [None]:
tr_index, val_index = train_test_split(list(range(len(train_dataset))), test_size = 0.1, shuffle=True, stratify = train_dataset.targets)

In [None]:
tr_sampler= SubsetRandomSampler(tr_index)
val_sampler = SubsetRandomSampler(val_index)

## Set DataLoader
1. using the samplers, set train, val loader
2. since test dataset is not shuffled and splited, no need to use a sampler

In [None]:
batch_size = 128
train_loader = DataLoader(
    dataset = train_dataset,
    batch_size = batch_size,
    num_workers = 0,
    sampler = tr_sampler
)

val_loader = DataLoader(
    dataset = train_dataset,
    batch_size = batch_size,
    num_workers = 0,
    sampler = val_sampler
)

test_loader = DataLoader(
    dataset = test_dataset,
    batch_size = batch_size,
    num_workers = 0
)

# Build Model

## Check GPU

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

## Bottleneck
- use bottleneck block instead of normal dense block
- to reduce feature map insert 1x1 conv layer before 3x3 conv layer
- also increase computational efficiency
- reference
    - [Densely Connected Convolutional Networks](https://arxiv.org/pdf/1608.06993v5.pdf) paper

> BatchNorm(BN) → relu → 1x1 Conv → BN → relu → 3x3 Conv

- for 1x1 Conv refering the paper, produce 4 * growth rate 
- add Dropout layer to prevent overfitting


In [None]:
class BottleneckBlock(nn.Module):
    def __init__(self, in_plane, growth_rate, droprate):
        super(BottleneckBlock, self).__init__()
        self.bn1 = nn.BatchNorm2d(in_plane)
        self.conv1 = nn.Conv2d(in_plane, 4*growth_rate, kernel_size=1, stride=1, bias=False)
        self.bn2 = nn.BatchNorm2d(4*growth_rate)
        self.conv2 = nn.Conv2d(4*growth_rate, growth_rate, kernel_size = 3, padding=1, bias=False)
        self.relu = nn.ReLU(inplace=True)
        self.droprate = droprate
    
    def forward(self, x):
        out = self.bn1(x)
        out = self.relu(out)
        out = self.conv1(out)
        out = F.dropout(out, p=self.droprate, inplace=False, training=self.training)

        out = self.bn2(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = F.dropout(out, p=self.droprate, inplace=False, training=self.training)        

        return torch.cat([out, x], 1)

## TransitionBlock
- in DenseNet, use transition block for **compression**
- therefore, use 2 x 2 average pooling

> BN → relu → 1 x 1 Conv → 2 x 2 Avg Pooling

- add Dropout layer 

In [None]:
class TransitionBlock(nn.Module):
    def __init__(self, in_plane, out_plane, droprate):
        super(TransitionBlock, self).__init__()
        self.bn1 = nn.BatchNorm2d(in_plane)
        self.relu = nn.ReLU(inplace=True)
        self.conv1 = nn.Conv2d(in_plane, out_plane, kernel_size=1, stride=1, bias=False)
        self.droprate = droprate
       

    def forward(self, x):
        out = self.bn1(x)
        out = self.relu(out)
        out = self.conv1(out)
        out = F.dropout(out, p=self.droprate, training=self.training, inplace=False)
        
        return F.avg_pool2d(out, 2)

## DenseNet
- make DenseNet using Bottleneck and transition block
- refering the paper, 
    - set θ as 0.5, growth rate as 12
    - set 1st 3 x 3 Conv layer's output channels as twice the growth rate
    - for Dense121, 
        - no. of Dense block: 6 12 24 16 

> conv → dense1 → transition1 → dense2 → transition2 → dense3 → transition3 → dense4 → classification layer

In [None]:
class DenseNet(nn.Module):
    def __init__(self, droprate, block=BottleneckBlock, growth_rate=12, num_classes=10, reduction=0.5):
        super(DenseNet, self).__init__()
        self.growth_rate = growth_rate
        

        in_plane = 2 * growth_rate
        self.conv1 = nn.Conv2d(3, in_plane, kernel_size = 3, padding=1, bias=False)

        # 1st Dense & Transition
        self.dense1 = self.make_dense_block(block, in_plane, 6, droprate)
        in_plane += 6 * growth_rate
        out_plane = int(math.floor(in_plane * reduction))
        self.trans1 = TransitionBlock(in_plane, out_plane, droprate)
        in_plane = out_plane

        # 2nd Dense & Transition
        self.dense2 = self.make_dense_block(block, in_plane, 12,droprate)
        in_plane += 12 * growth_rate
        out_plane = int(math.floor(in_plane * reduction))
        self.trans2 = TransitionBlock(in_plane, out_plane, droprate)
        in_plane = out_plane

        # 3rd Dense & Transition
        self.dense3 = self.make_dense_block(block, in_plane, 24, droprate)
        in_plane += 24 * growth_rate
        out_plane = int(math.floor(in_plane * reduction))
        self.trans3 = TransitionBlock(in_plane, out_plane, droprate)
        in_plane = out_plane

        # 4th Dense
        self.dense4 = self.make_dense_block(block, in_plane, 16, droprate)
        in_plane += 16 * growth_rate
        
        self.bn = nn.BatchNorm2d(in_plane)
        self.fc = nn.Linear(in_plane, num_classes)
        self.relu = nn.ReLU()

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. /n))
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.Linear):
                m.bias.data.zero_()
       

    def make_dense_block(self, block, in_plane, nblock, droprate):
        layers=[]
        for i in range(nblock):
            layers.append(block(in_plane, self.growth_rate, droprate))
            in_plane += self.growth_rate
        return nn.Sequential(*layers)

    def forward(self, x):
        # 32 x 32
        out = self.conv1(x)

        # 32 x 32
        out = self.dense1(out)
        out = self.trans1(out) # 32 -> 16
        
        # 16 x 16
        out = self.dense2(out)
        out = self.trans2(out) # 16 -> 8
        
        # 8 x 8
        out = self.dense3(out)
        out = self.trans3(out) # 8 -> 4

        out = self.dense4(out)

        out = self.bn(out)
        out = self.relu(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.fc(out)

        return out

In [None]:
model = DenseNet(droprate=0.2)
model = model.to(device)

In [None]:
model

DenseNet(
  (conv1): Conv2d(3, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (dense1): Sequential(
    (0): BottleneckBlock(
      (bn1): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv1): Conv2d(24, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn2): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(48, 12, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (relu): ReLU(inplace=True)
    )
    (1): BottleneckBlock(
      (bn1): BatchNorm2d(36, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv1): Conv2d(36, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn2): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(48, 12, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (relu): ReLU(inplace=True)
    )
    (2): BottleneckBlock(
      (bn1

In [None]:
summary(model, (3, 32, 32))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 24, 32, 32]             648
       BatchNorm2d-2           [-1, 24, 32, 32]              48
              ReLU-3           [-1, 24, 32, 32]               0
            Conv2d-4           [-1, 48, 32, 32]           1,152
       BatchNorm2d-5           [-1, 48, 32, 32]              96
              ReLU-6           [-1, 48, 32, 32]               0
            Conv2d-7           [-1, 12, 32, 32]           5,184
   BottleneckBlock-8           [-1, 36, 32, 32]               0
       BatchNorm2d-9           [-1, 36, 32, 32]              72
             ReLU-10           [-1, 36, 32, 32]               0
           Conv2d-11           [-1, 48, 32, 32]           1,728
      BatchNorm2d-12           [-1, 48, 32, 32]              96
             ReLU-13           [-1, 48, 32, 32]               0
           Conv2d-14           [-1, 12,

# Training

## Loss function and optimizer
- refering the paper
    - use SGD
    - learning rate: 0.1, weight decay: 10^(-4)

In [None]:
learning_rate = 0.1
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr = learning_rate, momentum=0.9, weight_decay=5e-4)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=200)

In [None]:
train_batches = len(train_loader)
val_batches = len(val_loader)

best_valid_loss = 1024
patience = 0

## Train
- set epoch to 200
    - although the paper set epoch 300 for CIFAR10, by experiment, because of overfitting, set to 200 
    - train accuracy becomes 1 after 185 epoch and loss of validation set doesn't decrease
- not using early stopping to train the model enough

In [None]:
epochs = 200

for epoch in range(epochs):
    model.train()

    train_loss = 0
    train_total = 0
    train_correct = 0

    for batch_idx, (inputs, labels) in enumerate(train_loader):
        inputs, labels = inputs.to(device), labels.to(device)
        # since pytorch add gradients, initialize to 0 for each iteration
        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)        

        # back propagate and update parameters
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

        _, predicted = outputs.max(1)
        train_correct += predicted.eq(labels).sum().item()
        train_total += labels.size(0)
    train_loss = train_loss / train_batches
    train_acc = train_correct / train_total

    # validation
    model.eval()

    val_loss = 0
    val_correct = 0
    val_total = 0

    with torch.no_grad():
        for batch_idx, (inputs, labels) in enumerate(val_loader):
            inputs, labels = inputs.to(device), labels.to(device)

            outputs = model(inputs)
            loss = criterion(outputs, labels)

            val_loss += loss.item()

            _, predicted = outputs.max(1)
            val_correct += predicted.eq(labels).sum().item()
            val_total += labels.size(0)

    val_loss = val_loss / val_batches
    val_acc = val_correct / val_total

    # save the best model (scored by validation loss)
    if val_loss < best_valid_loss:
        torch.save(model.state_dict(), './2018312292_DongwonKim.pt')
        best_valid_loss = val_loss
    
    print('[%d/%d] TrainLoss: %.3f, ValLoss: %.3f | TrainAcc: %.2f, ValAcc: %.2f'\
          % (epoch+1, epochs, train_loss, val_loss, train_acc, val_acc))
    

    scheduler.step()

[1/200] TrainLoss: 1.681, ValLoss: 1.710 | TrainAcc: 0.37, ValAcc: 0.41
[2/200] TrainLoss: 1.235, ValLoss: 1.408 | TrainAcc: 0.56, ValAcc: 0.52
[3/200] TrainLoss: 0.998, ValLoss: 1.089 | TrainAcc: 0.64, ValAcc: 0.61
[4/200] TrainLoss: 0.876, ValLoss: 0.977 | TrainAcc: 0.69, ValAcc: 0.66
[5/200] TrainLoss: 0.791, ValLoss: 0.972 | TrainAcc: 0.72, ValAcc: 0.67
[6/200] TrainLoss: 0.725, ValLoss: 0.935 | TrainAcc: 0.74, ValAcc: 0.69
[7/200] TrainLoss: 0.673, ValLoss: 0.732 | TrainAcc: 0.76, ValAcc: 0.74
[8/200] TrainLoss: 0.633, ValLoss: 0.718 | TrainAcc: 0.78, ValAcc: 0.77
[9/200] TrainLoss: 0.610, ValLoss: 0.651 | TrainAcc: 0.79, ValAcc: 0.78
[10/200] TrainLoss: 0.589, ValLoss: 0.794 | TrainAcc: 0.80, ValAcc: 0.73
[11/200] TrainLoss: 0.567, ValLoss: 0.652 | TrainAcc: 0.80, ValAcc: 0.77
[12/200] TrainLoss: 0.555, ValLoss: 0.759 | TrainAcc: 0.81, ValAcc: 0.75
[13/200] TrainLoss: 0.542, ValLoss: 0.832 | TrainAcc: 0.81, ValAcc: 0.74
[14/200] TrainLoss: 0.536, ValLoss: 0.634 | TrainAcc: 0.81, 

# Download the model

In [None]:
files.download('./DenseNet_CIFAR10.pt')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Test

## load the model

In [None]:
new_model = DenseNet(droprate=0.2)
new_model.load_state_dict(torch.load('./DenseNet_CIFAR10.pt'))
new_model = new_model.to(device)

## Test the model

In [None]:
test_loss = 0
test_correct = 0
test_total = 0
test_batches = len(test_loader)

new_model.eval()

with torch.no_grad():
    for batch_idx, (inputs, labels) in enumerate(test_loader):
        inputs, labels = inputs.to(device), labels.to(device)

        outputs = new_model(inputs)
        loss = criterion(outputs, labels)

        test_loss += loss.item()

        _, predicted = outputs.max(1)
        test_correct += predicted.eq(labels).sum().item()
        test_total += labels.size(0)

test_loss = test_loss / test_batches
test_acc = test_correct / test_total

print('TestAcc: %.2f' % (test_acc))



TestAcc: 0.93
