# EfficientNet implementation

[Original video](https://youtu.be/fR_0o25kigM)

[GoogLeNet paper](https://arxiv.org/abs/1905.11946)

In [17]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.datasets as datasets
import torchvision.transforms as transforms

from torch.utils.data import DataLoader

from math import ceil

In [18]:
base_model = [
    # expand_ratio, channels, repeats, stride, kernel_size
    [1, 16, 1, 1, 3],
    [6, 24, 2, 2, 3],
    [6, 40, 2, 2, 5],
    [6, 80, 3, 2, 3],
    [6, 112, 3, 1, 5],
    [6, 192, 4, 2, 5],
    [6, 320, 1, 1, 3],
]

# alpha, beta, gamma
# depth = alpha ** phi_value
phi_values = {
    # tuple of: (phi_value, resolution, drop_rate)
    'b0': (0, 224, 0.2),
    'b1': (0.5, 240, 0.2),
    'b2': (1, 260, 0.3),
    'b3': (2, 300, 0.3),
    'b4': (3, 380, 0.4),
    'b5': (4, 456, 0.4),
    'b6': (5, 528, 0.5),
    'b7': (6, 600, 0.5),
}

In [19]:
class CNNBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride, padding, groups=1):
        super(CNNBlock, self).__init__()
        self.cnn = nn.Conv2d(in_channels, out_channels, kernel_size, stride,
                             padding, groups=groups, bias=False)
        # bias=False always before Batch Normalization, it is unnecessary
        self.bn = nn.BatchNorm2d(out_channels)
        self.silu = nn.SiLU()  # SiLU <==> Swish

    def forward(self, x):
        return self.silu(self.bn(self.cnn(x)))


class SqueezeExcitation(nn.Module):
    def __init__(self, in_channels, reduced_dim):
        super(SqueezeExcitation, self).__init__()
        self.se = nn.Sequential(
            # Attention mechanism for the channels
            nn.AdaptiveAvgPool2d(1),  # C x H x W --> C x 1 x 1
            nn.Conv2d(in_channels, reduced_dim, 1),  # reduce dimensions
            nn.SiLU(),
            nn.Conv2d(reduced_dim, in_channels, 1),  # return it back
            nn.Sigmoid(),
        )

    def forward(self, x):
        return x * self.se(x)


class InvertedResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride, padding,
                 expand_ratio,
                 reduction=4,  # for squeeze excitation
                 survival_prob=0.8,  # for stochastic depth
    ):
        super(InvertedResidualBlock, self).__init__()
        self.survival_prob = 0.8
        self.use_residual = in_channels == out_channels and stride == 1
        hidden_dim = in_channels * expand_ratio
        self.expand = in_channels != hidden_dim
        reduced_dim = int(in_channels / reduction)

        if self.expand:
            self.expand_conv = CNNBlock(in_channels, hidden_dim, kernel_size=3,
                                        stride=1, padding=1)

        self.conv = nn.Sequential(
            # Depthwise convolution
            CNNBlock(hidden_dim, hidden_dim, kernel_size, stride, padding, groups=hidden_dim),
            SqueezeExcitation(hidden_dim, reduced_dim),
            nn.Conv2d(hidden_dim, out_channels, 1, bias=False),
            nn.BatchNorm2d(out_channels),
        )

    def stochastic_depth(self, x):
        if not self.training:
            return x

        # N x channels x H x W
        binary_tensor = torch.rand(x.shape[0], 1, 1, 1, device=x.device) < self.survival_prob
        return torch.div(x, self.survival_prob) * binary_tensor

    def forward(self, inputs):
        x = self.expand_conv(inputs) if self.expand else inputs
        if self.use_residual:
            return self.stochastic_depth(self.conv(x)) + inputs
        else:
            return self.conv(x)


class EfficientNet(nn.Module):
    def __init__(self, version, num_classes):
        super(EfficientNet, self).__init__()
        width_factor, depth_factor, dropout_rate = self.calculate_factors(version)
        last_channels = ceil(1280 * width_factor)
        self.pool = nn.AdaptiveAvgPool2d(1)
        self.features = self.create_features(width_factor, depth_factor, last_channels)
        self.classifier = nn.Sequential(
            nn.Dropout(dropout_rate),
            nn.Linear(last_channels, num_classes),
        )

    # alpha - for depth scaling, beta - for width scaling
    # coefficients alpha=1.2 and beta=1.1 are from the paper
    def calculate_factors(self, version, alpha=1.2, beta=1.1):
        phi, res, drop_rate = phi_values[version]
        depth_factor = alpha ** phi
        width_factor = beta ** phi
        return width_factor, depth_factor, drop_rate

    def create_features(self, width_factor, depth_factor, last_channels):
        channels = int(32 * width_factor)
        # features = [CNNBlock(3, channels, 3, stride=2, padding=1)]
        features = [CNNBlock(1, channels, 3, stride=2, padding=1)]  # for MNIST, 1 channel
        in_channels = channels

        for expand_ratio, channels, repeats, stride, kernel_size in base_model:
            # should be a factor of 4 because reduction=4 for squeeze excitations
            out_channels = 4 * ceil(int(channels * width_factor) / 4)
            layers_repeats = ceil(repeats * depth_factor)

            for layer in range(layers_repeats):
                features.append(
                    InvertedResidualBlock(
                        in_channels, out_channels,
                        expand_ratio=expand_ratio,
                        stride = stride if layer == 0 else 1,
                        kernel_size=kernel_size,
                        padding=kernel_size//2, # if k=1:pad=0, k=3:pad=1, k=5:pad=2
                    )
                )
                in_channels = out_channels  # it is divided by 2 from time to time

        features.append(
            CNNBlock(in_channels, last_channels, kernel_size=1, stride=1, padding=0),
        )

        return nn.Sequential(*features)
    
    def forward(self, x):
        x = self.pool(self.features(x))
        return self.classifier(x.view(x.shape[0], -1))

In [20]:
# Quick check if it works
def test():
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    version = 'b0'
    phi, res, drop_rate = phi_values[version]
    num_examples, num_classes = 4, 10
    # x = torch.randn((num_examples, 3, res, res)).to(device)
    x = torch.randn((num_examples, 1, res, res)).to(device)  # for MNIST
    model = EfficientNet(version=version, num_classes=num_classes).to(device)

    print(model(x).shape) # (num_examples, num_classes)

In [21]:
test()

torch.Size([4, 10])


## Train the model

In [22]:
# Hyperparameters
in_channels = 1
num_classes = 10
learning_rate = 1e-4
batch_size = 1024
num_epochs = 12  # 40

load_model = True
filename = 'my_checkpoint.pth.tar'

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Set transforms
my_transforms = transforms.Compose([
    # transforms.Lambda(lambda image: image.convert('RGB')),  # convert from 1 to 3 channels
    transforms.Pad(2),
    transforms.ToTensor(),
])

# For the error: HTTPError: HTTP Error 403: Forbidden
# StackOverflow: https://stackoverflow.com/a/66461122/7550928
from six.moves import urllib    
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
urllib.request.install_opener(opener)

# Load data
train_dataset = datasets.MNIST(root='dataset/', train=True, transform=my_transforms, download=True)
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = datasets.MNIST(root='dataset/', train=False, transform=my_transforms, download=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True)

# Initialize network
model = EfficientNet(version='b0', num_classes=num_classes).to(device)

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=model.parameters(), lr=learning_rate)


def save_checkpoint(state, filename=filename):
    print(' => Saving checkpoint')
    torch.save(state, filename)


def load_checkpoint(checkpoint):
    print(' => Loading checkpoint')
    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])
    best_acc = checkpoint['acc']


if load_model and os.path.exists(filename):
    load_checkpoint(torch.load(filename))
else:
    best_acc = 0


# Check accuracy on training and test to see how good our model
def check_accuracy(loader, model):
    if loader.dataset.train:
        print('Checking accuracy on training data', end='')
    else:
        print('Checking accuracy on test data', end='')
    num_correct = 0
    num_samples = 0
    model.eval()  # set model to evaluation mode

    with torch.no_grad():
        for x, y in loader:
            x = x.to(device=device)
            y = y.to(device=device)

            scores = model(x)  # shape = 64x10
            _, predictions = scores.max(dim=1)  # get index of max value
            num_correct += (predictions == y).sum()
            num_samples += predictions.size(0)

        acc = float(num_correct) / float(num_samples) * 100
        #print(f'Got {num_correct} / {num_samples} with accuracy {acc:.2f}')

        model.train()
        return acc

 => Loading checkpoint


In [23]:
# Train network
for epoch in range(num_epochs):
    losses = []

    for batch_idx, (data, targets) in enumerate(train_loader):
        # Get data to Cuda if possible
        data = data.to(device=device)
        targets = targets.to(device=device)

        # Forward
        scores = model(data)  # shape 64x10
        loss = criterion(scores, targets)
        losses.append(loss.item())

        # Backward
        optimizer.zero_grad()
        loss.backward()

        # Gradient descent or adam step
        optimizer.step()
    
    mean_loss = sum(losses)/len(losses)
    acc = check_accuracy(test_loader, model)
    msg = f'\rLoss at epoch {epoch} is {mean_loss:.5f}. Accuracy is {acc:.2f}'

    if best_acc < acc:
        best_acc = acc
        checkpoint = {'state_dict': model.state_dict(),
                      'optimizer': optimizer.state_dict(),
                      'acc': best_acc}
        print(msg, end='')
        save_checkpoint(checkpoint)
    else:
        print(msg)

Loss at epoch 0 is 0.02923. Accuracy is 97.76 => Saving checkpoint
Loss at epoch 1 is 0.02904. Accuracy is 97.85 => Saving checkpoint
Loss at epoch 2 is 0.02546. Accuracy is 97.77
Loss at epoch 3 is 0.02399. Accuracy is 97.88 => Saving checkpoint
Loss at epoch 4 is 0.02139. Accuracy is 97.81
Loss at epoch 5 is 0.02120. Accuracy is 97.94 => Saving checkpoint
Loss at epoch 6 is 0.01799. Accuracy is 98.02 => Saving checkpoint
Loss at epoch 7 is 0.01770. Accuracy is 97.95
Loss at epoch 8 is 0.01665. Accuracy is 97.82
Loss at epoch 9 is 0.01514. Accuracy is 98.14 => Saving checkpoint
Loss at epoch 10 is 0.01507. Accuracy is 98.08
Loss at epoch 11 is 0.01252. Accuracy is 98.14


In [24]:
# MNIST accuracy for:
#   LeNet            = 97.38 %
#   VGG16 pretrained = 97.75 %
#   VGG16            = 98.73 %
#   GoogLeNet        = 99.51 %
#   ResNet101        = 96.64 % (very slow)
#   EfficientNet b1  = 98.14 %
print(f': {check_accuracy(train_loader, model):.2f}')
print(f': {check_accuracy(test_loader, model):.2f}')

Checking accuracy on training data: 99.97
Checking accuracy on test data: 98.14
