## EfficientNet B0
### Reference
Mingxing Tan, Quoc V. Le, EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks, 2019. [link](https://arxiv.org/abs/1905.11946)
### Contents
* Proposed compound scaling method and base architecture for the method
* Architecture of baseline EfficientNetB0
  * <img src='../etc/images/Efficientnet-1.png' width=300>
* Squeeze and excitation module
  * <img src='../etc/images/Efficientnet-2.png' width=150>
### Keys
* Basic structure is similar with MobileNetV2
* EfficientNet used SiLU instead of ReLU6
* EfficientNet used SE(squeeze and excitation)([link](https://openaccess.thecvf.com/content_cvpr_2018/html/Hu_Squeeze-and-Excitation_Networks_CVPR_2018_paper.html)) layer between DW and PW layer
* EfficientNet used some kernel size bigger than 3


In [1]:
import torch
from torch import nn
from torchinfo import summary

In [2]:
class DW_conv(nn.Module):
    def __init__(self, reduction, channels, kernel_size=3, activation='ReLU6', *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        if activation == 'ReLU6':
            self.activation = nn.ReLU6
        elif activation == 'SiLU':
            self.activation = nn.SiLU
        else:
            self.activation = nn.ReLU

        if reduction:
            self.stride = 2
        else:
            self.stride = 1

        self.DW = nn.Sequential(
            nn.Conv2d(
                in_channels=channels,
                out_channels=channels,
                kernel_size=kernel_size,
                stride=self.stride,
                padding=kernel_size//2,
                groups=channels,
                bias=False
            ),
            nn.BatchNorm2d(num_features=channels),
            self.activation()
        )

    def forward(self, X):
        return self.DW(X)

In [3]:
model1 = DW_conv(
    reduction=True,
    channels=10,
    kernel_size=5,
    activation='SiLU'
)
model1(torch.randn((10, 10, 224, 224))).shape

torch.Size([10, 10, 112, 112])

In [4]:
class PW_conv(nn.Module):
    def __init__(self, in_channels, out_channels, activation='ReLU6', *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.PW = nn.Sequential(
            nn.Conv2d(
                in_channels=in_channels,
                out_channels=out_channels,
                kernel_size=1,
                bias=False
            ),
            nn.BatchNorm2d(
                num_features=out_channels
            )
        )
        if activation == 'ReLU6':
            self.PW.add_module(
                '2',
                nn.ReLU6()
            )
        elif activation == 'SiLU':
            self.PW.add_module(
                '2',
                nn.SiLU()
            )
        elif activation == 'linear':
            pass

    def forward(self, X):
        return self.PW(X)

In [5]:
class SE_conv(nn.Module):
    def __init__(self, in_channels, r=0.25, activation='ReLU6', *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.se_channels = int(in_channels * r)
        if activation == 'ReLU6':
            self.activation = nn.ReLU6
        elif activation == 'SiLU':
            self.activation = nn.SiLU
        else:
            self.activation = nn.ReLU

        self.SE = nn.Sequential(
            nn.AdaptiveAvgPool2d(1),
            nn.Conv2d(
                in_channels=in_channels,
                out_channels=self.se_channels,
                kernel_size=1,
            ),
            self.activation(),
            nn.Conv2d(
                in_channels=self.se_channels,
                out_channels=in_channels,
                kernel_size=1,
            ),
            nn.Sigmoid()
        )

    def forward(self, X):
        return X * self.SE(X)


In [6]:
class bottleneck(nn.Module):
    def __init__(self, t, in_channels, out_channels, reduction, kernel_size, activation, se=0.25, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.residual_connection = (in_channels == out_channels) and not reduction

        self.bn = nn.Sequential()
        num = 0
        if t > 1:
            self.bn.add_module(
                str(num),
                PW_conv(
                    in_channels=in_channels,
                    out_channels=in_channels * t,
                    activation=activation
                )
            )
            num += 1

        self.bn.add_module(
            str(num),
            DW_conv(
                reduction=reduction,
                channels=in_channels * t,
                kernel_size=kernel_size,
                activation=activation
            )
        )
        num += 1

        if se > 0:
            self.bn.add_module(
                str(num),
                SE_conv(
                    in_channels=in_channels * t,
                    r=se,
                    activation=activation
                )
            )
            num += 1

        self.bn.add_module(
            str(num),
            PW_conv(
                in_channels=in_channels * t,
                out_channels=out_channels,
                activation='linear'
            )
        )

    def forward(self, X):
        if self.residual_connection:
            return X + self.bn(X)
        return self.bn(X)

In [7]:
class bottlenecks(nn.Module):
    def __init__(self, t, in_channels, out_channels, n, reduction, kernel_size, activation, se, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.bottlenecks = []
        self.reduction = reduction
        for i in range(n):
            if i != 0:
                self.reduction = False
                in_channels = out_channels
            self.bottlenecks.append(
                bottleneck(
                    t=t,
                    in_channels=in_channels,
                    out_channels=out_channels,
                    reduction=self.reduction,
                    kernel_size=kernel_size,
                    activation=activation,
                    se=se
                )
            )

        self.bottlenecks = nn.Sequential(
            *self.bottlenecks
        )

    def forward(self, X):
        return self.bottlenecks(X)


In [8]:
class EfficientNetB0(nn.Module):
    def __init__(self, num_classes, dropout=0.2, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.sequence1 = nn.Sequential(
            nn.Conv2d(
                in_channels=3,
                out_channels=32,
                kernel_size=3,
                stride=2,
                padding=1,
                bias=False
            ),
            nn.BatchNorm2d(num_features=32),
            nn.SiLU()
        )
        self.sequence2 = nn.Sequential(
            bottlenecks(
                t=1,
                in_channels=32,
                out_channels=16,
                n=1,
                reduction=False,
                kernel_size=3,
                activation='SiLU',
                se=0.25
            ),
            bottlenecks(6, 16, 24, 2, True, 3, 'SiLU', 0.0417),
            bottlenecks(6, 24, 40, 2, True, 5, 'SiLU', 0.0417),
            bottlenecks(6, 40, 80, 3, True, 3, 'SiLU', 0.0417),
            bottlenecks(6, 80, 112, 3, False, 5, 'SiLU', 0.0417),
            bottlenecks(6, 112, 192, 4, True, 5, 'SiLU', 0.0417),
            bottlenecks(6, 192, 320, 1, False, 3, 'SiLU', 0.0417),
        )
        self.sequence3 = nn.Sequential(
            nn.Conv2d(
                in_channels=320,
                out_channels=1280,
                kernel_size=1,
                bias=False
            ),
            nn.BatchNorm2d(num_features=1280),
            nn.SiLU()
        )
        self.avgpool = nn.AvgPool2d(kernel_size=7)
        self.flatten = nn.Flatten()
        self.classifier = nn.Sequential(
            nn.Dropout(p=dropout),
            nn.Linear(
                in_features=1280,
                out_features=num_classes
            )
        )

    def forward(self, X):
        X = self.sequence1(X)
        X = self.sequence2(X)
        X = self.sequence3(X)
        X = self.avgpool(X)
        X = self.flatten(X)
        X = self.classifier(X)
        return X

In [9]:
model = EfficientNetB0(num_classes=1000)
summary(model)

Layer (type:depth-idx)                                            Param #
EfficientNetB0                                                    --
├─Sequential: 1-1                                                 --
│    └─Conv2d: 2-1                                                864
│    └─BatchNorm2d: 2-2                                           64
│    └─SiLU: 2-3                                                  --
├─Sequential: 1-2                                                 --
│    └─bottlenecks: 2-4                                           --
│    │    └─Sequential: 3-1                                       1,448
│    └─bottlenecks: 2-5                                           --
│    │    └─Sequential: 3-2                                       16,714
│    └─bottlenecks: 2-6                                           --
│    │    └─Sequential: 3-3                                       46,640
│    └─bottlenecks: 2-7                                           --
│    │    └─Seque

In [10]:
model

EfficientNetB0(
  (sequence1): Sequential(
    (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): SiLU()
  )
  (sequence2): Sequential(
    (0): bottlenecks(
      (bottlenecks): Sequential(
        (0): bottleneck(
          (bn): Sequential(
            (0): DW_conv(
              (DW): Sequential(
                (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
                (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
                (2): SiLU()
              )
            )
            (1): SE_conv(
              (SE): Sequential(
                (0): AdaptiveAvgPool2d(output_size=1)
                (1): Conv2d(32, 8, kernel_size=(1, 1), stride=(1, 1))
                (2): SiLU()
                (3): Conv2d(8, 32, kernel_size=(1, 1), stride=(1, 1))
                (4):

In [11]:
model.sequence1(torch.randn((10, 3, 224, 224))).shape

torch.Size([10, 32, 112, 112])

In [12]:
model.sequence2(torch.randn((10, 32, 112, 112))).shape

torch.Size([10, 320, 7, 7])

In [13]:
model.sequence3(torch.randn((10, 320, 7, 7))).shape

torch.Size([10, 1280, 7, 7])

In [14]:
model(torch.randn((10, 3, 224, 224))).shape

torch.Size([10, 1000])

In [15]:
import torchvision
model_temp = torchvision.models.efficientnet_b0()
summary(model_temp)

Layer (type:depth-idx)                                  Param #
EfficientNet                                            --
├─Sequential: 1-1                                       --
│    └─Conv2dNormActivation: 2-1                        --
│    │    └─Conv2d: 3-1                                 864
│    │    └─BatchNorm2d: 3-2                            64
│    │    └─SiLU: 3-3                                   --
│    └─Sequential: 2-2                                  --
│    │    └─MBConv: 3-4                                 1,448
│    └─Sequential: 2-3                                  --
│    │    └─MBConv: 3-5                                 6,004
│    │    └─MBConv: 3-6                                 10,710
│    └─Sequential: 2-4                                  --
│    │    └─MBConv: 3-7                                 15,350
│    │    └─MBConv: 3-8                                 31,290
│    └─Sequential: 2-5                                  --
│    │    └─MBConv: 3-9         

In [16]:
model_temp

EfficientNet(
  (features): Sequential(
    (0): Conv2dNormActivation(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): SiLU(inplace=True)
    )
    (1): Sequential(
      (0): MBConv(
        (block): Sequential(
          (0): Conv2dNormActivation(
            (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
            (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): SiLU(inplace=True)
          )
          (1): SqueezeExcitation(
            (avgpool): AdaptiveAvgPool2d(output_size=1)
            (fc1): Conv2d(32, 8, kernel_size=(1, 1), stride=(1, 1))
            (fc2): Conv2d(8, 32, kernel_size=(1, 1), stride=(1, 1))
            (activation): SiLU(inplace=True)
            (scale_activation): Sigmoid()
          )
          (2): Conv2dNormActivat