## GoogLeNet (Inception model)
### Reference
Christian Szegedy, et al., Going deeper with convolutions, CVPR, 2014. [link](https://arxiv.org/pdf/1409.4842.pdf)
### Contents
* enlarging network without overfitting or extremely increased parameters
* key is ultimate moving from fully connected to sparsely connected architectures
### Keys
* parallel filters and concatenation
* channel reduction and projection for computational efficiency
* every basic convolution includes batch-normalization and ReLU rectification (even the 1-by-1 reductions)
* add auxiliary classifiers at the intermediate layers as well as at the end of the model
* losses from the auxiliary classifier are weighted by 0.3 to be added to the total loss
* <img src='../etc/images/Googlenet-1.png' width='500'>
* <img src='../etc/images/Googlenet-2.png' width='500'>

In [1]:
import torch
from torch import nn
from torchinfo import summary

1. basic_convolution_block: n-by-n convolution + batch-normalization + ReLU
2. inception_block: 4 branches concatenated
3. auxiliary_block: average pooling + reduction + fully connected + dropout + classifier

In [46]:
class basic_conv_block(nn.Module):
    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        kernel_size: int,
        padding: int = 0,
        stride: int = 1
    ) -> None:
        super().__init__()
        self.conv = nn.Conv2d(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=kernel_size,
            padding=padding,
            stride=stride,
            bias=False
        )
        self.bn = nn.BatchNorm2d(num_features=out_channels, eps=0.001)
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        x = self.relu(x)
        return x

In [42]:
class inception_block(nn.Module):
    def __init__(
        self,
        in_channels: int,
        channels_1by1: int,
        channels_3by3_reduction: int,
        channels_3by3: int,
        channels_5by5_reduction: int,
        channels_5by5: int,
        channels_proj: int
    ) -> None:
        super().__init__()
        self.branch1 = basic_conv_block(
            in_channels=in_channels,
            out_channels=channels_1by1,
            kernel_size=1
        )
        self.branch2 = nn.Sequential(
            basic_conv_block(
              in_channels=in_channels,
              out_channels=channels_3by3_reduction,
              kernel_size=1
            ),
            basic_conv_block(
              in_channels=channels_3by3_reduction,
              out_channels=channels_3by3,
              kernel_size=3,
              padding=1
            )
        )
        self.branch3 = nn.Sequential(
            basic_conv_block(
              in_channels=in_channels,
              out_channels=channels_5by5_reduction,
              kernel_size=1
            ),
            basic_conv_block(
              in_channels=channels_5by5_reduction,
              out_channels=channels_5by5,
              kernel_size=5,
              padding=2
            )
        )
        self.branch4 = nn.Sequential(
            nn.MaxPool2d(
              kernel_size=3,
              padding=1,
              stride=1
            ),
            basic_conv_block(
              in_channels=in_channels,
              out_channels=channels_proj,
              kernel_size=1
            )
        )

    def forward(self, x):
        branch1 = self.branch1(x)
        branch2 = self.branch2(x)
        branch3 = self.branch3(x)
        branch4 = self.branch4(x)

        return torch.concat([branch1, branch2, branch3, branch4], dim=1)

In [43]:
class auxiliary_block(nn.Module):
    def __init__(self, in_channels: int, num_features: int) -> None:
        super().__init__()
        self.average_pooling = nn.AvgPool2d(
            kernel_size=5,
            stride=3
        )
        self.reduction = basic_conv_block(
            in_channels=in_channels,
            out_channels=128,
            kernel_size=1
        )
        self.fc1 = nn.Sequential(
            nn.Linear(
                in_features=4*4*128,
                out_features=1024
            ),
            nn.ReLU(inplace=True)
        )
        self.dropout = nn.Dropout(p=0.7)
        self.fc2 = nn.Linear(
            in_features=1024,
            out_features=num_features
        )

    def forward(self, x):
        x = self.average_pooling(x)
        x = self.reduction(x)
        x = nn.Flatten()(x)
        x = self.fc1(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x

In [44]:
class GoogLeNet(nn.Module):
    def __init__(self, num_features: int) -> None:
        super().__init__()
        self.conv_pool_layer1 = nn.Sequential(
            basic_conv_block(
              in_channels=3,
              out_channels=64,
              kernel_size=7,
              padding=3,
              stride=2
            ),
            nn.MaxPool2d(
                kernel_size=3,
                stride=2,
                ceil_mode=True
            )
        )
        self.conv_pool_layer2 = nn.Sequential(
            basic_conv_block(
              in_channels=64,
              out_channels=64,
              kernel_size=1
            ),
            basic_conv_block(
              in_channels=64,
              out_channels=192,
              kernel_size=3,
              padding=1
            ),
            nn.MaxPool2d(
                kernel_size=3,
                stride=2,
                ceil_mode=True
            )
        )
        self.inception_layer3a = inception_block(
            in_channels=192,
            channels_1by1=64,
            channels_3by3_reduction=96,
            channels_3by3=128,
            channels_5by5_reduction=16,
            channels_5by5=32,
            channels_proj=32
        )
        self.inception_layer3b = inception_block(
            in_channels=256,
            channels_1by1=128,
            channels_3by3_reduction=128,
            channels_3by3=192,
            channels_5by5_reduction=32,
            channels_5by5=96,
            channels_proj=64
        )
        self.max_pool3 = nn.MaxPool2d(
            kernel_size=3,
            stride=2,
            ceil_mode=True
        )
        self.inception_layer4a = inception_block(
            in_channels=480,
            channels_1by1=192,
            channels_3by3_reduction=96,
            channels_3by3=208,
            channels_5by5_reduction=16,
            channels_5by5=48,
            channels_proj=64
        )
        self.inception_layer4b = inception_block(
            in_channels=512,
            channels_1by1=160,
            channels_3by3_reduction=112,
            channels_3by3=224,
            channels_5by5_reduction=24,
            channels_5by5=64,
            channels_proj=64
        )
        self.inception_layer4c = inception_block(
            in_channels=512,
            channels_1by1=128,
            channels_3by3_reduction=128,
            channels_3by3=256,
            channels_5by5_reduction=24,
            channels_5by5=64,
            channels_proj=64
        )
        self.inception_layer4d = inception_block(
            in_channels=512,
            channels_1by1=112,
            channels_3by3_reduction=144,
            channels_3by3=288,
            channels_5by5_reduction=32,
            channels_5by5=64,
            channels_proj=64
        )
        self.inception_layer4e = inception_block(
            in_channels=528,
            channels_1by1=256,
            channels_3by3_reduction=160,
            channels_3by3=320,
            channels_5by5_reduction=32,
            channels_5by5=128,
            channels_proj=128
        )
        self.max_pool4 = nn.MaxPool2d(
            kernel_size=3,
            stride=2,
            ceil_mode=True
        )
        self.inception_layer5a = inception_block(
            in_channels=832,
            channels_1by1=256,
            channels_3by3_reduction=160,
            channels_3by3=320,
            channels_5by5_reduction=32,
            channels_5by5=128,
            channels_proj=128
        )
        self.inception_layer5b = inception_block(
            in_channels=832,
            channels_1by1=384,
            channels_3by3_reduction=192,
            channels_3by3=384,
            channels_5by5_reduction=48,
            channels_5by5=128,
            channels_proj=128
        )
        self.average_pool = nn.AvgPool2d(
            kernel_size=7
        )
        self.dropout = nn.Dropout(p=0.4)
        self.classifier = nn.Linear(in_features=1024, out_features=num_features)
        self.auxiliary1 = auxiliary_block(in_channels=512, num_features=num_features)
        self.auxiliary2 = auxiliary_block(in_channels=528, num_features=num_features)

    def forward(self, x, aux: bool = True):
        x = self.conv_pool_layer1(x)

        x = self.conv_pool_layer2(x)

        x = self.inception_layer3a(x)
        x = self.inception_layer3b(x)
        x = self.max_pool3(x)

        x = self.inception_layer4a(x)
        aux1 = None
        if aux:
            aux1 = self.auxiliary1(x)
        x = self.inception_layer4b(x)
        x = self.inception_layer4c(x)
        x = self.inception_layer4d(x)
        aux2 = None
        if aux:
            aux2 = self.auxiliary2(x)
        x = self.inception_layer4e(x)
        x = self.max_pool3(x)

        x = self.inception_layer5a(x)
        x = self.inception_layer5b(x)
        x = self.average_pool(x)
        x = self.dropout(x)

        x = nn.Flatten()(x)
        x = self.classifier(x)

        return (aux1, aux2, x)


In [47]:
model = GoogLeNet(num_features=101)
model

GoogLeNet(
  (conv_pool_layer1): Sequential(
    (0): basic_conv_block(
      (conv): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
    )
    (1): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=True)
  )
  (conv_pool_layer2): Sequential(
    (0): basic_conv_block(
      (conv): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
    )
    (1): basic_conv_block(
      (conv): Conv2d(64, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn): BatchNorm2d(192, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
    )
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=True)
  )
  (inception

In [48]:
summary(model, depth=1)

Layer (type:depth-idx)                   Param #
GoogLeNet                                --
├─Sequential: 1-1                        9,536
├─Sequential: 1-2                        115,200
├─inception_block: 1-3                   164,064
├─inception_block: 1-4                   389,376
├─MaxPool2d: 1-5                         --
├─inception_block: 1-6                   376,800
├─inception_block: 1-7                   449,808
├─inception_block: 1-8                   510,768
├─inception_block: 1-9                   606,080
├─inception_block: 1-10                  869,376
├─MaxPool2d: 1-11                        --
├─inception_block: 1-12                  1,044,480
├─inception_block: 1-13                  1,445,344
├─AvgPool2d: 1-14                        --
├─Dropout: 1-15                          --
├─Linear: 1-16                           103,525
├─auxiliary_block: 1-17                  2,267,493
├─auxiliary_block: 1-18                  2,269,541
Total params: 10,621,391
Trainable para

In [40]:
model(torch.randn((2, 3, 224, 224)))[2].shape

torch.Size([2, 101])

## GoogLeNet model for training Food101 datasets
* number of parameters: 10,621,391

In [None]:
import torch
from torch import nn
from torch.utils.data import DataLoader

import torchvision
from torchvision import transforms, datasets

from torchinfo import summary

from pathlib import Path

import sys
sys.path.append('..')

from python_scripts import model_builder, engine, data_setup

In [None]:
if torch.cuda.is_available():
    device = torch.device('cuda')
elif torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')

In [None]:
food101_transforms_train = transforms.Compose([
    transforms.RandomRotation(30),
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.AutoAugment(transforms.AutoAugmentPolicy.IMAGENET),
    transforms.ToTensor(),
    transforms.Normalize(
      [0.485, 0.456, 0.406],
      [0.229, 0.224, 0.225]
    )
])

food101_transforms_test = transforms.Compose([
    transforms.Resize(225),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(
      [0.485, 0.456, 0.406],
      [0.229, 0.224, 0.225]
    )
])

In [None]:
learning_rate_list = [1e-4]
weight_decay_list = [1e-4]
epochs_list = [5]
batch_size_list = [16]

In [None]:
data_dir = Path('../data/extracted')

train_data_food101 = datasets.Food101(
    root=data_dir,
    split='train',
    transform=food101_transforms_train,
    download=True
)
test_data_food101 = datasets.Food101(
    root=data_dir,
    split='test',
    transform=food101_transforms_test,
    download=True
)

class_names_food101 = train_data_food101.classes

In [None]:
def googlenet_model_generator(weights=None):
    return model_builder.GoogLeNet(num_features=len(class_names_food101))

In [None]:
train_dataset, _ = data_setup.split_dataset(
    dataset=train_data_food101,
    split_size=1, # you can start with small number of train set to check normality at first
    seed=42
)
test_dataset, _ = data_setup.split_dataset(
    dataset=test_data_food101,
    split_size=1,
    seed=42
)

tuning_results = engine.HP_tune_train(
    model=None,
    model_generator=googlenet_model_generator,
    model_weights=None,
    model_name='GoogLeNet_food101',
    train_dataset=train_dataset,
    test_dataset=test_dataset,
    learning_rate_list=learning_rate_list,
    weight_decay_list=weight_decay_list,
    epochs_list=epochs_list,
    batch_size_list=batch_size_list,
    is_tensorboard_writer=False,
    device=device,
    gradient_accumulation_num=1
)

[INFO] Splitting dataset of length 75750 into splits of size: 75750 and 0
[INFO] Splitting dataset of length 25250 into splits of size: 25250 and 0


  0%|          | 0/5 [00:00<?, ?it/s]

Epoch: 0 | Train_loss: 4.4056, Train_acc: 0.0417 | Test_loss: 3.9164, Test_acc: 0.1005
Epoch: 1 | Train_loss: 4.1334, Train_acc: 0.0781 | Test_loss: 3.5720, Test_acc: 0.1563
Epoch: 2 | Train_loss: 3.9022, Train_acc: 0.1161 | Test_loss: 3.2041, Test_acc: 0.2145
Epoch: 3 | Train_loss: 3.6619, Train_acc: 0.1569 | Test_loss: 2.8273, Test_acc: 0.2963
Epoch: 4 | Train_loss: 3.4637, Train_acc: 0.1954 | Test_loss: 2.5758, Test_acc: 0.3486
