#참조

[Thanks to..](https://deep-learning-study.tistory.com/376)

[AlexNet paper](https://proceedings.neurips.cc/paper/2012/file/c399862d3b9d6b76c8436e924a68c45b-Paper.pdf)

#모듈 불러오기

In [1]:
import torch
import torch.nn as nn
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import torch.optim as optim
import torch.nn.functional as F         #cross_entropy
import torch.nn.init

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

torch.manual_seed(1109)

if device == 'cuda':
    torch.cuda.manual_seed_all(1109)

#모델 설계

CONV1 - MAX POOL1 - NORM1 - CONV2 - MAXPOOL2 - NORM2 - CONV3 - CONV4 - CONV5 - MAXPOOL3 - FC6 - FC7 - FC8

- NORM1, 2 모두 Response-normalization layer
    - 측면 억제(강한 자극이 주변의 약한 자극을 전달하는 것을 막는 것)의 형태로 구현됨
    - ex) 헤르만 격자..
- 현재는 batch normalization이 쓰인다

구체적인 설명

**Convolution**
1. CONV1 : 96 (11*11) filters with stride 4
    - ReLU()
    - MAX POOL1 : (3*3) filters at stride 2
        - overlapping max pooling
    - NORM1
        - local response normalization


2. Conv2 : 256 (5*5) filters at stride 1, pad 2
    - ReLU()
    - MAX POOL2 : (3*3) filters at stride 2
        - overlapping max pooling
    - NORM2
        - local response normalization

3. CONV3 : 384 (3*3) filters at stride 1, pad 2
    - ReLU()
4. CONV4 : 384 (3*3) filters at stride 1, pad 2
    - ReLU()
5. CONV5 : 256 (3*3) filters at stride 1, pad 2
    - ReLU()
    - MAX POOL3 : (3*3) filters at stride 2
        - overlapping max pooling

**Fully Connected**
6. FC6 : 4096 neurons    (**Use Dropout(0.5)**)
    - ReLU()
7. FC7 : 4096 neurons    (**Use Dropout(0.5)**)
    - ReLU()
8. FC8 : 1000 neurons  (number of classes)
    - softmax()




데이터셋은 ImageNet 데이터의 부분집합으로 하고, 1000개의 클래스를 가져온다

**Weight Initialization**

- $N(0, (0.01)^2)$ 로 초기화
- 2, 4, 5번째 conv와 fc layer는 bias를 1로 초기화 (prevent Dying ReLU)

예전에는 GPU 성능의 한계 때문에 두 개의 GPU를 공유하면서 학습하였다

In [3]:
class AlexNet(nn.Module):
    def __init__(self):
        super(AlexNet, self).__init__()
        
        self.conv = nn.Sequential(
            #CONV1  (b, 96, 55, 55)
            nn.Conv2d(3, 96, kernel_size=11, stride=4,),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(3, stride=2),    #overlapping max pooling
            nn.LocalResponseNorm(size=5, alpha=0.0001, beta=0.75, k=2),  #BN으로 대체

            #CONV2  (b, 256, 13, 13)
            nn.Conv2d(96, 256, kernel_size=5, stride=1, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(3, stride=2),    #overlapping max pooling
            nn.LocalResponseNorm(size=5, alpha=0.0001, beta=0.75, k=2),  #BN으로 대체
            
            #CONV3  (b, 384, 13, 13)
            nn.Conv2d(256, 384, kernel_size=3, stride=1, padding=1),
            nn.ReLU(inplace=True),

            #CONV4  (b, 384, 13, 13)
            nn.Conv2d(384, 384, kernel_size=3, stride=1, padding=1),
            nn.ReLU(inplace=True),

            #CONV5  (b, 384, 6, 6)
            nn.Conv2d(384, 256, kernel_size=3, stride=1, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(3, stride=2),    #overlapping max pooling
        )

        self.fc = nn.Sequential(
            #FC6
            nn.Dropout(0.5, inplace=True),
            nn.Linear(384*6*6, 4096),
            nn.ReLU(inplace=True),

            #FC7
            nn.Dropout(0.5, inplace=True),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),

            #FC8
            nn.Linear(4096, 1000)
        )

        self.init_bias()
    
    def init_bias(self):
        for layer in self.conv:
            if isinstance(layer, nn.Conv2d):
                nn.init.normal_(layer.weight, mean=0, std=0.01)
                nn.init.constant_(layer.bias, 0)

            #conv2, 4, 5는 bias를 1로 초기화
            nn.init.constant_(self.conv[4].bias, 1)
            nn.init.constant_(self.conv[10].bias, 1)
            nn.init.constant_(self.conv[12].bias, 1)
        
        for layer in self.fc:
            if isinstance(layer, nn.Linear):
                nn.init.normal_(layer.weight, mean=0, std=0.01)
                nn.init.constant_(layer.bias, 1)        #prevent Dying ReLU

    def forward(self, x):
        x = self.conv(x)
        x = x.view(-1, 256*6*6)     #flatten
        return self.fc(x)

모델 생성

In [4]:
alexnet = AlexNet().to(device)

#Loss, Optimizer 설정

- Loss : cross_entropy
- Optimizer : SGD momentum with 0.9

**하이퍼파라미터 설정**

In [5]:
NUM_EPOCHS = 90
BATCH_SIZE = 128
MOMENTUM = 0.9
LR_DECAY = 0.0005
LR_INIT = 0.01

**Optimizer 구현**

In [6]:
optimizer = optim.SGD(
    params = alexnet.parameters(),
    lr = LR_INIT,
    momentum = MOMENTUM,
    weight_decay = LR_DECAY
)

**LR scheduler 구현**

- 30step 마다 0.1씩 감소

In [7]:
lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)

#Training

아래의 dataloader, X, y는 구현되지 않은 상태

In [None]:
for epoch in range(NUM_EPOCHS):
    lr_scheduler.step()     #학습률 조정
    for X, y in dataloader:
        X, y = X.to(device), y.to(device)

        output = alexnet(X)     #score확인
        loss = F.cross_entropy(output, y)       #loss 계산

        #parameter stepup
        optimizer.zero_grad()
        loss.backward()
        optimizer

#혼자 구현 시도해 본 결과..

In [None]:
#초기화하기가 너무 더럽다..

class AlexNet(nn.Module):
    def __init__(self):
        super(AlexNet, self).__init__()
        #CONV1  (b, 96, 55, 55)
        self.conv1 = nn.Sequential(
            nn.Conv2d(3, 96, kernel_size=11, stride=4),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(3, stride=2),    #overlapping max pooling
            nn.BatchNorm2d(),         #BN으로 대체
            #nn.LocalResponseNorm(size=5, alpha=0.0001, beta=0.75, k=2)
        )

        #CONV2  (b, 256, 13, 13)
        self.conv2 = nn.Sequential(
            nn.Conv2d(96, 256, kernel_size=5, stride=1, pad=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(3, stride=2),    #overlapping max pooling
            nn.BatchNorm2d(),         #BN으로 대체
            #nn.LocalResponseNorm(size=5, alpha=0.0001, beta=0.75, k=2)
        )

        #CONV3  (b, 384, 13, 13)
        self.conv3 = nn.Sequential(
            nn.Conv2d(256, 384, kernel_size=3, stride=1, pad=1),
            nn.ReLU(inplace=True),
        )

        #CONV4  (b, 384, 13, 13)
        self.conv4 = nn.Sequential(
            nn.Conv2d(384, 384, kernel_size=3, stride=1, pad=1),
            nn.ReLU(inplace=True),
        )

        #CONV5  (b, 384, 6, 6)
        self.conv5 = nn.Sequential(
            nn.Conv2d(384, 256, kernel_size=3, stride=1, pad=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(3, stride=2),    #overlapping max pooling
        )

        #FC6
        self.fc6 = nn.Sequential(
            nn.Dropout(0.5, inplace=True),
            nn.Linear(384*6*6, 4096),
            nn.ReLU(inplace=True),
        )
        
        #FC7
        self.fc7 = nn.Sequential(
            nn.Dropout(0.5, inplace=True),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
        )

        #FC8
        self.fc8 = nn.Linear(4096, 1000)
        
        self.init_bias()
    
    def init_bias(self):
