# 논문 내 설정 요약
- **훈련 목적**: 다중 클래스 분류 문제에서 다항 로지스틱 회귀 목표를 최적화
- **최적화 방법**: mini-batch gradient descent (mini-batch 크기: 256)과 momentum (momentum 값: 0.9) 사용
- **정규화**:
  - L2 패널티(가중치 감소) 활용 (L2 패널티 곱셈 인수: \(5 \times 10^{-4}\))
  - 첫 두 개의 fully-connected layer에 dropout 정규화 적용 (dropout 비율: 0.5)
- **학습률 설정**:
  - 초기 학습률: \(10^{-2}\)
  - 검증 세트 정확도가 개선되지 않을 때마다 학습률을 10배 감소
  - 전체 감속 횟수: 3회
  - 최종 반복 수: 370K 회 (74 epochs)
- **초기화**:
  - 초기화의 중요성: 나쁜 초기화는 깊은 신경망에서 gradient 불안정성을 초래하여 학습을 중단시킬 수 있음
  - 초기화 방법:
    - Configuration A (Table 1)를 무작위 초기화로 훈련
    - 더 깊은 아키텍처를 훈련할 때, 첫 4개의 convolutional layer와 마지막 3개의 fully-connected layer를 Configuration A의 레이어로 초기화
    - 중간 레이어는 무작위 초깃값으로 초기화
  - 가중치 샘플링: 평균 0, 분산 \(10^{-2}\)인 정규 분포에서 샘플링
  - Bias 초기화: 0으로 초기화
  - 참고: Glorot & Bengio(2010) 절차를 통해 사전 학습 없이 초기화 가능
- **입력 이미지 전처리**:
  - 고정 크기 \(224 \times 224\)의 ConvNet 입력 이미지는 재조정된 훈련 이미지에서 무작위로 자름
  - 데이터 증강: 무작위 수평 뒤집기 및 무작위 RGB 색상 변환 적용

# Data prepairing

In [1]:
import torchvision
import torch
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np

# transforms
transform = transforms.Compose(
    [
        transforms.Resize(224),
        transforms.RandomCrop((224, 224), padding=4),
        transforms.RandomVerticalFlip(0.5),
        transforms.RandomHorizontalFlip(0.5),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.247, 0.243, 0.261)),
    ]
)

train_set = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
valid_set = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)

# image size check
print(train_set.data.shape)
print(valid_set.data.shape)

Files already downloaded and verified
Files already downloaded and verified
(50000, 32, 32, 3)
(10000, 32, 32, 3)


# Modeling

In [2]:
import torch.nn as nn
import torch.nn.init as init

class VGG16(nn.Module):
    def __init__(self, num_classes=10):
        super(VGG16, self).__init__()

        self.convnet = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )

        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(in_features=512 * 7 * 7, out_features=4096),
            nn.ReLU(inplace=True),
            nn.Dropout(p=0.5),
            nn.Linear(in_features=4096, out_features=4096),
            nn.ReLU(inplace=True),
            nn.Dropout(p=0.5),
            nn.Linear(in_features=4096, out_features=num_classes),
            nn.Softmax(dim=1)
        )

        self._initialize_weights()

    def forward(self, x):
        x = self.convnet(x)
        x = self.classifier(x)
        return x

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
                init.normal_(m.weight, mean=0, std=0.1)  # 분산이 10^-2인 정규 분포에서 샘플링
                if m.bias is not None:
                    init.constant_(m.bias, 0)

# Training

In [10]:
import tqdm, os
import torch.optim as optim
from torch.utils.data import DataLoader


num_epochs = 5
learning_rate = 0.01
batch_size = 32
momentum = 0.9
weight_decay = 5e-4

train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_set, batch_size=batch_size, shuffle=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = VGG16().to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum, weight_decay=weight_decay)

model_path = "vgg16_latest.pt"
if os.path.exists(model_path):
    model.load_state_dict(torch.load(model_path))
    print(f"Model loaded from {model_path}")

for epoch in range(num_epochs):
    iterator = tqdm.tqdm(train_loader)
    for data, label in iterator:
        data, label = data.to(device), label.to(device)
        optimizer.zero_grad()

        preds = model(data)
        loss = criterion(preds, label)
        loss.backward()
        optimizer.step()

        iterator.set_description(f"Epoch {epoch+1}/{num_epochs} - Loss: {loss.item():.4f}")

    # 학습률 감소
    if epoch > 0 and epoch % 25 == 0:
        learning_rate /= 10
        for param_group in optimizer.param_groups:
            param_group['lr'] = learning_rate

    torch.save(model.state_dict(), model_path)

Epoch 1/5 - Loss: 2.2737: 100%|██████████| 1563/1563 [09:54<00:00,  2.63it/s]
Epoch 2/5 - Loss: 2.4612: 100%|██████████| 1563/1563 [09:53<00:00,  2.63it/s]
Epoch 3/5 - Loss: 2.4612: 100%|██████████| 1563/1563 [09:51<00:00,  2.64it/s]
Epoch 4/5 - Loss: 2.3987: 100%|██████████| 1563/1563 [09:56<00:00,  2.62it/s]
Epoch 5/5 - Loss: 2.3362: 100%|██████████| 1563/1563 [09:49<00:00,  2.65it/s]


# Testing

In [11]:
num_corr = 0

with torch.no_grad():
    for data, label in valid_loader:
        data, label = data.to(device), label.to(device)
        preds = model(data).data.max(1)[1]
        corr = preds.eq(label.data).sum().item()
        num_corr += corr

print(f"Accuracy: {num_corr/len(valid_set)}")

Accuracy: 0.0992
