In [0]:
import torch
import torchvision
import torchvision.transforms as transforms
from torchvision.utils import save_image
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import os
import glob
import PIL
from PIL import Image
from torch.utils import data as D
from torch.utils.data.sampler import SubsetRandomSampler
from torch.optim.lr_scheduler import _LRScheduler
import random
import torchsummary
from torch.utils.data import DataLoader
import time
from datetime import datetime
from torch.autograd import Variable

CIFAR100_TRAIN_MEAN = (0.5070751592371323, 0.48654887331495095, 0.4409178433670343)
CIFAR100_TRAIN_STD = (0.2673342858792401, 0.2564384629170883, 0.27615047132568404)
MILESTONES = [20, 40, 80] #learning rate 값을 60 epoch, 120 epoch, 160 epoch 별로 어떻게 하겠따.
CHECKPOINT_PATH = 'checkpoint'
TIME_NOW = datetime.now().isoformat()

In [0]:
num_workers = 4
batch_size = 64 
validation_ratio = 0.1
random_seed = 10 
EPOCH = 100
lr = 0.1
warm = 1
SAVE_EPOCH=10

In [0]:
class depthwise_conv(nn.Module):
    # __init__()에서 모델의 구조와 동작을 정의하는 생성자를 정의(속성값을 초기화하는 역할로, 객체가 생성될 때 자동으호 호출)
    def __init__(self, nin, kernel_size, padding, bias=False, stride=1):
        super(depthwise_conv, self).__init__() # super() 함수를 부르면 여기서 만든 클래스는 nn.Module 클래스의 속성들을 가지고 초기화
        #nn.conv2D 모듈 : 첫번째는 입력 채널 수, 두번째변수는 출력 채널 수 세번째는 커널 사이즈(숫자하나만 지정하면 정사각형으로 간주)
        self.depthwise = nn.Conv2d(nin, nin, kernel_size=kernel_size, stride=stride, padding=padding, groups=nin, bias=bias)
        #self.depthwise는 이제 nin 크기의 받아서 nin 크기의 출력을 하는 conv2D 함수가 됨.

    #foward() 함수는 모델이 학습데이터를 입력받아서 forward 연산을 진행시키는 함수
    def forward(self, x):
        out = self.depthwise(x)  #self.depthwise 실행하고 반환
        return out

In [0]:
class dw_block(nn.Module):
    def __init__(self, nin, kernel_size, padding=1, bias=False, stride=1):
        super(dw_block, self).__init__()
        self.dw_block = nn.Sequential(
            depthwise_conv(nin, kernel_size, padding, bias, stride),
            #BatchNorm2d(배치 정규화): 학습률을 너무 높게 잡으면 기울기가 소실되거나 발산하는 증상을 예방하여 학습과정을 안정화하는 방법
            nn.BatchNorm2d(nin),
            ##distribution을 평균 0, 표준편차 1인 input으로 normalize시키는 방법
            ##Training 할 때는 batch의 평균과 분산으로 normalize 하고, Test 할 때는 계산해놓은 이동 평균(training 때 계산)으로 normalize
            nn.ReLU()
        )
    def forward(self, x):
        out = self.dw_block(x)
        return out

In [0]:
class one_by_one_block(nn.Module):
    def __init__(self, nin, nout, padding=0, bias=False, stride=1):
        super(one_by_one_block, self).__init__()
        self.one_by_one_block = nn.Sequential(
            #커널 사이즈 1x1 로 컨벌루션 진행
            nn.Conv2d(nin, nout, kernel_size=1, stride=stride, padding=padding, bias=bias),
            nn.BatchNorm2d(nout),
            nn.ReLU()
        )
    def forward(self, x):
        out = self.one_by_one_block(x)
        return out

In [0]:
#########################################
#           Original Mobilenet          #
#########################################

class MobileNet(nn.Module):
  
    def __init__(self, input_channel, num_classes=10):
        super(MobileNet, self).__init__()
        
        self.network = nn.Sequential(
            #nn.conv2D 모듈 : 첫번째는 입력 채널 수, 두번째변수는 출력 채널 수 세번째는 커널 사이즈(숫자하나만 지정하면 정사각형으로 간주)
            #BatchNorm2d(배치 정규화): 학습률을 너무 높게 잡으면 기울기가 소실되거나 발산하는 증상을 예방하여 학습과정을 안정화하는 방법
            #계층에 들어가는 입력을 평균과 분산으로 정규화함.
            nn.Conv2d(input_channel, 32, kernel_size=3, stride=2, padding=1, bias=False),
            nn.BatchNorm2d(32),
            nn.ReLU(True),
            
            dw_block(32, kernel_size=3), #depthwise convolution
            one_by_one_block(32, 64), #one_by_one convolution
            
            dw_block(64, kernel_size=3, stride=2),
            one_by_one_block(64, 128),
            
            dw_block(128, kernel_size=3),
            one_by_one_block(128, 128),
            
            dw_block(128, kernel_size=3, stride=2),
            one_by_one_block(128, 256),
            
            dw_block(256, kernel_size=3),
            one_by_one_block(256, 256),
            
            dw_block(256, kernel_size=3, stride=2),
            one_by_one_block(256, 512),
            
            # 5 times 
            dw_block(512, kernel_size=3),
            one_by_one_block(512, 512),
            dw_block(512, kernel_size=3),
            one_by_one_block(512, 512),
            dw_block(512, kernel_size=3),
            one_by_one_block(512, 512),
            dw_block(512, kernel_size=3),
            one_by_one_block(512, 512),
            dw_block(512, kernel_size=3),
            one_by_one_block(512, 512),
            
            dw_block(512, kernel_size=3, padding=4,stride=2),
            one_by_one_block(512, 1024),
            
            dw_block(1024, kernel_size=3, padding=4, stride=2),
            one_by_one_block(1024, 1024),
        )
                
        self.linear = nn.Linear(1024, num_classes)
        
    def forward(self, x):
        body_output = self.network(x)
        
        avg_pool_output = F.adaptive_avg_pool2d(body_output, (1, 1))
        avg_pool_flat = avg_pool_output.view(avg_pool_output.size(0), -1)

        output = self.linear(avg_pool_flat)
        
        return output

In [0]:
net = MobileNet(3, 100)

In [0]:
def get_training_dataloader(mean, std, batch_size=16, num_workers=2, shuffle=True):
    """ return training dataloader
    Args:
        mean: mean of cifar100 training dataset
        std: std of cifar100 training dataset
        path: path to cifar100 training python dataset
        batch_size: dataloader batchsize
        num_workers: dataloader num_works
        shuffle: whether to shuffle 
    Returns: train_data_loader:torch dataloader object
    """

    transform_train = transforms.Compose([
        #transforms.ToPILImage(),
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.RandomRotation(15),
        transforms.ToTensor(),
        transforms.Normalize(mean, std)
    ])
    #cifar100_training = CIFAR100Train(path, transform=transform_train)
    cifar100_training = torchvision.datasets.CIFAR100(root='./data', train=True, download=True, transform=transform_train)
    cifar100_training_loader = DataLoader(
        cifar100_training, shuffle=shuffle, num_workers=num_workers, batch_size=batch_size)

    return cifar100_training_loader

def get_test_dataloader(mean, std, batch_size=16, num_workers=2, shuffle=True):
    """ return training dataloader
    Args:
        mean: mean of cifar100 test dataset
        std: std of cifar100 test dataset
        path: path to cifar100 test python dataset
        batch_size: dataloader batchsize
        num_workers: dataloader num_works
        shuffle: whether to shuffle 
    Returns: cifar100_test_loader:torch dataloader object
    """

    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean, std)
    ])
    #cifar100_test = CIFAR100Test(path, transform=transform_test)
    cifar100_test = torchvision.datasets.CIFAR100(root='./data', train=False, download=True, transform=transform_test)
    cifar100_test_loader = DataLoader(
        cifar100_test, shuffle=shuffle, num_workers=num_workers, batch_size=batch_size)
    
    return cifar100_test_loader

class WarmUpLR(_LRScheduler):
    """warmup_training learning rate scheduler
    Args:
        optimizer: optimzier(e.g. SGD)
        total_iters: totoal_iters of warmup phase
    """
    def __init__(self, optimizer, total_iters, last_epoch=-1):
        
        self.total_iters = total_iters
        super().__init__(optimizer, last_epoch)

    def get_lr(self):
        """we will use the first m batches, and set the learning
        rate to base_lr * m / total_iters
        """
        return [base_lr * self.last_epoch / (self.total_iters + 1e-8) for base_lr in self.base_lrs]

In [0]:
###================================ Dataset Resize ================================###
#데이터셋 설정
transform_train = transforms.Compose([
        transforms.Resize(224),
        ### 오버피팅을 방지하기 위해 RandomCrop과 Randam HorizontalFlip같은 노이즈 추가.
        transforms.RandomCrop(224, padding=28), #오버피팅 막으려고 랜덤으로 잘라서 이미지 만든다,,,,(?)
        transforms.RandomHorizontalFlip(), # 오버피팅 막으려고 이미지 반전시켜서 만든다,,,(?)
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616))])

#validation이나 test는 그런 것 필요 없음
transform_validation = transforms.Compose([
        transforms.Resize(224),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616))])


transform_test = transforms.Compose([
        transforms.Resize(224),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616))])
###================================ Dataset Resize ================================###

cifar100_training_loader = get_training_dataloader(
        CIFAR100_TRAIN_MEAN,
        CIFAR100_TRAIN_STD,
        batch_size=batch_size,
        num_workers=num_workers,
        shuffle=True
    )

cifar100_test_loader = get_test_dataloader(
        CIFAR100_TRAIN_MEAN,
        CIFAR100_TRAIN_STD,
        batch_size=batch_size,
        num_workers=num_workers,
        shuffle=False
    )

Downloading https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz to ./data/cifar-100-python.tar.gz


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Extracting ./data/cifar-100-python.tar.gz to ./data
Files already downloaded and verified


In [0]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") #GPU있고 cuda를 쓸수 있으면 쿠다를 쓰게 하고 없으면 cpu 쓰게함
print(device)

cuda:0


In [0]:
net.to(device) #이용가능한 device(cpu or Gpu)에 네트워크 전송

MobileNet(
  (network): Sequential(
    (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): dw_block(
      (dw_block): Sequential(
        (0): depthwise_conv(
          (depthwise): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
        )
        (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU()
      )
    )
    (4): one_by_one_block(
      (one_by_one_block): Sequential(
        (0): Conv2d(32, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU()
      )
    )
    (5): dw_block(
      (dw_block): Sequential(
        (0): depthwise_conv(
          (depthwise): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), 

In [0]:
torchsummary.summary(net, (3, 224, 224))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 32, 112, 112]             864
       BatchNorm2d-2         [-1, 32, 112, 112]              64
              ReLU-3         [-1, 32, 112, 112]               0
            Conv2d-4         [-1, 32, 112, 112]             288
    depthwise_conv-5         [-1, 32, 112, 112]               0
       BatchNorm2d-6         [-1, 32, 112, 112]              64
              ReLU-7         [-1, 32, 112, 112]               0
          dw_block-8         [-1, 32, 112, 112]               0
            Conv2d-9         [-1, 64, 112, 112]           2,048
      BatchNorm2d-10         [-1, 64, 112, 112]             128
             ReLU-11         [-1, 64, 112, 112]               0
 one_by_one_block-12         [-1, 64, 112, 112]               0
           Conv2d-13           [-1, 64, 56, 56]             576
   depthwise_conv-14           [-1, 64,

In [0]:
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4)
train_scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=MILESTONES, gamma=0.2) #learning rate decay
#20, 40, 80
# 20 <= epoch < 40, lr = lr*0.2 
iter_per_epoch = len(cifar100_training_loader)
warmup_scheduler = WarmUpLR(optimizer, iter_per_epoch * warm)
checkpoint_path = os.path.join(CHECKPOINT_PATH, "MobileNet", TIME_NOW)

In [0]:
def train(epoch):

    net.train()
    num_trained_data = 0
    for batch_index, (images, labels) in enumerate(cifar100_training_loader):
        if epoch <= warm:
            warmup_scheduler.step()

        images = Variable(images)
        labels = Variable(labels)

        labels = labels.cuda()
        images = images.cuda()
        
        optimizer.zero_grad()
        outputs = net(images)
        loss = loss_function(outputs, labels)
        loss.backward()
        optimizer.step()

        n_iter = (epoch - 1) * len(cifar100_training_loader) + batch_index + 1
        num_trained_data += len(images)
        '''print('Training Epoch: {epoch} [{trained_samples}/{total_samples}]\tLoss: {:0.4f}\tLR: {:0.6f}'.format(
            loss.item(),
            optimizer.param_groups[0]['lr'],
            epoch=epoch,
            trained_samples= num_trained_data,
            total_samples=len(cifar100_training_loader.dataset)
        ))'''

In [0]:
def eval_training(epoch):
    net.eval()

    test_loss = 0.0 # cost function error
    correct = 0.0

    for (images, labels) in cifar100_test_loader:
        images = Variable(images)
        labels = Variable(labels)

        images = images.cuda()
        labels = labels.cuda()

        outputs = net(images)
        loss = loss_function(outputs, labels)
        test_loss += loss.item()
        _, preds = outputs.max(1)
        correct += preds.eq(labels).sum()

    print('[', epoch, 'epoch]')
    print('Test set: Average loss: {:.4f}, Accuracy: {:.4f}'.format(
        test_loss / len(cifar100_test_loader.dataset),
        correct.float() / len(cifar100_test_loader.dataset)
    ))
    print()

    return correct.float() / len(cifar100_test_loader.dataset)

In [0]:
 if not os.path.exists(checkpoint_path):
        os.makedirs(checkpoint_path)
checkpoint_path = os.path.join(checkpoint_path, '{net}-{epoch}-{type}.pth')

best_acc = 0.0
for epoch in range(1, EPOCH):
    if epoch > warm:
        train_scheduler.step(epoch)

    train(epoch)
    acc = eval_training(epoch)

    #start to save best performance model after learning rate decay to 0.01 
    if epoch > MILESTONES[1] and best_acc < acc:
        torch.save(net.state_dict(), checkpoint_path.format(net="MobileNet", epoch=epoch, type='best'))
        best_acc = acc
        continue

    if not epoch % SAVE_EPOCH:
        torch.save(net.state_dict(), checkpoint_path.format(net="MobileNet", epoch=epoch, type='regular'))
print('Training Finish')



[ 1 epoch]
Test set: Average loss: 0.0632, Accuracy: 0.0849

[ 2 epoch]
Test set: Average loss: 0.0589, Accuracy: 0.1166

[ 3 epoch]
Test set: Average loss: 0.0580, Accuracy: 0.1243

[ 4 epoch]
Test set: Average loss: 0.0575, Accuracy: 0.1279

[ 5 epoch]
Test set: Average loss: 0.0571, Accuracy: 0.1322

[ 6 epoch]
Test set: Average loss: 0.0568, Accuracy: 0.1360

[ 7 epoch]
Test set: Average loss: 0.0564, Accuracy: 0.1398

[ 8 epoch]
Test set: Average loss: 0.0562, Accuracy: 0.1420

[ 9 epoch]
Test set: Average loss: 0.0557, Accuracy: 0.1441

[ 10 epoch]
Test set: Average loss: 0.0555, Accuracy: 0.1506

[ 11 epoch]
Test set: Average loss: 0.0552, Accuracy: 0.1559

[ 12 epoch]
Test set: Average loss: 0.0549, Accuracy: 0.1568

[ 13 epoch]
Test set: Average loss: 0.0548, Accuracy: 0.1542

[ 14 epoch]
Test set: Average loss: 0.0543, Accuracy: 0.1590

[ 15 epoch]
Test set: Average loss: 0.0540, Accuracy: 0.1617

[ 16 epoch]
Test set: Average loss: 0.0538, Accuracy: 0.1660

[ 17 epoch]
Test 