# RWNN_JiaminRen 기준으로 zero base(빈 폴더)에서 구축

- 실험환경: 1 gpu (Geforce 1080ti)

In [1]:
import os
import time
import logging

import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
import torch.backends.cudnn as cudnn    # for hardware tunning (cudnn.benchmark = True)

from tensorboardX import SummaryWriter

from model import CNN
from lr_scheduler import LRScheduler

In [2]:
# TRAIN:
#     workers: 3
#     batch_size: 256
#     epochs: 100

#     lr_mode : cosine
#     base_lr: 0.1
#     warmup_epochs: 0
#     warmup_lr: 0.0
#     targetlr : 0.0

#     momentum: 0.9
#     weight_decay: 0.00005

#     net_type : regular
#     channels : 109
#     nodes : 32
#     graph_model : WS
#     K : 4
#     P : 0.75
#     seed : 1

#     print_freq: 100
#     model_dir: checkpoint/regular_c109_n32

#     train_root: data/imagenet/train
#     train_source: data/imagenet/train.txt
#     val_root: data/imagenet/val
#     val_source: data/imagenet/val.txt

# TEST:
#     checkpoint_path : data/pretrained_model/regular_c109_n32.pth


#########################################
# LR schedular() 에 쓰이는 것들  // 없는 것 : args.step, args.decay_factor, args.power
# args.lr_mode, args.warmup_mode, args.base_lr
# args.targetlr, args.warmup_lr, args.epochs, args.warmup_epochs

In [3]:
###########################
# 1. Configuration
########################### 
class Args():
    def __init__(self, run_code='default_run_code'):
        self.run_code = run_code
        self.workers = 2
        self.batch_size = 128
        self.epochs = 100

        self.lr_mode = 'cosine'
        self.warmup_mode = 'linear'    # default
        self.base_lr = 0.1
        
        self.warmup_epochs = 0
        self.warmup_lr = 0.0
        self.targetlr = 0.0

        self.momentum = 0.9
        self.weight_decay = 0.00005

        self.net_type = 'regular'
        self.channels = 109
        self.nodes = 32
        self.graph_model = 'WS'
        self.K = 4
        self.P = 0.75
        
        self.resume = False    # checkpoint에 저장된 모델 불러와서 사용할지 여부

        self.seed = 1

        self.print_freq = 100
        
        self.model_dir = './checkpoint' + '/' + run_code + '/'    # graph 정보 (e.g. conv2.yaml 등), 
        self.tensorboard_path = './tensorboard/' + run_code + '/'
        self.log_path = './logs/' + run_code + '/'

#         self.model_name = temp_date_str + '_model_'

        if not os.path.isdir(self.model_dir): os.makedirs(self.model_dir)
        if not os.path.isdir(self.tensorboard_path): os.makedirs(self.tensorboard_path)
        if not os.path.isdir(self.log_path): os.makedirs(self.log_path)        

        logging.basicConfig(filename=self.log_path + 'logging.log', level=logging.INFO)
        logging.info('Configuration initialized.')
        
        # train_root: data/imagenet/train
        # train_source: data/imagenet/train.txt
        # val_root: data/imagenet/val
        # val_source: data/imagenet/val.txt

#         self.checkpoint_path = 'data/pretrained_model/regular_c109_n32.pth'

In [4]:
args = Args(run_code='regular_c109_n32')    # default source
print(args.model_dir)

./checkpoint/regular_c109_n32/


In [5]:
###########################
# 2. Model, scheduler, loss, optimizer
###########################
model = CNN(args)

model = nn.DataParallel(model)  # for multi-GPU

model.cuda()

logging.info('Model is initialized')

# define loss function (criterion) and optimizer
criterion = nn.CrossEntropyLoss().cuda()

optimizer = torch.optim.SGD(model.parameters(), args.base_lr,
                            momentum=args.momentum,
                            weight_decay=args.weight_decay)

######### 일단, args.evaluate는 무시하고 구현 ###########################################
# if args.evaluate:
#     # evaluation mode 였다면, validate 하고 'return' 하며 끝남.
#     print("args.evaluate: ", args.evaluate)
#     validate(val_loader, model, criterion, 0, writer)
#     # return

In [6]:
####################################################
# 3. saved model이 있다면 해당 checkpoint를 load. 아니면, train from scratch
####################################################
# auto resume from a checkpoint
# start_epoch = 0
# if args.evaluate:
#     load_state_ckpt(args.checkpoint_path, model)
# else:
#     best_prec1, start_epoch = load_state(model_dir, model, optimizer=optimizer)
# if args.rank == 0:
#     writer = SummaryWriter(model_dir)
# else:
#     writer = None

# cudnn.benchmark = True

start_epoch  = 0
best_prec1 = 0

# write tensorboard summaries & logger
writer = SummaryWriter(log_dir=args.tensorboard_path)

cudnn.benchmark = True    # This flag allows you to enable the inbuilt cudnn auto-tuner to find the best algorithm to use for your hardware.

In [7]:
###########################
# 4. Dataset & Dataloader
###########################
train_transform = transforms.Compose(
    [
        transforms.RandomHorizontalFlip(),   # 추가함
        transforms.Resize(224),    # 추가함.  imagenet dataset과 size 맞추기
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))  # rescale 0 ~ 1 => -1 ~ 1
    ])

val_transform = transforms.Compose(
    [
        transforms.Resize(224),    # 추가함.  imagenet dataset과 size 맞추기
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))  # rescale 0 ~ 1 => -1 ~ 1
    ])


train_dataset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=train_transform)

val_dataset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=val_transform)

# sampler를 생략함
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size,
                                          shuffle=True, num_workers=args.workers)  

val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.batch_size,
                                         shuffle=False, num_workers=args.workers)

logging.info('Dataset & DataLoader is ready')

####################### evaluate 가 왜 필요하지?
# if args.evaluate:
#     validate(val_loader, model, criterion, 0, writer)
#     return

Files already downloaded and verified
Files already downloaded and verified


In [8]:
class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

def accuracy(output, target, topk=(1,)):
    """Computes the precision@k for the specified values of k"""
    maxk = max(topk)
    batch_size = target.size(0)

    # torch.topk : input, k, dim=None, largest=True, sorted=True => returns top k element
    # returns values list & indices list
    _, pred = output.topk(maxk, 1, True, True)    
    pred = pred.t()
    correct = pred.eq(target.view(1, -1).expand_as(pred))   # torch.eq: Computes element-wise equality

    res = []
    for k in topk:
        correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)   # input, dim,
        res.append(correct_k.mul_(100.0 / batch_size))
    return res

In [9]:
def train(train_loader, model, criterion, optimizer, lr_scheduler, epoch, writer):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    # switch to train mode
    model.train()

    end = time.time()
    for i, (input, target) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        lr_scheduler.update(i, epoch)
        
        target = target.cuda(async=True)
        
        input_var = torch.autograd.Variable(input.cuda())
        target_var = torch.autograd.Variable(target)
        
        # compute output
        output = model(input_var)
        loss = criterion(output, target_var)

        # measure accuracy and record loss
        prec1, prec5 = accuracy(output, target, topk=(1, 5))

        losses.update(loss.item(), input.size(0))
        top1.update(prec1.item(), input.size(0))
        top5.update(prec5.item(), input.size(0))

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % args.print_freq == 0:
            print('Epoch: [{0}][{1}/{2}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                  'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                epoch, i, len(train_loader), batch_time=batch_time,
                data_time=data_time, loss=losses, top1=top1, top5=top5))
            
            logging.info('Epoch: [{0}][{1}/{2}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                  'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                epoch, i, len(train_loader), batch_time=batch_time,
                data_time=data_time, loss=losses, top1=top1, top5=top5))
            
            niter = epoch * len(train_loader) + i
            writer.add_scalar('learning_rate', optimizer.param_groups[0]['lr'], niter)
            writer.add_scalar('Train/Avg_Loss', losses.avg, niter)
            writer.add_scalar('Train/Avg_Top1', top1.avg / 100.0, niter)
            writer.add_scalar('Train/Avg_Top5', top5.avg / 100.0, niter)

In [10]:
def validate(val_loader, model, criterion, epoch, writer):
    batch_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    # switch to evaluate mode
    model.eval()

    with torch.no_grad():
        end = time.time()
        for i, (input, target) in enumerate(val_loader):
            target = target.cuda(async=True)
            input_var = torch.autograd.Variable(input.cuda(), volatile=True)
            target_var = torch.autograd.Variable(target, volatile=True)

            # compute output
            output = model(input_var)
            loss = criterion(output, target_var)

            # measure accuracy and record loss
            prec1, prec5 = accuracy(output, target, topk=(1, 5))

            losses.update(loss.item(), input.size(0))
            top1.update(prec1.item(), input.size(0))
            top5.update(prec5.item(), input.size(0))

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if i % args.print_freq == 0:
                print('Test: [{0}/{1}]\t'
                      'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                      'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                      'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                    i, len(val_loader), batch_time=batch_time, loss=losses,
                    top1=top1, top5=top5))
                
                logging.info('Test: [{0}/{1}]\t'
                      'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                      'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                      'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                    i, len(val_loader), batch_time=batch_time, loss=losses,
                    top1=top1, top5=top5))                
                

        print(' * Prec@1 {top1.avg:.3f} Prec@5 {top5.avg:.3f}'
              .format(top1=top1, top5=top5))

        niter = (epoch + 1)
        writer.add_scalar('Eval/Avg_Loss', losses.avg, niter)
        writer.add_scalar('Eval/Avg_Top1', top1.avg / 100.0, niter)
        writer.add_scalar('Eval/Avg_Top5', top5.avg / 100.0, niter)

    return top1.avg

In [11]:
####################################################
# 5. Train model
####################################################
niters = len(train_loader)

lr_scheduler = LRScheduler(optimizer, niters, args)  # (default) args.step = [30, 60, 90], args.decay_factor = 0.1, args.power = 2.0

for epoch in range(start_epoch, args.epochs):

    # train for one epoch
    ####################################################
    # 1 epoch training, validation 을 함수로 따로 정의함. => 코드가 훨씬 깔끔.
    ####################################################        
    train(train_loader, model, criterion, optimizer, lr_scheduler, epoch, writer)

    # evaluate on validation set
    prec1 = validate(val_loader, model, criterion, epoch, writer)

    #  write log
    # remember best prec@1 and save checkpoint
    is_best = prec1 > best_prec1
    best_prec1 = max(prec1, best_prec1)
    
    ########################################################## save checkpoint 구현해놔야함 ###################3
    # is_best 일 때, save checkpoint
    # run code에 모델내 그래프는 있으니까,
    # epoch, model.state_dict(), best_prec1, optimizer.state_dict() 만 저장하면 될 듯

#     save_checkpoint(model_dir, {
#         'epoch': epoch + 1,
#         'model': args.config.rsplit('/',1)[-1].split('.yaml')[0],
#         'state_dict': model.state_dict(),
#         'best_prec1': best_prec1,
#         'optimizer': optimizer.state_dict(),
#     }, is_best)

Epoch: [0][0/391]	Time 11.467 (11.467)	Data 0.401 (0.401)	Loss 7.0085 (7.0085)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][100/391]	Time 0.635 (0.777)	Data 0.009 (0.013)	Loss 1.6835 (2.1347)	Prec@1 42.188 (25.766)	Prec@5 88.281 (78.473)
Epoch: [0][200/391]	Time 0.661 (0.728)	Data 0.009 (0.011)	Loss 1.7369 (1.9198)	Prec@1 40.625 (31.417)	Prec@5 86.719 (83.357)
Epoch: [0][300/391]	Time 0.663 (0.715)	Data 0.009 (0.010)	Loss 1.5091 (1.7992)	Prec@1 45.312 (35.237)	Prec@5 90.625 (85.878)


  
  from ipykernel import kernelapp as app


Test: [0/79]	Time 0.843 (0.843)	Loss 1.4251 (1.4251)	Prec@1 51.562 (51.562)	Prec@5 89.844 (89.844)
 * Prec@1 47.680 Prec@5 93.090
Epoch: [1][0/391]	Time 3.245 (3.245)	Data 0.452 (0.452)	Loss 1.5748 (1.5748)	Prec@1 39.062 (39.062)	Prec@5 92.969 (92.969)
Epoch: [1][100/391]	Time 0.695 (0.729)	Data 0.009 (0.013)	Loss 1.2795 (1.3143)	Prec@1 53.125 (53.125)	Prec@5 92.969 (93.588)
Epoch: [1][200/391]	Time 0.703 (0.719)	Data 0.008 (0.011)	Loss 1.1226 (1.2535)	Prec@1 61.719 (55.162)	Prec@5 93.750 (94.236)
Epoch: [1][300/391]	Time 0.699 (0.717)	Data 0.013 (0.010)	Loss 1.0199 (1.1941)	Prec@1 56.250 (57.309)	Prec@5 97.656 (94.902)
Test: [0/79]	Time 0.737 (0.737)	Loss 1.0290 (1.0290)	Prec@1 66.406 (66.406)	Prec@5 98.438 (98.438)
 * Prec@1 63.290 Prec@5 96.190
Epoch: [2][0/391]	Time 1.138 (1.138)	Data 0.356 (0.356)	Loss 1.0180 (1.0180)	Prec@1 64.844 (64.844)	Prec@5 97.656 (97.656)
Epoch: [2][100/391]	Time 0.678 (0.713)	Data 0.008 (0.012)	Loss 0.9994 (0.9060)	Prec@1 64.844 (68.301)	Prec@5 96.875 (97

Test: [0/79]	Time 0.731 (0.731)	Loss 0.5012 (0.5012)	Prec@1 86.719 (86.719)	Prec@5 98.438 (98.438)
 * Prec@1 85.390 Prec@5 99.290
Epoch: [14][0/391]	Time 1.137 (1.137)	Data 0.372 (0.372)	Loss 0.0865 (0.0865)	Prec@1 96.875 (96.875)	Prec@5 100.000 (100.000)
Epoch: [14][100/391]	Time 0.691 (0.714)	Data 0.009 (0.012)	Loss 0.0869 (0.1074)	Prec@1 95.312 (96.101)	Prec@5 100.000 (100.000)
Epoch: [14][200/391]	Time 0.735 (0.713)	Data 0.008 (0.011)	Loss 0.1345 (0.1056)	Prec@1 96.094 (96.323)	Prec@5 100.000 (99.988)
Epoch: [14][300/391]	Time 0.741 (0.712)	Data 0.009 (0.010)	Loss 0.1214 (0.1141)	Prec@1 95.312 (95.974)	Prec@5 99.219 (99.977)
Test: [0/79]	Time 0.651 (0.651)	Loss 0.3767 (0.3767)	Prec@1 88.281 (88.281)	Prec@5 99.219 (99.219)
 * Prec@1 84.670 Prec@5 99.430
Epoch: [15][0/391]	Time 1.136 (1.136)	Data 0.426 (0.426)	Loss 0.0547 (0.0547)	Prec@1 98.438 (98.438)	Prec@5 100.000 (100.000)
Epoch: [15][100/391]	Time 0.716 (0.715)	Data 0.010 (0.013)	Loss 0.0776 (0.0974)	Prec@1 96.875 (96.573)	Prec

RuntimeError: CUDA error: unspecified launch failure

In [None]:
# import shutil

# def save_checkpoint(model_dir, state, is_best):
#     epoch = state['epoch']
#     path = os.path.join(model_dir, 'model.pth-' + str(epoch))
#     torch.save(state, path)
#     checkpoint_file = os.path.join(model_dir, 'checkpoint')
#     checkpoint = open(checkpoint_file, 'w+')
#     checkpoint.write('model_checkpoint_path:%s\n' % path)
#     checkpoint.close()
#     if is_best:
#         shutil.copyfile(path, os.path.join(model_dir, 'model-best.pth'))

## Pytorch Distributed 패키지 활용

In [6]:
import torch

import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel

In [7]:
ngpus_per_node = torch.cuda.device_count()
print(ngpus_per_node)

4
