# module 4. Distributed-Mutigpu Training with CutMix-ScriptMode
---

본 모듈에서는 Amzaon SageMaker API을 효과적으로 이용하기 위해 distributed-multigpu 학습을 위한 PyTorch 프레임워크 자체 구현만으로 모델 훈련을 수행해 봅니다.

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import os
import time, datetime

import warnings
warnings.filterwarnings('ignore')

from torch.utils.data import Dataset
from sklearn.metrics import recall_score

import matplotlib.pyplot as plt
import joblib

from sagemaker.pytorch import PyTorch, PyTorchModel

PyTorch 버전을 최신 버전으로 업데이트합니다. 

## 1. Setup

In [2]:
import time

import boto3
import glob
import numpy as np
from PIL import Image
import pandas as pd
import torch, torchvision
%config InlineBackend.figure_format = 'retina'
from matplotlib import pyplot as plt
from torchvision import datasets, transforms

import sagemaker
from sagemaker import get_execution_role
from sagemaker.session import Session

In [3]:
sess = boto3.Session()
sagemaker_session = sagemaker.Session()
sm = sess.client('sagemaker')
role = get_execution_role()

## 2. Upload dataset to S3 bucket

In [4]:
# create a s3 bucket to hold data, note that your account might already created a bucket with the same name
account_id = sess.client('sts').get_caller_identity()["Account"]
job_bucket = 'sagemaker-experiments-{}-{}'.format(sess.region_name, account_id)
data_bucket = 'sagemaker-{}-{}'.format(sess.region_name, account_id)
try:
    if sess.region_name == "us-east-1":
        sess.client('s3').create_bucket(Bucket=data_bucket)
    else:
        sess.client('s3').create_bucket(Bucket=data_bucket, 
                                        CreateBucketConfiguration={'LocationConstraint': sess.region_name})
except Exception as e:
    print(e)

An error occurred (BucketAlreadyOwnedByYou) when calling the CreateBucket operation: Your previous request to create the named bucket succeeded and you already own it.


In [5]:
local_data_paths = glob.glob('./input/train_image_data_*.feather')
s3_data_path = sagemaker.s3_input(s3_data='s3://{}/{}'.format(data_bucket, 'bangali/train'), content_type='csv')

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


## 3. Write main_trainer.py

In [6]:
%%writefile ./src/requirements.txt
albumentations
pyarrow

Overwriting ./src/requirements.txt


In [7]:
%%writefile ./src/main_trainer.py

import argparse
import datetime
import gc
import glob
import json
import logging
import logging.handlers
import os
import random
import sys
import time
import warnings
from functools import partial

import numpy as np
import pandas as pd
import torch
import torch.backends.cudnn as cudnn
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data
import torchvision.datasets as datasets
import torchvision.models as models
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, Dataset

import dis_util
import joblib
import matplotlib.pyplot as plt
import sagemaker_containers
import util
## augmentation for setting
from albumentations import (CLAHE, Blur, Compose, Flip, GaussNoise,
                            GridDistortion, HorizontalFlip, HueSaturationValue,
                            IAAAdditiveGaussianNoise, IAAEmboss,
                            IAAPerspective, IAAPiecewiseAffine, IAASharpen,
                            MedianBlur, MotionBlur, Normalize, OneOf,
                            OpticalDistortion, RandomBrightnessContrast,
                            RandomRotate90, Rotate, ShiftScaleRotate,
                            Transpose)
from albumentations.pytorch import ToTensor, ToTensorV2
from sklearn.metrics import recall_score

warnings.filterwarnings('ignore')

## Apex import package
try:
    from apex.parallel import DistributedDataParallel as DDP
    from apex.fp16_utils import *
    from apex import amp, optimizers
    from apex.multi_tensor_apply import multi_tensor_applier
except ImportError:
    raise ImportError(
        "Please install apex from https://www.github.com/nvidia/apex to run this example.")




logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler(sys.stdout))


def parser_args():
    parser = argparse.ArgumentParser()

    # Default Setting
    parser.add_argument('--log-interval', type=int, default=10, metavar='N',
                        help='how many batches to wait before logging training status')
    parser.add_argument('--backend', type=str, default='nccl',
                        help='backend for distributed training (tcp, gloo on cpu and gloo, nccl on gpu)')
    parser.add_argument('--channels-last', type=bool, default=True)
    parser.add_argument('--seed', type=int, default=1, metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument('-p', '--print-freq', default=10, type=int,
                        metavar='N', help='print frequency (default: 10)')

    # Hyperparameter Setting
    parser.add_argument('--model_name', type=str, default='resnet18')
    parser.add_argument('--height', type=int, default=128)
    parser.add_argument('--width', type=int, default=128)
    parser.add_argument('--num_folds', type=int, default=5)
    parser.add_argument('--vld_fold_idx', type=int, default=4)
    parser.add_argument('--lr', type=float, default=0.001)
    parser.add_argument('--num_epochs', type=int, default=3)
    parser.add_argument('--batch-size', type=int, default=64)
    parser.add_argument('--val-batch-size', type=int, default=200, metavar='N',
                        help='input batch size for testing (default: 200)')

    # APEX Setting for Distributed Training
    parser.add_argument('--apex', type=bool, default=False)
    parser.add_argument('--opt-level', type=str, default='O0')
    parser.add_argument('--keep-batchnorm-fp32', type=str, default=None)
    parser.add_argument('--loss-scale', type=str, default=None)
    parser.add_argument('--sync_bn', action='store_false',
                        help='enabling apex sync BN.')
    parser.add_argument('--prof', default=-1, type=int,
                        help='Only run 10 iterations for profiling.')

    # SageMaker Container environment
    parser.add_argument('--hosts', type=list,
                        default=json.loads(os.environ['SM_HOSTS']))
    parser.add_argument('--current-host', type=str,
                        default=os.environ['SM_CURRENT_HOST'])
    parser.add_argument('--model-dir', type=str,
                        default=os.environ['SM_MODEL_DIR'])
    parser.add_argument('--data-dir', type=str,
                        default=os.environ['SM_CHANNEL_TRAINING'])
    parser.add_argument('--num-gpus', type=int,
                        default=os.environ['SM_NUM_GPUS'])
    parser.add_argument('--output_data_dir', type=str,
                        default=os.environ.get('SM_OUTPUT_DATA_DIR'))

    args = parser.parse_args()
    return args
    
    
class BangaliDataset(Dataset):
    def __init__(self, imgs, label_df=None, transform=None):
        self.imgs = imgs
        self.label_df = label_df.reset_index(drop=True)
        self.transform = transform
        
    def __len__(self):
        return len(self.label_df)
    
    def __getitem__(self, idx):
        
        img_idx = self.label_df.iloc[idx].id
        img = (self.imgs[img_idx]).astype(np.uint8)
        img = 255 - img
    
        img = img[:,:,np.newaxis]
        img = np.repeat(img, 3, axis=2)
        
        if self.transform is not None:
            img = self.transform(image=img)['image']        
        
        if self.label_df is not None:
            label_1 = self.label_df.iloc[idx].grapheme_root
            label_2 = self.label_df.iloc[idx].vowel_diacritic
            label_3 = self.label_df.iloc[idx].consonant_diacritic           
            return img, np.array([label_1, label_2, label_3])        
        else:
            return img
        
        

def _rand_bbox(size, lam):
    '''
    CutMix Helper function.
    Retrieved from https://github.com/clovaai/CutMix-PyTorch/blob/master/train.py
    '''
    W = size[2]
    H = size[3]
    # 폭과 높이는 주어진 이미지의 폭과 높이의 beta distribution에서 뽑은 lambda로 얻는다
    cut_rat = np.sqrt(1. - lam)
    
    # patch size 의 w, h 는 original image 의 w,h 에 np.sqrt(1-lambda) 를 곱해준 값입니다.
    cut_w = np.int(W * cut_rat)
    cut_h = np.int(H * cut_rat)

    # patch의 중심점은 uniform하게 뽑힘
    cx = np.random.randint(W)
    cy = np.random.randint(H)

    bbx1 = np.clip(cx - cut_w // 2, 0, W)
    bby1 = np.clip(cy - cut_h // 2, 0, H)
    bbx2 = np.clip(cx + cut_w // 2, 0, W)
    bby2 = np.clip(cy + cut_h // 2, 0, H)

    return bbx1, bby1, bbx2, bby2


def _set_seed(seed=42):
    np.random.seed(seed)
    random.seed(seed)


def _get_images(args, data_type='train'):

    logger.info("=== Getting Labels ===")
    logger.info(args.data_dir)
    
    label_df = pd.read_csv(os.path.join(args.data_dir, 'train_folds.csv'))
     
    trn_fold = [i for i in range(args.num_folds) if i not in [args.vld_fold_idx]]
    vld_fold = [args.vld_fold_idx]

    trn_idx = label_df.loc[label_df['fold'].isin(trn_fold)].index
    vld_idx = label_df.loc[label_df['fold'].isin(vld_fold)].index

    logger.info("=== Getting Images ===")
    # files = [file for file in glob.glob(os.path.join(args.data_dir,'*')) if file.split('.')[-1] == 'parquet']
    files = [f'{args.data_dir}/{data_type}_image_data_{i}.feather' for i in range(4)]
    logger.info(files)
    
    image_df_list = [pd.read_feather(f) for f in files]
    imgs = [df.iloc[:, 1:].values.reshape(-1, args.height, args.width) for df in image_df_list]
    del image_df_list
    gc.collect()
    args.imgs = np.concatenate(imgs, axis=0)
    
    args.trn_df = label_df.loc[trn_idx]
    args.vld_df = label_df.loc[vld_idx]
    
    return args 


def _get_train_data_loader(args, **kwargs):
    logger.info("Get train data loader")
    train_transforms = Compose([
        Rotate(20),
            OneOf([
                IAAAdditiveGaussianNoise(),
                GaussNoise(),
            ], p=0.2),
            OneOf([
                MotionBlur(p=.2),
                MedianBlur(blur_limit=3, p=0.1),
                Blur(blur_limit=3, p=0.1),
            ], p=0.2),
            ShiftScaleRotate(shift_limit=0.0625, scale_limit=0.2, rotate_limit=45, p=0.2),
            OneOf([
                OpticalDistortion(p=0.3),
                GridDistortion(p=.1),
                IAAPiecewiseAffine(p=0.3),
            ], p=0.2),
            OneOf([
                CLAHE(clip_limit=2),
                IAASharpen(),
                IAAEmboss(),
                RandomBrightnessContrast(),            
            ], p=0.3),
            HueSaturationValue(p=0.3),
        Normalize(
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225],
        ),
        ToTensorV2()
        ], p=1.0)
    
    dataset = BangaliDataset(imgs=args.imgs, label_df=args.trn_df, transform=train_transforms)
    train_sampler = data.distributed.DistributedSampler(
        dataset, num_replicas=int(args.world_size), rank=int(args.rank)) if args.multigpus_distributed else None
    return data.DataLoader(dataset, batch_size=args.batch_size, shuffle=train_sampler is None,
                                       sampler=train_sampler, **kwargs), train_sampler


def _get_val_data_loader(args, **kwargs):
    logger.info("Get val data loader")   
    
    val_transforms = Compose([
        Normalize(
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225],
        ),
        ToTensorV2()
    ])
    
    dataset = BangaliDataset(imgs=args.imgs, label_df=args.vld_df, transform=val_transforms)
    val_sampler = data.distributed.DistributedSampler(dataset) if args.multigpus_distributed else None
    return data.DataLoader(dataset, batch_size=args.val_batch_size, shuffle=False, 
                           sampler=val_sampler, **kwargs)

def train(current_gpu, args):
    
    _set_seed()
    
    best_acc1 = -1
    model_history = {}
    model_history = util.init_modelhistory(model_history)
    train_start = time.time()

    ## choose model from pytorch model_zoo
    model = util.torch_model(args.model_name, pretrained=True)
    last_hidden_units = model.fc.in_features
    model.fc = torch.nn.Linear(last_hidden_units, 186)
    
    loss_fn = nn.CrossEntropyLoss()
    
    ## distributed_setting 
    model, args = dis_util.dist_setting(current_gpu, model, args)

    ## CuDNN library will benchmark several algorithms and pick that which it found to be fastest
    cudnn.benchmark = False if args.seed else True
    
    optimizer = optim.Adam(model.parameters(), lr=args.lr) 
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                          verbose=True, patience=5, 
                                                          factor=0.5)
    if args.apex:
        model, optimizer = dis_util.apex_init(model, optimizer, args)
   
    args = _get_images(args, data_type='train')
    train_loader, train_sampler = _get_train_data_loader(args, **args.kwargs)
    val_loader = _get_val_data_loader(args, **args.kwargs)

    logger.info("Processes {}/{} ({:.0f}%) of train data".format(
        len(train_loader.sampler), len(train_loader.dataset),
        100. * len(train_loader.sampler) / len(train_loader.dataset)
    ))

    logger.info("Processes {}/{} ({:.0f}%) of test data".format(
        len(val_loader.sampler), len(val_loader.dataset),
        100. * len(val_loader.sampler) / len(val_loader.dataset)
    ))

    for epoch in range(1, args.num_epochs + 1):
        batch_time = util.AverageMeter('Time', ':6.3f')
        data_time = util.AverageMeter('Data', ':6.3f')
        losses = util.AverageMeter('Loss', ':.4e')
        top1 = util.AverageMeter('Acc@1', ':6.2f')
        top5 = util.AverageMeter('Acc@5', ':6.2f')
        progress = util.ProgressMeter(
            len(train_loader),
            [batch_time, data_time, losses, top1, top5],
            prefix="Epoch: [{}]".format(epoch))
        
        trn_loss = []
        model.train()
        end = time.time()
        running_loss = 0.0
        ## Set epoch count for DistributedSampler
        if args.multigpus_distributed:
            train_sampler.set_epoch(epoch)
            
        for batch_idx, (input, target) in enumerate((train_loader)):
            input = input.cuda()
            target = target.cuda()
            
            batch_idx += 1

            ##### DATA Processing #####
            targets_gra = target[:, 0]
            targets_vow = target[:, 1]
            targets_con = target[:, 2]

            # 50%의 확률로 원본 데이터 그대로 사용    
            if np.random.rand() < 0.5:
                
                logits = model(input)
                grapheme = logits[:,:168]
                vowel = logits[:, 168:179]
                cons = logits[:, 179:]
            
                loss1 = loss_fn(grapheme, targets_gra)
                loss2 = loss_fn(vowel, targets_vow)
                loss3 = loss_fn(cons, targets_con) 
            else:
                lam = np.random.beta(1.0, 1.0) 
                rand_index = torch.randperm(input.size()[0])
                shuffled_targets_gra = targets_gra[rand_index]
                shuffled_targets_vow = targets_vow[rand_index]
                shuffled_targets_con = targets_con[rand_index]

                bbx1, bby1, bbx2, bby2 = _rand_bbox(input.size(), lam)
                input[:, :, bbx1:bbx2, bby1:bby2] = input[rand_index, :, bbx1:bbx2, bby1:bby2]
                # 픽셀 비율과 정확히 일치하도록 lambda 파라메터 조정  
                lam = 1 - ((bbx2 - bbx1) * (bby2 - bby1) / (input.size()[-1] * input.size()[-2]))

                logits = model(input)
                grapheme = logits[:,:168]
                vowel = logits[:, 168:179]
                cons = logits[:, 179:]
            
                loss1 = loss_fn(grapheme, targets_gra) * lam + loss_fn(grapheme, shuffled_targets_gra) * (1. - lam)
                loss2 = loss_fn(vowel, targets_vow) * lam + loss_fn(vowel, shuffled_targets_vow) * (1. - lam)
                loss3 = loss_fn(cons, targets_con) * lam + loss_fn(cons, shuffled_targets_con) * (1. - lam)

            loss = 0.5 * loss1 + 0.25 * loss2 + 0.25 * loss3    
            trn_loss.append(loss.item())
            running_loss += loss.item()
            
            #########################################################
            optimizer.zero_grad()         
            if args.apex:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()
            
            optimizer.step()
            scheduler.step(loss)
            
            # Printing vital information
            if (batch_idx) % (args.log_interval) == 0:
                s = f'[Epoch {epoch} Batch {batch_idx}/{len(train_loader)}] ' \
                f'loss: {running_loss / args.log_interval:.4f}'
                print(s)
                running_loss = 0 
                # Every log_interval iterations, check the loss, accuracy, and speed.
                # For best performance, it doesn't make sense to print these metrics every
                # iteration, since they incur an allreduce and some host<->device syncs.

                # Measure accuracy
                prec1, prec5 = util.accuracy(logits, target, topk=(1, 5))

                # Average loss and accuracy across processes for logging
                if args.multigpus_distributed:
                    reduced_loss = dis_util.reduce_tensor(loss.data, args)
                    prec1 = dis_util.reduce_tensor(prec1, args)
                    prec5 = dis_util.reduce_tensor(prec5, args)
                else:
                    reduced_loss = loss.data
                
                # to_python_float incurs a host<->device sync
                losses.update(to_python_float(reduced_loss), input.size(0))
                top1.update(to_python_float(prec1), input.size(0))
                top5.update(to_python_float(prec5), input.size(0))
                
                ## Waiting until finishing operations on GPU (Pytorch default: async)
                torch.cuda.synchronize()
                batch_time.update((time.time() - end)/args.log_interval)
                end = time.time()

                if current_gpu == 0:
                    print('Epoch: [{0}][{1}/{2}]  '
                          'Time {batch_time.val:.3f} ({batch_time.avg:.3f})  '
                          'Speed {3:.3f} ({4:.3f})  '
                          'Loss {loss.val:.10f} ({loss.avg:.4f})  '
                          'Prec@1 {top1.val:.3f} ({top1.avg:.3f})  '
                          'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                              epoch, batch_idx, len(train_loader),
                              args.world_size*args.batch_size/batch_time.val,
                              args.world_size*args.batch_size/batch_time.avg,
                              batch_time=batch_time,
                              loss=losses, top1=top1, top5=top5))
            
        if current_gpu == 0:
            model_history['epoch'].append(epoch)
            model_history['batch_idx'].append(batch_idx)
            model_history['batch_time'].append(batch_time.val)
            model_history['losses'].append(losses.val)
            model_history['top1'].append(top1.val)
            model_history['top5'].append(top5.val)
        
        
        acc1 = validate(val_loader, model, loss_fn, epoch, model_history, trn_loss, args)            

        # remember best acc@1 and save checkpoint
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)
        
        if not args.multigpus_distributed or (args.multigpus_distributed and args.rank % args.num_gpus == 0):
            util.save_history(os.path.join(args.output_data_dir,
                          'model_history.p'), model_history)

            util.save_model({
                'epoch': epoch + 1,
                'model_name': args.model_name,
                'state_dict': model.state_dict(),
                'best_acc1': best_acc1,
                'optimizer': optimizer.state_dict()
            }, is_best, args.model_dir)


def validate(val_loader, model, loss_fn, epoch, model_history, trn_loss, args):
    batch_time = util.AverageMeter('Time', ':6.3f')
    losses = util.AverageMeter('Loss', ':.4e')
    top1 = util.AverageMeter('Acc@1', ':6.2f')
    top5 = util.AverageMeter('Acc@5', ':6.2f')
    progress = util.ProgressMeter(
        len(val_loader),
        [batch_time, losses, top1, top5],
        prefix='Test: ')
    val_loss = []
    val_true = []
    val_pred = []
    
    # switch to evaluate mode
    model.eval()
    end = time.time()
  

    for batch_idx, (input, target) in enumerate((val_loader)):
        batch_idx += 1
        with torch.no_grad():
            input = input.cuda()
            target = target.cuda()

            # compute output
        
            logits = model(input)
            grapheme = logits[:,:168]
            vowel = logits[:, 168:179]
            cons = logits[:, 179:]

            loss= 0.5* loss_fn(grapheme, target[:,0]) + 0.25*loss_fn(vowel, target[:,1]) + \
            0.25*loss_fn(vowel, target[:,2])
            val_loss.append(loss.item())

            grapheme = grapheme.cpu().argmax(dim=1).data.numpy()
            vowel = vowel.cpu().argmax(dim=1).data.numpy()
            cons = cons.cpu().argmax(dim=1).data.numpy()

            val_true.append(target.cpu().numpy())
            val_pred.append(np.stack([grapheme, vowel, cons], axis=1))                
  
                
        # measure accuracy and record loss
        prec1, prec5 = util.accuracy(logits, target, topk=(1, 5))

        if args.multigpus_distributed:
            reduced_loss = dis_util.reduce_tensor(loss.data, args)
            prec1 = dis_util.reduce_tensor(prec1, args)
            prec5 = dis_util.reduce_tensor(prec5, args)
        else:
            reduced_loss = loss.data

        losses.update(to_python_float(reduced_loss), input.size(0))
        top1.update(to_python_float(prec1), input.size(0))
        top5.update(to_python_float(prec5), input.size(0))

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        # TODO:  Change timings to mirror train().
        if args.current_gpu == 0 and batch_idx % args.log_interval == 0:
            print('Test: [{0}/{1}]  '
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})  '
                  'Speed {2:.3f} ({3:.3f})  '
                  'val_Loss {loss.val:.4f} ({loss.avg:.4f})  '
                  'val_Prec {top1.val:.3f} ({top1.avg:.3f})  '
                  'val_Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                      batch_idx, len(val_loader),
                      args.world_size * args.batch_size / batch_time.val,
                      args.world_size * args.batch_size / batch_time.avg,
                      batch_time=batch_time, loss=losses,
                      top1=top1, top5=top5))
            model_history['val_epoch'].append(epoch)
            model_history['val_batch_idx'].append(batch_idx)
            model_history['val_batch_time'].append(batch_time.val)
            model_history['val_losses'].append(losses.val)
            model_history['val_top1'].append(top1.val)
            model_history['val_top5'].append(top5.val)



    val_true_concat = np.concatenate(val_true)
    val_pred_concat = np.concatenate(val_pred)
    val_loss_mean = np.mean(val_loss)
    trn_loss_mean = np.mean(trn_loss)
  

    score_g = recall_score(val_true_concat[:,0], val_pred_concat[:,0], average='macro')
    score_v = recall_score(val_true_concat[:,1], val_pred_concat[:,1], average='macro')
    score_c = recall_score(val_true_concat[:,2], val_pred_concat[:,2], average='macro')
    final_score = np.average([score_g, score_v, score_c], weights=[2,1,1])

    if args.current_gpu == 0:
        # Printing vital information
        s = f'[Epoch {epoch}] ' \
        f'trn_loss: {trn_loss_mean:.4f}, vld_loss: {val_loss_mean:.4f}, score: {final_score:.4f}, ' \
        f'score_g: {score_g:.4f}, score_v:{score_v:.4f}, score_c: {score_c:.4f}'          
        print(s)  
        
        
    print('  Prec@1 {top1.avg:.3f} Prec@5 {top5.avg:.3f}'.format(top1=top1, top5=top5))
    model_history['val_avg_epoch'].append(epoch)
    model_history['val_avg_batch_time'].append(batch_time.avg)
    model_history['val_avg_losses'].append(losses.avg)
    model_history['val_avg_top1'].append(top1.avg)
    model_history['val_avg_top5'].append(top5.avg)
    return final_score

## iter() overflowerror: cannot serialize a bytes object larger than 4 gib --> num_worker=0 resolved
def main():
    args = parser_args()
    args.use_cuda = args.num_gpus > 0
    print("args.use_cuda : {} , args.num_gpus : {}".format(
        args.use_cuda, args.num_gpus))
    args.kwargs = {'num_workers': 0,
                   'pin_memory': True} if args.use_cuda else {}
    args.device = torch.device("cuda" if args.use_cuda else "cpu")  
    
    print("main args : {}".format(args))

    dis_util.dist_init(train, args)


if __name__ == '__main__':
    main()


Overwriting ./src/main_trainer.py


In [8]:
%%writefile ./src/dis_util.py

import argparse
import logging
import numpy as np
import os
import random
import sys
import warnings

import torch
import torch.backends.cudnn as cudnn
import torch.distributed as dist
import torch.multiprocessing as mp
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data.distributed

import sagemaker_containers

try:
    from apex.parallel import DistributedDataParallel as DDP
    from apex.fp16_utils import *
    from apex import amp, optimizers
    from apex.multi_tensor_apply import multi_tensor_applier
except ImportError:
    raise ImportError(
        "Please install apex from https://www.github.com/nvidia/apex to run this example.")


logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler(sys.stdout))


def dist_init(fn, args):
    
    
    print("dist_init args : {}".format(args))
    
    
    if args.seed is not None:
        random.seed(args.seed)
        torch.manual_seed(args.seed)
        cudnn.deterministic = True
        warnings.warn('You have chosen to seed training. '
                      'This will turn on the CUDNN deterministic setting, '
                      'which can slow down your training considerably! '
                      'You may see unexpected behavior when restarting '
                      'from checkpoints.')

    args.is_distributed = len(args.hosts) > 1 and args.backend is not None
    args.is_multigpus = args.num_gpus > 1
    args.multigpus_distributed = (args.is_distributed or args.is_multigpus)

    logger.debug("Distributed training - {}".format(args.is_distributed))
    logger.debug("Number of gpus available - {}".format(args.num_gpus))

    args.world_size = 1
    if args.multigpus_distributed:
        # Initialize the distributed environment.
        args.apex = True
        args.world_size = len(args.hosts) * args.num_gpus
        os.environ['WORLD_SIZE'] = str(args.world_size)
        args.host_num = args.hosts.index(args.current_host)
        mp.spawn(fn, nprocs=args.num_gpus, args=(args,))
    else:
        current_gpu = 0
        fn(current_gpu, args)


def dist_setting(current_gpu, model, args):
    print("channels_last : {}".format(args.channels_last))
    if args.channels_last:
        args.memory_format = torch.channels_last
    else:
        args.memory_format = torch.contiguous_format

    if args.apex:
        args.lr = args.lr*float(args.world_size)
        
    args.current_gpu = current_gpu
    if args.current_gpu is not None:
        print("Use GPU: {} for training".format(args.current_gpu))

    if args.multigpus_distributed:
        args.rank = args.num_gpus * args.host_num + args.current_gpu
        dist.init_process_group(backend=args.backend,
                                rank=args.rank, world_size=args.world_size)
        logger.info('Initialized the distributed environment: \'{}\' backend on {} nodes. '.format(
            args.backend, dist.get_world_size()) + 'Current host rank is {}. Number of gpus: {}'.format(
            dist.get_rank(), args.num_gpus))

    if args.sync_bn:
        import apex
        print("using apex synced BN")
        model = apex.parallel.convert_syncbn_model(model)

    if args.multigpus_distributed:
        if args.current_gpu is not None:
            torch.cuda.set_device(args.current_gpu)
            args.batch_size = int(args.batch_size / args.num_gpus)
            if not args.apex:
                model.cuda(args.current_gpu)
                model = torch.nn.parallel.DistributedDataParallel(
                    model, device_ids=[args.current_gpu])
        else:
            if not args.apex:
                model.cuda()
                model = torch.nn.parallel.DistributedDataParallel(model)
    elif args.current_gpu is not None:
        torch.cuda.set_device(args.current_gpu)
        if not args.apex:
            model = model.cuda(args.current_gpu)
    else:
        if not args.apex:
            model = torch.nn.DataParallel(model).cuda()

    return model, args


def apex_init(model, optimizer, args):
    model = model.cuda().to(memory_format=args.memory_format)
    model, optimizer = amp.initialize(model, optimizer,
                                      opt_level=args.opt_level,
                                      keep_batchnorm_fp32=args.keep_batchnorm_fp32,
                                      loss_scale=args.loss_scale
                                      )
    if args.multigpus_distributed:
        model = DDP(model, delay_allreduce=True)
    return model, optimizer


def reduce_tensor(tensor, args):
    rt = tensor.clone()
    dist.all_reduce(rt, op=dist.reduce_op.SUM)
    rt /= args.world_size
    return rt

Overwriting ./src/dis_util.py


In [9]:
%%writefile ./src/util.py


import codecs
import json
import logging
import os
import shutil
import sys
import time


import torch
import torch.backends.cudnn as cudnn
import torch.distributed as dist
import torch.multiprocessing as mp
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data.distributed
from torchvision import models

import sagemaker_containers

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler(sys.stdout))


def torch_model(model_name, pretrained=True):
    model_names = sorted(name for name in models.__dict__
                         if name.islower() and not name.startswith("__")
                         and callable(models.__dict__[name]))

    if(model_name == "inception_v3"):
        raise RuntimeError(
            "Currently, inception_v3 is not supported by this example.")

    # create model
    if pretrained:
        print("=> using pre-trained model '{}'".format(model_name))
        model = models.__dict__[model_name](pretrained=True)
    else:
        print("=> creating model '{}'".format(model_name))
        model = models.__dict__[model_name]()
    return model


def accuracy(logits, target, topk=(1,)):
    """Computes the accuracy over the k top predictions for the specified values of k"""
    with torch.no_grad():
        maxk = max(topk)
        batch_size = target.size(0)
#         print("batch_size : {}".format(batch_size))

        _, pred1 = logits[:, :168].topk(maxk, 1, True, True)
        _, pred2 = logits[:, 168:179].topk(maxk, 1, True, True)
        _, pred3 = logits[:, 179:].topk(maxk, 1, True, True)

#         print("pred1 : {}, pred2 : {}, pred3 : {}".format(pred1 ,pred2, pred3))
        pred1 = pred1.t()
        pred2 = pred2.t()
        pred3 = pred3.t()
        correct1 = pred1.eq(target[:,0].view(1, -1).expand_as(pred1))
        correct2 = pred2.eq(target[:,1].view(1, -1).expand_as(pred2))
        correct3 = pred3.eq(target[:,2].view(1, -1).expand_as(pred3))
#         print("correct1 : {}, correct2 : {}, correct3 : {}".format(correct1 ,correct2, correct3))
        
        res = []
        for k in topk:
            correct_k1 = correct1[:k].view(-1).float().sum(0, keepdim=True)
            correct_k2 = correct2[:k].view(-1).float().sum(0, keepdim=True)
            correct_k3 = correct3[:k].view(-1).float().sum(0, keepdim=True)
            correct_k = 0.5 * correct_k1 + 0.25 * correct_k2 + 0.25 * correct_k3
            res.append(correct_k.mul_(100.0 / batch_size))
        return res    
    

def save_model(state, is_best, model_dir):
    logger.info("Saving the model.")
    filename = os.path.join(model_dir, 'checkpoint.pth')
    # recommended way from http://pytorch.org/docs/master/notes/serialization.html
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(filename, os.path.join(model_dir, 'model_best.pth'))


class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self, name, fmt=':f'):
        self.name = name
        self.fmt = fmt
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

    def __str__(self):
        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
        return fmtstr.format(**self.__dict__)


class ProgressMeter(object):
    def __init__(self, num_batches, meters, prefix=""):
        self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
        self.meters = meters
        self.prefix = prefix

    def display(self, batch):
        entries = [self.prefix + self.batch_fmtstr.format(batch)]
        entries += [str(meter) for meter in self.meters]
        print('\t'.join(entries))

    def _get_batch_fmtstr(self, num_batches):
        num_digits = len(str(num_batches // 1))
        fmt = '{:' + str(num_digits) + 'd}'
        return '[' + fmt + '/' + fmt.format(num_batches) + ']'

        
def adjust_learning_rate(optimizer, epoch, step, len_epoch, args):
    """LR schedule that should yield 76% converged accuracy with batch size 256"""
    factor = epoch // 5

    if epoch >= 10:
        factor = factor + 1

    lr = args.lr*(0.1**factor)

    """Warmup"""
    if epoch < 3:
        lr = lr*float(1 + step + epoch*len_epoch)/(3.*len_epoch)

    if(args.current_gpu == 0):
        print("epoch = {}, step = {}, lr = {}".format(epoch, step, lr))

    for param_group in optimizer.param_groups:
        param_group['lr'] = lr


def save_history(path, history):

    history_for_json = {}
    # transform float values that aren't json-serializable
    for key in history.keys():
        history_for_json[key] = list(map(float, history[key]))

    with codecs.open(path, 'w', encoding='utf-8') as f:
        json.dump(history_for_json, f, separators=(
            ',', ':'), sort_keys=True, indent=4)
        
        

def init_modelhistory(model_history):
    model_history['epoch'] = []
    model_history['batch_idx'] = []
    model_history['batch_time'] = []
    model_history['losses'] = []
    model_history['top1'] = []
    model_history['top5'] = []
    model_history['val_epoch'] = []
    model_history['val_batch_idx'] = []
    model_history['val_batch_time'] = []
    model_history['val_losses'] = []
    model_history['val_top1'] = []
    model_history['val_top5'] = []
    model_history['val_avg_epoch'] = []
    model_history['val_avg_batch_time'] = []
    model_history['val_avg_losses'] = []
    model_history['val_avg_top1'] = []
    model_history['val_avg_top5'] = []
    return model_history


Overwriting ./src/util.py


In [10]:
metrics_definitions = [
     {'Name': 'train:loss', 'Regex': 'loss: ([0-9\\.]+)'},
     {'Name': 'train:Prec@1', 'Regex': 'Prec@1: ([0-9\\.]+)'},
     {'Name': 'validation:trn_loss', 'Regex': 'trn_loss: ([0-9\\.]+)'},
     {'Name': 'validation:vld_loss', 'Regex': 'vld_loss: ([0-9\\.]+)'},
     {'Name': 'validation:score_g', 'Regex': 'score_g: ([0-9\\.]+)'},
     {'Name': 'validation:score_v', 'Regex': 'score_v: ([0-9\\.]+)'},
     {'Name': 'validation:score_c', 'Regex': 'score_c: ([0-9\\.]+)'},
     {'Name': 'validation:Prec', 'Regex': 'val_Prec: ([0-9\\.]+)'}
]

In [11]:
hyperparameters = {
        'model_name' : 'resnet18',
        'height' : 137,
        'width' : 236,
        'num_epochs': 5,
        'batch-size' : 256*4,   ## 192 for 8xlarge*2, 100 for 16xlarge*2, 64 for 16xlarge*3
        'backend': 'nccl',
        'lr': 0.0001,  ## 0.01. 0.003
        'opt-level' : 'O0'
    }
train_instance_count=1
train_instance_type='ml.p3.8xlarge'

In [12]:
# all input configurations, parameters, and metrics specified in estimator 
# definition are automatically tracked
estimator = PyTorch(
    entry_point='./main_trainer.py',
    source_dir='./src',
    role=role,
    sagemaker_session=sagemaker.Session(sagemaker_client=sm),
    framework_version='1.5.0',
    train_instance_count=train_instance_count,
    train_instance_type=train_instance_type,
    train_volume_size=400,
    hyperparameters=hyperparameters,
#     train_use_spot_instances=True,  # spot instance 활용
#     train_max_run=12*60*60,
#     train_max_wait=12*60*60,
#     checkpoint_s3_uri=checkpoint_s3_uri,
#     tensorboard_output_config=TensorBoardOutputConfig(tensorboard_output),
    metric_definitions=metrics_definitions,
    enable_sagemaker_metrics=True
)

In [13]:
training_job_name = "training-job-{}".format(int(time.time()))

# Now associate the estimator with the Experiment and Trial
estimator.fit(
    inputs=s3_data_path, 
    job_name=training_job_name,
    logs='All',
    wait=False,
)

'create_image_uri' will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.


In [None]:
sagemaker_session.logs_for_job(estimator.latest_training_job.name, wait=True)

2020-08-20 15:25:49 Starting - Starting the training job...
2020-08-20 15:25:51 Starting - Launching requested ML instances......
2020-08-20 15:26:56 Starting - Preparing the instances for training............
2020-08-20 15:28:51 Downloading - Downloading input data......
2020-08-20 15:30:03 Training - Downloading the training image.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2020-08-20 15:30:24,959 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2020-08-20 15:30:25,003 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2020-08-20 15:30:25,007 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2020-08-20 15:30:25,398 sagemaker-containers INFO     Module default_user_module_name does not provide a setup.py. [0m
[34mGenerating setup.py[0m
[34m2020-08-20 15:30:25,399 sa

[34margs.use_cuda : True , args.num_gpus : 4[0m
[34mmain args : Namespace(apex=False, backend='nccl', batch_size=1024, channels_last=True, current_host='algo-1', data_dir='/opt/ml/input/data/training', device=device(type='cuda'), height=137, hosts=['algo-1'], keep_batchnorm_fp32=None, kwargs={'num_workers': 0, 'pin_memory': True}, log_interval=10, loss_scale=None, lr=0.0001, model_dir='/opt/ml/model', model_name='resnet18', num_epochs=5, num_folds=5, num_gpus=4, opt_level='O0', output_data_dir='/opt/ml/output/data', print_freq=10, prof=-1, seed=1, sync_bn=True, use_cuda=True, val_batch_size=200, vld_fold_idx=4, width=236)[0m
[34mdist_init args : Namespace(apex=False, backend='nccl', batch_size=1024, channels_last=True, current_host='algo-1', data_dir='/opt/ml/input/data/training', device=device(type='cuda'), height=137, hosts=['algo-1'], keep_batchnorm_fp32=None, kwargs={'num_workers': 0, 'pin_memory': True}, log_interval=10, loss_scale=None, lr=0.0001, model_dir='/opt/ml/model', 

[34m[Epoch 1 Batch 40/157] loss: 2.3550[0m
[34m[Epoch 1 Batch 40/157] loss: 2.3485[0m
[34m[Epoch 1 Batch 40/157] loss: 2.3570[0m
[34m[Epoch 1 Batch 40/157] loss: 2.3558[0m
[34mEpoch: [1][40/157]  Time 2.890 (2.998)  Speed 354.280 (341.592)  Loss 2.8432884216 (2.4828)  Prec@1 29.736 (43.274)  Prec@5 58.496 (67.480)[0m
[34m[Epoch 1 Batch 50/157] loss: 1.7249[0m
[34m[Epoch 1 Batch 50/157] loss: 1.6832[0m
[34m[Epoch 1 Batch 50/157] loss: 1.7055[0m
[34m[Epoch 1 Batch 50/157] loss: 1.7089[0m
[34mEpoch: [1][50/157]  Time 2.866 (2.971)  Speed 357.242 (344.611)  Loss 1.6157498360 (2.3094)  Prec@1 65.015 (47.622)  Prec@5 89.404 (71.865)[0m
[34mEpoch    55: reducing learning rate of group 0 to 2.0000e-04.[0m
[34m[Epoch 1 Batch 60/157] loss: 1.9123[0m
[34m[Epoch 1 Batch 60/157] loss: 1.9082[0m
[34m[Epoch 1 Batch 60/157] loss: 1.9087[0m
[34m[Epoch 1 Batch 60/157] loss: 1.9225[0m
[34mEpoch: [1][60/157]  Time 2.888 (2.958)  Speed 354.604 (346.237)  Loss 1.2435557842 (2.

[34m[Epoch 2 Batch 10/157] loss: 1.7387[0m
[34m[Epoch 2 Batch 10/157] loss: 1.7066[0m
[34m[Epoch 2 Batch 10/157] loss: 1.7363[0m
[34mEpoch   167: reducing learning rate of group 0 to 2.4414e-08.[0m
[34m[Epoch 2 Batch 10/157] loss: 1.8373[0m
[34mEpoch: [2][10/157]  Time 2.908 (2.908)  Speed 352.108 (352.108)  Loss 0.8650103211 (0.8650)  Prec@1 81.763 (81.763)  Prec@5 96.240 (96.240)[0m
[34mEpoch   171: reducing learning rate of group 0 to 4.8828e-08.[0m
[34mEpoch   172: reducing learning rate of group 0 to 4.8828e-08.[0m
[34mEpoch   173: reducing learning rate of group 0 to 1.2207e-08.[0m
[34mEpoch   175: reducing learning rate of group 0 to 9.7656e-08.[0m
[34m[Epoch 2 Batch 20/157] loss: 1.3963[0m
[34mEpoch   177: reducing learning rate of group 0 to 2.4414e-08.[0m
[34m[Epoch 2 Batch 20/157] loss: 1.4029[0m
[34m[Epoch 2 Batch 20/157] loss: 1.4353[0m
[34m[Epoch 2 Batch 20/157] loss: 1.4653[0m
[34mEpoch: [2][20/157]  Time 2.887 (2.898)  Speed 354.672 (353.3

[34m[Epoch 3 Batch 60/157] loss: 1.6059[0m
[34m[Epoch 3 Batch 60/157] loss: 1.6149[0m
[34m[Epoch 3 Batch 60/157] loss: 1.6469[0m
[34m[Epoch 3 Batch 60/157] loss: 1.7174[0m
[34mEpoch: [3][60/157]  Time 2.845 (2.900)  Speed 359.878 (353.096)  Loss 0.8755669594 (1.7537)  Prec@1 81.445 (59.509)  Prec@5 96.045 (81.694)[0m
[34m[Epoch 3 Batch 70/157] loss: 1.6976[0m
[34m[Epoch 3 Batch 70/157] loss: 1.6742[0m
[34m[Epoch 3 Batch 70/157] loss: 1.7460[0m
[34m[Epoch 3 Batch 70/157] loss: 1.7758[0m
[34mEpoch: [3][70/157]  Time 2.890 (2.899)  Speed 354.276 (353.264)  Loss 2.7622807026 (1.8978)  Prec@1 23.950 (54.429)  Prec@5 51.782 (77.420)[0m
[34m[Epoch 3 Batch 80/157] loss: 1.3236[0m
[34m[Epoch 3 Batch 80/157] loss: 1.3106[0m
[34m[Epoch 3 Batch 80/157] loss: 1.4158[0m
[34m[Epoch 3 Batch 80/157] loss: 1.3933[0m
[34mEpoch: [3][80/157]  Time 2.965 (2.907)  Speed 345.334 (352.253)  Loss 0.8612000942 (1.7682)  Prec@1 81.006 (57.751)  Prec@5 95.874 (79.727)[0m
[34m[Epoch 3

In [None]:
artifacts_dir = estimator.model_data.replace('model.tar.gz', '')
print(artifacts_dir)
!aws s3 ls --human-readable {artifacts_dir}

In [None]:
model_dir = './model'
output_dir = './output'

In [None]:
!rm -rf $model_dir
!rm -rf $output_dir

In [None]:
import json , os

if not os.path.exists(model_dir):
    os.makedirs(model_dir)

if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    
!aws s3 cp {artifacts_dir}model.tar.gz {model_dir}/model.tar.gz
!tar -xzf {model_dir}/model.tar.gz -C {model_dir}
!aws s3 cp {artifacts_dir}output.tar.gz {output_dir}/output.tar.gz
!tar -xzf {output_dir}/output.tar.gz -C {output_dir}

## 9. 결과 확인

In [None]:
import json , os

with open(os.path.join(output_dir, 'model_history.p'), "r") as f:
    model_history = json.load(f)

In [None]:
model_history.keys()

In [None]:
model_history['epoch'], model_history['losses']

In [None]:
import matplotlib.pyplot as plt

def plot_training_curves(history): 
    
    fig, axes = plt.subplots(1, 4, figsize=(18, 4), sharex=True)
    
    ax = axes[0]
    ax.plot(history['epoch'], history['losses'], label='train')
    ax.plot(history['val_avg_epoch'], history['val_avg_losses'], label='validation')
    ax.set(
        title='model loss',
        ylabel='loss',
        xlabel='epoch')
    ax.legend()
    
    ax = axes[1]
    ax.plot(history['epoch'], history['batch_time'], label='train')
    ax.plot(history['val_avg_epoch'], history['val_avg_batch_time'], label='validation')
    ax.set(
        title='model batch_time',
        ylabel='batch_time',
        xlabel='epoch')
    ax.legend()
    
    
    ax = axes[2]
    ax.plot(history['epoch'], history['top1'], label='train')
    ax.plot(history['val_avg_epoch'], history['val_avg_top1'], label='validation')
    ax.set(
        title='top1 accuracy',
        ylabel='accuracy',
        xlabel='epoch')
    ax.legend()
    
    ax = axes[3]
    ax.plot(history['epoch'], history['top5'], label='train')
    ax.plot(history['val_avg_epoch'], history['val_avg_top5'], label='validation')
    ax.set(
        title='top5 accuracy',
        ylabel='accuracy',
        xlabel='epoch')
    ax.legend()
    fig.tight_layout()
    
plot_training_curves(model_history)

In [None]:
%store hyperparameters model_dir output_dir artifacts_dir