# Module 3. Training on Amazon SageMaker
---

본 모듈에서는 Amazon SageMaker API를 호출하여 모델 훈련을 수행합니다. Multi-GPU 분산 훈련에 더 관심이 있거나, SageMaker 기본 용법에 익숙하신 분들은 이 모듈을 건너 뛰고 Module 4로 곧바로 진행하시면 됩니다.

앞의 모듈과 달리 SageMaker notebook instance는 저렴한 인스턴스(예: `ml.t3.medium`)를 사용하시면 되고, 훈련 인스턴스 지정 시 GPU 기반 인스턴스(예: `ml.p2.xlarge`, `ml.p3.2xlarge`)를 선택하시면 됩니다.

<br>

## 1. Training script
---

아래 코드 셀은 `src` 디렉토리에 SageMaker 훈련 스크립트인 `train.py`를 저장합니다.<br>
Module 2를 진행하셨다면 아래 스크립트가 Module 2의 코드와 대부분 일치하다는 점을 알 수 있습니다. 다시 말해, SageMaker 훈련 스크립트 파일은 기존 온프레미스에서 사용했던 Python 스크립트 파일과 크게 다르지 않으며, SageMaker 훈련 컨테이너에서 수행하기 위한 추가적인 환경 변수들만 설정하시면 됩니다.

환경 변수 설정의 code snippet은 아래과 같습니다.

```python
# SageMaker Container environment
parser.add_argument('--train_dir', type=str, default=os.environ['SM_CHANNEL_TRAINING'])
parser.add_argument('--num_gpus', type=int, default=os.environ['SM_NUM_GPUS'])
parser.add_argument('--model_dir', type=str, default=os.environ['SM_MODEL_DIR'])
``` 

In [1]:
%%writefile ./src/train.py

import argparse
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import os
import sys
import time, datetime
import gc

import warnings
warnings.filterwarnings('ignore')

from torch.utils.data import Dataset
from sklearn.metrics import recall_score
import logging
import logging.handlers

import matplotlib.pyplot as plt
import joblib

HEIGHT = 137
WIDTH = 236
BATCH_SIZE = 256
NUM_WORKERS = 4

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler(sys.stdout))


class BangaliDataset(Dataset):
    def __init__(self, imgs, label_df=None, transform=None):
        self.imgs = imgs
        self.label_df = label_df.reset_index(drop=True)
        self.transform = transform
        
    def __len__(self):
        return len(self.label_df)
    
    def __getitem__(self, idx):
        
        img_idx = self.label_df.iloc[idx].id
        img = (self.imgs[img_idx]).astype(np.uint8)
        img = 255 - img
    
        img = img[:,:,np.newaxis]
        img = np.repeat(img, 3, axis=2)
        
        if self.transform is not None:
            img = self.transform(image=img)['image']        
        
        if self.label_df is not None:
            label_1 = self.label_df.iloc[idx].grapheme_root
            label_2 = self.label_df.iloc[idx].vowel_diacritic
            label_3 = self.label_df.iloc[idx].consonant_diacritic           
            return img, np.array([label_1, label_2, label_3])        
        else:
            return img
        
        
def _set_seed(seed=42):
    np.random.seed(seed)
    random.seed(seed)
    mx.random.seed(seed)

def _get_images(train_dir, num_folds=5, vld_fold_idx=4, data_type='train'):

    logger.info("=== Getting Labels ===")
    logger.info(train_dir)
    
    label_df = pd.read_csv(os.path.join(train_dir, 'train_folds.csv'))
    #label_df = pd.read_csv(f'{train_dir}/train_folds.csv')
     
    trn_fold = [i for i in range(num_folds) if i not in [vld_fold_idx]]
    vld_fold = [vld_fold_idx]

    trn_idx = label_df.loc[label_df['fold'].isin(trn_fold)].index
    vld_idx = label_df.loc[label_df['fold'].isin(vld_fold)].index

    logger.info("=== Getting Images ===")    
    files = [f'{train_dir}/{data_type}_image_data_{i}.feather' for i in range(4)]
    logger.info(files)
    
    image_df_list = [pd.read_feather(f) for f in files]
    imgs = [df.iloc[:, 1:].values.reshape(-1, HEIGHT, WIDTH) for df in image_df_list]
    del image_df_list
    gc.collect()
    imgs = np.concatenate(imgs, axis=0)
    
    trn_df = label_df.loc[trn_idx]
    vld_df = label_df.loc[vld_idx]
    
    return imgs, trn_df, vld_df       
                           
                           
def _get_data_loader(imgs, trn_df, vld_df):

    import albumentations as A
    from albumentations import (
        Rotate,HorizontalFlip, IAAPerspective, ShiftScaleRotate, CLAHE, RandomRotate90,
        Transpose, ShiftScaleRotate, Blur, OpticalDistortion, GridDistortion, HueSaturationValue,
        IAAAdditiveGaussianNoise, GaussNoise, MotionBlur, MedianBlur, RandomBrightnessContrast, IAAPiecewiseAffine,
        IAASharpen, IAAEmboss, Flip, OneOf, Compose
    )
    from albumentations.pytorch import ToTensor, ToTensorV2

    train_transforms = A.Compose([
        Rotate(20),
            OneOf([
                IAAAdditiveGaussianNoise(),
                GaussNoise(),
            ], p=0.2),
            OneOf([
                MotionBlur(p=.2),
                MedianBlur(blur_limit=3, p=0.1),
                Blur(blur_limit=3, p=0.1),
            ], p=0.2),
            ShiftScaleRotate(shift_limit=0.0625, scale_limit=0.2, rotate_limit=45, p=0.2),
            OneOf([
                OpticalDistortion(p=0.3),
                GridDistortion(p=.1),
                IAAPiecewiseAffine(p=0.3),
            ], p=0.2),
            OneOf([
                CLAHE(clip_limit=2),
                IAASharpen(),
                IAAEmboss(),
                RandomBrightnessContrast(),            
            ], p=0.3),
            HueSaturationValue(p=0.3),
        ToTensor()
        ], p=1.0)


    valid_transforms = A.Compose([
        ToTensor()
    ])

    from torch.utils.data import Dataset, DataLoader
    trn_dataset = BangaliDataset(imgs=imgs, label_df=trn_df, transform=train_transforms)
    vld_dataset = BangaliDataset(imgs=imgs, label_df=vld_df, transform=valid_transforms)

    trn_loader = DataLoader(trn_dataset, shuffle=True, num_workers=NUM_WORKERS, batch_size=BATCH_SIZE)
    vld_loader = DataLoader(vld_dataset, shuffle=False, num_workers=NUM_WORKERS, batch_size=BATCH_SIZE)
    
    return trn_loader, vld_loader


def _rand_bbox(size, lam):
    '''
    CutMix Helper function.
    Retrieved from https://github.com/clovaai/CutMix-PyTorch/blob/master/train.py
    '''
    W = size[2]
    H = size[3]
    # 폭과 높이는 주어진 이미지의 폭과 높이의 beta distribution에서 뽑은 lambda로 얻는다
    cut_rat = np.sqrt(1. - lam)
    
    # patch size 의 w, h 는 original image 의 w,h 에 np.sqrt(1-lambda) 를 곱해준 값입니다.
    cut_w = np.int(W * cut_rat)
    cut_h = np.int(H * cut_rat)

    # patch의 중심점은 uniform하게 뽑힘
    cx = np.random.randint(W)
    cy = np.random.randint(H)

    bbx1 = np.clip(cx - cut_w // 2, 0, W)
    bby1 = np.clip(cy - cut_h // 2, 0, H)
    bbx2 = np.clip(cx + cut_w // 2, 0, W)
    bby2 = np.clip(cy + cut_h // 2, 0, H)

    return bbx1, bby1, bbx2, bby2


def _format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))


def train_model(args):
    #num_epochs, num_folds, vld_fold_idx, batch_size, lr, log_interval, train_dir, model_dir):
    from torchvision import datasets, models
    from tqdm import tqdm
   
    imgs, trn_df, vld_df = _get_images(args.train_dir, args.num_folds, args.vld_fold_idx, data_type='train')
    trn_loader, vld_loader = _get_data_loader(imgs, trn_df, vld_df)

    logger.info("=== Getting Pre-trained model ===")    
    model = models.resnet18(pretrained=True)
    last_hidden_units = model.fc.in_features
    model.fc = torch.nn.Linear(last_hidden_units, 186)
    model = model.cuda()

    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    loss_fn = nn.CrossEntropyLoss()
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max',
                                                          verbose=True, patience=5, 
                                                          factor=0.5)

    best_score = -1
    training_stats = []
    logger.info("=== Start Training ===")    

    for epoch_id in range(args.num_epochs):

        ################################################################################
        # ==> Training phase
        ################################################################################    
        trn_loss = []
        model.train()

        # Measure how long the training epoch takes.
        t0 = time.time()
        running_loss = 0.0

        for batch_id, (inputs, targets) in enumerate((trn_loader)):
            inputs = inputs.cuda()
            targets = targets.cuda()
            targets_gra = targets[:, 0]
            targets_vow = targets[:, 1]
            targets_con = targets[:, 2]

            # 50%의 확률로 원본 데이터 그대로 사용    
            if np.random.rand() < 0.5:
                logits = model(inputs)
                grapheme = logits[:, :168]
                vowel = logits[:, 168:179]
                cons = logits[:, 179:]

                loss1 = loss_fn(grapheme, targets_gra)
                loss2 = loss_fn(vowel, targets_vow)
                loss3 = loss_fn(cons, targets_con) 

            else:

                lam = np.random.beta(1.0, 1.0) 
                rand_index = torch.randperm(inputs.size()[0])
                shuffled_targets_gra = targets_gra[rand_index]
                shuffled_targets_vow = targets_vow[rand_index]
                shuffled_targets_con = targets_con[rand_index]

                bbx1, bby1, bbx2, bby2 = _rand_bbox(inputs.size(), lam)
                inputs[:, :, bbx1:bbx2, bby1:bby2] = inputs[rand_index, :, bbx1:bbx2, bby1:bby2]
                # 픽셀 비율과 정확히 일치하도록 lambda 파라메터 조정  
                lam = 1 - ((bbx2 - bbx1) * (bby2 - bby1) / (inputs.size()[-1] * inputs.size()[-2]))

                logits = model(inputs)
                grapheme = logits[:,:168]
                vowel = logits[:, 168:179]
                cons = logits[:, 179:]

                loss1 = loss_fn(grapheme, targets_gra) * lam + loss_fn(grapheme, shuffled_targets_gra) * (1. - lam)
                loss2 = loss_fn(vowel, targets_vow) * lam + loss_fn(vowel, shuffled_targets_vow) * (1. - lam)
                loss3 = loss_fn(cons, targets_con) * lam + loss_fn(cons, shuffled_targets_con) * (1. - lam)

            loss = 0.5 * loss1 + 0.25 * loss2 + 0.25 * loss3    
            trn_loss.append(loss.item())
            running_loss += loss.item()
            loss.backward()

            optimizer.step()
            optimizer.zero_grad()

            # Printing vital information
            if (batch_id + 1) % (args.log_interval) == 0:
                s = f'[Epoch {epoch_id} Batch {batch_id+1}/{len(trn_loader)}] ' \
                f'loss: {running_loss / args.log_interval:.4f}'
                print(s)
                running_loss = 0

        # Measure how long this epoch took.
        trn_time = _format_time(time.time() - t0)        

        ################################################################################
        # ==> Validation phase
        ################################################################################
        val_loss = []
        val_true = []
        val_pred = []
        model.eval()
        
        # === Validation phase ===
        logger.info('=== Start Validation ===')        

        with torch.no_grad():
            for inputs, targets in vld_loader:
                inputs = inputs.cuda()
                targets = targets.cuda()
                logits = model(inputs)
                grapheme = logits[:,:168]
                vowel = logits[:, 168:179]
                cons = logits[:, 179:]

                loss= 0.5* loss_fn(grapheme, targets[:,0]) + 0.25*loss_fn(vowel, targets[:,1]) + \
                0.25*loss_fn(vowel, targets[:,2])
                val_loss.append(loss.item())

                grapheme = grapheme.cpu().argmax(dim=1).data.numpy()
                vowel = vowel.cpu().argmax(dim=1).data.numpy()
                cons = cons.cpu().argmax(dim=1).data.numpy()

                val_true.append(targets.cpu().numpy())
                val_pred.append(np.stack([grapheme, vowel, cons], axis=1))                

        val_true = np.concatenate(val_true)
        val_pred = np.concatenate(val_pred)
        val_loss = np.mean(val_loss)
        trn_loss = np.mean(trn_loss)

        score_g = recall_score(val_true[:,0], val_pred[:,0], average='macro')
        score_v = recall_score(val_true[:,1], val_pred[:,1], average='macro')
        score_c = recall_score(val_true[:,2], val_pred[:,2], average='macro')
        final_score = np.average([score_g, score_v, score_c], weights=[2,1,1])

        # Printing vital information
        s = f'[Epoch {epoch_id}] ' \
        f'trn_loss: {trn_loss:.4f}, vld_loss: {val_loss:.4f}, score: {final_score:.4f}, ' \
        f'score_each: [{score_g:.4f}, {score_v:.4f}, {score_c:.4f}]'          
        print(s)

        ################################################################################
        # ==> Save checkpoint and training stats
        ################################################################################        
        if final_score > best_score:
            best_score = final_score
            state_dict = model.cpu().state_dict()
            model = model.cuda()
            torch.save(state_dict, os.path.join(args.model_dir, 'model.pt'))

        # Record all statistics from this epoch
        training_stats.append(
            {
                'epoch': epoch_id + 1,
                'trn_loss': trn_loss,
                'trn_time': trn_time,            
                'val_loss': val_loss,
                'score': final_score,
                'score_g': score_g,
                'score_v': score_v,
                'score_c': score_c            
            }
        )      
        
        # === Save Model Parameters ===
        logger.info("Model successfully saved at: {}".format(args.model_dir))            

        
def parser_args():
    parser = argparse.ArgumentParser()

    # Hyperparameters sent by the client are passed as command-line arguments to the script.
    parser.add_argument('--num_epochs', type=int, default=1)
    parser.add_argument('--num_folds', type=int, default=5)
    parser.add_argument('--vld_fold_idx', type=int, default=4)
    parser.add_argument('--batch_size', type=int, default=256)
    parser.add_argument('--lr', type=float, default=0.001)
    parser.add_argument('--log_interval', type=int, default=10) 

    # SageMaker Container environment
    parser.add_argument('--train_dir', type=str, default=os.environ['SM_CHANNEL_TRAINING'])
    parser.add_argument('--num_gpus', type=int, default=os.environ['SM_NUM_GPUS'])
    parser.add_argument('--model_dir', type=str, default=os.environ['SM_MODEL_DIR'])
    #parser.add_argument('--model_output_dir', type=str, default=os.environ.get('SM_OUTPUT_DATA_DIR'))
    args = parser.parse_args() 
    return args
        
    
if __name__ =='__main__':

    #parse arguments
    args = parser_args() 
    args.use_cuda = args.num_gpus > 0
    print("args.use_cuda : {} , args.num_gpus : {}".format(
        args.use_cuda, args.num_gpus))
    args.device = torch.device("cuda" if args.use_cuda else "cpu")
    train_model(args)

Overwriting ./src/train.py


<br>

## 2. Training on SageMaker
---

훈련 스크립트가 준비되었다면 SageMaker 훈련을 수행하는 법은 매우 간단합니다. SageMaker Python SDK 활용 시, Estimator 인스턴스를 생성하고 해당 인스턴스의 `fit()` 메서드를 호출하는 것이 전부입니다.

#### 1) `Estimator` 인스턴스 생성 
훈련 컨테이너에 필요한 설정들을 지정합니다. 본 핸즈온에서는 훈련 스크립트 파일이 포함된 경로인 소스 경로와(`source_dir`)와 훈련 스크립트 Python 파일만 엔트리포인트(`entry_point`)로 지정해 주면 됩니다.

#### 2) `fit()` 메서드 호출
`estimator.fit(YOUR_TRAINING_DATA_URI)` 메서드를 호출하면, 훈련에 필요한 인스턴스를 시작하고 컨테이너 환경을 시작합니다. 
필수 인자값은 훈련 데이터가 존해자는 S3 경로(`s3://`)이며, 로컬 모드로 훈련 시에는 로컬 경로(`file://`)를 지정하시면 됩니다. 

인자값 중 wait은 디폴트 값으로 `wait=True`이며, 모든 훈련 작업이 완료될 때까지 코드 셀이 freezing됩니다. 만약 다른 코드 셀을 실행하거나, 다른 훈련 job을 시작하고 싶다면 `wait=False`로 설정하여 Asynchronous 모드로 변경하면 됩니다.

SageMaker 훈련이 끝나면 컨테이너 환경과 훈련 인스턴스는 자동으로 삭제됩니다. 이 때, SageMaker는 자동으로 `SM_MODEL_DIR` 경로에 저장된 최종 모델 아티팩트를 `model.tar.gz`로 압축하여 훈련 컨테이너 환경에서 S3 bucket으로 저장합니다. 당연히, S3 bucket에 저장된 모델 아티팩트를 다운로드받아 로컬 상에서 곧바로 테스트할 수 있습니다.

In [2]:
import boto3
import sagemaker

boto_session = boto3.Session()
sagemaker_session = sagemaker.Session(boto_session=boto_session)

In [3]:
from sagemaker.pytorch import PyTorch
role = sagemaker.get_execution_role()
bucket = sagemaker.Session().default_bucket()
prefix = 'bangali/train'

In [4]:
estimator = PyTorch(entry_point='train.py',
                    source_dir='src',
                    role=role,
                    instance_type='ml.p3.2xlarge',
                    instance_count=1,
                    framework_version='1.6.0',
                    py_version='py3',
                    hyperparameters = {'num_epochs': 1, 
                                       'num_folds': 5,
                                       'vld_fold_idx': 4,
                                       'batch_size': 256,
                                       'lr': 0.001,
                                       'log_interval': 10,
                                      }                       
                   )
s3_input_train = sagemaker.TrainingInput(s3_data='s3://{}/{}'.format(bucket, prefix), content_type='csv')    
#s3_input_train = sagemaker.s3_input(s3_data='s3://{}/{}'.format(bucket, prefix), content_type='csv') # SDK v1

In [5]:
%%time
estimator.fit(s3_input_train)

2021-03-02 05:08:48 Starting - Starting the training job...
2021-03-02 05:09:12 Starting - Launching requested ML instancesProfilerReport-1614661728: InProgress
.........
2021-03-02 05:10:33 Starting - Preparing the instances for training......
2021-03-02 05:11:37 Downloading - Downloading input data......
2021-03-02 05:12:48 Training - Downloading the training image........[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2021-03-02 05:14:00,658 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2021-03-02 05:14:00,695 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2021-03-02 05:14:03,728 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2021-03-02 05:14:04,223 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:[0m
[34m/opt/conda/bin/p

[34m=== Getting Pre-trained model ===[0m
[34m=== Start Training ===[0m
[34m[2021-03-02 05:15:20.589 algo-1:31 INFO json_config.py:90] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.[0m
[34m[2021-03-02 05:15:20.590 algo-1:31 INFO hook.py:193] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries.[0m
[34m[2021-03-02 05:15:20.590 algo-1:31 INFO hook.py:238] Saving to /opt/ml/output/tensors[0m
[34m[2021-03-02 05:15:20.590 algo-1:31 INFO state_store.py:67] The checkpoint config file /opt/ml/input/config/checkpointconfig.json does not exist.[0m
[34m[2021-03-02 05:15:20.636 algo-1:31 INFO hook.py:398] Monitoring the collections: losses[0m
[34m[2021-03-02 05:15:20.637 algo-1:31 INFO hook.py:459] Hook is writing from the hook with pid: 31
[0m
[34m[Epoch 0 Batch 10/628] loss: 3.2109[0m
[34m[Epoch 0 Batch 20/628] loss: 2.7456[0m
[34m[Epoch 0 Batch 30/628] loss: 2.7329[0m
[34m[Epoch 0 Batch 40/628] los

In [7]:
s3_model_dir = estimator.model_data.replace('model.tar.gz', '')
print(s3_model_dir)
!aws s3 ls {s3_model_dir}

s3://sagemaker-us-east-1-143656149352/pytorch-training-2021-03-02-05-08-47-826/output/
2021-03-02 05:27:58   41944025 model.tar.gz


<br>

## 3. Getting Model artifacts
---

훈련이 완료된 모델 아티팩트를 로컬(jupyter notebook 인스턴스 or 온프레미스)로 복사합니다.


In [8]:
local_model_dir = './model'
!rm -rf $local_model_dir

In [9]:
import json, os

if not os.path.exists(local_model_dir):
    os.makedirs(local_model_dir)

!aws s3 cp {s3_model_dir}model.tar.gz {local_model_dir}/model.tar.gz
!tar -xzf {local_model_dir}/model.tar.gz -C {local_model_dir}

download: s3://sagemaker-us-east-1-143656149352/pytorch-training-2021-03-02-05-08-47-826/output/model.tar.gz to model/model.tar.gz
