# training script를 이용한 training job
이번 주제를 통해 training script를 통해 머신러닝 모델을 훈련하고 deploy하는 방법을 배웁니다.
fastai와 pytorch를 이용한 두개의 스크립트를 이용하여 각각의 모델을 만들 예정입니다.

## 1. 기본적인 설정
training job을 위한 기본적인 설정을 진행합니다.

In [2]:
import os
import io
import subprocess
import boto3

import PIL

import sagemaker
from sagemaker.pytorch import PyTorch, PyTorchModel
from sagemaker.predictor import Predictor

In [3]:
# import random

sagemaker_session = sagemaker.Session()

bucket = sagemaker_session.default_bucket()
# prefix = "gc-sagemaker"

In [4]:
role = sagemaker.get_execution_role()
print(role)

arn:aws:iam::413929759937:role/service-role/AmazonSageMaker-ExecutionRole-20220330T131248


In [5]:
bucket

'sagemaker-ap-northeast-2-413929759937'

## 3. pytorch
### 3.1 training script 작성

In [9]:
%%writefile ./train_pytorch_resnet18.py

import argparse
import os
import matplotlib.pyplot as plt
import time
import os
import copy
from datetime import datetime
from pytz import timezone
from tqdm import tqdm
import zipfile
import shutil
from pathlib import Path
import logging
import sys

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import torchvision
from torchvision import datasets, transforms
import torchvision.models as models

parser = argparse.ArgumentParser()

# Hyperparameters sent by the client are passed as command-line arguments to the script.
parser.add_argument('--num_epochs', type=int, default=1)
parser.add_argument('--batch_size', type=int, default=4)

# SageMaker Container environment
parser.add_argument('--data', type=str, default=os.environ['SM_CHANNEL_TRAINING'])
parser.add_argument('--model_dir', type=str, default=os.environ['SM_MODEL_DIR'])


args, _ = parser.parse_known_args()

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler(sys.stdout))

# 학습을 위해 데이터 증가(augmentation) 및 일반화(normalization)
# 검증을 위한 일반화
data_transforms = {
    'train': transforms.Compose([
        transforms.Resize([224, 224]),
#         transforms.RandomRotation(degrees=(0, 180)),
#         transforms.RandomHorizontalFlip(),
#         transforms.RandomVerticalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val': transforms.Compose([
        transforms.Resize([224, 224]),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

data_dir = args.data
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x),
                                          data_transforms[x])
                  for x in ['train', 'val']}

dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=args.batch_size,
                                             shuffle=True, num_workers=4)
              for x in ['train', 'val']}
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}
class_names = image_datasets['train'].classes

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)
        
        start_time = time.time()
        
        start = datetime.now(timezone('Asia/Seoul')
                            ).strftime('%Y-%m-%d %H:%M:%S')
        print('Start = {}'.format(start))

        # 각 에폭(epoch)은 학습 단계와 검증 단계를 갖습니다.
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # 모델을 학습 모드로 설정
            else:
                model.eval()   # 모델을 평가 모드로 설정

            running_loss = 0.0
            running_corrects = 0

            # 데이터를 반복
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # 매개변수 경사도를 0으로 설정
                optimizer.zero_grad()

                # 순전파
                # 학습 시에만 연산 기록을 추적
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # 학습 단계인 경우 역전파 + 최적화
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # 통계
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
            if phase == 'train':
                scheduler.step()

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]*100

            print('{:10}: Loss - {:10.4f} | Acc - {:10.2f}%'.format(
                phase, epoch_loss, epoch_acc))

            # 모델을 깊은 복사(deep copy)함
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
#                 best_model_wts = copy.deepcopy(model.state_dict())
                best_model = model
            
        finish = datetime.now(timezone('Asia/Seoul')
                            ).strftime('%Y-%m-%d %H:%M:%S')
        print('Finish = {}'.format(finish))
        
        time_elapsed = time.time() - start_time
        print('Time: {:10.2f}m'.format(time_elapsed/60))

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:10.0f}hr {:10.0f}s'.format(
        time_elapsed // 3600, (time_elapsed % 3600)/60))
    print('Best val Acc: {:10.2f}%'.format(best_acc))

    # 가장 나은 모델 가중치를 불러옴
    torch.save(best_model, os.path.join(args.model_dir, 'model.pth'))
    
    # === Save Model Parameters ===
    logger.info("Model successfully saved at: {}".format(args.model_dir)) 


model_ft = models.resnet18(pretrained=True)
num_ftrs = model_ft.fc.in_features
model_ft.fc = nn.Linear(num_ftrs, 3)

model_ft = model_ft.to(device)

criterion = nn.CrossEntropyLoss()

# 모든 매개변수들이 최적화되었는지 관찰
optimizer_ft = optim.SGD(model_ft.parameters(), lr=1e-1, momentum=0.9)

# 5에폭마다 0.1씩 학습률 감소
# scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.01, steps_per_epoch=len(data_loader))
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7,
                                       gamma=0.1)

model_ft = train_model(model_ft, criterion, optimizer_ft,
                       exp_lr_scheduler,
                       num_epochs=args.num_epochs)

Overwriting ./train_pytorch_resnet18.py


### 3.2 training job

In [10]:
estimator = PyTorch(entry_point='train_pytorch_resnet18.py',
                    role=role,
#                     instance_type='ml.g4dn.xlarge',
                    instance_type='ml.m5.2xlarge',
                    instance_count=1,
                    framework_version='1.6.0',
                    py_version='py36',
                    max_run = 3*24*60*60,
                    hyperparameters = {'num_epochs': 2, 
                                       'batch_size': 4
                                      }                       
                   )
# s3_input_train = sagemaker.TrainingInput(s3_data='s3://{}/{}'.format(bucket, prefix), content_type='csv')    
#s3_input_train = sagemaker.s3_input(s3_data='s3://{}/{}'.format(bucket, prefix), content_type='csv') # SDK v1
estimator.fit('s3://hymenoptera')

INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: pytorch-training-2022-04-06-07-05-03-713


2022-04-06 07:05:04 Starting - Starting the training job...
2022-04-06 07:05:30 Starting - Preparing the instances for trainingProfilerReport-1649228703: InProgress
......
2022-04-06 07:06:30 Downloading - Downloading input data...
2022-04-06 07:06:50 Training - Downloading the training image...
2022-04-06 07:07:31 Training - Training image download completed. Training in progress.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2022-04-06 07:07:22,451 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2022-04-06 07:07:22,454 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-04-06 07:07:22,463 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2022-04-06 07:07:22,468 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2022-04-0

In [11]:
estimator.model_data

's3://sagemaker-ap-northeast-2-413929759937/pytorch-training-2022-04-06-07-05-03-713/output/model.tar.gz'

In [12]:
%%writefile ./inference.py

import json
import logging
import os
import torch
import requests
from PIL import Image
from torchvision import transforms
from torchvision import models
import torch.nn as nn

logger = logging.getLogger(__name__)

def model_fn(model_dir):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(1)
    logger.info('Loading the model')
    print(2)
    model = models.resnet18(pretrained=True)
    print(3)
    num_ftrs = model.fc.in_features
    print(4)
    model.fc = nn.Linear(num_ftrs, 2)
    print(5)
    
    with open(os.path.join(model_dir, 'model.pth'), 'rb') as f:
        model.load_state_dict(torch.load(f, map_location=device), strict=False)
#         model.load_state_dict(torch.load(f))
    print(6)    
    model.to(device).eval()
    print(7)
    logger.info('Done loading model')
    return model


def predict_fn(input_data, model):
    logger.info('Generating prediction based on input parameters.')

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()
    with torch.no_grad():
        return model(input_data.to(device))

Overwriting ./inference.py


In [13]:
model = PyTorchModel(
    model_data=estimator.model_data,
    name=estimator._current_job_name,
    role=role,
    framework_version=estimator.framework_version,
    py_version="py36",
    entry_point='inference.py',
)

predictor = model.deploy(instance_type='ml.m5.xlarge',
                                     initial_instance_count=1)

INFO:sagemaker:Creating model with name: pytorch-training-2022-04-06-07-05-03-713
INFO:sagemaker:Creating endpoint with name pytorch-training-2022-04-06-07-09-21-588


-----!

In [14]:
import torch
from torchvision import datasets, transforms

import numpy as np

test_datasets = datasets.ImageFolder(
    root='hymenoptera_data/val',
    transform= transforms.Compose([
        transforms.Resize([224, 224]),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]))

test_datasets_loaders = torch.utils.data.DataLoader(test_datasets, batch_size=8, shuffle=True)

In [15]:
pred_label = []
true_label = []
output_of_model = []

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

for i, (inputs, labels) in enumerate(test_datasets_loaders):
            inputs = inputs.to(device)
            labels = labels.to(device)
            
            outputs = predictor.predict(inputs)
            _, preds = torch.max(torch.from_numpy(outputs), 1)
            print(preds)
            output_of_model.append(outputs)
            pred_label.append(preds.tolist())
            true_label.append(labels.tolist())
pred_label = sum(pred_label, [])
true_label = sum(true_label, [])

ModelError: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received server error (0) from primary with message "Your invocation timed out while waiting for a response from container primary. Review the latency metrics for each container in Amazon CloudWatch, resolve the issue, and try again.". See https://ap-northeast-2.console.aws.amazon.com/cloudwatch/home?region=ap-northeast-2#logEventViewer:group=/aws/sagemaker/Endpoints/pytorch-training-2022-04-06-07-09-21-588 in account 413929759937 for more information.

### 3.2 confusion matrix

In [16]:
from sklearn.metrics import confusion_matrix

confusion_matrix(true_label, pred_label)

array([], shape=(0, 0), dtype=int64)

### 3.3 accuracy

In [17]:
from sklearn.metrics import accuracy_score

accuracy_score(true_label, pred_label)

  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


nan

### 3.4 deleting endpoint

In [18]:
predictor.delete_endpoint()

INFO:sagemaker:Deleting endpoint configuration with name: pytorch-training-2022-04-06-07-09-21-588
INFO:sagemaker:Deleting endpoint with name: pytorch-training-2022-04-06-07-09-21-588
