# [Module 1.5] 로컬 모드 훈련

본 워크샵의 모든 노트북은 `conda_python3` 여기에서 작업 합니다.

이 노트북은 아래와 같은 작업을 합니다.
- 아래는 세이지메이커의 어떤 피쳐도 사용하지 않고, PyTorch 만을 사용해서 훈련 합니다.

# PyTorch CIFAR-10 local training  



In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sagemaker

sagemaker_session = sagemaker.Session()

bucket = sagemaker_session.default_bucket()
prefix = "sagemaker/DEMO-pytorch-cnn-cifar10"

role = sagemaker.get_execution_role()

In [3]:
import os
import subprocess

instance_type = "local"

try:
    if subprocess.call("nvidia-smi") == 0:
        ## Set type to GPU if one is present
        instance_type = "local_gpu"
except:
    pass

print("Instance type = " + instance_type)

Instance type = local_gpu


### Upload the data
We use the ```sagemaker.Session.upload_data``` function to upload our datasets to an S3 location. The return value inputs identifies the location -- we will use this later when we start the training job.

In [4]:
inputs = sagemaker_session.upload_data(path="../data", bucket=bucket, key_prefix="data/cifar10")
print("s3 inputs: ", inputs)

s3 inputs:  s3://sagemaker-ap-northeast-2-057716757052/data/cifar10


# Construct a script for training 
Here is the full code for the network model:

Instance type = local_gpu


In [11]:
from sagemaker.pytorch import PyTorch
import os
import subprocess


instance_type = "local_gpu"

local_cifar10_estimator = PyTorch(
    entry_point="train.py",    
    source_dir='source',    
    role=role,
    framework_version='1.6.0',
    py_version='py3',
    instance_count=1,
    instance_type=instance_type,
    session = sagemaker_session,
    hyperparameters={'epochs': 1, 
                     'lr': 0.1,
                     'batch_size': 16
                    }                      
    
)
local_cifar10_estimator.fit(inputs)

Creating 3rbonk6gu9-algo-1-oxgd4 ... 
Creating 3rbonk6gu9-algo-1-oxgd4 ... done
Attaching to 3rbonk6gu9-algo-1-oxgd4
[36m3rbonk6gu9-algo-1-oxgd4 |[0m 2021-06-07 13:42:45,709 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
[36m3rbonk6gu9-algo-1-oxgd4 |[0m 2021-06-07 13:42:45,753 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
[36m3rbonk6gu9-algo-1-oxgd4 |[0m 2021-06-07 13:42:45,756 sagemaker_pytorch_container.training INFO     Invoking user training script.
[36m3rbonk6gu9-algo-1-oxgd4 |[0m 2021-06-07 13:42:45,912 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:
[36m3rbonk6gu9-algo-1-oxgd4 |[0m /opt/conda/bin/python3.6 -m pip install -r requirements.txt
[36m3rbonk6gu9-algo-1-oxgd4 |[0m Collecting torchsummary==1.5.1
[36m3rbonk6gu9-algo-1-oxgd4 |[0m   Downloading torchsummary-1.5.1-py3-none-any.whl (2.8 kB)
[36m3rbonk6gu9-algo-1-oxgd4 |[0m Collecting sagema

## SageMaker Host Mode 로 훈련

In [12]:
from sagemaker.pytorch import PyTorch

instance_type = 'ml.p3.2xlarge'

cifar10_estimator = PyTorch(
    entry_point="train.py",    
    source_dir='source',    
    role=role,
    framework_version='1.6.0',
    py_version='py3',
    instance_count=1,
    instance_type=instance_type,
    hyperparameters={'epochs': 2, 
                     'lr': 0.01,
                     'batch_size': 16                     
                    }                      
    
)
cifar10_estimator.fit(inputs)

2021-06-07 13:44:58 Starting - Starting the training job...
2021-06-07 13:45:24 Starting - Launching requested ML instancesProfilerReport-1623073498: InProgress
......
2021-06-07 13:46:24 Starting - Preparing the instances for training.........
2021-06-07 13:47:49 Downloading - Downloading input data......
2021-06-07 13:48:44 Training - Downloading the training image......
2021-06-07 13:49:47 Training - Training image download completed. Training in progress.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2021-06-07 13:49:48,830 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2021-06-07 13:49:48,854 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2021-06-07 13:49:48,863 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2021-06-07 13:49:49,241 sagemaker-training-too

## 모델 아티펙트 저장

In [21]:
artifact_path = cifar10_estimator.model_data
print("artifact_path: ", artifact_path)

%store artifact_path

artifact_path:  s3://sagemaker-ap-northeast-2-057716757052/pytorch-training-2021-06-07-13-44-58-337/output/model.tar.gz
Stored 'artifact_path' (str)


In [17]:
local_artifact_path = local_cifar10_estimator.model_data
print("local_artifact_path: ", local_artifact_path)
%store local_artifact_path

local_artifact_path:  s3://sagemaker-ap-northeast-2-057716757052/pytorch-training-2021-06-07-13-42-38-627/model.tar.gz
Stored 'local_artifact_path' (str)


In [20]:
# ! aws s3 ls {local_artifact_path} --recursive