# <B> Training models using the framework (Torch) </B>

## AutoReload

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
!pip install --upgrade pip
!pip install --upgrade sagemaker==2.118.0
#!pip install -U boto3 botocore

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com


In [None]:
import sagemaker 

## 1. parameter store 셋팅

In [3]:
import boto3
from utils.ssm import parameter_store

In [4]:
strRegionName=boto3.Session().region_name
pm = parameter_store(strRegionName)
strPrefix = pm.get_params(key="PREFIX")

In [5]:
pm.get_params(key=strPrefix + "SAGEMAKER-ROLE-ARN")

'arn:aws:iam::419974056037:role/service-role/AmazonSageMaker-ExecutionRole-20221206T163436'

## 2. train

### 2.1 with built-in PyTorch image
* https://sagemaker-examples.readthedocs.io/en/latest/aws_sagemaker_studio/frameworks/pytorch_cnn_cifar10/pytorch_cnn_cifar10.html
* https://sagemaker.readthedocs.io/en/stable/frameworks/pytorch/using_pytorch.html
* train, deploy and inference: https://sagemaker.readthedocs.io/en/stable/frameworks/pytorch/sagemaker.pytorch.html

In [6]:
import sagemaker
from sagemaker.pytorch import PyTorch

In [7]:
estimator = PyTorch(
    source_dir="./source/train",
    entry_point="cifar10.py",
    role=pm.get_params(key=strPrefix + "SAGEMAKER-ROLE-ARN"),
    framework_version="1.12.1",
    py_version="py38",
    instance_count=1,
    #instance_type="ml.g4dn.xlarge",
    instance_type="local_gpu",
    session = sagemaker.LocalSession(), # 로컬 세션을 사용합니다.
    output_path=f"s3://{pm.get_params(key=strPrefix + 'BUCKET')}/{'byom-model-output'}",
)

strDataChannel = "TR"
estimator.fit(
    inputs={strDataChannel: pm.get_params(key=strPrefix + "DATA-PATH")},
    wait=True,
)

Creating 58zyksoz5y-algo-1-cynzs ... 
Creating 58zyksoz5y-algo-1-cynzs ... done
Attaching to 58zyksoz5y-algo-1-cynzs
[36m58zyksoz5y-algo-1-cynzs |[0m 2023-03-02 04:50:11,838 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
[36m58zyksoz5y-algo-1-cynzs |[0m 2023-03-02 04:50:11,858 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)
[36m58zyksoz5y-algo-1-cynzs |[0m 2023-03-02 04:50:11,866 sagemaker-training-toolkit INFO     instance_groups entry not present in resource_config
[36m58zyksoz5y-algo-1-cynzs |[0m 2023-03-02 04:50:11,868 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
[36m58zyksoz5y-algo-1-cynzs |[0m 2023-03-02 04:50:11,872 sagemaker_pytorch_container.training INFO     Invoking user training script.
[36m58zyksoz5y-algo-1-cynzs |[0m 2023-03-02 04:50:11,905 botocore.credentials INFO     Found credentials from IAM Role: BaseNotebookInstanceEc2Instance

### 아래 코드는 학습이 완료된 후 에러없이 동작합니다. 

In [9]:
pm.put_params(key=strPrefix + "S3-MODEL-ARTIFACT", value=estimator.model_data, overwrite=True)
pm.put_params(key=strPrefix + "TRAIN-IMAGE-URI", value=estimator.training_image_uri(), overwrite=True)
print (f"Model artifact: {estimator.model_data}")
print (f"Train image uri: {estimator.training_image_uri()}")

Model artifact: s3://sagemaker-ap-northeast-2-419974056037/byom-model-output/pytorch-training-2023-03-02-04-50-07-688/model.tar.gz
Train image uri: 763104351884.dkr.ecr.ap-northeast-2.amazonaws.com/pytorch-training:1.12.1-gpu-py38


### 2.1 with custom image

#### 2.2.1. Build a docker and Register the image to ECR 

In [60]:
import boto3
from utils.ecr import ecr_handler

In [61]:
ecr = ecr_handler()

### 소스코드는 2가지 방식으로 제공 할 수 있음
* 1.docker image 생성 시 도커 내 "/opt/ml/code"에 코드를 복사하는 방법
    - "DockerFile"에서 "COPY ./code /opt/ml/code" 이용, 아래 copy_tree 사용
* 2.docker image에 넣지 않고, esimator 셋팅 시 소스코드 전달하는 방법 (예제에서 사용하는 방법)
    - "source_dir", "entry_point" 활용

In [62]:
#copy_tree("./source/train", "./docker/code") 

['./docker/code/cifar10.py',
 './docker/code/.ipynb_checkpoints/cifar10-checkpoint.py',
 './docker/code/.ipynb_checkpoints/requirements-checkpoint.txt',
 './docker/code/requirements.txt']

In [63]:
! pygmentize docker/Dockerfile

[34mFROM[39;49;00m[37m [39;49;00m[33mpytorch/pytorch:1.4-cuda10.1-cudnn7-devel[39;49;00m[37m[39;49;00m
[37m[39;49;00m
[34mRUN[39;49;00m[37m [39;49;00mpython3[37m [39;49;00m-m[37m [39;49;00mpip[37m [39;49;00minstall[37m [39;49;00m--upgrade[37m [39;49;00mpip[37m[39;49;00m
[34mRUN[39;49;00m[37m [39;49;00mpip3[37m [39;49;00minstall[37m [39;49;00mpyOpenSSL[37m [39;49;00m--upgrade[37m[39;49;00m
[34mRUN[39;49;00m[37m [39;49;00mpip3[37m [39;49;00minstall[37m [39;49;00msagemaker-training[37m[39;49;00m
[37m[39;49;00m
[37m# Copies the training code inside the container[39;49;00m[37m[39;49;00m
[37m#COPY ./code /opt/ml/code[39;49;00m[37m[39;49;00m
[34mWORKDIR[39;49;00m[37m [39;49;00m[33m/opt/ml/code[39;49;00m[37m[39;49;00m
[37m[39;49;00m
[37m# Defines train.py as script entrypoint[39;49;00m[37m[39;49;00m
[34mENV[39;49;00m[37m [39;49;00mSAGEMAKER_PROGRAM[37m [39;49;00mcifar10.py[37m[39;49;00m
[34mENV[39;49;00m[37m [3

In [64]:
strRepositoryName = strPrefix + "pytorch-container"
strRepositoryName = strRepositoryName.lower()
strDockerDir = "./docker/"
strTag = ":latest"
strRegionName=pm.get_params(key=strPrefix + "REGION")
strAccountId=pm.get_params(key=strPrefix + "ACCOUNT-ID")  

In [65]:
ecr.build_docker(strDockerDir, strRepositoryName)

/home/ec2-user/SageMaker/sagemaker-train-deploy
/home/ec2-user/SageMaker/sagemaker-train-deploy/docker
Sending build context to Docker daemon  22.02kB

Step 1/7 : FROM pytorch/pytorch:1.4-cuda10.1-cudnn7-devel
1.4-cuda10.1-cudnn7-devel: Pulling from pytorch/pytorch
7ddbc47eeb70: Pulling fs layer
c1bbdc448b72: Pulling fs layer
8c3b70e39044: Pulling fs layer
45d437916d57: Pulling fs layer
d8f1569ddae6: Pulling fs layer
85386706b020: Pulling fs layer
ee9b457b77d0: Pulling fs layer
be4f3343ecd3: Pulling fs layer
30b4effda4fd: Pulling fs layer
b398e882f414: Pulling fs layer
64e532b06236: Pulling fs layer
31188d0173e6: Pulling fs layer
4a1386f93f29: Pulling fs layer
87d47d0287c7: Pulling fs layer
7a932c9d3ad4: Pulling fs layer
85386706b020: Waiting
45d437916d57: Waiting
be4f3343ecd3: Waiting
b398e882f414: Waiting
ee9b457b77d0: Waiting
30b4effda4fd: Waiting
64e532b06236: Waiting
4a1386f93f29: Waiting
31188d0173e6: Waiting
7a932c9d3ad4: Waiting
87d47d0287c7: Waiting
d8f1569ddae6: Waiting
8c3b7

In [66]:
strEcrRepositoryUri = ecr.register_image_to_ecr(strRegionName, strAccountId, strRepositoryName, strTag)
pm.put_params(key=strPrefix + "PYTORCH-ECR-URI", value=strEcrRepositoryUri, overwrite=True)

== REGISTER AN IMAGE TO ECR ==
  processing_repository_uri: 419974056037.dkr.ecr.ap-northeast-2.amazonaws.com/sm-byom-pytorch-container:latest
aws ecr get-login --region 'ap-northeast-2' --registry-ids '419974056037' --no-include-email


https://docs.docker.com/engine/reference/commandline/login/#credentials-store



Login Succeeded

aws ecr create-repository --repository-name 'sm-byom-pytorch-container'
docker tag 'sm-byom-pytorch-container:latest' '419974056037.dkr.ecr.ap-northeast-2.amazonaws.com/sm-byom-pytorch-container:latest'
docker push '419974056037.dkr.ecr.ap-northeast-2.amazonaws.com/sm-byom-pytorch-container:latest'
== REGISTER AN IMAGE TO ECR ==


'Store suceess'

#### 2.2.2. Train model with the custom image
* https://docs.aws.amazon.com/sagemaker/latest/dg/adapt-training-container.html#:%7E:text=Step%202%3A%20Create%20and%20upload%20the%20Dockerfile%20and%20Python%20training%20scripts

In [67]:
from distutils.dir_util import copy_tree
from sagemaker.estimator import Estimator

In [72]:
estimator=Estimator(
    source_dir="./source/train",
    entry_point="cifar10.py",
    image_uri=pm.get_params(key=strPrefix + "PYTORCH-ECR-URI"),
    role=pm.get_params(key=strPrefix + "SAGEMAKER-ROLE-ARN"), 
    instance_count=1,
    #instance_type="ml.g4dn.xlarge",
    instance_type="local_gpu",
    volume_size=30,
    max_run=360000,
    input_mode= 'File',
    output_path=f"s3://{pm.get_params(key=strPrefix + 'BUCKET')}/{'byom-model-output'}"
    #sagemaker_session=self.pipeline_session ## add
)

In [74]:
strDataChannel = "TR"

estimator.fit(
    #inputs=pm.get_params(key=strPrefix + "DATA-PATH"),
    inputs={strDataChannel: pm.get_params(key=strPrefix + "DATA-PATH")},
    wait=True
)

Creating nexzhhg804-algo-1-6k6rc ... 
Creating nexzhhg804-algo-1-6k6rc ... done
Attaching to nexzhhg804-algo-1-6k6rc
[36mnexzhhg804-algo-1-6k6rc |[0m 2023-03-02 07:07:17,858 botocore.credentials INFO     Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
[36mnexzhhg804-algo-1-6k6rc |[0m 2023-03-02 07:07:18,018 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:
[36mnexzhhg804-algo-1-6k6rc |[0m /opt/conda/bin/python3 -m pip install -r requirements.txt
[36mnexzhhg804-algo-1-6k6rc |[0m Collecting sagemaker
[36mnexzhhg804-algo-1-6k6rc |[0m   Downloading sagemaker-2.135.1.post0.tar.gz (674 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m674.4/674.4 kB[0m [31m48.2 MB/s[0m eta [36m0:00:00[0m31m?[0m eta [36m-:--:--[0m
[36mnexzhhg804-algo-1-6k6rc |[0m [?25h  Preparing metadata (setup.py) ... [?25ldone
[36mnexzhhg804-algo-1-6k6rc |[0m Collecting pandas
[36mnexzhhg804-algo-1-6k6rc |[0m   Downloading panda