# <B> Training models using the framework (Torch) </B>

## AutoReload

In [15]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [16]:
!pip install --upgrade pip
!pip install --upgrade sagemaker==2.118.0
!pip install -U boto3 botocore

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com


## 1. parameter store 셋팅

In [17]:
import boto3
from utils.ssm import parameter_store

In [18]:
strRegionName=boto3.Session().region_name
pm = parameter_store(strRegionName)
strPrefix = pm.get_params(key="PREFIX")

In [19]:
pm.get_params(key=strPrefix + "SAGEMAKER-ROLE-ARN")

'arn:aws:iam::419974056037:role/service-role/AmazonSageMaker-ExecutionRole-20221206T163436'

## 2. train

### 2.1 with built-in PyTorch image
* https://sagemaker-examples.readthedocs.io/en/latest/aws_sagemaker_studio/frameworks/pytorch_cnn_cifar10/pytorch_cnn_cifar10.html
* https://sagemaker.readthedocs.io/en/stable/frameworks/pytorch/using_pytorch.html
* train, deploy and inference: https://sagemaker.readthedocs.io/en/stable/frameworks/pytorch/sagemaker.pytorch.html

In [20]:
import sagemaker
from sagemaker.pytorch import PyTorch

In [21]:
estimator = PyTorch(
    source_dir="./source/train",
    entry_point="cifar10.py",
    role=pm.get_params(key=strPrefix + "SAGEMAKER-ROLE-ARN"),
    framework_version="1.12.1",
    py_version="py38",
    instance_count=1,
    instance_type="ml.g4dn.xlarge",
    #instance_type="local_gpu",
    #session = sagemaker.LocalSession(), # 로컬 세션을 사용합니다.
    output_path=f"s3://{pm.get_params(key=strPrefix + 'BUCKET')}/{'byom-model-output'}",
)

strDataChannel = "TR"
estimator.fit(
    inputs={strDataChannel: pm.get_params(key=strPrefix + "DATA-PATH")},
    wait=True,
)

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: pytorch-training-2023-02-16-01-22-09-857


2023-02-16 01:22:10 Starting - Starting the training job...
2023-02-16 01:22:26 Starting - Preparing the instances for training......
2023-02-16 01:23:37 Downloading - Downloading input data
2023-02-16 01:23:37 Training - Downloading the training image...........................
2023-02-16 01:27:44 Training - Training image download completed. Training in progress.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2023-02-16 01:27:55,361 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2023-02-16 01:27:55,380 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2023-02-16 01:27:55,390 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2023-02-16 01:27:55,394 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2023-02-16 01:27:55,

### 아래 코드는 학습이 완료된 후 에러없이 동작합니다. 

In [22]:
pm.put_params(key=strPrefix + "S3-MODEL-ARTIFACT", value=estimator.model_data, overwrite=True)
pm.put_params(key=strPrefix + "TRAIN-IMAGE-URI", value=estimator.training_image_uri(), overwrite=True)
print (f"Model artifact: {estimator.model_data}")
print (f"Train image uri: {estimator.training_image_uri()}")

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.


Model artifact: s3://sagemaker-ap-northeast-2-419974056037/byom-model-output/pytorch-training-2023-02-16-01-22-09-857/output/model.tar.gz
Train image uri: 763104351884.dkr.ecr.ap-northeast-2.amazonaws.com/pytorch-training:1.12.1-gpu-py38


### 2.1 with custom image

#### 2.2.1. Build a docker and Register the image to ECR 

In [84]:
import boto3
from utils.ecr import ecr_handler

In [85]:
ecr = ecr_handler()

In [86]:
! pygmentize docker/Dockerfile

[34mFROM[39;49;00m[37m [39;49;00m[33mpytorch/pytorch:1.4-cuda10.1-cudnn7-devel[39;49;00m

[34mRUN[39;49;00m[37m [39;49;00mpython3 -m pip install --upgrade pip
[34mRUN[39;49;00m[37m [39;49;00mpip3 install pyOpenSSL --upgrade
[34mRUN[39;49;00m[37m [39;49;00mpip3 install sagemaker-training

[37m# Copies the training code inside the container[39;49;00m
[34mCOPY[39;49;00m[37m [39;49;00m./code /opt/ml/code
[34mWORKDIR[39;49;00m[37m [39;49;00m[33m/opt/ml/code[39;49;00m

[37m# Defines train.py as script entrypoint[39;49;00m
[34mENV[39;49;00m[37m [39;49;00mSAGEMAKER_PROGRAM cifar10.py
[34mENV[39;49;00m[37m [39;49;00mPYTHONUNBUFFERED TRUE


In [87]:
strRepositoryName = strPrefix + "pytorch-container"
strRepositoryName = strRepositoryName.lower()
strDockerDir = "./docker/"
strTag = ":latest"
strRegionName=pm.get_params(key=strPrefix + "REGION")
strAccountId=pm.get_params(key=strPrefix + "ACCOUNT-ID")  

In [88]:
ecr.build_docker(strDockerDir, strRepositoryName)

/home/ec2-user/SageMaker/bring-your-own-model
/home/ec2-user/SageMaker/bring-your-own-model/docker
Sending build context to Docker daemon  22.02kB

Step 1/8 : FROM pytorch/pytorch:1.4-cuda10.1-cudnn7-devel
 ---> 76c152fbfd03
Step 2/8 : RUN python3 -m pip install --upgrade pip
 ---> Using cache
 ---> 3e6cda4f983f
Step 3/8 : RUN pip3 install pyOpenSSL --upgrade
 ---> Using cache
 ---> 9a19fc10d7b0
Step 4/8 : RUN pip3 install sagemaker-training
 ---> Using cache
 ---> 0b5306eea8bd
Step 5/8 : COPY ./code /opt/ml/code
 ---> Using cache
 ---> 33518b6490fd
Step 6/8 : WORKDIR /opt/ml/code
 ---> Using cache
 ---> 470f0a5751a1
Step 7/8 : ENV SAGEMAKER_PROGRAM cifar10.py
 ---> Using cache
 ---> c9f840cf58f4
Step 8/8 : ENV PYTHONUNBUFFERED TRUE
 ---> Using cache
 ---> 7752acff6c9d
Successfully built 7752acff6c9d
Successfully tagged sm-byom-pytorch-container:latest

/home/ec2-user/SageMaker/bring-your-own-model


In [89]:
strEcrRepositoryUri = ecr.register_image_to_ecr(strRegionName, strAccountId, strRepositoryName, strTag)
pm.put_params(key=strPrefix + "PYTORCH-ECR-URI", value=strEcrRepositoryUri, overwrite=True)

== REGISTER AN IMAGE TO ECR ==
  processing_repository_uri: 419974056037.dkr.ecr.ap-northeast-2.amazonaws.com/sm-byom-pytorch-container:latest
aws ecr get-login --region 'ap-northeast-2' --registry-ids '419974056037' --no-include-email


https://docs.docker.com/engine/reference/commandline/login/#credentials-store



Login Succeeded

aws ecr create-repository --repository-name 'sm-byom-pytorch-container'



An error occurred (RepositoryAlreadyExistsException) when calling the CreateRepository operation: The repository with name 'sm-byom-pytorch-container' already exists in the registry with id '419974056037'


docker tag 'sm-byom-pytorch-container:latest' '419974056037.dkr.ecr.ap-northeast-2.amazonaws.com/sm-byom-pytorch-container:latest'
docker push '419974056037.dkr.ecr.ap-northeast-2.amazonaws.com/sm-byom-pytorch-container:latest'
== REGISTER AN IMAGE TO ECR ==


'Store suceess'

#### 2.2.2. Train model with the custom image
* https://docs.aws.amazon.com/sagemaker/latest/dg/adapt-training-container.html#:%7E:text=Step%202%3A%20Create%20and%20upload%20the%20Dockerfile%20and%20Python%20training%20scripts

In [90]:
from distutils.dir_util import copy_tree
from sagemaker.estimator import Estimator

In [91]:
copy_tree("./source/train", "./docker/code")

['./docker/code/.ipynb_checkpoints/cifar10-checkpoint.py',
 './docker/code/.ipynb_checkpoints/requirements-checkpoint.txt',
 './docker/code/requirements.txt',
 './docker/code/cifar10.py']

In [92]:
estimator=Estimator(
    image_uri=pm.get_params(key=strPrefix + "PYTORCH-ECR-URI"),
    role=pm.get_params(key=strPrefix + "SAGEMAKER-ROLE-ARN"), 
    instance_count=1,
    #instance_type="ml.g4dn.xlarge",
    instance_type="local_gpu",
    volume_size=30,
    max_run=360000,
    input_mode= 'File',
    output_path=f"s3://{pm.get_params(key=strPrefix + 'BUCKET')}/{'byom-model-output'}"
    #sagemaker_session=self.pipeline_session ## add
)

In [None]:
strDataChannel = "TR"

estimator.fit(
    #inputs=pm.get_params(key=strPrefix + "DATA-PATH"),
    inputs={strDataChannel: pm.get_params(key=strPrefix + "DATA-PATH")},
    wait=True
)

INFO:sagemaker:Creating training-job with name: sm-byom-pytorch-container-2023-02-07-07-51-20-184
INFO:sagemaker.local.local_session:Starting training job
INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:sagemaker.local.image:No AWS credentials found in session but credentials from EC2 Metadata Service are available.
INFO:sagemaker.local.image:docker compose file: 
networks:
  sagemaker-local:
    name: sagemaker-local
services:
  algo-1-axxbt:
    command: train
    container_name: s2ul0rjqso-algo-1-axxbt
    deploy:
      resources:
        reservations:
          devices:
          - capabilities:
            - gpu
    environment:
    - '[Masked]'
    - '[Masked]'
    image: 419974056037.dkr.ecr.ap-northeast-2.amazonaws.com/sm-byom-pytorch-container:latest
    networks:
      sagemaker-local:
        aliases:
        - algo-1-axxbt
    st

Creating s2ul0rjqso-algo-1-axxbt ... 
Creating s2ul0rjqso-algo-1-axxbt ... done
Attaching to s2ul0rjqso-algo-1-axxbt
[36ms2ul0rjqso-algo-1-axxbt |[0m 2023-02-07 07:51:23,120 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:
[36ms2ul0rjqso-algo-1-axxbt |[0m /opt/conda/bin/python3 -m pip install -r requirements.txt
[36ms2ul0rjqso-algo-1-axxbt |[0m Collecting pandas
[36ms2ul0rjqso-algo-1-axxbt |[0m   Downloading pandas-1.3.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m100.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m:--:--[0m
[36ms2ul0rjqso-algo-1-axxbt |[0m Installing collected packages: pandas
[36ms2ul0rjqso-algo-1-axxbt |[0m Successfully installed pandas-1.3.5
[36ms2ul0rjqso-algo-1-axxbt |[0m [0m2023-02-07 07:51:27,331 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)
[36ms2ul0rjqso-algo