# <B> Training models using the framework (Torch) </B>

## AutoReload

In [1]:
%load_ext autoreload
%autoreload 2

## 1. parameter store 셋팅

In [22]:
import boto3
from utils.ssm import parameter_store

In [23]:
strRegionName=boto3.Session().region_name
pm = parameter_store(strRegionName)
strPrefix = pm.get_params(key="PREFIX")

## 2. train

### 2.1 with built-in PyTorch image
* https://sagemaker-examples.readthedocs.io/en/latest/aws_sagemaker_studio/frameworks/pytorch_cnn_cifar10/pytorch_cnn_cifar10.html
* https://sagemaker.readthedocs.io/en/stable/frameworks/pytorch/using_pytorch.html
* train, deploy and inference: https://sagemaker.readthedocs.io/en/stable/frameworks/pytorch/sagemaker.pytorch.html

In [24]:
from sagemaker.pytorch import PyTorch

In [None]:
estimator = PyTorch(
    source_dir="./source",
    entry_point="cifar10.py",
    role=pm.get_params(key=strPrefix + "SAGEMAKER-ROLE-ARN"),
    framework_version="1.10.0",
    py_version="py38",
    instance_count=1,
    instance_type="ml.g4dn.xlarge",
    output_path=f"s3://{pm.get_params(key=strPrefix + 'BUCKET')}/{'byom-model-output'}",
)

estimator.fit(
    inputs=pm.get_params(key=strPrefix + "DATA-PATH"),
    wait=True,
)

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: pytorch-training-2023-02-03-14-38-26-194


2023-02-03 14:38:26 Starting - Starting the training job...
2023-02-03 14:38:43 Starting - Preparing the instances for training......
2023-02-03 14:39:46 Downloading - Downloading input data...
2023-02-03 14:40:06 Training - Downloading the training image...................

### 2.1 with custom image

#### 2.2.1. Build a docker and Register the image to ECR 

In [None]:
import boto3
from utils.ecr import ecr_handler

In [None]:
ecr = ecr_handler()

In [None]:
! pygmentize docker/Dockerfile

In [None]:
strRepositoryName = strPrefix + "pytorch-container"
strRepositoryName = strRepositoryName.lower()
strDockerDir = "./docker/"
strTag = ":latest"
strRegionName=pm.get_params(key=strPrefix + "REGION")
strAccountId=pm.get_params(key=strPrefix + "ACCOUNT-ID")  

In [None]:
ecr.build_docker(strDockerDir, strRepositoryName)

In [None]:
strEcrRepositoryUri = ecr.register_image_to_ecr(strRegionName, strAccountId, strRepositoryName, strTag)
pm.put_params(key=strPrefix + "PYTORCH-ECR-URI", value=strEcrRepositoryUri, overwrite=True)

#### 2.2.2. Train model with the custom image
* https://docs.aws.amazon.com/sagemaker/latest/dg/adapt-training-container.html#:%7E:text=Step%202%3A%20Create%20and%20upload%20the%20Dockerfile%20and%20Python%20training%20scripts

In [None]:
from distutils.dir_util import copy_tree
from sagemaker.estimator import Estimator

In [None]:
copy_tree("./source", "./docker/code")

In [None]:
estimator=Estimator(
    image_uri=pm.get_params(key=strPrefix + "PYTORCH-ECR-URI"),
    role=pm.get_params(key=strPrefix + "SAGEMAKER-ROLE-ARN"), 
    instance_count=1,
    instance_type="ml.g4dn.xlarge",
    #instance_type="local_gpu",
    volume_size=30,
    max_run=360000,
    input_mode= 'File',
    output_path=f"s3://{pm.get_params(key=strPrefix + 'BUCKET')}/{'byom-model-output'}"
    #sagemaker_session=self.pipeline_session ## add
)

In [None]:
estimator.fit(
    inputs=pm.get_params(key=strPrefix + "DATA-PATH"),
    wait=True
)