# <B> Processing step examples </B>

## AutoReload

In [129]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 1. processing job 만들기

#### 1.1. SKLearnProcessor based processing jobs (Framework based)
> Description: https://sagemaker.readthedocs.io/en/stable/amazon_sagemaker_processing.html <br>
> Using requirements.txt in script/sklearnprocessor: https://stackoverflow.com/questions/69046990/how-to-pass-dependency-files-to-sagemaker-sklearnprocessor-and-use-it-in-pipelin

In [156]:
import boto3
from utils.ssm import parameter_store
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput

In [157]:
strRegionName=boto3.Session().region_name
pm = parameter_store(strRegionName)
strPrefix = pm.get_params(key="PREFIX")

In [158]:
sklearn_processor = SKLearnProcessor(
    framework_version="0.20.0",
    role=pm.get_params(key=strPrefix + "SAGEMAKER-ROLE-ARN"),
    instance_type="ml.m5.xlarge",
    #instance_type="local",
    instance_count=1,
    base_job_name="preprocessing", # bucket에 보이는 이름 (pipeline으로 묶으면 pipeline에서 정의한 이름으로 bucket에 보임)
    #sagemaker_session=pipeline_session
)

In [159]:
sklearn_processor.run(
    code='./sources/preprocessing/preprocessing.py',
    inputs=[
        ProcessingInput(
            source=pm.get_params(key=strPrefix + "REVIEW-DATA-PATH"),
            destination='/opt/ml/processing/input'
        ),
        ProcessingInput(
            input_name="requirements",
            source='./sources/preprocessing/requirements.txt',
            destination="/opt/ml/processing/input/requirements"
        ),
    ],
    outputs=[
        ProcessingOutput(output_name="train_data", source='/opt/ml/processing/output/train'),
        ProcessingOutput(output_name="validation_data", source='/opt/ml/processing/output/validation'),
        ProcessingOutput(output_name="test_data", source='/opt/ml/processing/output/test')
    ],
    arguments=["--input_name", "reviews.tsv.gz", "--region", strRegionName],
    #job_name="preprocessing",
    wait=False
)
preprocessing_job_description = sklearn_processor.jobs[-1].describe()


Job Name:  preprocessing-2023-01-12-13-21-11-244
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-ap-northeast-2-419974056037/reviews-helpfulness-pipeline/data/reviews.tsv.gz', 'LocalPath': '/opt/ml/processing/input', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'requirements', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-ap-northeast-2-419974056037/preprocessing-2023-01-12-13-21-11-244/input/requirements/requirements.txt', 'LocalPath': '/opt/ml/processing/input/requirements', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-ap-northeast-2-419974056037/preprocessing-2023-01-12-13-21-11-244/input/code/preprocessing.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Pr

In [138]:
preprocessing_job_description

{'ProcessingInputs': [{'InputName': 'input-1',
   'AppManaged': False,
   'S3Input': {'S3Uri': 's3://sagemaker-ap-northeast-2-419974056037/reviews-helpfulness-pipeline/data/reviews.tsv.gz',
    'LocalPath': '/opt/ml/processing/input',
    'S3DataType': 'S3Prefix',
    'S3InputMode': 'File',
    'S3DataDistributionType': 'FullyReplicated',
    'S3CompressionType': 'None'}},
  {'InputName': 'requirements',
   'AppManaged': False,
   'S3Input': {'S3Uri': 's3://sagemaker-ap-northeast-2-419974056037/preprocessing-2023-01-12-13-09-09-774/input/requirements/requirements.txt',
    'LocalPath': '/opt/ml/processing/input/requirements',
    'S3DataType': 'S3Prefix',
    'S3InputMode': 'File',
    'S3DataDistributionType': 'FullyReplicated',
    'S3CompressionType': 'None'}},
  {'InputName': 'code',
   'AppManaged': False,
   'S3Input': {'S3Uri': 's3://sagemaker-ap-northeast-2-419974056037/preprocessing-2023-01-12-13-09-09-774/input/code/preprocessing.py',
    'LocalPath': '/opt/ml/processing/inpu

##### 1.1.1. output path 저장

In [139]:
for dicOutput in preprocessing_job_description["ProcessingOutputConfig"]["Outputs"]:
    strOutputName = dicOutput["OutputName"]
    strS3Uri = dicOutput["S3Output"]["S3Uri"]
    
    if strOutputName == "train_data":
        pm.put_params(key="".join([strPrefix, "TRAIN-DATA-PATH"]), value=strS3Uri, overwrite=True)
    elif strOutputName == "validation_data":
        pm.put_params(key="".join([strPrefix, "VALID-DATA-PATH"]), value=strS3Uri, overwrite=True)
    elif strOutputName == "test_data":
        pm.put_params(key="".join([strPrefix, "TEST-DATA-PATH"]), value=strS3Uri, overwrite=True)
    
    print (strOutputName, strS3Uri)#["train_data"]#.S3Output.S3Uri

train_data s3://sagemaker-ap-northeast-2-419974056037/preprocessing-2023-01-12-13-09-09-774/output/train_data
validation_data s3://sagemaker-ap-northeast-2-419974056037/preprocessing-2023-01-12-13-09-09-774/output/validation_data
test_data s3://sagemaker-ap-northeast-2-419974056037/preprocessing-2023-01-12-13-09-09-774/output/test_data


####    1.2. ScriptProcessor based processing jobs
> https://docs.aws.amazon.com/sagemaker/latest/dg/processing-container-run-scripts.html

##### 1.2.1. Build a docker and Register the image to ECR 

In [140]:
import boto3
from utils.ecr import ecr_handler

In [145]:
ecr = ecr_handler()

In [146]:
strRepositoryName = strPrefix + "processing-container"
strRepositoryName = strRepositoryName.lower()
strDockerDir = "./docker/"
strTag = ":latest"
strRegionName=pm.get_params(key=strPrefix + "REGION")
strAccountId=pm.get_params(key=strPrefix + "ACCOUNT-ID")   

In [147]:
ecr.build_docker(strDockerDir, strRepositoryName)

/home/ec2-user/SageMaker/sm-pipelines
Sending build context to Docker daemon  3.584kB

Step 1/6 : FROM python:3.8-slim-buster
 ---> 6ba145ad2ad6
Step 2/6 : RUN python3 -m pip install --upgrade pip
 ---> Running in 08cc68e9f324
Collecting pip
  Downloading pip-22.3.1-py3-none-any.whl (2.1 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.1/2.1 MB 56.6 MB/s eta 0:00:00
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 22.0.4
    Uninstalling pip-22.0.4:
      Successfully uninstalled pip-22.0.4
Successfully installed pip-22.3.1
[0mRemoving intermediate container 08cc68e9f324
 ---> 4ebee808d2be
Step 3/6 : RUN pip3 install pandas scikit-learn numpy awscli sagemaker spacy boto3 mecab-python natto-py fsspec s3fs boto3
 ---> Running in b5e87dbe4cfd
Collecting pandas
  Downloading pandas-1.5.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.2 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.2/12.2 MB 99.7 MB/s eta 0:00:00
Co

In [148]:
strEcrRepositoryUri = ecr.register_image_to_ecr(strRegionName, strAccountId, strRepositoryName, strTag)
pm.put_params(key=strPrefix + "PROCESSING-ECR-URI", value=strEcrRepositoryUri, overwrite=True)

== REGISTER AN IMAGE TO ECR ==
  processing_repository_uri: 419974056037.dkr.ecr.ap-northeast-2.amazonaws.com/sm-pipeline-hand-on-processing-container:latest
aws ecr get-login --region 'ap-northeast-2' --registry-ids '419974056037' --no-include-email


https://docs.docker.com/engine/reference/commandline/login/#credentials-store



Login Succeeded

aws ecr create-repository --repository-name 'sm-pipeline-hand-on-processing-container'
docker tag 'sm-pipeline-hand-on-processing-container:latest' '419974056037.dkr.ecr.ap-northeast-2.amazonaws.com/sm-pipeline-hand-on-processing-container:latest'
docker push '419974056037.dkr.ecr.ap-northeast-2.amazonaws.com/sm-pipeline-hand-on-processing-container:latest'
== REGISTER AN IMAGE TO ECR ==


'Store suceess'

##### 1.1.2. Define a processing job

In [160]:
from sagemaker.processing import ScriptProcessor

In [161]:
prep_processor = ScriptProcessor(
    command=["python3"],
    image_uri=pm.get_params(key=strPrefix + "PROCESSING-ECR-URI"),
    instance_type="ml.m5.xlarge",
    #instance_type="local",
    role=pm.get_params(key=strPrefix + "SAGEMAKER-ROLE-ARN"),
    instance_count=1,
    base_job_name="preprocessing", # bucket에 보이는 이름 (pipeline으로 묶으면 pipeline에서 정의한 이름으로 bucket에 보임)
    #sagemaker_session=self.pipeline_session
)

In [162]:
prep_processor.run(
    #job_name="preprocessing",
    code='./sources/preprocessing/preprocessing.py',
    inputs=[
        ProcessingInput(
            source=pm.get_params(key=strPrefix + "REVIEW-DATA-PATH"),
            destination='/opt/ml/processing/input')
    ],
    outputs=[
        ProcessingOutput(output_name="train_data", source='/opt/ml/processing/output/train'),
        ProcessingOutput(output_name="validation_data", source='/opt/ml/processing/output/validation'),
        ProcessingOutput(output_name="test_data", source='/opt/ml/processing/output/test')
    ],
    arguments=["--input_name", "reviews.tsv.gz", "--region", strRegionName],
    wait=False
)


Job Name:  preprocessing-2023-01-12-13-22-16-821
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-ap-northeast-2-419974056037/reviews-helpfulness-pipeline/data/reviews.tsv.gz', 'LocalPath': '/opt/ml/processing/input', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-ap-northeast-2-419974056037/preprocessing-2023-01-12-13-22-16-821/input/code/preprocessing.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'train_data', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-ap-northeast-2-419974056037/preprocessing-2023-01-12-13-22-16-821/output/train_data', 'LocalPath': '/opt/ml/processing/output/train', 'S3UploadMode': 'EndOfJob'}}, {'