# <B> # SageMaker pipeline </B>
* Container: codna_python3

## AutoReload

In [1]:
%load_ext autoreload
%autoreload 2

## 0. Install packages

In [2]:
install_needed = False  # should only be True once
# install_needed = False

In [3]:
%%bash
#!/bin/bash

DAEMON_PATH="/etc/docker"
MEMORY_SIZE=10G

FLAG=$(cat $DAEMON_PATH/daemon.json | jq 'has("data-root")')
# echo $FLAG

if [ "$FLAG" == true ]; then
    echo "Already revised"
else
    echo "Add data-root and default-shm-size=$MEMORY_SIZE"
    sudo cp $DAEMON_PATH/daemon.json $DAEMON_PATH/daemon.json.bak
    sudo cat $DAEMON_PATH/daemon.json.bak | jq '. += {"data-root":"/home/ec2-user/SageMaker/.container/docker","default-shm-size":"'$MEMORY_SIZE'"}' | sudo tee $DAEMON_PATH/daemon.json > /dev/null
    sudo service docker restart
    echo "Docker Restart"
fi

Already revised


In [4]:
import sys
import IPython

if install_needed:
    print("installing deps and restarting kernel")
    !{sys.executable} -m pip install -U pip
    !{sys.executable} -m pip install -U smdebug sagemaker-experiments
    !{sys.executable} -m pip install -U sagemaker

    IPython.Application.instance().kernel.do_shutdown(True)

installing deps and restarting kernel
Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting sagemaker
  Downloading sagemaker-2.146.0.tar.gz (718 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m718.5/718.5 kB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: sagemaker
  Building wheel for sagemaker (setup.py) ... [?25ldone
[?25h  Created wheel for sagemaker: filename=sagemaker-2.146.0-py2.py3-none-any.whl size=964936 sha256=d3a9669420018e3c5e20828dddd14289133eb66b2cdc3c6309016c0923d44bc4
  Stored in directory: /home/ec2-user/.cache/pip/wheels/3a/04/13/2066fc4ef9ed243c9e8710b9c269f29e7711bca655da2eb416
Successfully built sagemaker
Installing collected packages: sagemaker
  Att

## 1. parameter store 설정

In [3]:
import boto3
from utils.ssm import parameter_store

In [4]:
strRegionName=boto3.Session().region_name
pm = parameter_store(strRegionName)
strPrefix = pm.get_params(key="PREFIX")

In [5]:
strBucketName = pm.get_params(key="-".join([strPrefix, "BUCKET"]))
strExecutionRole = pm.get_params(key="-".join([strPrefix, "SAGEMAKER-ROLE-ARN"]))

## 2. Dataset

In [6]:
import os

In [7]:
strS3DataPath = f"s3://{strBucketName}/dataset" 
strLocalDataPath = os.path.join(os.getcwd(), "data")

## 3. MLOps pipeline
* pipeline:
    * https://sagemaker.readthedocs.io/en/stable/amazon_sagemaker_model_building_pipeline.html#id2 
    * [Amazon SageMaker 모델 구축 파이프라인](https://docs.aws.amazon.com/ko_kr/sagemaker/latest/dg/pipelines.html)
    

In [11]:
import os
import time
import sagemaker
from sagemaker.inputs import TrainingInput
from sagemaker.xgboost.estimator import XGBoost
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.properties import PropertyFile
from sagemaker.processing import ProcessingInput, ProcessingOutput, FrameworkProcessor
from sagemaker.workflow.steps import CacheConfig, ProcessingStep, TrainingStep
from sagemaker.workflow.pipeline_context import PipelineSession, LocalPipelineSession
from sagemaker.workflow.retry import StepRetryPolicy, StepExceptionTypeEnum, SageMakerJobExceptionTypeEnum, SageMakerJobStepRetryPolicy

In [14]:
class pipeline():
    
    def __init__(self, bLocalMode, strPipelineName): 
        
        
        self.strRegionName = boto3.Session().region_name
        self.pm = parameter_store(self.strRegionName)
        self.strPrefix = self.pm.get_params(key="PREFIX")
        
        self.strExecutionRole = self.pm.get_params(key="-".join([self.strPrefix, "SAGEMAKER-ROLE-ARN"]))
        print (self.strExecutionRole)
        self.strBucketName = self.pm.get_params(key="-".join([self.strPrefix, "BUCKET"]))
        self.strPipelineName = strPipelineName
        self.bLocalMode = bLocalMode
     
        if self.bLocalMode:
            self.pipeline_session = LocalPipelineSession()
            self.pipeline_session.config = {'local': {'local_code': True}}
            self.strDataPath = os.path.join(os.getcwd(), "data")
        else:
            self.pipeline_session = PipelineSession()
            self.strDataPath = f"s3://{self.strBucketName}/dataset" 
        
        self.cache_config = CacheConfig(
            enable_caching=True,
            expire_after="T48H"
        )
        
        self.retry_policies=[                
            # retry when resource limit quota gets exceeded
            SageMakerJobStepRetryPolicy(
                exception_types=[SageMakerJobExceptionTypeEnum.RESOURCE_LIMIT],
                expire_after_mins=180,
                interval_seconds=60,
                backoff_rate=1.0
            ),
        ]
        
    def _step_training(self, ):
        
        
        strInstanceType = "ml.m5.xlarge"
        nInstanceCount = 1
        bSpotTraining = False
        
        if self.bLocalMode:
            dicDataChannels = {
                "training": f'file://{os.path.join(self.strDataPath, "train.csv")}',
                "testing": f'file://{os.path.join(self.strDataPath, "test.csv")}',
            }
        else:
            dicDataChannels = {
                "training": os.path.join(self.strDataPath, "train.csv"),
                "testing": os.path.join(self.strDataPath, "test.csv"),
            }
            
        if bSpotTraining:
            nMaxWait = 1*60*60
            nMaxRun = 1*60*60
        else:
            nMaxWait = None
            nMaxRun = 1*60*60
    
        strOutputPath = os.path.join(
            "s3://{}".format(self.strBucketName),
            self.strPipelineName,
            "training",
            "model-output"
        )

        strCodeLocation = os.path.join(
            "s3://{}".format(self.strBucketName),
            self.strPipelineName,
            "training",
            "backup_codes"
        )
        
        dicHyperparameters = {
            "scale_pos_weight" : "19",    
            "max_depth": "2",
            "eta": "0.3",
            "objective": "binary:logistic",
            "num_round": "100",
        }
        
        self.estimator = XGBoost(
            entry_point="xgboost_starter_script.py",
            source_dir="source/train/",
            output_path=strOutputPath,
            code_location=strCodeLocation,
            hyperparameters=dicHyperparameters, ## Contatiner내 env. variable로 들어 감
            role=self.strExecutionRole,
            instance_count=nInstanceCount,
            instance_type=strInstanceType,
            framework_version="1.3-1",
            max_run=nMaxRun,
            use_spot_instances=bSpotTraining,
            max_wait=nMaxWait,
            #keep_alive_period_in_seconds=nKeepAliveSeconds,
            enable_sagemaker_metrics=True,
            volume_size=64, ## GB
            
            sagemaker_session=self.pipeline_session
            
        )
        
        job_name = "-".join([self.strPipelineName, "training-job"])
        step_training_args = self.estimator.fit(
            inputs=dicDataChannels,
            job_name=job_name,
            experiment_config={
              'TrialName': job_name,
              'TrialComponentDisplayName': job_name,
            },
            logs="All",
        )
        
        self.training_process = TrainingStep(
            name="TrainingProcess",
            step_args=step_training_args,
            cache_config=self.cache_config,
            retry_policies=self.retry_policies
        )
            
        print ("  \n== Training Step ==")
        print ("   \nArgs: ", self.training_process.arguments.items())   
        
    
    def _step_evaluation(self, ):
        
        if self.bLocalMode: pipeline_session = LocalPipelineSession() ## processing job에서 local code는 적용되지 않기 때문
        else: pipeline_session=self.pipeline_session 
        
        strInstanceType = "ml.m5.xlarge"
        nInstanceCount = 1
        
        strProcPrefixPath = "/opt/ml/processing"
        strTestDataPath = os.path.join(self.strDataPath, "test.csv")
        
        strOutputPath = os.path.join(
            "s3://{}".format(self.strBucketName),
            self.strPipelineName,
            "evaluation",
            "output"
        )

        strCodeLocation = os.path.join(
            "s3://{}".format(self.strBucketName),
            self.strPipelineName,
            "evaluation",
            "backup_codes"
        )
        
        evaluation_processor = FrameworkProcessor(
            estimator_cls=XGBoost,
            framework_version="1.3-1",
            image_uri=None,
            role=self.strExecutionRole,
            instance_type=strInstanceType,
            instance_count=nInstanceCount,
            base_job_name="evaluation", # bucket에 보이는 이름 (pipeline으로 묶으면 pipeline에서 정의한 이름으로 bucket에 보임)
            
            sagemaker_session=pipeline_session
        )
        
        step_evaluation_args = evaluation_processor.run(
            code="evaluation.py",
            source_dir="source/evaluation/",
            inputs=[
                ProcessingInput(
                    source=strTestDataPath,
                    input_name="test_data",
                    destination=os.path.join(strProcPrefixPath, "test")
                ),
                ProcessingInput(
                    source=self.training_process.properties.ModelArtifacts.S3ModelArtifacts,
                    input_name="model_weight",
                    destination=os.path.join(strProcPrefixPath, "model")
                )
            ],
            outputs=[
                ProcessingOutput(
                    source=os.path.join(strProcPrefixPath, "output"),
                    output_name='evaluation',
                    destination=strOutputPath,
                )
            ],
        )
        
        self.evaluation_report = PropertyFile(
            name="EvaluationReport",
            output_name="evaluation", ## evaluation의 ProcessingOutput의 output_name
            path="evaluation.json", ## evaluate.py 에서 write하는 부분
        )
        
        self.evaluation_process = ProcessingStep(
            name="EvaluationProcess", ## Processing job이름
            step_args=step_evaluation_args,
            depends_on=[self.training_process],
            property_files=[self.evaluation_report],
            cache_config=self.cache_config,
            retry_policies=self.retry_policies
        )
    
        print ("  \n== Evaluation Step ==")
        print ("   \nArgs: ", self.evaluation_process.arguments.items())
        
    
    def _step_deploy(self, ):
        
        if self.bLocalMode: pipeline_session = LocalPipelineSession() ## processing job에서 local code는 적용되지 않기 때문
        else: pipeline_session=self.pipeline_session 
        
        strInstanceType = "ml.m5.xlarge"
        nInstanceCount = 1
        strEndpointName = f"endpoint--{self.strPipelineName}{int(time.time())}"
        strProcPrefixPath = "/opt/ml/processing"
        
        deploy_processor = FrameworkProcessor(
            estimator_cls=XGBoost,
            framework_version="1.3-1",
            image_uri=None,
            role=self.strExecutionRole,
            instance_type=strInstanceType,
            instance_count=nInstanceCount,
            base_job_name="deploy", # bucket에 보이는 이름 (pipeline으로 묶으면 pipeline에서 정의한 이름으로 bucket에 보임)
            sagemaker_session=pipeline_session
        )
        
        step_deploy_args = deploy_processor.run(
            code="deploy.py",
            source_dir="source/deploy/",
            inputs=[
                ProcessingInput(
                    source="source/deploy/inference.py",
                    input_name="inference-py",
                    destination=os.path.join(strProcPrefixPath, "inference")
                ),
            ],
            arguments=[
                "--prefix_deploy", strProcPrefixPath, \
                "--region", self.strRegionName, \
                "--instance_type", strInstanceType, \
                "--model_data", self.training_process.properties.ModelArtifacts.S3ModelArtifacts, \
                "--endpoint_name", strEndpointName, \
                "--execution_role", self.strExecutionRole, \
                "--local_mode", str(self.bLocalMode), \
            ],
            job_name="deploy",
        )
        
        self.pm.put_params(key=self.strPrefix + "ENDPOINT-NAME", value=strEndpointName, overwrite=True)
        
        self.deploy_process = ProcessingStep(
            name="DeployProcess", ## Processing job이름
            step_args=step_deploy_args,
            depends_on=[self.evaluation_process],
            cache_config=self.cache_config,
            retry_policies=self.retry_policies
        )
        
        print ("  \n== Deploy Step ==")
        print ("   \nArgs: ", self.deploy_process.arguments.items())
    
    def _get_pipeline(self, ):
        
        if self.bLocalMode: steps=[self.training_process, self.evaluation_process]
        else: steps=[self.training_process, self.evaluation_process, self.deploy_process]
            
        pipeline = Pipeline(
            name=self.strPipelineName,
            steps=steps,
            sagemaker_session=self.pipeline_session
        )

        return pipeline
                      
    def execution(self, ):
    
        self._step_training()
        self._step_evaluation()
        if not self.bLocalMode: self._step_deploy()
        
        pipeline = self._get_pipeline()
        pipeline.upsert(role_arn=self.strExecutionRole) ## Submit the pipeline definition to the SageMaker Pipelines service 
        execution = pipeline.start()
        execution.describe()
     

In [15]:
pipe = pipeline(
    bLocalMode=True,
    strPipelineName=f'{strPrefix}-PIPELINE'
)
pipe.execution()

INFO:sagemaker.image_uris:Ignoring unnecessary Python version: py3.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: ml.m5.xlarge.
INFO:sagemaker.image_uris:Ignoring unnecessary Python version: py3.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: ml.m5.xlarge.
INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


arn:aws:iam::419974056037:role/service-role/AmazonSageMaker-ExecutionRole-20221206T163436
  
== Training Step ==
   
Args:  dict_items([('AlgorithmSpecification', {'TrainingInputMode': 'File', 'TrainingImage': '683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.3-1', 'EnableSageMakerMetricsTimeSeries': True}), ('OutputDataConfig', {'S3OutputPath': 's3://sagemaker-us-east-1-419974056037/DJ-SM-IMD-PIPELINE/training/model-output'}), ('StoppingCondition', {'MaxRuntimeInSeconds': 3600}), ('ResourceConfig', {'VolumeSizeInGB': 64, 'InstanceCount': 1, 'InstanceType': 'ml.m5.xlarge'}), ('RoleArn', 'arn:aws:iam::419974056037:role/service-role/AmazonSageMaker-ExecutionRole-20221206T163436'), ('InputDataConfig', [{'DataSource': {'FileDataSource': {'FileDataDistributionType': 'FullyReplicated', 'FileUri': 'file:///home/ec2-user/SageMaker/sagemaker-immersion-day/data/train.csv'}}, 'ChannelName': 'training'}, {'DataSource': {'FileDataSource': {'FileDataDistributionType': 'FullyReplicate

INFO:sagemaker.processing:Uploaded source/evaluation/ to s3://sagemaker-us-east-1-419974056037/evaluation-2023-04-16-07-15-58-843/source/sourcedir.tar.gz
INFO:sagemaker.processing:runproc.sh uploaded to s3://sagemaker-us-east-1-419974056037/evaluation-2023-04-16-07-15-58-843/source/runproc.sh
INFO:sagemaker.processing:Uploaded source/evaluation/ to s3://sagemaker-us-east-1-419974056037/evaluation-2023-04-16-07-15-59-530/source/sourcedir.tar.gz


   
Args:  dict_items([('ProcessingResources', {'ClusterConfig': {'InstanceType': 'ml.m5.xlarge', 'InstanceCount': 1, 'VolumeSizeInGB': 30}}), ('AppSpecification', {'ImageUri': '683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.3-1', 'ContainerEntrypoint': ['/bin/bash', '/opt/ml/processing/input/entrypoint/runproc.sh']}), ('RoleArn', 'arn:aws:iam::419974056037:role/service-role/AmazonSageMaker-ExecutionRole-20221206T163436'), ('ProcessingInputs', [{'InputName': 'test_data', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-419974056037/evaluation-2023-04-16-07-15-58-843/input/test_data/test.csv', 'LocalPath': '/opt/ml/processing/test', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'model_weight', 'AppManaged': False, 'S3Input': {'S3Uri': <sagemaker.workflow.properties.Properties object at 0x7f7bc1056230>, 'LocalPath': '/opt/ml/processing/model', 'S3DataType': 'S

INFO:sagemaker.processing:runproc.sh uploaded to s3://sagemaker-us-east-1-419974056037/evaluation-2023-04-16-07-15-59-530/source/runproc.sh
INFO:sagemaker.processing:Uploaded source/evaluation/ to s3://sagemaker-us-east-1-419974056037/evaluation-2023-04-16-07-15-59-779/source/sourcedir.tar.gz


Starting execution for pipeline DJ-SM-IMD-PIPELINE. Execution ID is 2f0cf5b9-0a04-4484-a95a-4a2403a5af3d


INFO:sagemaker.processing:runproc.sh uploaded to s3://sagemaker-us-east-1-419974056037/evaluation-2023-04-16-07-15-59-779/source/runproc.sh
INFO:sagemaker.local.local_session:Starting training job
INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:sagemaker.local.image:No AWS credentials found in session but credentials from EC2 Metadata Service are available.
INFO:sagemaker.local.image:docker compose file: 
networks:
  sagemaker-local:
    name: sagemaker-local
services:
  algo-1-4k4pr:
    command: train
    container_name: s215drh6jr-algo-1-4k4pr
    environment:
    - '[Masked]'
    - '[Masked]'
    image: 683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.3-1
    networks:
      sagemaker-local:
        aliases:
        - algo-1-4k4pr
    stdin_open: true
    tty: true
    volumes:
    - /tmp/tmpkn09_5qi/algo-1-4k4pr/output:/

Starting pipeline step: 'TrainingProcess'
Creating s215drh6jr-algo-1-4k4pr ... 
Creating s215drh6jr-algo-1-4k4pr ... done
Attaching to s215drh6jr-algo-1-4k4pr
[36ms215drh6jr-algo-1-4k4pr |[0m [2023-04-16 07:16:02.782 0753f1cbd9b3:1 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None
[36ms215drh6jr-algo-1-4k4pr |[0m [2023-04-16 07:16:02.811 0753f1cbd9b3:1 INFO profiler_config_parser.py:111] Unable to find config at /opt/ml/input/config/profilerconfig.json. Profiler is disabled.
[36ms215drh6jr-algo-1-4k4pr |[0m [2023-04-16:07:16:02:INFO] Imported framework sagemaker_xgboost_container.training
[36ms215drh6jr-algo-1-4k4pr |[0m [2023-04-16:07:16:02:INFO] No GPUs detected (normal if no gpus installed)
[36ms215drh6jr-algo-1-4k4pr |[0m [2023-04-16:07:16:02:INFO] Invoking user training script.
[36ms215drh6jr-algo-1-4k4pr |[0m [2023-04-16:07:16:02:INFO] Installing module with the following command:
[36ms215drh6jr-algo-1-4k4pr |[0m /miniconda3/bin/python3 -m pip install . 
[36ms2

INFO:root:creating /tmp/tmpkn09_5qi/artifacts/output/data
INFO:root:copying /tmp/tmpkn09_5qi/algo-1-4k4pr/output/data/metrics.json -> /tmp/tmpkn09_5qi/artifacts/output/data
INFO:root:copying /tmp/tmpkn09_5qi/model/xgboost-model -> /tmp/tmpkn09_5qi/artifacts/model


[36ms215drh6jr-algo-1-4k4pr exited with code 0
[0mAborting on container exit...


INFO:sagemaker.processing:Uploaded source/evaluation/ to s3://sagemaker-us-east-1-419974056037/evaluation-2023-04-16-07-16-11-398/source/sourcedir.tar.gz


===== Job Complete =====
Pipeline step 'TrainingProcess' SUCCEEDED.
Starting pipeline step: 'EvaluationProcess'


INFO:sagemaker.processing:runproc.sh uploaded to s3://sagemaker-us-east-1-419974056037/evaluation-2023-04-16-07-16-11-398/source/runproc.sh
INFO:sagemaker.local.local_session:Starting processing job
INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:sagemaker.local.image:No AWS credentials found in session but credentials from EC2 Metadata Service are available.
INFO:sagemaker.local.image:docker compose file: 
networks:
  sagemaker-local:
    name: sagemaker-local
services:
  algo-1-ele1q:
    container_name: ue08vgn5x8-algo-1-ele1q
    entrypoint:
    - /bin/bash
    - /opt/ml/processing/input/entrypoint/runproc.sh
    environment: []
    image: 683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.3-1
    networks:
      sagemaker-local:
        aliases:
        - algo-1-ele1q
    stdin_open: true
    tty: true
    volumes:
    - /tmp/tmp2n86x_wj/algo-1-ele1q/output:/opt/ml/output
    - /tmp/tmp2n86x_wj/algo-1-ele1q/config:/

Creating ue08vgn5x8-algo-1-ele1q ... 
Creating ue08vgn5x8-algo-1-ele1q ... done
Attaching to ue08vgn5x8-algo-1-ele1q
[36mue08vgn5x8-algo-1-ele1q |[0m [0m
[36mue08vgn5x8-algo-1-ele1q |[0m [1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1[0m
[36mue08vgn5x8-algo-1-ele1q |[0m [1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[36mue08vgn5x8-algo-1-ele1q |[0m #############################################
[36mue08vgn5x8-algo-1-ele1q |[0m args.model_path: /opt/ml/processing/model/model.tar.gz
[36mue08vgn5x8-algo-1-ele1q |[0m args.test_path: /opt/ml/processing/test/test.csv
[36mue08vgn5x8-algo-1-ele1q |[0m args.output_evaluation_dir: /opt/ml/processing/output
[36mue08vgn5x8-algo-1-ele1q |[0m ****** All folder and files under /opt/ml/processing ****** 
[36mue08vgn5x8-algo-1-ele1q |[0m ('/opt/ml/processing', ['output', 'model', 'test