# SageMaker Pipeline (pipeline.py) 로컬에서 Test 하기



# 1. 환경 설정 및 컨피그 파일 로딩

In [12]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
code_pipeline_train_config_json_path = 'pipelines/ncf/src/code_pipeline_train_config.json'
sm_pipeline_train_config_json_path = 'pipelines/ncf/src/sm_pipeline_train_config.json'

In [14]:
from pipelines.ncf.src.common_utils import load_json

code_pipeline_train_dict = load_json(code_pipeline_train_config_json_path)
sm_pipeline_train_dict = load_json(sm_pipeline_train_config_json_path)

import json
print("Code Pipeline Series Params: ")
print (json.dumps(code_pipeline_train_dict, indent=2))
print("SageMaker Pipeline Series Params: ")
print (json.dumps(sm_pipeline_train_dict, indent=2))


Code Pipeline Series Params: 
{
  "code_pipeline_role_arn": "arn:aws:iam::057716757052:role/code-pipeline-gsmoon",
  "code_build_service_arn": "arn:aws:iam::057716757052:role/codebuild-gsmoon",
  "project_prefix": "CodePipeline-Train-NCF",
  "region": "us-east-1",
  "account_id": "057716757052",
  "train_code_repo_name": "ncf-train",
  "code_build_project_name": "ncf-training-sm-pipeline",
  "bucket": "sagemaker-us-east-1-057716757052",
  "code_pipeline_name": "ncf-training-code-pipeline",
  "model_package_group_name": "NCF-Model-CodePipeline",
  "branch_name": "master"
}
SageMaker Pipeline Series Params: 
{
  "project_prefix": "SageMaker-Train-NCF",
  "s3_input_data_uri": "s3://sagemaker-us-east-1-057716757052/NCFModel/data",
  "sm_pipeline_name": "ncf-sm-pipeline",
  "training_instance_type": "ml.p3.2xlarge",
  "training_instance_count": 1,
  "ModelApprovalStatus": "PendingManualApproval",
  "inference_image_uri": "763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-inference:1.8.1-

### 필요한 설정 값 로딩

In [15]:
import boto3
import sagemaker
import os

region = code_pipeline_train_dict["region"]
account_id = code_pipeline_train_dict["account_id"]
bucket = code_pipeline_train_dict["bucket"]
role = code_pipeline_train_dict["code_build_service_arn"]
model_package_group_name = code_pipeline_train_dict["model_package_group_name"]

pipeline_name = sm_pipeline_train_dict["sm_pipeline_name"]
s3_input_data_uri = sm_pipeline_train_dict["s3_input_data_uri"]


project_prefix = sm_pipeline_train_dict["project_prefix"]
inference_image_uri = sm_pipeline_train_dict["inference_image_uri"]
training_instance_type = sm_pipeline_train_dict["training_instance_type"]
training_instance_count = sm_pipeline_train_dict["training_instance_count"]
ModelApprovalStatus = sm_pipeline_train_dict["ModelApprovalStatus"]

# 2. src 코드 S3 업로딩 
- 리패키징 람다 스텝에서 사용.
- code_buildspec.yml 파일에서 upload.py 에 해당 함.

## 2.1. source.tar.gz 로 압축

In [16]:
code_data_dir = 'pipelines/ncf/src'
code_artifact_name = 'source.tar.gz'

In [17]:
%%sh -s {code_data_dir} {code_artifact_name}
code_data_dir=$1
code_artifact_name=$2

cd $code_data_dir
rm -rf $code_artifact_name
tar -czvf $code_artifact_name *.*

code_pipeline_train_config.json
common_utils.py
config.py
data_utils.py
evaluate.py
iam_change_model_approval.py
iam_create_endpoint.py
iam_helper.py
inference.py
inference_utils.py
model_config.json
model.py
pipeline_util.py
requirements.txt
sm_pipeline_train_config.json
train_lib.py
train.py


## 2.2. S3 에 업로딩

In [18]:
source_code_prefix = 'code'
# S3에 저장되는 데이터의 기본 폴더 위치
s3_code_uri = f"s3://{bucket}/{source_code_prefix}"

In [19]:
! aws s3 ls {s3_code_uri} --recursive
! aws s3 rm {s3_code_uri} --recursive

2022-11-20 06:27:45      12711 code/source.tar.gz
delete: s3://sagemaker-us-east-1-057716757052/code/source.tar.gz


In [20]:
import os
local_code = os.path.join(code_data_dir, code_artifact_name)

In [21]:
_ = sagemaker.s3.S3Uploader.upload(
    local_path=local_code, 
    desired_s3_uri=s3_code_uri,    
)
print(s3_code_uri)

s3://sagemaker-us-east-1-057716757052/code


s3_code_uri 에 source.tar.gz 이름 붙임

In [22]:
s3_code_uri = os.path.join(s3_code_uri, 'source.tar.gz')
print("s3_code_uri: \n", s3_code_uri)

s3_code_uri: 
 s3://sagemaker-us-east-1-057716757052/code/source.tar.gz


## 2.3. code_location.json 파일에 위치 명시하고 저장

In [23]:
def store_s3_code_uri_json(s3_code_uri):
    '''
    json_file_name 안에 S3_URL 을 저장
    '''
 
    # Data to be written
    dictionary = {
        "s3_code_uri": s3_code_uri,
    }
    print("dictionary: \n", dictionary)    

    # Serializing json
    json_object = json.dumps(dictionary, indent=4)

    json_file_name = "code_location.json"    
    # Writing to sample.json
    with open(json_file_name, "w") as outfile:
        outfile.write(json_object)


    return json_file_name

store_s3_code_uri_json(s3_code_uri)

dictionary: 
 {'s3_code_uri': 's3://sagemaker-us-east-1-057716757052/code/source.tar.gz'}


'code_location.json'

# 3. Pipeline 테스트

## 3.1. 컨피그 파일에서 설정 값 로딩

In [24]:
print("s3_input_data_uri: \n", s3_input_data_uri)
print("project_prefix: \n", project_prefix)
print("region: \n", region)
print("inference_image_uri: \n", inference_image_uri)
print("role: \n", role)
print("bucket: \n", bucket)
print("model_package_group_name: \n", model_package_group_name)
print("ModelApprovalStatus: \n", ModelApprovalStatus)
print("pipeline_name: \n", pipeline_name)
print("training_instance_type: \n", training_instance_type)
print("training_instance_count: \n", training_instance_count)



s3_input_data_uri: 
 s3://sagemaker-us-east-1-057716757052/NCFModel/data
project_prefix: 
 SageMaker-Train-NCF
region: 
 us-east-1
inference_image_uri: 
 763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-inference:1.8.1-gpu-py3
role: 
 arn:aws:iam::057716757052:role/codebuild-gsmoon
bucket: 
 sagemaker-us-east-1-057716757052
model_package_group_name: 
 NCF-Model-CodePipeline
ModelApprovalStatus: 
 PendingManualApproval
pipeline_name: 
 ncf-sm-pipeline
training_instance_type: 
 ml.p3.2xlarge
training_instance_count: 
 1


In [25]:
from pipelines.ncf.pipeline import get_pipeline


pipeline = get_pipeline(
    project_prefix = project_prefix,
    region=region,
    role= role, # SAGEMAKER_PIPELINE_ROLE_ARN 이 넘어옴.
    default_bucket= bucket,
    model_package_group_name= model_package_group_name,
    pipeline_name= pipeline_name    
)

######### get_pipeline() input parameter ###############
### BASE_DIR: /home/ec2-user/SageMaker/Neural-Collaborative-Filtering-On-SageMaker/3_MLOps/4_sm-train-codepipeline/codecommit/pipelines/ncf
region: us-east-1
role: arn:aws:iam::057716757052:role/codebuild-gsmoon
default_bucket: sagemaker-us-east-1-057716757052
pipeline_name: ncf-sm-pipeline
role:  arn:aws:iam::057716757052:role/codebuild-gsmoon
code_location path: 
 code_location.json
##### S3 Code Location #########
s3_code_uri:  s3://sagemaker-us-east-1-057716757052/code/source.tar.gz
################################
estimator_output_path: 
 s3://sagemaker-us-east-1-057716757052/SageMaker-Train-NCF/training_jobs
repackage_lambda_script_path: 
 /home/ec2-user/SageMaker/Neural-Collaborative-Filtering-On-SageMaker/3_MLOps/4_sm-train-codepipeline/codecommit/pipelines/ncf/iam_repackage_model_artifact.py
function_name: 
 sagemaker-lambda-step-repackage-model-artifact
bucket prefix: 
 SageMaker-Train-NCF/2022-11-20-06-41-06
NCF-Model-



In [26]:
definition = json.loads(pipeline.definition())
# definition

The input argument instance_type of function (sagemaker.image_uris.retrieve) is a pipeline variable (<class 'sagemaker.workflow.parameters.ParameterString'>), which is not allowed. The default_value of this Parameter object will be used to override it. Please make sure the default_value is valid.


In [27]:
pipeline.upsert(role_arn=role)
#execution = pipeline.start()
execution = pipeline.start(
    parameters=dict(
        InputData= s3_input_data_uri,
        training_instance_type = training_instance_type,
        training_instance_count = training_instance_count,
        ModelApprovalStatus = ModelApprovalStatus,                                
        inference_image_uri = inference_image_uri,                        
    )
)

The input argument instance_type of function (sagemaker.image_uris.retrieve) is a pipeline variable (<class 'sagemaker.workflow.parameters.ParameterString'>), which is not allowed. The default_value of this Parameter object will be used to override it. Please make sure the default_value is valid.


We'll start the pipeline, accepting all the default parameters.

Values can also be passed into these pipeline parameters on starting of the pipeline, and will be covered later. 

In [30]:
execution.wait()
execution.describe()

{'PipelineArn': 'arn:aws:sagemaker:us-east-1:057716757052:pipeline/ncf-sm-pipeline',
 'PipelineExecutionArn': 'arn:aws:sagemaker:us-east-1:057716757052:pipeline/ncf-sm-pipeline/execution/uv9apzhex9on',
 'PipelineExecutionDisplayName': 'execution-1668926468955',
 'PipelineExecutionStatus': 'Succeeded',
 'PipelineExperimentConfig': {'ExperimentName': 'ncf-sm-pipeline',
  'TrialName': 'uv9apzhex9on'},
 'CreationTime': datetime.datetime(2022, 11, 20, 6, 41, 8, 834000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2022, 11, 20, 6, 51, 57, 353000, tzinfo=tzlocal()),
 'CreatedBy': {},
 'LastModifiedBy': {},
 'ResponseMetadata': {'RequestId': '633015db-cb92-44d0-93bd-1039d914cd2a',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '633015db-cb92-44d0-93bd-1039d914cd2a',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '486',
   'date': 'Sun, 20 Nov 2022 06:52:25 GMT'},
  'RetryAttempts': 0}}

In [29]:
execution.list_steps()

[]