# <B> Monitor - SageMaker pipeline </B>
* Container: conda_python3

## AutoReload

In [1]:
%load_ext autoreload
%autoreload 2

## 0. Install packages

In [2]:
install_needed = False  # should only be True once
# install_needed = False

In [3]:
%%bash
#!/bin/bash

DAEMON_PATH="/etc/docker"
MEMORY_SIZE=10G

FLAG=$(cat $DAEMON_PATH/daemon.json | jq 'has("data-root")')
# echo $FLAG

if [ "$FLAG" == true ]; then
    echo "Already revised"
else
    echo "Add data-root and default-shm-size=$MEMORY_SIZE"
    sudo cp $DAEMON_PATH/daemon.json $DAEMON_PATH/daemon.json.bak
    sudo cat $DAEMON_PATH/daemon.json.bak | jq '. += {"data-root":"/home/ec2-user/SageMaker/.container/docker","default-shm-size":"'$MEMORY_SIZE'"}' | sudo tee $DAEMON_PATH/daemon.json > /dev/null
    sudo service docker restart
    echo "Docker Restart"
fi

Already revised


In [4]:
import sys
import IPython

if install_needed:
    print("installing deps and restarting kernel")
    !{sys.executable} -m pip install -U pip
    !{sys.executable} -m pip install -U smdebug sagemaker-experiments
    !{sys.executable} -m pip install -U sagemaker

    IPython.Application.instance().kernel.do_shutdown(True)

## 1. parameter store 설정

In [8]:
import boto3
from utils.ssm import parameter_store

In [9]:
strRegionName=boto3.Session().region_name
pm = parameter_store(strRegionName)
strPrefix = pm.get_params(key="PREFIX")

In [10]:
strBucketName = pm.get_params(key="-".join([strPrefix, "BUCKET"]))
strExecutionRole = pm.get_params(key="-".join([strPrefix, "SAGEMAKER-ROLE-ARN"]))

In [11]:
print (f'strExecutionRole: {strExecutionRole}')

strExecutionRole: arn:aws:iam::419974056037:role/service-role/AmazonSageMaker-ExecutionRole-20221206T163436


## 2. EventBridge - Lambda 설정

In [12]:
import boto3

### 2-1. Lambda function 생성

In [13]:
import shutil
import zipfile
import tempfile
from utils.lambda_func import lambda_handler

In [14]:
lam_handler = lambda_handler(region_name=strRegionName)

In [15]:
%%writefile ./monitor/lambda/pipeline_monitor_lambda.py

import json
from pprint import pprint

def lambda_handler(event, context):
    
    # TODO implement
    
    pprint (event)
    print ("==")
    
    strPipelineArn = event["detail"]["pipelineArn"]
    strStepName = event["detail"]["stepName"]
    strCurrentStepStatus = event["detail"]["currentStepStatus"]
    strFailReasion = event["detail"]["failureReason"]
    strEndTime = event["detail"]["stepEndTime"]
    strMetaData = str(event["detail"]["metadata"])
    
    
    print (f'strPipelineArn: {strPipelineArn}')
    print (f'strStepName: {strStepName}')
    print (f'strMetaData: {strMetaData}')
    print (f'strCurrentStepStatus: {strCurrentStepStatus}')
    print (f'strFailReasion: {strFailReasion}')
    print (f'strEndTime: {strEndTime}')
    
    return {
        'statusCode': 200,
        'body': json.dumps('Hello from Lambda!')
    }


Overwriting ./monitor/lambda/pipeline_monitor_lambda.py


In [16]:
strLambdaRoleArn = pm.get_params(key="-".join([strPrefix, "LAMBDA-ROLE-ARN"]))
strLambdaFuncName = "-".join([strPrefix, "LAMBDA"])
strLambdaSrcDir = "./monitor/lambda"
strLambdaHandler = "pipeline_monitor_lambda.lambda_handler" 

In [17]:
print (f'strLambdaRoleArn: {strLambdaRoleArn}')
print (f'strRegionName: {strRegionName}')

strLambdaRoleArn: arn:aws:iam::419974056037:role/DJ-SM-PIPELINE-LabmdaRole
strRegionName: us-east-1


In [18]:
with tempfile.TemporaryDirectory() as tempDirPath:
    
    lambda_archive_path = shutil.make_archive(
        base_name=tempDirPath,
        format="zip",
        root_dir=strLambdaSrcDir,
    )
    
    with open(lambda_archive_path, 'rb') as f: zipped_code = f.read()
    
    strLambdaArn = lam_handler.create_function(
        Code=dict(ZipFile=zipped_code),
        Description='SageMaker IMD: Lambda for Automating Amazon SageMaker with Amazon EventBridge',
        Environment={
           'Variables': {
               'REGION':strRegionName,
           },
        },
        FunctionName=strLambdaFuncName,
        Handler=strLambdaHandler,
        Publish=True,
        Role=strLambdaRoleArn,
        Runtime='python3.9',
    )
    
print (f'LambdaArn: {strLambdaArn}')
print (f'strLambdaFuncName: {strLambdaFuncName}')
pm.put_params(key="-".join([strPrefix, "LAMBDA-PIPELINE-MONITOR"]), value=strLambdaFuncName, overwrite=True)

== CREATE LAMBDA FUNCTION ==
  lambda function: [DJ-SM-PIPELINE-LAMBDA] is already exist!!, so, this will be deleted and re-created.
  lambda function: [DJ-SM-PIPELINE-LAMBDA] is deleted successfully
Argments for lambda below:

{'Architectures': ['x86_64'],
 'CodeSha256': 'HR4Sss2N7/J25ATBRvBoIyMNMb0qn8Pprys1ovVGT8k=',
 'CodeSize': 1099,
 'Description': 'SageMaker IMD: Lambda for Automating Amazon SageMaker with '
                'Amazon EventBridge',
 'Environment': {'Variables': {'REGION': 'us-east-1'}},
 'EphemeralStorage': {'Size': 512},
 'FunctionArn': 'arn:aws:lambda:us-east-1:419974056037:function:DJ-SM-PIPELINE-LAMBDA',
 'FunctionName': 'DJ-SM-PIPELINE-LAMBDA',
 'Handler': 'pipeline_monitor_lambda.lambda_handler',
 'LastModified': '2023-05-14T00:52:38.669+0000',
 'MemorySize': 128,
 'PackageType': 'Zip',
 'ResponseMetadata': {'HTTPHeaders': {'connection': 'keep-alive',
                                      'content-length': '1350',
                                      'content

'Store suceess'

### 2-1. Event Rule 생성
* [Automating Amazon SageMaker with Amazon EventBridge](https://docs.aws.amazon.com/sagemaker/latest/dg/automating-sagemaker-with-eventbridge.html#eventbridge-pipeline)
* [BOTO3 for eventbridge](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/events.html)

In [19]:
client_events = boto3.client('events')

In [20]:
dicEventPattern = '''
{
    "source": ["aws.sagemaker"],
    "detail-type": ["SageMaker Model Building Pipeline Execution Step Status Change"],
    "detail": {
        "currentStepStatus": ["Failed"] 
    }
}
'''
strEventBridgeRole = pm.get_params(key="-".join([strPrefix, "CODE-EVENTBRIDGE-ROLE-ARN"]))
strEventRuleName = "SAGEMAKER-PIPELINE-STEP-MONITOR"
pm.put_params(key="-".join([strPrefix, "EVENT-RULE-NAME"]), value=strEventRuleName, overwrite=True)

'Store suceess'

In [21]:
print (f'strEventBridgeRole: {strEventBridgeRole}')
print (f'strEventRuleName: {strEventRuleName}')

strEventBridgeRole: arn:aws:iam::419974056037:role/DJ-SM-PIPELINE-EventBridgeRole
strEventRuleName: SAGEMAKER-PIPELINE-STEP-MONITOR


In [22]:
rule_response = client_events.put_rule(
    Name=strEventRuleName,
    #ScheduleExpression='string',
    EventPattern=dicEventPattern,
    State="ENABLED", #'ENABLED'|'DISABLED'
    Description="Trigger when currentStepStatus is Failed",
    RoleArn=strEventBridgeRole
)
rule_response

{'RuleArn': 'arn:aws:events:us-east-1:419974056037:rule/SAGEMAKER-PIPELINE-STEP-MONITOR',
 'ResponseMetadata': {'RequestId': 'fd72e31d-a1f0-431d-b01f-03831c8a8672',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'fd72e31d-a1f0-431d-b01f-03831c8a8672',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '88',
   'date': 'Sun, 14 May 2023 00:52:45 GMT'},
  'RetryAttempts': 0}}

### 2.2 target 설정

In [23]:
target_response = client_events.put_targets(
    Rule=strEventRuleName,
    Targets=[
        {
            'Id': strLambdaFuncName,
            'Arn': strLambdaArn
        }
    ]
)

In [24]:
target_response

{'FailedEntryCount': 0,
 'FailedEntries': [],
 'ResponseMetadata': {'RequestId': '52b408c4-40c5-4ab5-b547-cae5fc47193a',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '52b408c4-40c5-4ab5-b547-cae5fc47193a',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '41',
   'date': 'Sun, 14 May 2023 00:52:47 GMT'},
  'RetryAttempts': 0}}

In [25]:
add_lambda_permission = lam_handler.add_permission(
    strLambdaArn=strLambdaArn,
    strLambdaFuncName=strLambdaFuncName,
    SourceArn=rule_response["RuleArn"]
)
add_lambda_permission

{'ResponseMetadata': {'RequestId': 'e926ba44-d9a1-43c4-ac7c-d11a502eb65d',
  'HTTPStatusCode': 201,
  'HTTPHeaders': {'date': 'Sun, 14 May 2023 00:52:49 GMT',
   'content-type': 'application/json',
   'content-length': '380',
   'connection': 'keep-alive',
   'x-amzn-requestid': 'e926ba44-d9a1-43c4-ac7c-d11a502eb65d'},
  'RetryAttempts': 0},
 'Statement': '{"Sid":"48344DJ-SM-PIPELINE-LAMBDA","Effect":"Allow","Principal":{"Service":"events.amazonaws.com"},"Action":"lambda:InvokeFunction","Resource":"arn:aws:lambda:us-east-1:419974056037:function:DJ-SM-PIPELINE-LAMBDA","Condition":{"ArnLike":{"AWS:SourceArn":"arn:aws:events:us-east-1:419974056037:rule/SAGEMAKER-PIPELINE-STEP-MONITOR"}}}'}

In [68]:
aa = {'version': '0', 'id': '317a4633-524f-2ac3-2cbb-f038c637920e', 'detail-type': 'SageMaker Model Building Pipeline Execution Step Status Change', 'source': 'aws.sagemaker', 'account': '419974056037', 'time': '2023-05-04T15:31:33Z', 'region': 'us-east-1', 'resources': ['arn:aws:sagemaker:us-east-1:419974056037:pipeline/dj-sm-imd-pipeline', 'arn:aws:sagemaker:us-east-1:419974056037:pipeline/dj-sm-imd-pipeline/execution/x1xt2felyccl'], 'detail': {'failureReason': 'ClientError: Failed to invoke sagemaker:CreateTrainingJob. Error Details: No S3 objects found under S3 URL "s3://sagemaker-us-east-1-419974056037/dataset/train_.csv" given in input data source. Please ensure that the bucket exists in the selected region (us-east-1), that objects exist under that S3 prefix, and that the role "arn:aws:iam::419974056037:role/service-role/AmazonSageMaker-ExecutionRole-20221206T163436" has "s3:ListBucket" permissions on bucket "sagemaker-us-east-1-419974056037".', 'metadata': {}, 'stepStartTime': '2023-05-04T15:31:32Z', 'stepEndTime': '2023-05-04T15:31:33Z', 'stepName': 'TrainingProcess', 'stepType': 'Training', 'previousStepStatus': 'Starting', 'currentStepStatus': 'Failed', 'pipelineArn': 'arn:aws:sagemaker:us-east-1:419974056037:pipeline/dj-sm-imd-pipeline', 'pipelineExecutionArn': 'arn:aws:sagemaker:us-east-1:419974056037:pipeline/dj-sm-imd-pipeline/execution/x1xt2felyccl'}}

In [97]:
strPipelineArn = aa["detail"]["pipelineArn"]
strStepName = aa["detail"]["stepName"]
strCurrentStepStatus = aa["detail"]["currentStepStatus"]
strFailReasion = aa["detail"]["failureReason"]
strEndTime = aa["detail"]["stepEndTime"]

In [69]:
import pprint

In [71]:
pprint.pprint(aa)

{'account': '419974056037',
 'detail': {'currentStepStatus': 'Failed',
            'failureReason': 'ClientError: Failed to invoke '
                             'sagemaker:CreateTrainingJob. Error Details: No '
                             'S3 objects found under S3 URL '
                             '"s3://sagemaker-us-east-1-419974056037/dataset/train_.csv" '
                             'given in input data source. Please ensure that '
                             'the bucket exists in the selected region '
                             '(us-east-1), that objects exist under that S3 '
                             'prefix, and that the role '
                             '"arn:aws:iam::419974056037:role/service-role/AmazonSageMaker-ExecutionRole-20221206T163436" '
                             'has "s3:ListBucket" permissions on bucket '
                             '"sagemaker-us-east-1-419974056037".',
            'metadata': {},
            'pipelineArn': 'arn:aws:sagemaker:us-east

In [3]:
meta = {'processingJob': {'arn': 'arn:aws:sagemaker:us-east-1:419974056037:processing-job/pipelines-a1bzloczzdgz-PreprocessingProcess-Z7ov8YlR7l'}}

In [1]:
res = {'ProcessingResources': {'ClusterConfig': {'InstanceType': 'ml.m5.xlarge', 'InstanceCount': 1, 'VolumeSizeInGB': 30}}, 'AppSpecification': {'ImageUri': '419974056037.dkr.ecr.us-east-1.amazonaws.com/mlops-image-prep:latest', 'ContainerArguments': ['--prefix_prep', '/opt/ml/processing/', '--region', 'us-east-1'], 'ContainerEntrypoint': ['/bin/bash', '/opt/ml/processing/input/entrypoint/runproc.sh']}, 'RoleArn': 'arn:aws:iam::419974056037:role/service-role/AmazonSageMaker-ExecutionRole-20221206T163436', 'ProcessingInputs': [{'InputName': 'input', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-419974056037/DJ-SM-PIPELINE-DATA', 'LocalPath': '/opt/ml/processing/input', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-419974056037/preprocessing/source/sourcedir.tar.gz', 'LocalPath': '/opt/ml/processing/input/code/', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'entrypoint', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-419974056037/preprocessing/source/runproc.sh', 'LocalPath': '/opt/ml/processing/input/entrypoint', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}], 'ProcessingOutputConfig': {'Outputs': [{'OutputName': 'train-data', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-us-east-1-419974056037/DJ-SM-PIPELINE-MODEL-1/preprocessing/output/train-data', 'LocalPath': '/opt/ml/processing/output/train', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'validation-data', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-us-east-1-419974056037/DJ-SM-PIPELINE-MODEL-1/preprocessing/output/validation-data', 'LocalPath': '/opt/ml/processing/output/validation', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'test-data', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-us-east-1-419974056037/DJ-SM-PIPELINE-MODEL-1/preprocessing/output/test-data', 'LocalPath': '/opt/ml/processing/output/test', 'S3UploadMode': 'EndOfJob'}}]}}

In [3]:
res.keys()

dict_keys(['ProcessingResources', 'AppSpecification', 'RoleArn', 'ProcessingInputs', 'ProcessingOutputConfig'])

In [7]:
res["ProcessingOutputConfig"]["Outputs"]

[{'OutputName': 'train-data',
  'AppManaged': False,
  'S3Output': {'S3Uri': 's3://sagemaker-us-east-1-419974056037/DJ-SM-PIPELINE-MODEL-1/preprocessing/output/train-data',
   'LocalPath': '/opt/ml/processing/output/train',
   'S3UploadMode': 'EndOfJob'}},
 {'OutputName': 'validation-data',
  'AppManaged': False,
  'S3Output': {'S3Uri': 's3://sagemaker-us-east-1-419974056037/DJ-SM-PIPELINE-MODEL-1/preprocessing/output/validation-data',
   'LocalPath': '/opt/ml/processing/output/validation',
   'S3UploadMode': 'EndOfJob'}},
 {'OutputName': 'test-data',
  'AppManaged': False,
  'S3Output': {'S3Uri': 's3://sagemaker-us-east-1-419974056037/DJ-SM-PIPELINE-MODEL-1/preprocessing/output/test-data',
   'LocalPath': '/opt/ml/processing/output/test',
   'S3UploadMode': 'EndOfJob'}}]

In [24]:
from pprint import pprint
for key, value in res.items():
    
    
    print ("===========================")
    print (f'key: {key}')
    if type(value) == list:
        for elem in value:
            print (type(elem), elem)
    

key: ProcessingResources
key: AppSpecification
key: RoleArn
key: ProcessingInputs
<class 'dict'> {'InputName': 'input', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-419974056037/DJ-SM-PIPELINE-DATA', 'LocalPath': '/opt/ml/processing/input', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}
<class 'dict'> {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-419974056037/preprocessing/source/sourcedir.tar.gz', 'LocalPath': '/opt/ml/processing/input/code/', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}
<class 'dict'> {'InputName': 'entrypoint', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-419974056037/preprocessing/source/runproc.sh', 'LocalPath': '/opt/ml/processing/input/entrypoint', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType'