# <B> Monitor - SageMaker pipeline </B>
* Container: conda_python3

## AutoReload

In [80]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 0. Install packages

In [48]:
install_needed = False  # should only be True once
# install_needed = False

In [49]:
%%bash
#!/bin/bash

DAEMON_PATH="/etc/docker"
MEMORY_SIZE=10G

FLAG=$(cat $DAEMON_PATH/daemon.json | jq 'has("data-root")')
# echo $FLAG

if [ "$FLAG" == true ]; then
    echo "Already revised"
else
    echo "Add data-root and default-shm-size=$MEMORY_SIZE"
    sudo cp $DAEMON_PATH/daemon.json $DAEMON_PATH/daemon.json.bak
    sudo cat $DAEMON_PATH/daemon.json.bak | jq '. += {"data-root":"/home/ec2-user/SageMaker/.container/docker","default-shm-size":"'$MEMORY_SIZE'"}' | sudo tee $DAEMON_PATH/daemon.json > /dev/null
    sudo service docker restart
    echo "Docker Restart"
fi

Already revised


In [50]:
import sys
import IPython

if install_needed:
    print("installing deps and restarting kernel")
    !{sys.executable} -m pip install -U pip
    !{sys.executable} -m pip install -U smdebug sagemaker-experiments
    !{sys.executable} -m pip install -U sagemaker

    IPython.Application.instance().kernel.do_shutdown(True)

## 1. parameter store 설정

In [81]:
import boto3
from utils.ssm import parameter_store

In [82]:
strRegionName=boto3.Session().region_name
pm = parameter_store(strRegionName)
strPrefix = pm.get_params(key="PREFIX")

In [83]:
strBucketName = pm.get_params(key="-".join([strPrefix, "BUCKET"]))
strExecutionRole = pm.get_params(key="-".join([strPrefix, "SAGEMAKER-ROLE-ARN"]))

In [84]:
print (f'strExecutionRole: {strExecutionRole}')

strExecutionRole: arn:aws:iam::419974056037:role/service-role/AmazonSageMaker-ExecutionRole-20221206T163436


## 2. EventBridge - Lambda 설정

In [55]:
import boto3

### 2-1. Lambda function 생성

In [85]:
import shutil
import zipfile
import tempfile
from utils.lambda_func import lambda_handler

In [86]:
lam_handler = lambda_handler(region_name=strRegionName)

In [87]:
strLambdaRoleArn = pm.get_params(key="-".join([strPrefix, "LAMBDA-ROLE-ARN"]))
strLambdaFuncName = "-".join([strPrefix, "LAMBDA"])
strLambdaSrcDir = "./lambda/sources"
strLambdaHandler = "pipeline_monitor_lambda.lambda_handler" 

In [88]:
print (f'strLambdaRoleArn: {strLambdaRoleArn}')
print (f'strRegionName: {strRegionName}')

strLambdaRoleArn: arn:aws:iam::419974056037:role/DJ-SM-IMD-LabmdaRole
strRegionName: us-east-1


In [89]:
with tempfile.TemporaryDirectory() as tempDirPath:
    
    lambda_archive_path = shutil.make_archive(
        base_name=tempDirPath,
        format="zip",
        root_dir=strLambdaSrcDir,
    )
    
    with open(lambda_archive_path, 'rb') as f: zipped_code = f.read()
    
    strLambdaArn = lam_handler.create_function(
        Code=dict(ZipFile=zipped_code),
        Description='SageMaker IMD: Lambda for Automating Amazon SageMaker with Amazon EventBridge',
        Environment={
           'Variables': {
               'REGION':strRegionName,
           },
        },
        FunctionName=strLambdaFuncName,
        Handler=strLambdaHandler,
        Publish=True,
        Role=strLambdaRoleArn,
        Runtime='python3.9',
    )
    
print (f'LambdaArn: {strLambdaArn}')
print (f'strLambdaFuncName: {strLambdaFuncName}')
pm.put_params(key="-".join([strPrefix, "LAMBDA-PIPELINE-MONITOR"]), value=strLambdaFuncName, overwrite=True)

== CREATE LAMBDA FUNCTION ==
  lambda function: [DJ-SM-IMD-LAMBDA] is already exist!!, so, this will be deleted and re-created.
  lambda function: [DJ-SM-IMD-LAMBDA] is deleted successfully
Argments for lambda below:

{'Architectures': ['x86_64'],
 'CodeSha256': 'hsjVPvwX9+81Cgr3rpV3wN/ssqG+kapJpzKrFESh4tk=',
 'CodeSize': 1038,
 'Description': 'SageMaker IMD: Lambda for Automating Amazon SageMaker with '
                'Amazon EventBridge',
 'Environment': {'Variables': {'REGION': 'us-east-1'}},
 'EphemeralStorage': {'Size': 512},
 'FunctionArn': 'arn:aws:lambda:us-east-1:419974056037:function:DJ-SM-IMD-LAMBDA',
 'FunctionName': 'DJ-SM-IMD-LAMBDA',
 'Handler': 'pipeline_monitor_lambda.lambda_handler',
 'LastModified': '2023-05-04T15:45:48.672+0000',
 'MemorySize': 128,
 'PackageType': 'Zip',
 'ResponseMetadata': {'HTTPHeaders': {'connection': 'keep-alive',
                                      'content-length': '1335',
                                      'content-type': 'application

'Store suceess'

### 2-1. Event Rule 생성
* [Automating Amazon SageMaker with Amazon EventBridge](https://docs.aws.amazon.com/sagemaker/latest/dg/automating-sagemaker-with-eventbridge.html#eventbridge-pipeline)
* [BOTO3 for eventbridge](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/events.html)

In [90]:
client_events = boto3.client('events')

In [91]:
dicEventPattern = '''
{
    "source": ["aws.sagemaker"],
    "detail-type": ["SageMaker Model Building Pipeline Execution Step Status Change"],
    "detail": {
        "currentStepStatus": ["Failed"] 
    }
}
'''
strEventBridgeRole = pm.get_params(key="-".join([strPrefix, "CODE-EVENTBRIDGE-ROLE-ARN"]))
strEventRuleName = "SAGEMAKER-PIPELINE-STEP-MONITOR"
pm.put_params(key="-".join([strPrefix, "EVENT-RULE-NAME"]), value=strEventRuleName, overwrite=True)

'Store suceess'

In [92]:
print (f'strEventBridgeRole: {strEventBridgeRole}')
print (f'strEventRuleName: {strEventRuleName}')

strEventBridgeRole: arn:aws:iam::419974056037:role/DJ-SM-IMD-EventBridgeRole
strEventRuleName: SAGEMAKER-PIPELINE-STEP-MONITOR


In [93]:
rule_response = client_events.put_rule(
    Name=strEventRuleName,
    #ScheduleExpression='string',
    EventPattern=dicEventPattern,
    State="ENABLED", #'ENABLED'|'DISABLED'
    Description="Trigger when currentStepStatus is Failed",
    RoleArn=strEventBridgeRole
)
rule_response

{'RuleArn': 'arn:aws:events:us-east-1:419974056037:rule/SAGEMAKER-PIPELINE-STEP-MONITOR',
 'ResponseMetadata': {'RequestId': '6a5ff98a-f2b9-4dee-8086-e43f40aeb72c',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '6a5ff98a-f2b9-4dee-8086-e43f40aeb72c',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '88',
   'date': 'Thu, 04 May 2023 15:45:59 GMT'},
  'RetryAttempts': 0}}

### 2.2 target 설정

In [94]:
target_response = client_events.put_targets(
    Rule=strEventRuleName,
    Targets=[
        {
            'Id': strLambdaFuncName,
            'Arn': strLambdaArn
        }
    ]
)

In [95]:
target_response

{'FailedEntryCount': 0,
 'FailedEntries': [],
 'ResponseMetadata': {'RequestId': '28a2e133-7f08-4d0f-890b-448b035ee57f',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '28a2e133-7f08-4d0f-890b-448b035ee57f',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '41',
   'date': 'Thu, 04 May 2023 15:46:01 GMT'},
  'RetryAttempts': 0}}

In [96]:
add_lambda_permission = lam_handler.add_permission(
    strLambdaArn=strLambdaArn,
    strLambdaFuncName=strLambdaFuncName,
    SourceArn=rule_response["RuleArn"]
)
add_lambda_permission

{'ResponseMetadata': {'RequestId': 'dae2dc9c-bc67-4909-9b0e-3c6fb5ae2f1b',
  'HTTPStatusCode': 201,
  'HTTPHeaders': {'date': 'Thu, 04 May 2023 15:46:03 GMT',
   'content-type': 'application/json',
   'content-length': '370',
   'connection': 'keep-alive',
   'x-amzn-requestid': 'dae2dc9c-bc67-4909-9b0e-3c6fb5ae2f1b'},
  'RetryAttempts': 0},
 'Statement': '{"Sid":"11956DJ-SM-IMD-LAMBDA","Effect":"Allow","Principal":{"Service":"events.amazonaws.com"},"Action":"lambda:InvokeFunction","Resource":"arn:aws:lambda:us-east-1:419974056037:function:DJ-SM-IMD-LAMBDA","Condition":{"ArnLike":{"AWS:SourceArn":"arn:aws:events:us-east-1:419974056037:rule/SAGEMAKER-PIPELINE-STEP-MONITOR"}}}'}

In [68]:
aa = {'version': '0', 'id': '317a4633-524f-2ac3-2cbb-f038c637920e', 'detail-type': 'SageMaker Model Building Pipeline Execution Step Status Change', 'source': 'aws.sagemaker', 'account': '419974056037', 'time': '2023-05-04T15:31:33Z', 'region': 'us-east-1', 'resources': ['arn:aws:sagemaker:us-east-1:419974056037:pipeline/dj-sm-imd-pipeline', 'arn:aws:sagemaker:us-east-1:419974056037:pipeline/dj-sm-imd-pipeline/execution/x1xt2felyccl'], 'detail': {'failureReason': 'ClientError: Failed to invoke sagemaker:CreateTrainingJob. Error Details: No S3 objects found under S3 URL "s3://sagemaker-us-east-1-419974056037/dataset/train_.csv" given in input data source. Please ensure that the bucket exists in the selected region (us-east-1), that objects exist under that S3 prefix, and that the role "arn:aws:iam::419974056037:role/service-role/AmazonSageMaker-ExecutionRole-20221206T163436" has "s3:ListBucket" permissions on bucket "sagemaker-us-east-1-419974056037".', 'metadata': {}, 'stepStartTime': '2023-05-04T15:31:32Z', 'stepEndTime': '2023-05-04T15:31:33Z', 'stepName': 'TrainingProcess', 'stepType': 'Training', 'previousStepStatus': 'Starting', 'currentStepStatus': 'Failed', 'pipelineArn': 'arn:aws:sagemaker:us-east-1:419974056037:pipeline/dj-sm-imd-pipeline', 'pipelineExecutionArn': 'arn:aws:sagemaker:us-east-1:419974056037:pipeline/dj-sm-imd-pipeline/execution/x1xt2felyccl'}}

In [97]:
strPipelineArn = aa["detail"]["pipelineArn"]
strStepName = aa["detail"]["stepName"]
strCurrentStepStatus = aa["detail"]["currentStepStatus"]
strFailReasion = aa["detail"]["failureReason"]
strEndTime = aa["detail"]["stepEndTime"]

In [69]:
import pprint

In [71]:
pprint.pprint(aa)

{'account': '419974056037',
 'detail': {'currentStepStatus': 'Failed',
            'failureReason': 'ClientError: Failed to invoke '
                             'sagemaker:CreateTrainingJob. Error Details: No '
                             'S3 objects found under S3 URL '
                             '"s3://sagemaker-us-east-1-419974056037/dataset/train_.csv" '
                             'given in input data source. Please ensure that '
                             'the bucket exists in the selected region '
                             '(us-east-1), that objects exist under that S3 '
                             'prefix, and that the role '
                             '"arn:aws:iam::419974056037:role/service-role/AmazonSageMaker-ExecutionRole-20221206T163436" '
                             'has "s3:ListBucket" permissions on bucket '
                             '"sagemaker-us-east-1-419974056037".',
            'metadata': {},
            'pipelineArn': 'arn:aws:sagemaker:us-east