# Import Libraries

In [1]:
import os
import shutil
import time
import json
import requests
import tempfile
import numpy as np
import pandas as pd

import boto3
import sagemaker
from sagemaker.inputs import TrainingInput, CreateModelInput
from sagemaker.workflow.steps import TrainingStep, CreateModelStep, ProcessingStep, TransformStep

from sagemaker.estimator import Estimator
from sagemaker import get_execution_role

from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.workflow.parameters import ParameterString
from sagemaker.model import Model
from sagemaker.pipeline import PipelineModel
from sagemaker.workflow.model_step import ModelStep
from sagemaker.workflow.step_collections import RegisterModel
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingOutput, ProcessingInput

from datetime import datetime

# Setup Region, session, and role

In [70]:
region = os.environ["AWS_REGION"]
boto_session = boto3.Session(region_name=region)
# sagemaker_boto_client = boto_session.client("sagemaker")


account_id = boto_session.client("sts").get_caller_identity()["Account"]
s3_client = boto3.client("s3", region_name=region)
sm_client = boto3.client("sagemaker", region_name=region)

# Get the AWS EventBridge client
event_bridge_client = boto3.client('events')
lambda_client = boto3.client('lambda')
event_bridge_scheduler = boto3.client("scheduler")

sns_client = boto3.client('sns')
cloudwatch = boto3.client('cloudwatch')

sagemaker_session = sagemaker.Session(
    boto_session=boto_session, sagemaker_client=sm_client
)

sm_role = get_execution_role(sagemaker_session=sagemaker_session)

# Initialize variables

In [3]:
# S3 prefix
bucket = "artwork-content-trial-bucket"
prefix="mch-artwork-content"
train_data_dir_prefix="data"
pipeline_dir_prefix="pipeline-data"
training_input_prefix = "ovr_content_data"

model_approval_status = ParameterString(
    name="ModelApprovalStatus", default_value="Approved"
)


# Get latest ovr data from S3

In [16]:
# Check if the bucket is empty
def get_latest_ovr_content_data(s3_client, bucket, train_data_dir_prefix, training_input_prefix):
    try:
        content_data_path = train_data_dir_prefix+'/'+training_input_prefix

        result = s3_client.list_objects(Bucket=bucket, Prefix=content_data_path)

        if 'Contents' not in result.keys():
            print(f"There seems to be no data in the {train_data_dir_prefix}/ folder of {bucket}")
        else:
            # Get the list of all objects in the bucket
            objects = result['Contents']

            # Sort the objects by LastModified
            sorted_objects = sorted(objects, key=lambda x: x['LastModified'], reverse=True)

            # Get the latest modified object
            latest_modified_object = sorted_objects[0]

            # Get the key (file name) and the last modified date of the latest modified object
            latest_modified_content_data_key = latest_modified_object['Key']
            latest_modified_date = latest_modified_object['LastModified']

            print("The latest modified object is:", latest_modified_content_data_key)
            print("The last modified date is:", latest_modified_date)
    
    except Exception as e:
        print(f"Exception occurred while fetching latest ovr data. Error - {e}")
        
    return latest_modified_content_data_key


In [None]:
# Check if the data bucket is empty
# result = s3_client.list_objects(Bucket=bucket, Prefix=content_data_path)

# if 'Contents' in result.keys():
#     print('Yes')
# else:
#     print('No data')

In [None]:
# content_data_path = train_data_dir_prefix+'/'+training_input_prefix
# s3_data_result = sagemaker_session.list_s3_files(bucket=bucket, key_prefix=content_data_path)

# s3_data_result

In [17]:
current_ovr_data = get_latest_ovr_content_data(s3_client, bucket, train_data_dir_prefix, training_input_prefix)

current_ovr_data

The latest modified object is: data/ovr_content_data_v3.csv
The last modified date is: 2023-02-09 13:23:17+00:00


'data/ovr_content_data_v3.csv'

# Creating Estimator for training step

In [18]:
train_image_uri = '791574662255.dkr.ecr.us-east-1.amazonaws.com/artwork-content-train-repo:latest'
estimator_output_path = f"s3://{bucket}/{pipeline_dir_prefix}/"
training_input = f"s3://{bucket}/{current_ovr_data}"

In [19]:
content_estimator = Estimator(
    image_uri=train_image_uri,
    instance_type="ml.m5.xlarge",
    instance_count=1,
    output_path=estimator_output_path,
    role=sm_role,
)

# Training Step

In [20]:
step_train = TrainingStep(
    name="TrainingStep",
    estimator=content_estimator,
    inputs=training_input,
)

# Model creation step

In [21]:
mdl_name="artwork-content-model"

model = Model(
    name=mdl_name,
    image_uri=content_estimator.training_image_uri(),
    model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
    sagemaker_session=sagemaker_session,
    role=sm_role,
)

inputs = CreateModelInput(instance_type="ml.m5.xlarge")

step_model_create = CreateModelStep(name="CreateModelStep", model=model, inputs=inputs)

# Registering model to Model Registry

In [22]:
# timestamp = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")
# mpg_name = f"artwork-content-{timestamp}"
mpg_name = "MCH-Content-Models"

step_model_registration = RegisterModel(
    name="RegisterModelStep",
    estimator=content_estimator,
    model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
    content_types=["application/json"],
    response_types=["application/json"],
    inference_instances=["ml.t2.medium", "ml.m5.xlarge"],
    transform_instances=["ml.m5.xlarge"],
    model_package_group_name=mpg_name,
    approval_status=model_approval_status,
)

# Pushing files to S3 bucket

In [None]:
deploy_file_path='deploy.py'
inference_file_path = 'scoring/inference.py'
inference_prefix='inference'
primary_prefix='code'

s3_client.upload_file(Filename=f"{deploy_file_path}", Bucket=bucket, Key=f"{primary_prefix}/{deploy_file_path}")
s3_client.upload_file(Filename=f"{inference_file_path}", Bucket=bucket, Key=f"{primary_prefix}/{inference_file_path}")

# Deploying models using inference script as an entry point via ProcessingStep

In [23]:
deploy_script_uri = f"s3://{bucket}/{primary_prefix}/{deploy_file_path}"

# timestamp = datetime.now().strftime("%Y-%m-%d-%H-%M")
endpoint_name = 'mch-artwork-content-ep-2'

deployment_processor = SKLearnProcessor(
    framework_version="1.0-1",
    role=sm_role,
    instance_type="ml.m5.xlarge",
    instance_count=1,
    base_job_name=f"{prefix}-deploy",
    sagemaker_session=sagemaker_session,
)


step_deploy_model = ProcessingStep(
    name="DeployContentModel",
    processor=deployment_processor,
    inputs=[
            ProcessingInput(source=f"s3://{bucket}/{primary_prefix}/{inference_file_path}",
                                destination="/opt/ml/processing/input"
                               )
    ],
    job_arguments=[
        "--model_data",
        step_train.properties.ModelArtifacts.S3ModelArtifacts,
        "--inference_prefix",
        inference_prefix,
        "--sm_role",
        sm_role,
        "--endpoint_name",
        endpoint_name
    ],
    code=deploy_script_uri,
    outputs=[
        ProcessingOutput(output_name="endpoint_arn", source="/opt/ml/processing/endpoint_arn")
    ],
)

# Creating pipeline object and passing all steps

In [24]:
pipeline_name = "artwork-content-pipeline-demo"

pipeline_steps = [step_train, step_model_registration, step_deploy_model]

pipeline = Pipeline(
    name=pipeline_name,
    parameters=[model_approval_status],
    steps=pipeline_steps,
    sagemaker_session=sagemaker_session
)

In [25]:
pipeline.upsert(role_arn=sm_role)

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


{'PipelineArn': 'arn:aws:sagemaker:us-east-1:791574662255:pipeline/artwork-content-pipeline-demo',
 'ResponseMetadata': {'RequestId': '7fade34e-0f0d-48a6-9a8f-25bdf9e6074d',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '7fade34e-0f0d-48a6-9a8f-25bdf9e6074d',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '97',
   'date': 'Thu, 09 Feb 2023 13:38:40 GMT'},
  'RetryAttempts': 0}}

# Start the pipeline

In [None]:
execution = pipeline.start()
execution.wait()
execution.describe()

In [None]:
# pipeline.delete()

In [None]:
import time
from sagemaker.lineage.visualizer import LineageTableVisualizer


viz = LineageTableVisualizer(sagemaker_session)
for execution_step in reversed(execution.list_steps()):
    print(execution_step)
    display(viz.show(pipeline_execution_step=execution_step))
    time.sleep(5)

# =========================================

In [None]:
# This file contains a small sample of ovr data
sagemaker.s3.S3Uploader.upload("./data/ovr_content_data_v3.csv", 
                               f"s3://{bucket}/{train_data_dir_prefix}")
#wait for file to finish uploading 
time.sleep(5)