In [76]:
%matplotlib inline
import pandas as pd
import numpy as np
import sagemaker
import json
import boto3
from sagemaker import get_execution_role

sm_client = boto3.client('sagemaker')

In [77]:
# Retrieve the default bucket
sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()
region = sagemaker_session.boto_region_name
print(region)
print(bucket)
role = get_execution_role()

eu-west-1
sagemaker-eu-west-1-707684582322


In [78]:
project_name = "mlops-cicd-demo"
project_id = sm_client.describe_project(ProjectName=project_name)['ProjectId']

model_package_group_name = project_name + '-' + project_id
print("Model package group name: %s" % model_package_group_name)

Model package group name: mlops-cicd-demo-p-1llgzzxekxpq


#### Pipeline input parameters

In [79]:
from sagemaker.workflow.parameters import ParameterInteger, ParameterString

training_instance_type = ParameterString(
    name="TrainingInstanceType",
    default_value="ml.m5.xlarge"
)

training_instance_count = ParameterInteger(
    name="TrainingInstanceCount",
    default_value=1
)

input_raw_data = ParameterString(
    name="InputRawData",
    default_value='s3://{}/sagemaker/xgboostcontainer/raw-data'.format(bucket)
)

input_train_data = ParameterString(
    name="InputDataTrain",
    default_value='s3://{}/sagemaker/xgboostcontainer/processed/train'.format(bucket)
)

input_test_data = ParameterString(
    name="InputDataTest",
    default_value='s3://{}/sagemaker/xgboostcontainer/processed/test'.format(bucket)
)

### Preprocessing

In [80]:
from sagemaker.sklearn.processing import SKLearnProcessor
role = get_execution_role()
sklearn_processor = SKLearnProcessor(framework_version='0.20.0',
                                     role=role,
                                     instance_type=training_instance_type,
                                     instance_count=training_instance_count)

In [81]:
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep

step_process = ProcessingStep(
    name="BostonHousingDropColumns",
    processor=sklearn_processor,
    inputs=[
      ProcessingInput(source=input_raw_data, destination="/opt/ml/processing/input"),  
    ],
    outputs=[ProcessingOutput(output_name='xgboost_train_data',
                              source='/opt/ml/processing/output/train',
                              destination = input_train_data),
             ProcessingOutput(output_name='xgboost_test_data',
                              source='/opt/ml/processing/output/test',
                              destination = input_test_data)],
    code="preprocessing.py"
)

### Training

In [82]:
from sagemaker.image_uris import retrieve 
from sagemaker.session import Session

# this line automatically looks for the XGBoost image URI and builds an XGBoost container.
# specify the repo_version depending on your preference.
container = retrieve(region=boto3.Session().region_name,
                          framework='xgboost', 
                          version='1.0-1')
print(container)

141502667606.dkr.ecr.eu-west-1.amazonaws.com/sagemaker-xgboost:1.0-1-cpu-py3


In [83]:
# initialize hyperparameters
hyperparameters = {
        "max_depth":"10",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "objective":"reg:squarederror",
        "num_round":"200"}

In [84]:
# construct a SageMaker estimator that calls the xgboost-container
estimator = sagemaker.estimator.Estimator(image_uri=container, 
                                          hyperparameters=hyperparameters,
                                          role=role,
                                          instance_count=1, 
                                          instance_type='ml.m5.2xlarge')



In [85]:
from sagemaker.inputs import TrainingInput
from sagemaker.workflow.steps import TrainingStep

step_train = TrainingStep(
    name="TrainingXgBoost",
    estimator=estimator,
    inputs={
        "train": TrainingInput(s3_data=step_process.properties.ProcessingOutputConfig.Outputs["xgboost_train_data"].S3Output.S3Uri, content_type="text/csv"),
        "validation": TrainingInput(s3_data=step_process.properties.ProcessingOutputConfig.Outputs["xgboost_test_data"].S3Output.S3Uri, content_type="text/csv"
        )
    },
)

### Then, the Register step that will add a new version to the Model Registry

In [86]:
from sagemaker.workflow.step_collections import RegisterModel

# NOTE: model_approval_status is not available as arg in service dsl currently
step_register = RegisterModel(
    name="RegisterXgBoostModel",
    estimator=estimator,
    model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
    content_types=["text/csv"],
    response_types=["text/csv"],
    inference_instances=["ml.t2.medium", "ml.m5.large"],
    transform_instances=["ml.m5.large"],
    model_package_group_name=model_package_group_name
)

### Now, we can create the pipeline

In [87]:
from botocore.exceptions import ClientError, ValidationError
from sagemaker.workflow.pipeline import Pipeline

# NOTE:
# condition steps have issues in service so we go straight to step_register
pipeline_name = "XgBoost-Pipelines-2"
pipeline = Pipeline(
    name=pipeline_name,
    parameters=[
        training_instance_type,
        training_instance_count,
        input_raw_data,
        input_train_data,
        input_test_data
    ],
    steps=[step_process,step_train, step_register],
    sagemaker_session=sagemaker_session,
)

try:
    response = pipeline.create(role_arn=role)
except ClientError as e:
    error = e.response["Error"]
    if error["Code"] == "ValidationError" and "Pipeline names must be unique within" in error["Message"]:
        print(error["Message"])
        response = pipeline.describe()
    else:
        raise

pipeline_arn = response["PipelineArn"]
print(pipeline_arn)

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


arn:aws:sagemaker:eu-west-1:707684582322:pipeline/xgboost-pipelines-2


### And then, run it

In [88]:
import time

start_response = pipeline.start(parameters={
    "TrainingInstanceCount": "1"
})

pipeline_execution_arn = start_response.arn
print(pipeline_execution_arn)

while True:
    resp = sm_client.describe_pipeline_execution(PipelineExecutionArn=pipeline_execution_arn)
    if resp['PipelineExecutionStatus'] == 'Executing':
        print('Running...')
    else:
        print(resp['PipelineExecutionStatus'], pipeline_execution_arn)
        break
    time.sleep(15)

arn:aws:sagemaker:eu-west-1:707684582322:pipeline/xgboost-pipelines-2/execution/mu66d5fgwunc
Running...
Running...
Running...
Running...
Running...
Running...
Running...
Running...
Running...
Running...
Running...
Running...
Running...
Running...
Running...
Running...
Running...
Running...
Running...
Running...
Running...
Running...
Running...
Running...
Running...
Running...
Running...
Running...
Succeeded arn:aws:sagemaker:eu-west-1:707684582322:pipeline/xgboost-pipelines-2/execution/mu66d5fgwunc


### Finally, approve the model to kick-off the deployment process

In [None]:
# list all packages and select the latest one
packages = sm_client.list_model_packages(ModelPackageGroupName=model_package_group_name)['ModelPackageSummaryList']
packages = sorted(packages, key=lambda x: x['CreationTime'], reverse=True)

latest_model_package_arn = packages[0]['ModelPackageArn']

In [None]:
# model_package_update_response = sm_client.update_model_package(
#    ModelPackageArn=latest_model_package_arn,
#    ModelApprovalStatus="Approved",
# )

## Done! :) Let's open the CodePipeline console and get some popcorn to watch