In [27]:
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.pipeline import PipelineModel
from sagemaker.workflow.steps import ProcessingStep, TrainingStep, CreateModelStep

from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.workflow.parameters import ParameterString
from sagemaker.workflow.execution_variables import ExecutionVariables
from sagemaker.model import Model
from sagemaker.sklearn.processing import SKLearnProcessor

from sagemaker.workflow.conditions import ConditionGreaterThanOrEqualTo
from sagemaker.workflow.condition_step import ConditionStep
from sagemaker.workflow.functions import JsonGet
from sagemaker.workflow.properties import PropertyFile

from sagemaker.workflow.fail_step import FailStep
from sagemaker.workflow.functions import Join

from sagemaker.tuner import HyperparameterTuner, CategoricalParameter
from sagemaker.workflow.steps import TuningStep

from sagemaker.workflow.functions import Join


In [2]:
#Initialization
role = get_execution_role()
sagemaker_session = PipelineSession()

bucket = 'temp129428'
data = 'bank_clean.csv'
region = region = boto3.Session().region_name
input_folder_name = 'input'
output_folder_name = 'output'

input_data_path = f's3://{bucket}/{input_folder_name}/{data}'
#input_mappings_path = f's3://{bucket}/{input_folder_name}/mappings.csv'
output_train_path = f's3://{bucket}/{output_folder_name}/train/'
output_test_path = f's3://{bucket}/{output_folder_name}/test/'
output_txt_path = f's3://{bucket}/{output_folder_name}/txt/' #Will be used for metadata

preprocessing_script_path = f's3://{bucket}/{input_folder_name}/preprocessing.py'
model_path = f"s3://{bucket}/{output_folder_name}/model"
eval_script_path = f's3://{bucket}/{input_folder_name}/evaluate.py'
deployment_script_path = f"s3://{bucket}/{input_folder_name}/deploy_model.py"

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


In [3]:
input_data = ParameterString(
    name="InputData",
    default_value=input_data_path,
)

preprocessed_data_train = ParameterString(
    name="PreprocessedDataTrain",
    default_value=output_train_path,
)

preprocessed_data_test = ParameterString(
    name="PreprocessedDataTest",
    default_value=output_test_path,
)

preprocessed_data_txt = ParameterString(
    name="PreprocessedDataTXT",
    default_value=output_txt_path,
)

In [4]:
script_processor = ScriptProcessor(
    role=role,
    image_uri=sagemaker.image_uris.retrieve('sklearn', region=region, version='0.23-1'),  # Example image
    command=['python3'],
    instance_count=1,
    instance_type='ml.m5.4xlarge',
    #instance_type='ml.m5.xlarge',
    sagemaker_session=sagemaker_session
)

INFO:sagemaker.image_uris:Defaulting to only available Python version: py3
INFO:sagemaker.image_uris:Defaulting to only supported image scope: cpu.


In [5]:
processing_step = ProcessingStep(
    name="PreprocessData",
    processor=script_processor,
    inputs=[
        ProcessingInput(
            source=input_data,
            destination='/opt/ml/processing/input'
        )
    ],
    outputs=[
        ProcessingOutput(
            output_name='trainoutput',
            source='/opt/ml/processing/output/train',
            destination=preprocessed_data_train
        ),
        ProcessingOutput(
            output_name='testoutput',
            source='/opt/ml/processing/output/test',
            destination=preprocessed_data_test
        ),
        ProcessingOutput(
            output_name='columnoutput',
            source='/opt/ml/processing/output/txt',
            destination=preprocessed_data_txt
        )
    ],
    code=preprocessing_script_path,
    job_arguments=[
        '--input', f'/opt/ml/processing/input/{data}',
        '--output_train', '/opt/ml/processing/output/train/processed_train.csv',
        '--output_test', '/opt/ml/processing/output/test/processed_test.csv',
        '--output_txt', '/opt/ml/processing/output/txt/'
    ]
)

Tuning Step

In [6]:
# Define the XGBoost container
container = sagemaker.image_uris.retrieve("xgboost", region, "1.2-1")

# Set the static hyperparameters
hyperparameters = {
    "eval_metric": "logloss",
    "objective": "binary:logistic",
    "num_round": 400
}

# Define the XGBoost estimator
xgb_estimator = Estimator(
    image_uri=container,
    hyperparameters=hyperparameters,
    role=role,
    instance_count=1,
    instance_type='ml.m5.xlarge',
    volume_size=5,  # 5 GB 
    output_path=tuning_model_path,
    use_spot_instances=True,
    max_wait=600, 
    max_run=300,
    sagemaker_session=sagemaker_session
)

# Define hyperparameter ranges for grid search
# In this case, tuning the lasso regularization strength parameter
hyperparameter_ranges = {
    'alpha': CategoricalParameter([10, 100, 1000, 5000])
    #Secondthing 1 2 3
    #,Add more
}

# Define objective metric and metric definitions
objective_metric_name = 'validation:logloss'
metric_definitions = [{'Name': 'validation:logloss', 'Regex': '.*\\[[0-9]+\\].*logloss:([0-9\\.]+)'}]

# Create hyperparameter tuner
tuner = HyperparameterTuner(
    estimator=xgb_estimator,
    objective_metric_name=objective_metric_name,
    hyperparameter_ranges=hyperparameter_ranges,
    metric_definitions=metric_definitions,
    max_jobs=4,  # Number of total training jobs
    max_parallel_jobs=2,  # Number of parallel training jobs
    strategy='Grid', #Bayesian Optimization (Future)
    objective_type="Minimize"
)


hpo_args = tuner.fit(
    inputs={
        "train": TrainingInput(
            s3_data=processing_step.properties.ProcessingOutputConfig.Outputs["trainoutput"].S3Output.S3Uri,
            content_type="text/csv",
        ),
        "validation": TrainingInput(
            s3_data=processing_step.properties.ProcessingOutputConfig.Outputs[
                "testoutput"
            ].S3Output.S3Uri,
            content_type="text/csv",
        ),
    }
)

# Define the tuning step
tuning_step = TuningStep(
    name="HPTuning",
    step_args=hpo_args,
)

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


Eval Steps

In [7]:
path_to_best_model = Join(on="/",
        values=[
            f"s3://{bucket}/output/tuning",
            tuning_step.properties.BestTrainingJob.TrainingJobName,
            "output/model.tar.gz"
        ]
    )

In [8]:
# Define the ScriptProcessor for evaluation
evaluation_processor = ScriptProcessor(
    role=role,
    image_uri=container,
    command=['python3'],
    instance_count=1,
    instance_type='ml.m5.xlarge',
    base_job_name="evaluate-model",
    sagemaker_session=sagemaker_session,
)

# Define the evaluation step and property file
evaluation_property_file = PropertyFile(
    name="EvaluationReport",
    output_name="evaluation",
    path="evaluation.json"
)

evaluation_step = ProcessingStep(
    name="EvaluateModel",
    processor=evaluation_processor,
    inputs=[
        ProcessingInput(
            #source=training_step.properties.ModelArtifacts.S3ModelArtifacts,
            source=path_to_best_# Define the ScriptProcessor for evaluation
evaluation_processor = ScriptProcessor(
    role=role,
    image_uri=container,
    command=['python3'],
    instance_count=1,
    instance_type='ml.m5.xlarge',
    base_job_name="evaluate-model",
    sagemaker_session=sagemaker_session,
)

# Define the evaluation step and property file
evaluation_property_file = PropertyFile(
    name="EvaluationReport",
    output_name="evaluation",
    path="evaluation.json"
)

evaluation_step = ProcessingStep(
    name="EvaluateModel",
    processor=evaluation_processor,
    inputs=[
        ProcessingInput(
            source=path_to_best_model,
            destination="/opt/ml/processing/model"
        ),
        ProcessingInput(
            source=processing_step.properties.ProcessingOutputConfig.Outputs['trainoutput'].S3Output.S3Uri,
            destination="/opt/ml/processing/train"
        ),
        ProcessingInput(
            source=processing_step.properties.ProcessingOutputConfig.Outputs['testoutput'].S3Output.S3Uri,
            destination="/opt/ml/processing/test"
        )
    ],
    outputs=[
        ProcessingOutput(
            output_name="evaluation",
            source="/opt/ml/processing/evaluation",
            destination=f's3://{bucket}/{output_folder_name}/evaluation'
        )
    ],
    code=eval_script_path,
    job_arguments=[
        '--model-path', '/opt/ml/processing/model',
        '--train-path', 'opt/ml/processing/train',
        '--test-path', '/opt/ml/processing/test',
        '--output-path', '/opt/ml/processing/evaluation'
    ],
    property_files=[evaluation_property_file]
)model,
            destination="/opt/ml/processing/model"
        ),
        ProcessingInput(
            source=processing_step.properties.ProcessingOutputConfig.Outputs['testoutput'].S3Output.S3Uri,
            destination="/opt/ml/processing/test"
        )
    ],
    outputs=[
        ProcessingOutput(
            output_name="evaluation",
            source="/opt/ml/processing/evaluation",
            destination=f's3://{bucket}/{output_folder_name}/evaluation'
        )
    ],
    code=eval_script_path,
    job_arguments=[
        '--model-path', '/opt/ml/processing/model',
        '--test-path', '/opt/ml/processing/test',
        '--output-path', '/opt/ml/processing/evaluation'
    ],
    property_files=[evaluation_property_file]
)

Model Step

In [15]:
#Change here for 1000 row model or full data model
preprocessing_model = Model(
    image_uri=sagemaker.image_uris.retrieve("sklearn", region, "0.23-1"),
    model_data=None,  # No pre-trained model file required for preprocessing
    role=role,
    entry_point='preprocess_inference.py',  # Preprocessing script saved locally
    sagemaker_session=sagemaker_session
)

inference_model = Model(
    image_uri=tuning_step.properties.TrainingJobDefinition.AlgorithmSpecification.TrainingImage,
    model_data=path_to_best_model,
    role=role,
    entry_point='inference.py',
    sagemaker_session=sagemaker_session
)

model = PipelineModel(
    name='pipeline-model',
    role=role,
    models=[preprocessing_model, inference_model],
    sagemaker_session=sagemaker_session
)

# Create the model step
model_step = CreateModelStep(
    name="CreateModel",
    model=model,
    inputs=sagemaker.inputs.CreateModelInput(instance_type="ml.t3.medium")
)

INFO:sagemaker.image_uris:Defaulting to only available Python version: py3
INFO:sagemaker.image_uris:Defaulting to only supported image scope: cpu.


Fail Step


In [16]:
fail_step = FailStep(
    name="AccuracyBelowThreshold",
    error_message=Join(on=" ", values=["Execution failed due to accuracy <", 0.8]),
)

Deployment Step

In [17]:
# Define the SKLearnProcessor for deployment
deploy_model_processor = SKLearnProcessor(
    framework_version="0.23-1",
    role=role,
    instance_type='ml.m5.xlarge',
    instance_count=1,
    volume_size_in_gb=60,
    base_job_name="deployingtest",
    sagemaker_session=sagemaker_session,
)

# Define the processing step to deploy the model
deploy_step = ProcessingStep(
    name="DeployModel",
    processor=deploy_model_processor,
    job_arguments=[
        "--model-name", model_step.properties.ModelName,
        "--region", region,
        "--endpoint-instance-type", "ml.t2.medium",
        "--endpoint-name", "hpodhubdev",
    ],
    code=deployment_script_path,
)

INFO:sagemaker.image_uris:Defaulting to only available Python version: py3


Conditional Step

In [18]:
# Define the condition step
condition_step = ConditionStep(
    name="CheckEvaluation",
    conditions=[
        ConditionGreaterThanOrEqualTo(
            left=JsonGet(
                step_name="EvaluateModel",
                property_file=evaluation_property_file,
                json_path="classification_metrics.BSS.value"
            ),
            right=0  # Set the accuracy threshold
        )
    ],
    if_steps=[model_step, deploy_step],
    else_steps=[fail_step]
)

Pipeline Creation

In [20]:
# Create the pipeline
pipeline = Pipeline(
    name="dhubpipe",
    parameters=[input_data, preprocessed_data_train, preprocessed_data_test, preprocessed_data_txt],
    #steps=[processing_step, training_step, evaluation_step, condition_step],
    steps=[processing_step, tuning_step, evaluation_step, condition_step],
    sagemaker_session=sagemaker_session,
)

Pipeline Execution

In [21]:
# Execute the pipeline
pipeline.upsert(role_arn=role)
execution = pipeline.start()
execution.wait()



### Testing Endpoint

In [22]:
import json
import requests
import time
import boto3
from requests_aws4auth import AWS4Auth
import pandas as pd
import random
import datetime
import numpy as np

# AWS credentials and region
session = boto3.Session()
credentials = session.get_credentials()
region = 'us-east-2'

# Create an AWS4Auth object
auth = AWS4Auth(credentials.access_key, credentials.secret_key, region, 'sagemaker', session_token=credentials.token)

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


In [23]:
# SageMaker endpoint URL
url = 'https://runtime.sagemaker.us-east-2.amazonaws.com/endpoints/dhub2/invocations'

# Define the headers explicitly without charset
headers = {
    "Content-Type": "application/json"
}

d = pd.read_csv('demo_data.csv')

In [25]:
d = d.drop(columns=['result', 'DateClosed', 'DateDifferenceTXDateAndDateClosed'])
d.iloc[:,:] = np.nan

  d.iloc[:,:] = np.nan
  d.iloc[:,:] = np.nan


In [26]:
for i in range(20):
    
    row = random.randint(0, d.shape[0])
    #row = i

    d_obs = d.iloc[row].to_dict()
    print(row)
    data = json.dumps(d_obs)

    # Send the POST request
    start_time = time.time()
    response = requests.post(url, auth=auth, headers=headers, data=data)
    end_time = time.time()

    latency = (end_time - start_time) * 1000

    # Print the latency
    print(f"Request latency: {latency:.0f}ms")

    # Print the response
    print("Status Code:", response.status_code)
    print("Response Body:", response.json())

511
Request latency: 176ms
Status Code: 200
Response Body: {'predictions': [0.7867759466171265], 'shap_values': {'TXDate': -0.3523402810096741, 'TXAmount': 0.005455068778246641, 'CBDate': -0.6847403049468994, 'CBAmount': 0.051228754222393036, 'CancellationDate': 0.0}}
176
Request latency: 147ms
Status Code: 200
Response Body: {'predictions': [0.7867759466171265], 'shap_values': {'TXDate': -0.3523402810096741, 'TXAmount': 0.005455068778246641, 'CBDate': -0.6847403049468994, 'CBAmount': 0.051228754222393036, 'CancellationDate': 0.0}}
847
Request latency: 170ms
Status Code: 200
Response Body: {'predictions': [0.7867759466171265], 'shap_values': {'TXDate': -0.3523402810096741, 'TXAmount': 0.005455068778246641, 'CBDate': -0.6847403049468994, 'CBAmount': 0.051228754222393036, 'CancellationDate': 0.0}}
467
Request latency: 181ms
Status Code: 200
Response Body: {'predictions': [0.7867759466171265], 'shap_values': {'TXDate': -0.3523402810096741, 'TXAmount': 0.005455068778246641, 'CBDate': -0.68

KeyboardInterrupt: 