### Import statements and declare parameters and constants

In [5]:
!pip install --upgrade boto3 s3fs

Collecting boto3
  Downloading boto3-1.34.79-py3-none-any.whl.metadata (6.6 kB)
Collecting s3fs
  Using cached s3fs-2024.3.1-py3-none-any.whl.metadata (1.6 kB)
Collecting botocore<1.35.0,>=1.34.79 (from boto3)
  Downloading botocore-1.34.79-py3-none-any.whl.metadata (5.7 kB)
Collecting s3transfer<0.11.0,>=0.10.0 (from boto3)
  Using cached s3transfer-0.10.1-py3-none-any.whl.metadata (1.7 kB)
Collecting fsspec==2024.3.1 (from s3fs)
  Using cached fsspec-2024.3.1-py3-none-any.whl.metadata (6.8 kB)
INFO: pip is looking at multiple versions of aiobotocore to determine which version is compatible with other requirements. This could take a while.
Collecting aiobotocore<3.0.0,>=2.5.4 (from s3fs)
  Using cached aiobotocore-2.12.2-py3-none-any.whl.metadata (21 kB)
  Using cached aiobotocore-2.12.1-py3-none-any.whl.metadata (21 kB)
  Using cached aiobotocore-2.12.0-py3-none-any.whl.metadata (21 kB)
  Using cached aiobotocore-2.11.2-py3-none-any.whl.metadata (21 kB)
  Using cached aiobotocore-2.1

### Pipeline Begin

In [29]:
import boto3 
import pandas as pd 
import sagemaker
from sagemaker.workflow.pipeline_context import PipelineSession 

s3_client = boto3.resource('s3')
pipeline_name = f"sagemaker-mlops-fraud-pipeline" 

import sagemaker

sagemaker_session = sagemaker.Session()
region = sagemaker_session.boto_region_name 
role = sagemaker.get_execution_role()
pipeline_session = PipelineSession()
default_bucket = sagemaker_session.default_bucket()
model_package_group_name = f"FraudModelPackageGroup"

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


### Collect and prepare data

In [30]:
bucket_name = 'eliezerraj-908671954593-dataset'
prefix_name = 'payment'
file_name = 'payment.csv'

input_dataset = 's3://{}/{}/{}'.format(bucket_name, prefix_name, file_name)
input_requirement = 's3://{}/{}/{}'.format(bucket_name,prefix_name,'requirements.txt')
model_path = f"s3://{bucket_name}/{prefix_name}/output"

print("input_dataset : ",input_dataset)
print("input_requirement : ",input_requirement)
print("model_path : ",model_path)

store_data = pd.read_csv(input_dataset)

from sagemaker.workflow.parameters import ( 
                         ParameterInteger, 
                         ParameterString, 
                         ParameterFloat) 

auc_score_threshold = 0.75
base_job_prefix = "fraud-model"

processing_instance_count = ParameterInteger(name="ProcessingInstanceCount",
                                             default_value=1)
processing_instance_type = ParameterString(name="ProcessingInstanceType",
                                           default_value="ml.m4.xlarge")
training_instance_type = ParameterString(name="TrainingInstanceType",
                                         default_value="ml.m4.xlarge")

model_approval_status = ParameterString(name="ModelApprovalStatus",
                                        default_value="PendingManualApproval")

input_dataset :  s3://eliezerraj-908671954593-dataset/payment/payment.csv
input_requirement :  s3://eliezerraj-908671954593-dataset/payment/requirements.txt
model_path :  s3://eliezerraj-908671954593-dataset/payment/output


In [31]:
#!pygmentize "feat-v02.py"

### Step 1: Define Processing Step

In [32]:
# Define Processing Step for Feature Engineering
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep

framework_version = "1.0-1"

sklearn_processor = SKLearnProcessor(
    framework_version=framework_version,
    instance_type="ml.m4.xlarge",
    instance_count=processing_instance_count,
    base_job_name="mlops-pipeline-fraud-feature-eng",
    role=role,
    sagemaker_session=pipeline_session,
)

processor_args = sklearn_processor.run(
    inputs=[
        ProcessingInput(source=input_dataset,
                        destination="/opt/ml/processing/input"),
        ProcessingInput(source=input_requirement,
                        destination="/opt/ml/processing/input/req/")
    ],
    outputs=[
        ProcessingOutput(output_name="train",
                         source="/opt/ml/processing/train",
                         destination=f"s3://{bucket_name}/{prefix_name}/output/train"),
        ProcessingOutput(output_name="validation",
                         source="/opt/ml/processing/validation",
                         destination=f"s3://{bucket_name}/{prefix_name}/output/validation"),
        ProcessingOutput(output_name="test",
                         source="/opt/ml/processing/test",
                         destination=f"s3://{bucket_name}/{prefix_name}/output/test")
    ],
    code=f"feat-v02.py",
)

step_process = ProcessingStep(
    name="FeaturingEngineering-FraudModel",
    step_args=processor_args
)

print("step_process : ", step_process)

INFO:sagemaker.image_uris:Defaulting to only available Python version: py3


step_process :  ProcessingStep(name='FeaturingEngineering-FraudModel', display_name=None, description=None, step_type=<StepTypeEnum.PROCESSING: 'Processing'>, depends_on=None)




### Step 2: Define HyperParameter Tuning  Step

In [33]:
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput
from sagemaker.tuner import (
    IntegerParameter,
    CategoricalParameter,
    ContinuousParameter,
    HyperparameterTuner,
)
from sagemaker.workflow.steps import TuningStep

image_uri = sagemaker.image_uris.retrieve(
                                            framework="xgboost",
                                            region=region,
                                            version="1.0-1",
                                            py_version="py3",
                                            instance_type=training_instance_type,
)
fixed_hyperparameters = {
                        "eval_metric": "auc",
                        "objective": "binary:logistic",
                        "num_round": "100",
                        "rate_drop": "0.3",
                        "tweedie_variance_power": "1.4"
}
xgb_train = Estimator(
                        image_uri=image_uri,
                        instance_type=training_instance_type,
                        instance_count=1,
                        hyperparameters=fixed_hyperparameters,
                        output_path=model_path,
                        base_job_name=f"fraud-train",
                        sagemaker_session=pipeline_session,
                        role=role,
)



In [34]:
hyperparameter_ranges = {
                        "eta": ContinuousParameter(0, 1),
                        "min_child_weight": ContinuousParameter(1, 10),
                        "alpha": ContinuousParameter(0, 2),
                        "max_depth": IntegerParameter(1, 10),
                        }

objective_metric_name = "validation:auc"

tuner = HyperparameterTuner(
    xgb_train,
    objective_metric_name,
    hyperparameter_ranges,
    max_jobs=2,
    max_parallel_jobs=2,
)

hpo_args = tuner.fit(
    inputs={
        "train": TrainingInput(
            s3_data=step_process.properties.ProcessingOutputConfig.Outputs["train"].S3Output.S3Uri, 
            content_type="text/csv",
        ),
        "validation": TrainingInput(
            s3_data=step_process.properties.ProcessingOutputConfig.Outputs["validation"].S3Output.S3Uri, 
            content_type="text/csv",
        ),
    }
)

step_tuning = TuningStep(name="HyperParameterTuning-FraudModel", step_args=hpo_args)

print("step_tuning : ", step_tuning)

step_tuning :  TuningStep(name='HyperParameterTuning-FraudModel', display_name=None, description=None, step_type=<StepTypeEnum.TUNING: 'Tuning'>, depends_on=None)


### Step 3: Define the evaluation script and model evaluation step

In [35]:
#!pygmentize "evaluate-fraud.py"

In [36]:
# define model evaluation step to evaluate the trained model
from sagemaker.processing import ScriptProcessor
script_eval = ScriptProcessor(
    image_uri=image_uri,
    command=["python3"],
    instance_type=processing_instance_type,
    instance_count=1,
    base_job_name="script-fraud-eval",
    role=role,
    sagemaker_session=pipeline_session,
)

eval_args = script_eval.run(
     inputs=[
            ProcessingInput(
                source=step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=f"{bucket_name}/{prefix_name}", prefix="output"),
                destination="/opt/ml/processing/model"
            ),
            ProcessingInput(
                source=step_process.properties.ProcessingOutputConfig.Outputs["test"].S3Output.S3Uri,
                destination="/opt/ml/processing/test"
            )
        ],
    outputs=[
            ProcessingOutput(output_name="evaluation", 
                             source="/opt/ml/processing/evaluation",
                             destination=f"s3://{bucket_name}/{prefix_name}/output/evaluation"),
        ],
    code=f"evaluate-fraud.py",
)

from sagemaker.workflow.properties import PropertyFile

evaluation_report = PropertyFile(
    name="FraudModelEvaluationReport", output_name="evaluation", path="evaluation.json"
)

step_eval = ProcessingStep(
    name="EvalModel-FraudModel",
    step_args=eval_args,
    property_files=[evaluation_report],
)

print("step_eval : ", step_eval)

step_eval :  ProcessingStep(name='EvalModel-FraudModel', display_name=None, description=None, step_type=<StepTypeEnum.PROCESSING: 'Processing'>, depends_on=None)


### Step 4: Define a register model step

In [37]:
from sagemaker import Model
from sagemaker.workflow.model_step import ModelStep

model = Model(
    image_uri=image_uri,
    model_data=step_tuning.get_top_model_s3_uri(top_k=0,s3_bucket=f"{bucket_name}/{prefix_name}",prefix="output"),
    sagemaker_session=pipeline_session,
    role=role,
)

from sagemaker.model_metrics import MetricsSource, ModelMetrics
from time import gmtime, strftime

model_name = "xgboost-fraud-v3-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())

model_metrics = ModelMetrics(
    model_statistics=MetricsSource(
        s3_uri="{}/evaluation.json".format(
            step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"]
        ),
        content_type="application/json",
    )
)
register_args = model.register(
                                content_types=["text/csv"],
                                response_types=["text/csv"],
                                inference_instances=["ml.t2.medium", "ml.m5.large"],
                                transform_instances=["ml.m4.xlarge"],
                                model_package_group_name=model_package_group_name,
                                approval_status=model_approval_status,
                                model_metrics=model_metrics,
)

step_register = ModelStep(
    name="RegisterModel-FraudModel",
    step_args=register_args
)

print("step_register : ", step_register)



step_register :  ModelStep(name='RegisterModel-FraudModel', steps=[_RegisterModelStep(name='RegisterModel-FraudModel-RegisterModel', display_name=None, description=None, step_type=<StepTypeEnum.REGISTER_MODEL: 'RegisterModel'>, depends_on=None)])


### Step 5: Define a condition step to check AUC score

In [38]:
from sagemaker.workflow.conditions import ConditionGreaterThan
from sagemaker.workflow.condition_step import ConditionStep
from sagemaker.workflow.functions import JsonGet
cond_lte = ConditionGreaterThan(
    left=JsonGet(
        step_name=step_eval.name,
        property_file=evaluation_report,
        json_path="classification_metrics.auc_score.value",
    ),
    right=auc_score_threshold,
)

step_cond = ConditionStep(
    name="CheckAUCScore-FraudFraud",
    conditions=[cond_lte],
    if_steps=[step_register],
)

### Build and Trigger the pipeline run

In [39]:
import json
from sagemaker.workflow.pipeline import Pipeline

pipeline = Pipeline(
    name=pipeline_name,
    parameters=[
        processing_instance_count,
        processing_instance_type,
        training_instance_type,
        model_approval_status,
        input_dataset,
        auc_score_threshold,
    ],
    steps=[step_process,
           step_tuning,
           step_eval,
           step_cond],
)

definition = json.loads(pipeline.definition())

print(definition)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml




{'Version': '2020-12-01', 'Metadata': {}, 'Parameters': [{'Name': 'ProcessingInstanceCount', 'Type': 'Integer', 'DefaultValue': 1}, {'Name': 'ProcessingInstanceType', 'Type': 'String', 'DefaultValue': 'ml.m4.xlarge'}, {'Name': 'TrainingInstanceType', 'Type': 'String', 'DefaultValue': 'ml.m4.xlarge'}, {'Name': 'ModelApprovalStatus', 'Type': 'String', 'DefaultValue': 'PendingManualApproval'}], 'PipelineExperimentConfig': {'ExperimentName': {'Get': 'Execution.PipelineName'}, 'TrialName': {'Get': 'Execution.PipelineExecutionId'}}, 'Steps': [{'Name': 'FeaturingEngineering-FraudModel', 'Type': 'Processing', 'Arguments': {'ProcessingResources': {'ClusterConfig': {'InstanceType': 'ml.m4.xlarge', 'InstanceCount': {'Get': 'Parameters.ProcessingInstanceCount'}, 'VolumeSizeInGB': 30}}, 'AppSpecification': {'ImageUri': '257758044811.dkr.ecr.us-east-2.amazonaws.com/sagemaker-scikit-learn:1.0-1-cpu-py3', 'ContainerEntrypoint': ['python3', '/opt/ml/processing/input/code/feat-v02.py']}, 'RoleArn': 'arn

In [40]:
# Create a new or update existing Pipeline
pipeline.upsert(role_arn=role)
# start Pipeline execution
pipeline.start()



_PipelineExecution(arn='arn:aws:sagemaker:us-east-2:908671954593:pipeline/sagemaker-mlops-fraud-pipeline/execution/y17j1cxmga28', sagemaker_session=<sagemaker.session.Session object at 0x7f75775d58d0>)