In [None]:
from datetime import datetime

import sagemaker
import secrets
import boto3
import os

In [None]:
def get_sagemaker_session():
    """
    Gets sagemaker session, bucket and role.
    """
    sagemaker_session = sagemaker.Session()
    role = sagemaker.get_execution_role()
    bucket = sagemaker_session.default_bucket()

    print(f"Role : {role}")
    return sagemaker_session, role, bucket

sagemaker_session, role, bucket = get_sagemaker_session()

In [None]:
bucket

In [None]:
prefix = "projects/mlops"

In [None]:
logs = {}
model_id_prefix = "sklearn-dummy"
date_str = datetime.now().strftime("%Y-%m-%d")
logs["model_id"] = f'{model_id_prefix}-{date_str}-' + secrets.token_hex(nbytes=16)
print(logs["model_id"])

In [None]:
now = datetime.now()

In [None]:
from sagemaker.workflow.parameters import ParameterInteger, ParameterString, ParameterFloat

base_job_prefix="mlops-test"
model_package_group_name = "MLOpsTestModel"
pipeline_name = "TrainingPipelineMLOpsTest"  # SageMaker Pipeline name

# parameters for pipeline execution
processing_instance_count = 1
evaluation_instance_count = 1
processing_instance_type = "ml.m5.large"
training_instance_type = "ml.m5.large"
evaluation_instance_type = "ml.m5.large"
model_approval_status = ParameterString(
    name="ModelApprovalStatus", default_value="PendingManualApproval"
)

In [None]:
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep

In [None]:
sklearn_framework_version = "1.2-1"

sklearn_processor = SKLearnProcessor(
    framework_version=sklearn_framework_version,
    instance_type=processing_instance_type,
    instance_count=processing_instance_count,
    volume_size_in_gb=5,
    base_job_name="mlops-test-nb-pipeline-preprocess",
    role=role
)

In [None]:
processing_step = ProcessingStep(
    name="MLOpsTestProcessing",
    processor=sklearn_processor,
    outputs=[
        ProcessingOutput(output_name="train_data", source="/opt/ml/processing/train",
                        destination=os.path.join(f"s3://{bucket}", prefix, "challenger", 
                                                 now.strftime("%Y/%m/%d"), logs["model_id"], "data", "train")),
        ProcessingOutput(output_name="test_data", source="/opt/ml/processing/test",
                        destination=os.path.join(f"s3://{bucket}", prefix, "challenger", 
                                                 now.strftime("%Y/%m/%d"), logs["model_id"], "data", "test")),
    ],
    code=os.path.join("../../train/code/preprocess.py")
)

In [None]:
from sagemaker.sklearn.estimator import SKLearn

sklearn = SKLearn(
    entry_point="train.py",
    source_dir="../../train/code/",
    framework_version="1.2-1",
    instance_type=training_instance_type,
    instance_count=1,
    role=role,
    sagemaker_session=sagemaker_session,
    output_path=f"s3://{bucket}/{prefix}/training/output",
    code_location=f"s3://{bucket}/{prefix}/training/code"
)


In [None]:
from sagemaker.inputs import TrainingInput
from sagemaker.workflow.steps import TrainingStep

step_train = TrainingStep(
    name="mlops-pipeline-test-model",
    estimator=sklearn,
    inputs={
        "train": TrainingInput(
            s3_data=processing_step.properties.ProcessingOutputConfig.Outputs["train_data"].S3Output.S3Uri,
            content_type="text/csv",
        )
    }
)

In [None]:
from sagemaker.workflow.properties import PropertyFile

evaluation_report = PropertyFile(
    name="EvaluationReport", output_name="evaluation", path="evaluation.json"
)


In [None]:
evaluation = SKLearnProcessor(
    framework_version=sklearn_framework_version,
    instance_type=evaluation_instance_type,
    instance_count=evaluation_instance_count,
    volume_size_in_gb=5,
    base_job_name="mlops-test-nb-pipeline-evaluate",
    role=role
)

In [None]:
step_evaluate = ProcessingStep(
    name="EvaluatePerformance",
    processor=evaluation,
    inputs=[
        ProcessingInput(
            source=step_train.properties.ModelArtifacts.S3ModelArtifacts,
            destination="/opt/ml/processing/model",
        ),
        ProcessingInput(
            source=processing_step.properties.ProcessingOutputConfig.Outputs["test_data"].S3Output.S3Uri,
            destination="/opt/ml/processing/test",
        ),
    ],
    outputs=[
        ProcessingOutput(output_name="evaluation", source="/opt/ml/processing/evaluation", destination=os.path.join(f"s3://{bucket}", prefix, "challenger", 
                                                 now.strftime("%Y/%m/%d"), logs["model_id"], "data", "evaluation_report")),
    ],
    property_files=[evaluation_report],
    code="../../train/code/evaluate.py"
)


In [None]:
from sagemaker.workflow.pipeline import Pipeline

pipeline = Pipeline(
    name=pipeline_name,
    steps=[processing_step, step_train, step_evaluate]
)

In [None]:
import json
json.loads(pipeline.definition())

In [None]:
pipeline.upsert(role_arn=role)

In [None]:
pipeline.start()

In [None]:
sm = boto3.client('sagemaker')
sm.list_pipelines()

In [None]:
role