# Create a Training Pipeline with the Step Functions Data Science SDK

![Step Functions SageMaker Pipeline](img/step-functions-sagemaker-pipeline-high-level-no-feature-engineering.png)

In [None]:
from botocore.exceptions import ClientError

import os
import sagemaker
import logging
import boto3
import sagemaker
import pandas as pd

sess   = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name='sagemaker', region_name=region)

In [None]:
!pip install -q stepfunctions==1.0.0.8

In [None]:
import stepfunctions
import logging
from stepfunctions.template.pipeline import TrainingPipeline

stepfunctions.set_stream_logger(level=logging.INFO)

# Create an IAM Execution Role for Step Functions
We need a StepFunctionsWorkflowExecutionRole so that you can create and execute workflows in Step Functions.

In [None]:
iam = boto3.Session().client(service_name='iam', region_name=region)
sts = boto3.Session().client(service_name='sts', region_name=region)

In [None]:
stepfunction_role_name = 'DSOAWS_StepFunctionsExecutionRole'

### Create an AssumeRolePolicyDocument

In [None]:
assume_role_policy_doc = {
  "Version": "2012-10-17",
  "Statement": [
    {
      "Effect": "Allow",
      "Principal": {
        "Service": "states.amazonaws.com"
      },
      "Action": "sts:AssumeRole"
    }
  ]
} 

### Create `DSOAWS_StepFunctionsExecutionRole`

In [None]:
import json

try:
    iam.create_role(
        RoleName=stepfunction_role_name,
        AssumeRolePolicyDocument=json.dumps(assume_role_policy_doc),
        Description='DSOAWS Step Function Workflow Execution Role'
    )
    print("Role created.")

except ClientError as e:
    if e.response['Error']['Code'] == 'EntityAlreadyExists':
        print("Role already exists. This is OK.")
    else:
        print("Unexpected error: %s" % e)

### Get the Role ARN

In [None]:
stepfunction_role = iam.get_role(RoleName=stepfunction_role_name)
stepfunction_role_arn = stepfunction_role['Role']['Arn']
print(stepfunction_role_arn)

# Add a Policy to the Role

## Define permissions

In [None]:
stepfunction_permissions = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Action": [
                "sagemaker:CreateTransformJob",
                "sagemaker:DescribeTransformJob",
                "sagemaker:StopTransformJob",
                "sagemaker:CreateTrainingJob",
                "sagemaker:DescribeTrainingJob",
                "sagemaker:StopTrainingJob",
                "sagemaker:CreateHyperParameterTuningJob",
                "sagemaker:DescribeHyperParameterTuningJob",
                "sagemaker:StopHyperParameterTuningJob",
                "sagemaker:CreateModel",
                "sagemaker:CreateEndpointConfig",
                "sagemaker:CreateEndpoint",
                "sagemaker:DeleteEndpointConfig",
                "sagemaker:DeleteEndpoint",
                "sagemaker:UpdateEndpoint",
                "sagemaker:ListTags",
                "lambda:InvokeFunction",
                "sqs:SendMessage",
                "sns:Publish",
                "ecs:RunTask",
                "ecs:StopTask",
                "ecs:DescribeTasks",
                "dynamodb:GetItem",
                "dynamodb:PutItem",
                "dynamodb:UpdateItem",
                "dynamodb:DeleteItem",
                "batch:SubmitJob",
                "batch:DescribeJobs",
                "batch:TerminateJob",
                "glue:StartJobRun",
                "glue:GetJobRun",
                "glue:GetJobRuns",
                "glue:BatchStopJobRun"
            ],
            "Resource": "*"
        },
        {
            "Effect": "Allow",
            "Action": [
                "iam:PassRole"
            ],
            "Resource": "*",
            "Condition": {
                "StringEquals": {
                    "iam:PassedToService": "sagemaker.amazonaws.com"
                }
            }
        },
        {
            "Effect": "Allow",
            "Action": [
                "events:PutTargets",
                "events:PutRule",
                "events:DescribeRule"
            ],
            "Resource": [
                "arn:aws:events:*:*:rule/StepFunctionsGetEventsForSageMakerTrainingJobsRule",
                "arn:aws:events:*:*:rule/StepFunctionsGetEventsForSageMakerTransformJobsRule",
                "arn:aws:events:*:*:rule/StepFunctionsGetEventsForSageMakerTuningJobsRule",
                "arn:aws:events:*:*:rule/StepFunctionsGetEventsForECSTaskRule",
                "arn:aws:events:*:*:rule/StepFunctionsGetEventsForBatchJobsRule"
            ]
        }
    ]
}

## Turn into Policy Object

In [None]:
stepfunction_policy_name = 'DSOAWS_StepFunctionsWorkflowExecutionPolicy'

In [None]:
account_id = sts.get_caller_identity()['Account']

In [None]:
try:
    stepfunction_policy = iam.create_policy(
      PolicyName=stepfunction_policy_name,
      PolicyDocument=json.dumps(stepfunction_permissions)
    )
    stepfunction_policy_arn = f'arn:aws:iam::{account_id}:policy/{stepfunction_policy_name}'
    print("Policy created.")

except ClientError as e:
    if e.response['Error']['Code'] == 'EntityAlreadyExists':
        print("Policy already exists.")
        stepfunction_policy_arn = f'arn:aws:iam::{account_id}:policy/{stepfunction_policy_name}'
        stepfunction_policy = iam.create_policy_version(
            PolicyArn=stepfunction_policy_arn,
            PolicyDocument=json.dumps(stepfunction_permissions),
            SetAsDefault=True)
        print("Policy updated.")
    else:
        print("Unexpected error: %s" % e)

## Get ARN

In [None]:
# stepfunction_policy_arn = f'arn:aws:iam::{account_id}:policy/{stepfunction_policy_name}'
print(stepfunction_policy_arn)

## Attach Policy To Step Function Workflow Execution Role

In [None]:
try:
    response = iam.attach_role_policy(
        PolicyArn=stepfunction_policy_arn,
        RoleName=stepfunction_role_name
    )
    print("Done.")
except ClientError as e:
    if e.response['Error']['Code'] == 'EntityAlreadyExists':
        print("Policy is already attached. This is OK.")
    else:
        print("Unexpected error: %s" % e)


In [None]:
try:
    response = iam.attach_role_policy(
        PolicyArn='arn:aws:iam::aws:policy/service-role/AWSLambdaRole',
        RoleName=stepfunction_role_name
    )
    print("Done.")
except ClientError as e:
    if e.response['Error']['Code'] == 'EntityAlreadyExists':
        print("Policy is already attached. This is OK.")
    else:
        print("Unexpected error: %s" % e)

# Specify the S3 Location of the Features

In [None]:
%store -r processed_train_data_s3_uri

In [None]:
print(processed_train_data_s3_uri)

In [None]:
%store -r processed_validation_data_s3_uri

In [None]:
print(processed_validation_data_s3_uri)

In [None]:
%store -r processed_test_data_s3_uri

In [None]:
print(processed_test_data_s3_uri)

In [None]:
print(processed_train_data_s3_uri)
!aws s3 ls $processed_train_data_s3_uri/

In [None]:
print(processed_validation_data_s3_uri)
!aws s3 ls $processed_validation_data_s3_uri/

In [None]:
print(processed_test_data_s3_uri)
!aws s3 ls $processed_test_data_s3_uri/

In [None]:
s3_input_train_data = sagemaker.s3_input(s3_data=processed_train_data_s3_uri, distribution='ShardedByS3Key')
s3_input_validation_data = sagemaker.s3_input(s3_data=processed_validation_data_s3_uri, distribution='ShardedByS3Key')
s3_input_test_data = sagemaker.s3_input(s3_data=processed_test_data_s3_uri, distribution='ShardedByS3Key')

print(s3_input_train_data.config)
print(s3_input_validation_data.config)
print(s3_input_test_data.config)

# Show Training Script

In [None]:
!pygmentize src/tf_bert_reviews.py

# Setup Hyper-Parameters

In [None]:
%store -r max_seq_length

In [None]:
print(max_seq_length)

In [None]:
epochs=1
learning_rate=0.00001
epsilon=0.00000001
train_batch_size=128
validation_batch_size=128
test_batch_size=128
train_steps_per_epoch=50
validation_steps=50
test_steps=50
train_instance_count=1
train_instance_type='ml.c5.9xlarge'
train_volume_size=1024
use_xla=True
use_amp=True
freeze_bert_layer=True
enable_sagemaker_debugger=False
enable_checkpointing=False
enable_tensorboard=False
input_mode='Pipe'
run_validation=True
run_test=True
run_sample_predictions=True

deploy_instance_count=1
deploy_instance_type='ml.m5.large'

# Setup Metrics To Track Model Performance

In [None]:
metrics_definitions = [
     {'Name': 'train:loss', 'Regex': 'loss: ([0-9\\.]+)'},
     {'Name': 'train:accuracy', 'Regex': 'accuracy: ([0-9\\.]+)'},
     {'Name': 'validation:loss', 'Regex': 'val_loss: ([0-9\\.]+)'},
     {'Name': 'validation:accuracy', 'Regex': 'val_accuracy: ([0-9\\.]+)'},
]

# Setup Estimator

In [None]:
from sagemaker.tensorflow import TensorFlow

estimator = TensorFlow(entry_point='tf_bert_reviews.py',
                       source_dir='src',
                       role=role,
                       train_instance_count=train_instance_count, # Make sure you have at least this number of input files or the ShardedByS3Key distibution strategy will fail the job due to no data available
                       train_instance_type=train_instance_type,
                       train_volume_size=train_volume_size,
                       py_version='py3',
                       framework_version='2.1.0',
                       hyperparameters={'epochs': epochs,
                                        'learning_rate': learning_rate,
                                        'epsilon': epsilon,
                                        'train_batch_size': train_batch_size,
                                        'validation_batch_size': validation_batch_size,
                                        'test_batch_size': test_batch_size,                                             
                                        'train_steps_per_epoch': train_steps_per_epoch,
                                        'validation_steps': validation_steps,
                                        'test_steps': test_steps,
                                        'use_xla': use_xla,
                                        'use_amp': use_amp,                                             
                                        'max_seq_length': max_seq_length,
                                        'freeze_bert_layer': freeze_bert_layer,
                                        'enable_sagemaker_debugger': enable_sagemaker_debugger,
                                        'enable_checkpointing': enable_checkpointing,
                                        'enable_tensorboard': enable_tensorboard,                                        
                                        'run_validation': run_validation,
                                        'run_test': run_test,
                                        'run_sample_predictions': run_sample_predictions},
                       input_mode=input_mode,
                       metric_definitions=metrics_definitions,
#                       train_max_run=7200 # max 2 hours * 60 minutes seconds per hour * 60 seconds per minute
                      )

# Setup Pipeline with the Step Functions SDK

A typical task for a data scientist is to train a model and deploy that model to an endpoint. Without the Step Functions SDK, this is a four step process on SageMaker that includes the following.

1. Training the model
2. Creating the model on SageMaker
3. Creating an endpoint configuration
4. Deploying the trained model to the configured endpoint

The Step Functions SDK provides the [TrainingPipeline](https://aws-step-functions-data-science-sdk.readthedocs.io/en/latest/pipelines.html#stepfunctions.template.pipeline.train.TrainingPipeline) API to simplify this procedure. The following configures `pipeline` with the necessary parameters to define a training pipeline.

In [None]:
from stepfunctions.steps import TrainingStep, TransformStep, ModelStep, EndpointConfigStep, EndpointStep, Chain, Fail, Catch
from stepfunctions.workflow import Workflow
from stepfunctions.template.pipeline.common import WorkflowTemplate
from stepfunctions.template.pipeline.common import StepId

In [None]:
sfn = boto3.client('stepfunctions')

In [None]:
import time
timestamp = int(time.time())

pipeline_name = 'bert-pipeline-{}'.format(timestamp)

print('Pipeline name {}'.format(pipeline_name))

In [None]:
from __future__ import absolute_import

from sagemaker.utils import base_name_from_image
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.model import Model
from sagemaker.pipeline import PipelineModel

from stepfunctions.steps import TrainingStep, TransformStep, ModelStep, EndpointConfigStep, EndpointStep, Chain, Fail, Catch
from stepfunctions.workflow import Workflow
from stepfunctions.template.pipeline.common import StepId, WorkflowTemplate


class TrainingPipelineWithDifferentDeployInstanceType(WorkflowTemplate):

    """
    Creates a standard training pipeline with the following steps in order:
        1. Train estimator
        2. Create estimator model
        3. Endpoint configuration
        4. Deploy model
    """

    __allowed_kwargs = ('pipeline_name',)
    
    def __init__(self, 
                 estimator, 
                 role, 
                 inputs, 
                 s3_bucket, 
                 client, 
                 deploy_instance_count, 
                 deploy_instance_type, 
                 **kwargs):
        """
        Args:
            estimator (sagemaker.estimator.EstimatorBase): The estimator to use for training. Can be a BYO estimator, Framework estimator or Amazon algorithm estimator.
            role (str): An AWS IAM role (either name or full Amazon Resource Name (ARN)). This role is used to create, manage, and execute the Step Functions workflows.
            inputs: Information about the training data. Please refer to the `fit()` method of the associated estimator, as this can take any of the following forms:
                * (str) - The S3 location where training data is saved.
                * (dict[str, str] or dict[str, `sagemaker.session.s3_input`]) - If using multiple channels for training data, you can specify a dict mapping channel names to strings or `sagemaker.session.s3_input` objects.
                * (`sagemaker.session.s3_input`) - Channel configuration for S3 data sources that can provide additional information about the training dataset. See `sagemaker.session.s3_input` for full details.
                * (`sagemaker.amazon.amazon_estimator.RecordSet`) - A collection of Amazon `Record` objects serialized and stored in S3. For use with an estimator for an Amazon algorithm.
                * (list[`sagemaker.amazon.amazon_estimator.RecordSet`]) - A list of `sagemaker.amazon.amazon_estimator.RecordSet` objects, where each instance is a different channel of training data.
            s3_bucket (str): S3 bucket under which the output artifacts from the training job will be stored. The parent path used is built using the format: ``s3://{s3_bucket}/{pipeline_name}/models/{job_name}/``. In this format, `pipeline_name` refers to the keyword argument provided for TrainingPipeline. If a `pipeline_name` argument was not provided, one is auto-generated by the pipeline as `training-pipeline-<timestamp>`. Also, in the format, `job_name` refers to the job name provided when calling the :meth:`TrainingPipeline.run()` method.
            client (SFN.Client, optional): boto3 client to use for creating and interacting with the training pipeline in Step Functions. (default: None)
        Keyword Args:
            pipeline_name (str, optional): Name of the pipeline. This name will be used to name jobs (if not provided when calling execute()), models, endpoints, and S3 objects created by the pipeline. If a `pipeline_name` argument was not provided, one is auto-generated by the pipeline as `training-pipeline-<timestamp>`. (default:None)
        """
        self.estimator = estimator
        self.inputs = inputs
        
        for key in self.__class__.__allowed_kwargs:
            setattr(self, key, kwargs.pop(key, None))

        if not self.pipeline_name:
            self.__pipeline_name_unique = True
            self.pipeline_name = 'training-pipeline-{date}'.format(date=self._generate_timestamp())

        self.definition = self.build_workflow_definition()
        self.input_template = self._extract_input_template(self.definition)

        workflow = Workflow(name=self.pipeline_name, definition=self.definition, role=role, format_json=True, client=client)

        super(TrainingPipelineWithDifferentDeployInstanceType, self).__init__(s3_bucket=s3_bucket, workflow=workflow, role=role, client=client)
    
    def build_workflow_definition(self):
        """
        Build the workflow definition for the training pipeline with all the states involved.
        Returns:
            :class:`~stepfunctions.steps.states.Chain`: Workflow definition as a chain of states involved in the the training pipeline.
        """
        default_name = self.pipeline_name

        train_instance_type = self.estimator.train_instance_type
        train_instance_count = self.estimator.train_instance_count

        training_step = TrainingStep(
            StepId.Train.value,
            estimator=self.estimator,
            job_name=default_name + '/estimator-source',
            data=self.inputs,
        )

        model = self.estimator.create_model()
        model_step = ModelStep(
            StepId.CreateModel.value,
            instance_type=train_instance_type,
            model=model,
            model_name=default_name
        )

        endpoint_config_step = EndpointConfigStep(
            StepId.ConfigureEndpoint.value,
            endpoint_config_name=default_name,
            model_name=default_name,
            initial_instance_count=deploy_instance_count,
            instance_type=deploy_instance_type
        )
        
        deploy_step = EndpointStep(
            StepId.Deploy.value,
            endpoint_name=default_name,
            endpoint_config_name=default_name,
        )

        return Chain([training_step, model_step, endpoint_config_step, deploy_step])
    
    def execute(self, job_name=None, hyperparameters=None):
        """
        Run the training pipeline.
        
        Args:
            job_name (str, optional): Name for the training job. If one is not provided, a job name will be auto-generated. (default: None)
            hyperparameters (dict, optional): Hyperparameters for the estimator training. (default: None)
        
        Returns:
            :py:class:`~stepfunctions.workflow.Execution`: Running instance of the training pipeline.
        """
        inputs = self.input_template.copy()
        
        if hyperparameters is not None:
            inputs[StepId.Train.value]['HyperParameters'] = {
                k: str(v) for k, v in hyperparameters.items()
            }
        
        if job_name is None:
            job_name = '{base_name}-{timestamp}'.format(base_name='training-pipeline', timestamp=self._generate_timestamp())
        
        # Configure training and model
        inputs[StepId.Train.value]['TrainingJobName'] = 'estimator-' + job_name
        inputs[StepId.Train.value]['OutputDataConfig']['S3OutputPath'] = 's3://{s3_bucket}/{pipeline_name}/models'.format(
            s3_bucket=self.s3_bucket,
            pipeline_name=self.workflow.name
        )
        inputs[StepId.CreateModel.value]['ModelName'] = job_name

        # Configure endpoint
        inputs[StepId.ConfigureEndpoint.value]['EndpointConfigName'] = job_name
        for variant in inputs[StepId.ConfigureEndpoint.value]['ProductionVariants']:
            variant['ModelName'] = job_name
        inputs[StepId.Deploy.value]['EndpointConfigName'] = job_name
        inputs[StepId.Deploy.value]['EndpointName'] = job_name
        
        # Configure the path to model artifact
        inputs[StepId.CreateModel.value]['PrimaryContainer']['ModelDataUrl'] = '{s3_uri}/{job}/output/model.tar.gz'.format(
            s3_uri=inputs[StepId.Train.value]['OutputDataConfig']['S3OutputPath'],
            job=inputs[StepId.Train.value]['TrainingJobName']
        )
        
        return self.workflow.execute(inputs=inputs, name=job_name)

In [None]:
pipeline = TrainingPipelineWithDifferentDeployInstanceType(
    estimator=estimator,
    role=stepfunction_role_arn,
    inputs={
        'train':s3_input_train_data,
        'validation':s3_input_validation_data,
        'test':s3_input_test_data        
    },
    s3_bucket=bucket,
    client=sfn,
    deploy_instance_count=deploy_instance_count,
    deploy_instance_type=deploy_instance_type,    
)


# Visualize the pipeline

You can now view the workflow definition, and also visualize it as a graph. This workflow and graph represent your training pipeline. 

## View the workflow definition

In [None]:
print(pipeline.workflow.definition.to_json(pretty=True))

## Visualize the workflow graph
## *Note: This only renders in Jupyter. NOT in JupyterLab.*

In [None]:
pipeline.render_graph()

You should see a graph like this:

<img src="img/pipeline_created.png" width="70%" align="left">

## Create and execute the pipeline on AWS Step Functions

Create the pipeline in AWS Step Functions with [create](https://aws-step-functions-data-science-sdk.readthedocs.io/en/latest/workflow.html#stepfunctions.workflow.Workflow.create).

In [None]:
print(type(pipeline))

In [None]:
# Sleeping to wait for role and policy creations
import time
time.sleep(10)

pipeline.create()

Run the workflow with [execute](https://aws-step-functions-data-science-sdk.readthedocs.io/en/latest/workflow.html#stepfunctions.workflow.Workflow.execute). A link will be provided after the following cell is executed. Following this link, you can monitor your pipeline execution on Step Functions' console.

In [None]:
execution = pipeline.execute(job_name=None,
                             hyperparameters=None)

## *Note: This only renders in Jupyter. NOT in JupyterLab.*

In [None]:
execution.render_progress()

You should see a graph like this:

<img src="img/pipeline_executed.png" width="90%" align="left">

In [None]:
import time

events = execution.list_events()

while len(events) <= 5:
    print('Number of events:  {}'.format(len(events)))
    time.sleep(30)
    events = execution.list_events()

print('Number of events:  {}'.format(len(events)))

In [None]:
execution.render_progress()

# _Wait for ^^ Number of Events ^^ to Reach At Least 6_

In [None]:
import json

training_job_name = json.loads(events[5]['taskSucceededEventDetails']['output'])['TrainingJobName']
print('Training Job Name: {}'.format(training_job_name))

print('')

trained_model_s3_uri = json.loads(events[5]['taskSucceededEventDetails']['output'])['ModelArtifacts']['S3ModelArtifacts']
print('Trained Model S3 URI: {}'.format(trained_model_s3_uri))

# Copy the Model from S3

In [None]:
!aws s3 cp $trained_model_s3_uri ./model.tar.gz

In [None]:
!tar -xvzf ./model.tar.gz

# Show the Model Prediction Signature

In [None]:
!saved_model_cli show --all --dir ./tensorflow/saved_model/0/

In [None]:
events = execution.list_events()

while len(events) <= 18:
    print('Number of events:  {}'.format(len(events)))
    time.sleep(30)
    events = execution.list_events()

print('Number of events:  {}'.format(len(events)))

In [None]:
execution.render_progress()

# _Wait for ^^ Number of Events ^^ to Reach At Least 19_

In [None]:
import json

step_functions_pipeline_endpoint_name = json.loads(events[18]['taskScheduledEventDetails']['parameters'])['EndpointName']

print('Endpoint Name: {}'.format(step_functions_pipeline_endpoint_name))

In [None]:
events = execution.list_events()

while len(events) <= 21:
    print('Number of events:  {}'.format(len(events)))
    time.sleep(30)
    events = execution.list_events()    

print('Number of events:  {}'.format(len(events)))

In [None]:
execution.render_progress()

# _Wait for ^^ Number of Events ^^ to Reach At Least 22_

In [None]:
event_details = json.loads(events[21]['stateExitedEventDetails']['output'])

print(event_details)

# Pass Variables to the Next Notebooks(s)

In [None]:
print(step_functions_pipeline_endpoint_name)

In [None]:
%store step_functions_pipeline_endpoint_name

In [None]:
stepfunction_arn = 'arn:aws:states:{}:{}:stateMachine:{}'.format(region, account_id, pipeline.pipeline_name)
print(stepfunction_arn)

In [None]:
%store stepfunction_arn

In [None]:
stepfunction_name = pipeline.pipeline_name
print(stepfunction_name)

In [None]:
%store stepfunction_name

In [None]:
%%javascript
Jupyter.notebook.save_checkpoint();
Jupyter.notebook.session.delete();