# Create a Training Pipeline with the Step Functions Data Science SDK

In [1]:
from botocore.exceptions import ClientError

import os
import sagemaker
import logging
import boto3
import sagemaker
import pandas as pd

sess   = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name='sagemaker', region_name=region)

In [2]:
!pip install stepfunctions==1.0.0.8

In [3]:
import stepfunctions
import logging
from stepfunctions.template.pipeline import TrainingPipeline

stepfunctions.set_stream_logger(level=logging.INFO)

# Create an IAM Execution Role for Step Functions
We need a StepFunctionsWorkflowExecutionRole so that you can create and execute workflows in Step Functions.

In [4]:
iam = boto3.Session().client(service_name='iam', region_name=region)
sts = boto3.Session().client(service_name='sts', region_name=region)

In [5]:
stepfunction_role_name = 'DSOAWS_StepFunctionsExecutionRole'

### Create an AssumeRolePolicyDocument

In [6]:
assume_role_policy_doc = {
  "Version": "2012-10-17",
  "Statement": [
    {
      "Effect": "Allow",
      "Principal": {
        "Service": "states.amazonaws.com"
      },
      "Action": "sts:AssumeRole"
    }
  ]
} 

### Create `DSOAWS_StepFunctionsExecutionRole`

In [7]:
import json

try:
    iam_role_sf = iam.create_role(
        RoleName=stepfunction_role_name,
        AssumeRolePolicyDocument=json.dumps(assume_role_policy_doc),
        Description='DSOAWS Step Function Workflow Execution Role'
    )
except ClientError as e:
    if e.response['Error']['Code'] == 'EntityAlreadyExists':
        print("Role already exists. This is OK.")
    else:
        print("Unexpected error: %s" % e)

Role already exists. This is OK.


### Get the Role ARN

In [8]:
role = iam.get_role(RoleName=stepfunction_role_name)
stepfunction_role_arn = role['Role']['Arn']
print(stepfunction_role_arn)

arn:aws:iam::806570384721:role/DSOAWS_StepFunctionsExecutionRole


# Add a Policy to the Role

## Define permissions

In [9]:
stepfunction_permissions = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Action": [
                "sagemaker:CreateTransformJob",
                "sagemaker:DescribeTransformJob",
                "sagemaker:StopTransformJob",
                "sagemaker:CreateTrainingJob",
                "sagemaker:DescribeTrainingJob",
                "sagemaker:StopTrainingJob",
                "sagemaker:CreateHyperParameterTuningJob",
                "sagemaker:DescribeHyperParameterTuningJob",
                "sagemaker:StopHyperParameterTuningJob",
                "sagemaker:CreateModel",
                "sagemaker:CreateEndpointConfig",
                "sagemaker:CreateEndpoint",
                "sagemaker:DeleteEndpointConfig",
                "sagemaker:DeleteEndpoint",
                "sagemaker:UpdateEndpoint",
                "sagemaker:ListTags",
                "lambda:InvokeFunction",
                "sqs:SendMessage",
                "sns:Publish",
                "ecs:RunTask",
                "ecs:StopTask",
                "ecs:DescribeTasks",
                "dynamodb:GetItem",
                "dynamodb:PutItem",
                "dynamodb:UpdateItem",
                "dynamodb:DeleteItem",
                "batch:SubmitJob",
                "batch:DescribeJobs",
                "batch:TerminateJob",
                "glue:StartJobRun",
                "glue:GetJobRun",
                "glue:GetJobRuns",
                "glue:BatchStopJobRun"
            ],
            "Resource": "*"
        },
        {
            "Effect": "Allow",
            "Action": [
                "iam:PassRole"
            ],
            "Resource": "*",
            "Condition": {
                "StringEquals": {
                    "iam:PassedToService": "sagemaker.amazonaws.com"
                }
            }
        },
        {
            "Effect": "Allow",
            "Action": [
                "events:PutTargets",
                "events:PutRule",
                "events:DescribeRule"
            ],
            "Resource": [
                "arn:aws:events:*:*:rule/StepFunctionsGetEventsForSageMakerTrainingJobsRule",
                "arn:aws:events:*:*:rule/StepFunctionsGetEventsForSageMakerTransformJobsRule",
                "arn:aws:events:*:*:rule/StepFunctionsGetEventsForSageMakerTuningJobsRule",
                "arn:aws:events:*:*:rule/StepFunctionsGetEventsForECSTaskRule",
                "arn:aws:events:*:*:rule/StepFunctionsGetEventsForBatchJobsRule"
            ]
        }
    ]
}

## Turn into Policy Object

In [10]:
stepfunction_policy_name = 'DSOAWS_StepFunctionsWorkflowExecutionPolicy'

In [11]:
try:
    stepfunction_policy = iam.create_policy(
      PolicyName=stepfunction_policy_name,
      PolicyDocument=json.dumps(stepfunction_permissions)
    )
except ClientError as e:
    if e.response['Error']['Code'] == 'EntityAlreadyExists':
        print("Policy already exists. This is OK.")
    else:
        print("Unexpected error: %s" % e)

Policy already exists. This is OK.


## Get ARN

In [12]:
account_id = sts.get_caller_identity()['Account']
stepfunction_policy_arn = f'arn:aws:iam::{account_id}:policy/{stepfunction_policy_name}'
print(stepfunction_policy_arn)

arn:aws:iam::806570384721:policy/DSOAWS_StepFunctionsWorkflowExecutionPolicy


## Attach Policy To Step Function Workflow Execution Role

In [13]:
try:
    response = iam.attach_role_policy(
        PolicyArn=stepfunction_policy_arn,
        RoleName=stepfunction_role_name
    )
    print("Done.")
except ClientError as e:
    if e.response['Error']['Code'] == 'EntityAlreadyExists':
        print("Policy is already attached. This is OK.")
    else:
        print("Unexpected error: %s" % e)


Done.


In [14]:
try:
    response = iam.attach_role_policy(
        PolicyArn='arn:aws:iam::aws:policy/service-role/AWSLambdaRole',
        RoleName=stepfunction_role_name
    )
    print("Done.")
except ClientError as e:
    if e.response['Error']['Code'] == 'EntityAlreadyExists':
        print("Policy is already attached. This is OK.")
    else:
        print("Unexpected error: %s" % e)

Done.


# Specify the S3 Location of the Features

In [15]:
%store -r processed_train_data_s3_uri

In [16]:
print(processed_train_data_s3_uri)

s3://sagemaker-us-east-1-806570384721/sagemaker-scikit-learn-2020-06-06-15-22-03-905/output/bert-train


In [17]:
%store -r processed_validation_data_s3_uri

In [18]:
print(processed_validation_data_s3_uri)

s3://sagemaker-us-east-1-806570384721/sagemaker-scikit-learn-2020-06-06-15-22-03-905/output/bert-validation


In [19]:
%store -r processed_test_data_s3_uri

In [20]:
print(processed_test_data_s3_uri)

s3://sagemaker-us-east-1-806570384721/sagemaker-scikit-learn-2020-06-06-15-22-03-905/output/bert-test


In [21]:
print(processed_train_data_s3_uri)
!aws s3 ls $processed_train_data_s3_uri/

s3://sagemaker-us-east-1-806570384721/sagemaker-scikit-learn-2020-06-06-15-22-03-905/output/bert-train
2020-06-06 15:26:38      17981 part-algo-1-amazon_reviews_us_Digital_Software_v1_00.tfrecord
2020-06-06 15:26:34      19628 part-algo-2-amazon_reviews_us_Digital_Video_Games_v1_00.tfrecord


In [22]:
print(processed_validation_data_s3_uri)
!aws s3 ls $processed_validation_data_s3_uri/

s3://sagemaker-us-east-1-806570384721/sagemaker-scikit-learn-2020-06-06-15-22-03-905/output/bert-validation
2020-06-06 15:26:38       1185 part-algo-1-amazon_reviews_us_Digital_Software_v1_00.tfrecord
2020-06-06 15:26:34       1075 part-algo-2-amazon_reviews_us_Digital_Video_Games_v1_00.tfrecord


In [23]:
print(processed_test_data_s3_uri)
!aws s3 ls $processed_test_data_s3_uri/

s3://sagemaker-us-east-1-806570384721/sagemaker-scikit-learn-2020-06-06-15-22-03-905/output/bert-test
2020-06-06 15:26:38       1146 part-algo-1-amazon_reviews_us_Digital_Software_v1_00.tfrecord
2020-06-06 15:26:35       1088 part-algo-2-amazon_reviews_us_Digital_Video_Games_v1_00.tfrecord


In [36]:
s3_input_train_data = sagemaker.s3_input(s3_data=processed_train_data_s3_uri, distribution='ShardedByS3Key')
s3_input_validation_data = sagemaker.s3_input(s3_data=processed_validation_data_s3_uri, distribution='ShardedByS3Key')
s3_input_test_data = sagemaker.s3_input(s3_data=processed_test_data_s3_uri, distribution='ShardedByS3Key')

print(s3_input_train_data.config)
print(s3_input_validation_data.config)
print(s3_input_test_data.config)

{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sagemaker-us-east-1-806570384721/sagemaker-scikit-learn-2020-06-06-15-22-03-905/output/bert-train', 'S3DataDistributionType': 'ShardedByS3Key'}}}
{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sagemaker-us-east-1-806570384721/sagemaker-scikit-learn-2020-06-06-15-22-03-905/output/bert-validation', 'S3DataDistributionType': 'ShardedByS3Key'}}}
{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sagemaker-us-east-1-806570384721/sagemaker-scikit-learn-2020-06-06-15-22-03-905/output/bert-test', 'S3DataDistributionType': 'ShardedByS3Key'}}}


# Show Training Script

In [25]:
!pygmentize src/tf_bert_reviews.py

[34mimport[39;49;00m [04m[36mtime[39;49;00m
[34mimport[39;49;00m [04m[36mrandom[39;49;00m
[34mimport[39;49;00m [04m[36mpandas[39;49;00m [34mas[39;49;00m [04m[36mpd[39;49;00m
[34mfrom[39;49;00m [04m[36mglob[39;49;00m [34mimport[39;49;00m glob
[34mimport[39;49;00m [04m[36mpprint[39;49;00m
[34mimport[39;49;00m [04m[36margparse[39;49;00m
[34mimport[39;49;00m [04m[36mjson[39;49;00m
[34mimport[39;49;00m [04m[36msubprocess[39;49;00m
[34mimport[39;49;00m [04m[36msys[39;49;00m
[34mimport[39;49;00m [04m[36mos[39;49;00m
[34mimport[39;49;00m [04m[36mtensorflow[39;49;00m [34mas[39;49;00m [04m[36mtf[39;49;00m
[37m#subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorflow==2.1.0'])[39;49;00m
subprocess.check_call([sys.executable, [33m'[39;49;00m[33m-m[39;49;00m[33m'[39;49;00m, [33m'[39;49;00m[33mpip[39;49;00m[33m'[39;49;00m, [33m'[39;49;00m[33minstall[39;49;00m[33m'[39;49;00m, [33m'[39;49;00m[33

# Setup Hyper-Parameters

In [26]:
%store -r max_seq_length

In [27]:
print(max_seq_length)

128


In [28]:
epochs=1
learning_rate=0.00001
epsilon=0.00000001
train_batch_size=128
validation_batch_size=128
test_batch_size=128
train_steps_per_epoch=50
validation_steps=50
test_steps=50
train_instance_count=1
train_instance_type='ml.c5.4xlarge'
train_volume_size=1024
use_xla=True
use_amp=True
freeze_bert_layer=True
enable_sagemaker_debugger=False
enable_checkpointing=False
enable_tensorboard=False
input_mode='Pipe'
run_validation=True
run_test=True
run_sample_predictions=True

# Setup Metrics To Track Model Performance

In [29]:
metrics_definitions = [
     {'Name': 'train:loss', 'Regex': 'loss: ([0-9\\.]+)'},
     {'Name': 'train:accuracy', 'Regex': 'accuracy: ([0-9\\.]+)'},
     {'Name': 'validation:loss', 'Regex': 'val_loss: ([0-9\\.]+)'},
     {'Name': 'validation:accuracy', 'Regex': 'val_accuracy: ([0-9\\.]+)'},
]

# Setup Estimator

In [30]:
from sagemaker.tensorflow import TensorFlow

estimator = TensorFlow(entry_point='tf_bert_reviews.py',
                       source_dir='src',
                       role=role,
                       train_instance_count=train_instance_count, # Make sure you have at least this number of input files or the ShardedByS3Key distibution strategy will fail the job due to no data available
                       train_instance_type=train_instance_type,
                       train_volume_size=train_volume_size,
                       py_version='py3',
                       framework_version='2.1.0',
                       hyperparameters={'epochs': epochs,
                                        'learning_rate': learning_rate,
                                        'epsilon': epsilon,
                                        'train_batch_size': train_batch_size,
                                        'validation_batch_size': validation_batch_size,
                                        'test_batch_size': test_batch_size,                                             
                                        'train_steps_per_epoch': train_steps_per_epoch,
                                        'validation_steps': validation_steps,
                                        'test_steps': test_steps,
                                        'use_xla': use_xla,
                                        'use_amp': use_amp,                                             
                                        'max_seq_length': max_seq_length,
                                        'freeze_bert_layer': freeze_bert_layer,
                                        'enable_sagemaker_debugger': enable_sagemaker_debugger,
                                        'enable_checkpointing': enable_checkpointing,
                                        'enable_tensorboard': enable_tensorboard,                                        
                                        'run_validation': run_validation,
                                        'run_test': run_test,
                                        'run_sample_predictions': run_sample_predictions},
                       input_mode=input_mode,
                       metric_definitions=metrics_definitions,
#                       train_max_run=7200 # max 2 hours * 60 minutes seconds per hour * 60 seconds per minute
                      )

# Setup Pipeline with the Step Functions SDK

A typical task for a data scientist is to train a model and deploy that model to an endpoint. Without the Step Functions SDK, this is a four step process on SageMaker that includes the following.

1. Training the model
2. Creating the model on SageMaker
3. Creating an endpoint configuration
4. Deploying the trained model to the configured endpoint

The Step Functions SDK provides the [TrainingPipeline](https://aws-step-functions-data-science-sdk.readthedocs.io/en/latest/pipelines.html#stepfunctions.template.pipeline.train.TrainingPipeline) API to simplify this procedure. The following configures `pipeline` with the necessary parameters to define a training pipeline.

In [63]:
pipeline = TrainingPipeline(
    estimator=estimator,
    role='stepfunction_role_arn',
    inputs={
        'train':s3_input_train_data,
        'validation':s3_input_validation_data,
        'test':s3_input_test_data        
    },
    s3_bucket=bucket
)

### Visualize the pipeline

You can now view the workflow definition, and also visualize it as a graph. This workflow and graph represent your training pipeline.

#### View the workflow definition

In [64]:
print(pipeline.workflow.definition.to_json(pretty=True))

{
    "StartAt": "Training",
    "States": {
        "Training": {
            "Resource": "arn:aws:states:::sagemaker:createTrainingJob.sync",
            "Parameters": {
                "AlgorithmSpecification.$": "$$.Execution.Input['Training'].AlgorithmSpecification",
                "OutputDataConfig.$": "$$.Execution.Input['Training'].OutputDataConfig",
                "StoppingCondition.$": "$$.Execution.Input['Training'].StoppingCondition",
                "ResourceConfig.$": "$$.Execution.Input['Training'].ResourceConfig",
                "RoleArn.$": "$$.Execution.Input['Training'].RoleArn",
                "InputDataConfig.$": "$$.Execution.Input['Training'].InputDataConfig",
                "HyperParameters.$": "$$.Execution.Input['Training'].HyperParameters",
                "TrainingJobName.$": "$$.Execution.Input['Training'].TrainingJobName",
                "DebugHookConfig.$": "$$.Execution.Input['Training'].DebugHookConfig"
            },
            "Type": "Task",
 

#### Visualize the workflow graph

In [65]:
pipeline.render_graph()

### Create and execute the pipeline on AWS Step Functions

Create the pipeline in AWS Step Functions with [create](https://aws-step-functions-data-science-sdk.readthedocs.io/en/latest/workflow.html#stepfunctions.workflow.Workflow.create).

In [66]:
pipeline.create()

[32m[INFO] Workflow created successfully on AWS Step Functions.[0m


'arn:aws:states:us-east-1:806570384721:stateMachine:training-pipeline-2020-06-13-14-44-05'

Run the workflow with [execute](https://aws-step-functions-data-science-sdk.readthedocs.io/en/latest/workflow.html#stepfunctions.workflow.Workflow.execute). A link will be provided after the following cell is executed. Following this link, you can monitor your pipeline execution on Step Functions' console.

In [67]:
response = pipeline.inputs
print(response)

{'train': <sagemaker.inputs.s3_input object at 0x7f38a00d7780>, 'validation': <sagemaker.inputs.s3_input object at 0x7f38a00d77b8>, 'test': <sagemaker.inputs.s3_input object at 0x7f38a00d7828>}


In [68]:
execution = pipeline.execute()

TypeError: Object of type 'datetime' is not JSON serializable

In [69]:
execution.render_progress()

NameError: name 'execution' is not defined

In [None]:
import time

events = execution.list_events()

while len(events) <= 5:
    print('Number of events:  {}'.format(len(events)))
    time.sleep(30)
    events = execution.list_events()

print('Number of events:  {}'.format(len(events)))

# _Wait for ^^ Number of Events ^^ to Reach At Least 5_

In [None]:
import json

training_job_name = json.loads(events[5]['taskSucceededEventDetails']['output'])['TrainingJobName']
print('Training Job Name: {}'.format(training_job_name))

print('')

trained_model_s3_uri = json.loads(events[5]['taskSucceededEventDetails']['output'])['ModelArtifacts']['S3ModelArtifacts']
print('Trained Model S3 URI: {}'.format(trained_model_s3_uri))

# Copy the Model from S3

In [None]:
!aws s3 cp $trained_model_s3_uri ./model.tar.gz

In [None]:
!tar -xvzf ./model.tar.gz

# Show the Model Prediction Signature

In [None]:
!saved_model_cli show --all --dir ./tensorflow/saved_model/0/

In [None]:
events = execution.list_events()

while len(events) <= 18:
    print('Number of events:  {}'.format(len(events)))
    time.sleep(30)
    events = execution.list_events()

print('Number of events:  {}'.format(len(events)))

# _Wait for ^^ Number of Events ^^ to Reach At Least 19_

In [None]:
import json

step_functions_pipeline_endpoint_name = json.loads(events[18]['taskScheduledEventDetails']['parameters'])['EndpointName']

print('Endpoint Name: {}'.format(step_functions_pipeline_endpoint_name))

In [None]:
events = execution.list_events()

while len(events) <= 21:
    print('Number of events:  {}'.format(len(events)))
    time.sleep(30)
    events = execution.list_events()    

print('Number of events:  {}'.format(len(events)))

# _Wait for ^^ Number of Events ^^ to Reach At Least 22_

In [None]:
event_details = json.loads(events[21]['stateExitedEventDetails']['output'])

print(event_details)

# Pass Variables to the Next Notebooks(s)

In [None]:
print(step_functions_pipeline_endpoint_name)

In [None]:
%store step_functions_pipeline_endpoint_name

In [None]:
stepfunction_arn = 'arn:aws:states:{}:{}:stateMachine:{}'.format(region, account_id, pipeline.pipeline_name)
print(stepfunction_arn)

In [None]:
%store stepfunction_arn

In [None]:
stepfunction_name = pipeline.pipeline_name
print(stepfunction_name)

In [None]:
%store stepfunction_name