# Create a `TrainingPipeline` with the Step Functions Data Science SDK

In [1]:
import os
import sagemaker
import logging
import boto3
import sagemaker
import pandas as pd

sess   = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
sm = boto3.Session().client(service_name='sagemaker', region_name=region)

In [2]:
!pip install stepfunctions==1.0.0.8

Collecting stepfunctions==1.0.0.8
  Downloading stepfunctions-1.0.0.8.tar.gz (58 kB)
[K     |████████████████████████████████| 58 kB 8.1 MB/s  eta 0:00:01
Building wheels for collected packages: stepfunctions
  Building wheel for stepfunctions (setup.py) ... [?25ldone
[?25h  Created wheel for stepfunctions: filename=stepfunctions-1.0.0.8-py2.py3-none-any.whl size=69532 sha256=e6480f49062dccb7d80920b460cf31792a701824704eb59f8c6379c3d3b3dd31
  Stored in directory: /home/ec2-user/.cache/pip/wheels/66/0b/2e/d91e6d0948ef2c46b1b7df28ae2fff3085953f01408eceff35
Successfully built stepfunctions
Installing collected packages: stepfunctions
Successfully installed stepfunctions-1.0.0.8


In [3]:
import stepfunctions
import logging

from stepfunctions.template.pipeline import TrainingPipeline
stepfunctions.set_stream_logger(level=logging.INFO)

# Updte IAM Roles to Enable Step Functions to Trigger SageMaker Jobs

## Add Managed Policy SageMaker Notebook Execution Role

1. Open the Amazon [SageMaker console](https://console.aws.amazon.com/sagemaker/). 
2. Select **Notebook instances** and choose the name of your notebook instance
3. Under **Permissions and encryption** select the role ARN to view the role on the IAM console
4. Choose **Attach policies** and search for `AWSStepFunctionsFullAccess`.
5. Select the check box next to `AWSStepFunctionsFullAccess` and choose **Attach policy**

![Attach AWSStepFunctionsFullAccess Policy to Notebook Execution Role](img/attach_policies_with_stepfunctions.png)

## Create an Execution Role for Step Functions

You need a StepFunctionsWorkflowExecutionRole so that you can create and execute workflows in Step Functions.

1. Go to the [IAM console](https://console.aws.amazon.com/iam/)
2. Select **Roles** and then **Create role**.
3. Under **Choose the service that will use this role** select **Step Functions**
4. Choose **Next** until you can enter a **Role name**
5. Enter a name such as `StepFunctionsWorkflowExecutionRole` and then select **Create role**


Attach a policy to the role you created. The following steps attach a policy that provides full access to Step Functions, however as a good practice you should only provide access to the resources you need.  

1. Under the **Permissions** tab, click **Add inline policy**
2. Enter the following in the **JSON** tab

```json
{
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Action": [
                "sagemaker:CreateTransformJob",
                "sagemaker:DescribeTransformJob",
                "sagemaker:StopTransformJob",
                "sagemaker:CreateTrainingJob",
                "sagemaker:DescribeTrainingJob",
                "sagemaker:StopTrainingJob",
                "sagemaker:CreateHyperParameterTuningJob",
                "sagemaker:DescribeHyperParameterTuningJob",
                "sagemaker:StopHyperParameterTuningJob",
                "sagemaker:CreateModel",
                "sagemaker:CreateEndpointConfig",
                "sagemaker:CreateEndpoint",
                "sagemaker:DeleteEndpointConfig",
                "sagemaker:DeleteEndpoint",
                "sagemaker:UpdateEndpoint",
                "sagemaker:ListTags",
                "lambda:InvokeFunction",
                "sqs:SendMessage",
                "sns:Publish",
                "ecs:RunTask",
                "ecs:StopTask",
                "ecs:DescribeTasks",
                "dynamodb:GetItem",
                "dynamodb:PutItem",
                "dynamodb:UpdateItem",
                "dynamodb:DeleteItem",
                "batch:SubmitJob",
                "batch:DescribeJobs",
                "batch:TerminateJob",
                "glue:StartJobRun",
                "glue:GetJobRun",
                "glue:GetJobRuns",
                "glue:BatchStopJobRun"
            ],
            "Resource": "*"
        },
        {
            "Effect": "Allow",
            "Action": [
                "iam:PassRole"
            ],
            "Resource": "*",
            "Condition": {
                "StringEquals": {
                    "iam:PassedToService": "sagemaker.amazonaws.com"
                }
            }
        },
        {
            "Effect": "Allow",
            "Action": [
                "events:PutTargets",
                "events:PutRule",
                "events:DescribeRule"
            ],
            "Resource": [
                "arn:aws:events:*:*:rule/StepFunctionsGetEventsForSageMakerTrainingJobsRule",
                "arn:aws:events:*:*:rule/StepFunctionsGetEventsForSageMakerTransformJobsRule",
                "arn:aws:events:*:*:rule/StepFunctionsGetEventsForSageMakerTuningJobsRule",
                "arn:aws:events:*:*:rule/StepFunctionsGetEventsForECSTaskRule",
                "arn:aws:events:*:*:rule/StepFunctionsGetEventsForBatchJobsRule"
            ]
        }
    ]
}
```

3. Choose **Review policy** and give the policy a name such as `StepFunctionsWorkflowExecutionPolicy`
4. Choose **Create policy**. You will be redirected to the details page for the role.
5. Copy the **Role ARN** at the top of the **Summary**

In [None]:
stepfunctions_role = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Action": [
                "sagemaker:CreateTransformJob",
                "sagemaker:DescribeTransformJob",
                "sagemaker:StopTransformJob",
                "sagemaker:CreateTrainingJob",
                "sagemaker:DescribeTrainingJob",
                "sagemaker:StopTrainingJob",
                "sagemaker:CreateHyperParameterTuningJob",
                "sagemaker:DescribeHyperParameterTuningJob",
                "sagemaker:StopHyperParameterTuningJob",
                "sagemaker:CreateModel",
                "sagemaker:CreateEndpointConfig",
                "sagemaker:CreateEndpoint",
                "sagemaker:DeleteEndpointConfig",
                "sagemaker:DeleteEndpoint",
                "sagemaker:UpdateEndpoint",
                "sagemaker:ListTags",
                "lambda:InvokeFunction",
                "sqs:SendMessage",
                "sns:Publish",
                "ecs:RunTask",
                "ecs:StopTask",
                "ecs:DescribeTasks",
                "dynamodb:GetItem",
                "dynamodb:PutItem",
                "dynamodb:UpdateItem",
                "dynamodb:DeleteItem",
                "batch:SubmitJob",
                "batch:DescribeJobs",
                "batch:TerminateJob",
                "glue:StartJobRun",
                "glue:GetJobRun",
                "glue:GetJobRuns",
                "glue:BatchStopJobRun"
            ],
            "Resource": "*"
        },
        {
            "Effect": "Allow",
            "Action": [
                "iam:PassRole"
            ],
            "Resource": "*",
            "Condition": {
                "StringEquals": {
                    "iam:PassedToService": "sagemaker.amazonaws.com"
                }
            }
        },
        {
            "Effect": "Allow",
            "Action": [
                "events:PutTargets",
                "events:PutRule",
                "events:DescribeRule"
            ],
            "Resource": [
                "arn:aws:events:*:*:rule/StepFunctionsGetEventsForSageMakerTrainingJobsRule",
                "arn:aws:events:*:*:rule/StepFunctionsGetEventsForSageMakerTransformJobsRule",
                "arn:aws:events:*:*:rule/StepFunctionsGetEventsForSageMakerTuningJobsRule",
                "arn:aws:events:*:*:rule/StepFunctionsGetEventsForECSTaskRule",
                "arn:aws:events:*:*:rule/StepFunctionsGetEventsForBatchJobsRule"
            ]
        }
    ]
}

## Training Data

In [4]:
%store -r scikit_processing_job_name

In [5]:
print(scikit_processing_job_name)

sagemaker-scikit-learn-2020-05-08-19-54-41-801


In [6]:
print('Previous Scikit Processing Job Name: {}'.format(scikit_processing_job_name))

Previous Scikit Processing Job Name: sagemaker-scikit-learn-2020-05-08-19-54-41-801


In [7]:
prefix_train = '{}/output/bert-train'.format(scikit_processing_job_name)
prefix_validation = '{}/output/bert-validation'.format(scikit_processing_job_name)
prefix_test = '{}/output/bert-test'.format(scikit_processing_job_name)

train_s3_uri = 's3://{}/{}'.format(bucket, prefix_train)
validation_s3_uri = 's3://{}/{}'.format(bucket, prefix_validation)
test_s3_uri = 's3://{}/{}'.format(bucket, prefix_test)

In [8]:
print(train_s3_uri)
!aws s3 ls $train_s3_uri/

s3://sagemaker-us-east-1-835319576252/sagemaker-scikit-learn-2020-05-08-19-54-41-801/output/bert-train
2020-05-08 20:03:54     893792 part-algo-1-amazon_reviews_us_Apparel_v1_00.tfrecord
2020-05-08 20:03:54     810463 part-algo-1-amazon_reviews_us_Books_v1_01.tfrecord
2020-05-08 20:03:54      61503 part-algo-1-amazon_reviews_us_Digital_Music_Purchase_v1_00.tfrecord
2020-05-08 20:03:54     109623 part-algo-1-amazon_reviews_us_Furniture_v1_00.tfrecord
2020-05-08 20:03:54     308128 part-algo-1-amazon_reviews_us_Home_Improvement_v1_00.tfrecord
2020-05-08 20:03:54      45135 part-algo-1-amazon_reviews_us_Luggage_v1_00.tfrecord
2020-05-08 20:03:54     100468 part-algo-1-amazon_reviews_us_Musical_Instruments_v1_00.tfrecord
2020-05-08 20:03:54     378059 part-algo-1-amazon_reviews_us_Pet_Products_v1_00.tfrecord
2020-05-08 20:03:54     572785 part-algo-1-amazon_reviews_us_Toys_v1_00.tfrecord
2020-05-08 20:03:54    1470542 part-algo-1-amazon_reviews_us_Wireless_v1_00.tfrecord
2020-05-08 20:02:4

In [9]:
s3_input_train_data = sagemaker.s3_input(s3_data=train_s3_uri, distribution='ShardedByS3Key') 
s3_input_validation_data = sagemaker.s3_input(s3_data=validation_s3_uri, distribution='ShardedByS3Key')
s3_input_test_data = sagemaker.s3_input(s3_data=test_s3_uri, distribution='ShardedByS3Key')

print(s3_input_train_data.config)
print(s3_input_validation_data.config)
print(s3_input_test_data.config)

{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sagemaker-us-east-1-835319576252/sagemaker-scikit-learn-2020-05-08-19-54-41-801/output/bert-train', 'S3DataDistributionType': 'ShardedByS3Key'}}}
{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sagemaker-us-east-1-835319576252/sagemaker-scikit-learn-2020-05-08-19-54-41-801/output/bert-validation', 'S3DataDistributionType': 'ShardedByS3Key'}}}
{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sagemaker-us-east-1-835319576252/sagemaker-scikit-learn-2020-05-08-19-54-41-801/output/bert-test', 'S3DataDistributionType': 'ShardedByS3Key'}}}


In [21]:
!pygmentize src/tf_bert_reviews.py

[34mimport[39;49;00m [04m[36mtime[39;49;00m
[34mimport[39;49;00m [04m[36mrandom[39;49;00m
[34mimport[39;49;00m [04m[36mpandas[39;49;00m [34mas[39;49;00m [04m[36mpd[39;49;00m
[34mfrom[39;49;00m [04m[36mglob[39;49;00m [34mimport[39;49;00m glob
[34mimport[39;49;00m [04m[36mpprint[39;49;00m
[34mimport[39;49;00m [04m[36margparse[39;49;00m
[34mimport[39;49;00m [04m[36mjson[39;49;00m
[34mimport[39;49;00m [04m[36msubprocess[39;49;00m
[34mimport[39;49;00m [04m[36msys[39;49;00m
[34mimport[39;49;00m [04m[36mos[39;49;00m
[34mimport[39;49;00m [04m[36mtensorflow[39;49;00m [34mas[39;49;00m [04m[36mtf[39;49;00m
[37m#subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorflow==2.1.0'])[39;49;00m
subprocess.check_call([sys.executable, [33m'[39;49;00m[33m-m[39;49;00m[33m'[39;49;00m, [33m'[39;49;00m[33mpip[39;49;00m[33m'[39;49;00m, [33m'[39;49;00m[33minstall[39;49;00m[33m'[39;49;00m, [33m'[3

    parser.add_argument([33m'[39;49;00m[33m--run_sample_predictions[39;49;00m[33m'[39;49;00m,
                        [36mtype[39;49;00m=[36meval[39;49;00m,
                        default=[36mFalse[39;49;00m)
    
    args, _ = parser.parse_known_args()
    [34mprint[39;49;00m([33m"[39;49;00m[33mArgs:[39;49;00m[33m"[39;49;00m) 
    [34mprint[39;49;00m(args)
    
    env_var = os.environ 
    [34mprint[39;49;00m([33m"[39;49;00m[33mEnvironment Variables:[39;49;00m[33m"[39;49;00m) 
    pprint.pprint([36mdict[39;49;00m(env_var), width = [34m1[39;49;00m) 

    train_data = args.train_data
    [34mprint[39;49;00m([33m'[39;49;00m[33mtrain_data {}[39;49;00m[33m'[39;49;00m.format(train_data))
    validation_data = args.validation_data
    [34mprint[39;49;00m([33m'[39;49;00m[33mvalidation_data {}[39;49;00m[33m'[39;49;00m.format(validation_data))
    test_data = args.test_data
    [34mprint[39;49;00m([33m'[39;49;00m[33mtest_

# Setup Hyper-Parameters

In [11]:
epochs=1
learning_rate=0.00001
epsilon=0.00000001
train_batch_size=128
validation_batch_size=128
test_batch_size=128
train_steps_per_epoch=100
validation_steps=100
test_steps=100
train_instance_count=1
train_instance_type='ml.p3.2xlarge'
train_volume_size=1024
use_xla=True
use_amp=True
max_seq_length=128
freeze_bert_layer=True
input_mode='Pipe'
run_validation=True
run_test=True
run_sample_predictions=True

# Setup Metrics

In [12]:
metrics_definitions = [
     {'Name': 'train:loss', 'Regex': 'loss: ([0-9\\.]+)'},
     {'Name': 'train:accuracy', 'Regex': 'accuracy: ([0-9\\.]+)'},
     {'Name': 'validation:loss', 'Regex': 'val_loss: ([0-9\\.]+)'},
     {'Name': 'validation:accuracy', 'Regex': 'val_accuracy: ([0-9\\.]+)'},
]

# Setup Estimator

In [13]:
from sagemaker.tensorflow import TensorFlow

estimator = TensorFlow(entry_point='tf_bert_reviews.py',
                       source_dir='src',
                       role=role,
                       train_instance_count=train_instance_count, # Make sure you have at least this number of input files or the ShardedByS3Key distibution strategy will fail the job due to no data available
                       train_instance_type=train_instance_type,
                       train_volume_size=train_volume_size,
                       py_version='py3',
                       framework_version='2.1.0',
                       hyperparameters={'epochs': epochs,
                                        'learning_rate': learning_rate,
                                        'epsilon': epsilon,
                                        'train_batch_size': train_batch_size,
                                        'validation_batch_size': validation_batch_size,
                                        'test_batch_size': test_batch_size,                                             
                                        'train_steps_per_epoch': train_steps_per_epoch,
                                        'validation_steps': validation_steps,
                                        'test_steps': test_steps,
                                        'use_xla': use_xla,
                                        'use_amp': use_amp,                                             
                                        'max_seq_length': max_seq_length,
                                        'freeze_bert_layer': freeze_bert_layer,
                                        'run_validation': run_validation,
                                        'run_test': run_test,
                                        'run_sample_predictions': run_sample_predictions},
                       input_mode=input_mode,
                       metric_definitions=metrics_definitions,
                       train_max_run=7200 # max 2 hours * 60 minutes seconds per hour * 60 seconds per minute
                      )

# Setup Pipeline with the Step Functions SDK

A typical task for a data scientist is to train a model and deploy that model to an endpoint. Without the Step Functions SDK, this is a four step process on SageMaker that includes the following.

1. Training the model
2. Creating the model on SageMaker
3. Creating an endpoint configuration
4. Deploying the trained model to the configured endpoint

The Step Functions SDK provides the [TrainingPipeline](https://aws-step-functions-data-science-sdk.readthedocs.io/en/latest/pipelines.html#stepfunctions.template.pipeline.train.TrainingPipeline) API to simplify this procedure. The following configures `pipeline` with the necessary parameters to define a training pipeline.

In [14]:
print(train_s3_uri)

s3://sagemaker-us-east-1-835319576252/sagemaker-scikit-learn-2020-05-08-19-54-41-801/output/bert-train


In [15]:
# TODO:  Paste the StepFunctionsWorkflowExecutionRole ARN from above

In [16]:
workflow_execution_role='arn:aws:iam::XXX:role/StepFunctionsWorkflowExecutionRole' 

In [17]:
pipeline = TrainingPipeline(
    estimator=estimator,
    role=workflow_execution_role,
    inputs={'train': s3_input_train_data, 
            'validation': s3_input_validation_data,
            'test': s3_input_test_data
    },
    s3_bucket=bucket
)

### Visualize the pipeline

You can now view the workflow definition, and also visualize it as a graph. This workflow and graph represent your training pipeline.

#### View the workflow definition

In [18]:
print(pipeline.workflow.definition.to_json(pretty=True))

{
    "StartAt": "Training",
    "States": {
        "Training": {
            "Resource": "arn:aws:states:::sagemaker:createTrainingJob.sync",
            "Parameters": {
                "AlgorithmSpecification.$": "$$.Execution.Input['Training'].AlgorithmSpecification",
                "OutputDataConfig.$": "$$.Execution.Input['Training'].OutputDataConfig",
                "StoppingCondition.$": "$$.Execution.Input['Training'].StoppingCondition",
                "ResourceConfig.$": "$$.Execution.Input['Training'].ResourceConfig",
                "RoleArn.$": "$$.Execution.Input['Training'].RoleArn",
                "InputDataConfig.$": "$$.Execution.Input['Training'].InputDataConfig",
                "HyperParameters.$": "$$.Execution.Input['Training'].HyperParameters",
                "TrainingJobName.$": "$$.Execution.Input['Training'].TrainingJobName",
                "DebugHookConfig.$": "$$.Execution.Input['Training'].DebugHookConfig"
            },
            "Type": "Task",
 

#### Visualize the workflow graph

In [19]:
pipeline.render_graph()

### Create and execute the pipeline on AWS Step Functions

Create the pipeline in AWS Step Functions with [create](https://aws-step-functions-data-science-sdk.readthedocs.io/en/latest/workflow.html#stepfunctions.workflow.Workflow.create).

In [20]:
pipeline.create()

InvalidArn: An error occurred (InvalidArn) when calling the CreateStateMachine operation: Invalid Role Arn: 'arn:aws:iam::XXX:role/StepFunctionsWorkflowExecutionRole'

Run the workflow with [execute](https://aws-step-functions-data-science-sdk.readthedocs.io/en/latest/workflow.html#stepfunctions.workflow.Workflow.execute). A link will be provided after the following cell is executed. Following this link, you can monitor your pipeline execution on Step Functions' console.

In [None]:
execution = pipeline.execute()

In [None]:
execution.render_progress()

In [None]:
import time

events = execution.list_events()

while len(events) < 5:
    print('Number of events:  {}'.format(len(events)))
    time.sleep(10)
    events = execution.list_events()

print('Number of events:  {}'.format(len(events)))

# Wait for Number of Events to Reach at Least 5 ^^ Above ^^

In [None]:
import json

training_job_name = json.loads(events[5]['taskSucceededEventDetails']['output'])['TrainingJobName']
print('Training Job Name: {}'.format(training_job_name))

print('')

trained_model_s3_uri = json.loads(events[5]['taskSucceededEventDetails']['output'])['ModelArtifacts']['S3ModelArtifacts']
print('Trained Model S3 URI: {}'.format(trained_model_s3_uri))

# TODO:  Copy the Inference Model
TODO:  Update the S3 location to the deploymed model (not trained model)

In [None]:
!rm model.tar.gz

In [None]:
!aws s3 cp s3://$bucket/training-pipeline-2020-04-30-03-15-17/models/estimator-training-pipeline-2020-04-30-03-15-22/output/model.tar.gz ./model.tar.gz
    

In [None]:
!tar -xvzf ./model.tar.gz

In [None]:
!saved_model_cli show --all --dir ./tensorflow/saved_model/0/

In [None]:
events = execution.list_events()

while len(events) < 18:
    print('Number of events:  {}'.format(len(events)))
    time.sleep(10)
    events = execution.list_events()

print('Number of events:  {}'.format(len(events)))

# Wait for Number of Events to Reach at Least 18 ^^ Above ^^

In [None]:
import json

step_functions_pipeline_endpoint_name = json.loads(events[18]['taskScheduledEventDetails']['parameters'])['EndpointName']

print('Endpoint Name: {}'.format(step_functions_pipeline_endpoint_name))

In [None]:
events = execution.list_events()

while len(events) < 21:
    print('Number of events:  {}'.format(len(events)))
    time.sleep(10)
    events = execution.list_events()    

print('Number of events:  {}'.format(len(events)))

# Wait for Number of Events to Reach at Least 21 ^^ Above ^^

In [None]:
event_details = json.loads(events[21]['stateExitedEventDetails']['output'])

print(event_details)

In [None]:
print(step_functions_pipeline_endpoint_name)

In [None]:
%store step_functions_pipeline_endpoint_name