# Create a `TrainingPipeline` with the Step Functions Data Science SDK

In [None]:
import os
import sagemaker
import logging
import boto3
import sagemaker
import pandas as pd

sess   = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name='sagemaker', region_name=region)

In [None]:
!pip install stepfunctions==1.0.0.8

In [None]:
import stepfunctions
import logging
from stepfunctions.template.pipeline import TrainingPipeline

stepfunctions.set_stream_logger(level=logging.INFO)

# Update IAM Roles to Enable Step Functions to Trigger SageMaker Jobs

# Add Managed Policy SageMaker Notebook Execution Role

# 1. Open the Amazon [SageMaker console](https://console.aws.amazon.com/sagemaker/). 

# 2. Select **Notebook instances** and choose the name of your notebook instance

![](../img/click_notebook_instance.png)

# 3. Click on the IAM role link and navigate to the IAM Management Console.

# 4. Under **Permissions and encryption** select the role ARN to view the role on the IAM console

![](../img/update_iam.png)

# 5. Choose **Attach policies** and search for `AWSStepFunctionsFullAccess`.

[](../img/view_policies.png)

# 6. Select `AmazonS3FullAccess` and click on `Attach Policy`.

![Attach AWSStepFunctionsFullAccess Policy to Notebook Execution Role](img/attach_policies_with_stepfunctions.png)

# Create an Execution Role for Step Functions
We need a StepFunctionsWorkflowExecutionRole so that you can create and execute workflows in Step Functions.

# 1. Go to the [IAM console](https://console.aws.amazon.com/iam/)

# 2. Select **Roles** and then **Create role**.

![](img/create_execution_role_step_functions.png)

# 3. Under **Choose the service that will use this role** select **Step Functions**

# 4. Choose **Next: Permissions** until you can enter a **Role name**

![](img/create_execution_role_step_functions_part_2.png)

# 5. Click **Next: Tags**

![](img/create_execution_role_step_functions_part_3.png)

# 6. Click **Next: Review**

![](img/create_execution_role_step_functions_part_4.png)

# 7. Enter the name `StepFunctionsWorkflowExecutionRole` and select **Create role**

![](img/create_execution_role_step_functions_part_5.png)


# Add a Policy to the Role

# 1. Select `StepFunctionsWorkflowExecutionRole`

![](img/select_step_functions_worflow_execution_role.png)

# 2. Under the **Permissions** tab, click **Add inline policy**

![](img/add_inline_policy.png)

# 3. Add the Following JSON

![](img/create_policy_json.png)

{
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Action": [
                "sagemaker:CreateTransformJob",
                "sagemaker:DescribeTransformJob",
                "sagemaker:StopTransformJob",
                "sagemaker:CreateTrainingJob",
                "sagemaker:DescribeTrainingJob",
                "sagemaker:StopTrainingJob",
                "sagemaker:CreateHyperParameterTuningJob",
                "sagemaker:DescribeHyperParameterTuningJob",
                "sagemaker:StopHyperParameterTuningJob",
                "sagemaker:CreateModel",
                "sagemaker:CreateEndpointConfig",
                "sagemaker:CreateEndpoint",
                "sagemaker:DeleteEndpointConfig",
                "sagemaker:DeleteEndpoint",
                "sagemaker:UpdateEndpoint",
                "sagemaker:ListTags",
                "lambda:InvokeFunction",
                "sqs:SendMessage",
                "sns:Publish",
                "ecs:RunTask",
                "ecs:StopTask",
                "ecs:DescribeTasks",
                "dynamodb:GetItem",
                "dynamodb:PutItem",
                "dynamodb:UpdateItem",
                "dynamodb:DeleteItem",
                "batch:SubmitJob",
                "batch:DescribeJobs",
                "batch:TerminateJob",
                "glue:StartJobRun",
                "glue:GetJobRun",
                "glue:GetJobRuns",
                "glue:BatchStopJobRun"
            ],
            "Resource": "*"
        },
        {
            "Effect": "Allow",
            "Action": [
                "iam:PassRole"
            ],
            "Resource": "*",
            "Condition": {
                "StringEquals": {
                    "iam:PassedToService": "sagemaker.amazonaws.com"
                }
            }
        },
        {
            "Effect": "Allow",
            "Action": [
                "events:PutTargets",
                "events:PutRule",
                "events:DescribeRule"
            ],
            "Resource": [
                "arn:aws:events:*:*:rule/StepFunctionsGetEventsForSageMakerTrainingJobsRule",
                "arn:aws:events:*:*:rule/StepFunctionsGetEventsForSageMakerTransformJobsRule",
                "arn:aws:events:*:*:rule/StepFunctionsGetEventsForSageMakerTuningJobsRule",
                "arn:aws:events:*:*:rule/StepFunctionsGetEventsForECSTaskRule",
                "arn:aws:events:*:*:rule/StepFunctionsGetEventsForBatchJobsRule"
            ]
        }
    ]
}

# 4. Name the Role `StepFunctionsWorkflowExecutionPolicy`

![](img/create_policy.png)

# 5. Copy the **Role ARN** at the top of the **Summary**

![](img/arn.png)

# Configure the `pipeline_role` ARN

In [None]:
account_id = boto3.client('sts').get_caller_identity().get('Account')

pipeline_role = 'arn:aws:iam::{}:role/StepFunctionsWorkflowExecutionRole'.format(account_id)
print(pipeline_role)

# Specify the S3 Location of the Features

In [None]:
%store -r processed_train_data_s3_uri

In [None]:
print(processed_train_data_s3_uri)

In [None]:
%store -r processed_validation_data_s3_uri

In [None]:
print(processed_validation_data_s3_uri)

In [None]:
%store -r processed_test_data_s3_uri

In [None]:
print(processed_test_data_s3_uri)

In [None]:
print(processed_train_data_s3_uri)
!aws s3 ls $processed_train_data_s3_uri/

In [None]:
print(processed_validation_data_s3_uri)
!aws s3 ls $processed_validation_data_s3_uri/

In [None]:
print(processed_test_data_s3_uri)
!aws s3 ls $processed_test_data_s3_uri/

In [None]:
s3_input_train_data = sagemaker.s3_input(s3_data=processed_train_data_s3_uri, 
                                         distribution='ShardedByS3Key') 
s3_input_validation_data = sagemaker.s3_input(s3_data=processed_validation_data_s3_uri, 
                                              distribution='ShardedByS3Key')
s3_input_test_data = sagemaker.s3_input(s3_data=processed_test_data_s3_uri, 
                                        distribution='ShardedByS3Key')

print(s3_input_train_data.config)
print(s3_input_validation_data.config)
print(s3_input_test_data.config)

# Show Training Script

In [None]:
!pygmentize src/tf_bert_reviews.py

# Setup Hyper-Parameters

In [None]:
%store -r max_seq_length

In [None]:
print(max_seq_length)

In [None]:
epochs=2
learning_rate=0.00001
epsilon=0.00000001
train_batch_size=128
validation_batch_size=128
test_batch_size=128
train_steps_per_epoch=2000
validation_steps=2000
test_steps=2000
train_instance_count=1
train_instance_type='ml.p3.2xlarge'
train_volume_size=1024
use_xla=True
use_amp=True
freeze_bert_layer=False
enable_sagemaker_debugger=True                    
input_mode='Pipe'
run_validation=True
run_test=True
run_sample_predictions=True

# Setup Metrics To Track Model Performance

In [None]:
metrics_definitions = [
     {'Name': 'train:loss', 'Regex': 'loss: ([0-9\\.]+)'},
     {'Name': 'train:accuracy', 'Regex': 'accuracy: ([0-9\\.]+)'},
     {'Name': 'validation:loss', 'Regex': 'val_loss: ([0-9\\.]+)'},
     {'Name': 'validation:accuracy', 'Regex': 'val_accuracy: ([0-9\\.]+)'},
]

# Setup Estimator

In [None]:
from sagemaker.tensorflow import TensorFlow

estimator = TensorFlow(entry_point='tf_bert_reviews.py',
                       source_dir='src',
                       role=role,
                       train_instance_count=train_instance_count, # Make sure you have at least this number of input files or the ShardedByS3Key distibution strategy will fail the job due to no data available
                       train_instance_type=train_instance_type,
                       train_volume_size=train_volume_size,
                       py_version='py3',
                       framework_version='2.1.0',
                       hyperparameters={'epochs': epochs,
                                        'learning_rate': learning_rate,
                                        'epsilon': epsilon,
                                        'train_batch_size': train_batch_size,
                                        'validation_batch_size': validation_batch_size,
                                        'test_batch_size': test_batch_size,                                             
                                        'train_steps_per_epoch': train_steps_per_epoch,
                                        'validation_steps': validation_steps,
                                        'test_steps': test_steps,
                                        'use_xla': use_xla,
                                        'use_amp': use_amp,                                             
                                        'max_seq_length': max_seq_length,
                                        'freeze_bert_layer': freeze_bert_layer,
                                        'run_validation': run_validation,
                                        'run_test': run_test,
                                        'run_sample_predictions': run_sample_predictions},
                       input_mode=input_mode,
                       metric_definitions=metrics_definitions,
                       train_max_run=7200 # max 2 hours * 60 minutes seconds per hour * 60 seconds per minute
                      )

# Setup Pipeline with the Step Functions SDK

A typical task for a data scientist is to train a model and deploy that model to an endpoint. Without the Step Functions SDK, this is a four step process on SageMaker that includes the following.

1. Training the model
2. Creating the model on SageMaker
3. Creating an endpoint configuration
4. Deploying the trained model to the configured endpoint

The Step Functions SDK provides the [TrainingPipeline](https://aws-step-functions-data-science-sdk.readthedocs.io/en/latest/pipelines.html#stepfunctions.template.pipeline.train.TrainingPipeline) API to simplify this procedure. The following configures `pipeline` with the necessary parameters to define a training pipeline.

In [None]:
pipeline = TrainingPipeline(
    estimator=estimator,
    role=pipeline_role,
    inputs={
        'train': s3_input_train_data,
        'validation': s3_input_validation_data,
        'test': s3_input_test_data        
    },
    s3_bucket=bucket
)

### Visualize the pipeline

You can now view the workflow definition, and also visualize it as a graph. This workflow and graph represent your training pipeline.

#### View the workflow definition

In [None]:
print(pipeline.workflow.definition.to_json(pretty=True))

#### Visualize the workflow graph

In [None]:
pipeline.render_graph()

### Create and execute the pipeline on AWS Step Functions

Create the pipeline in AWS Step Functions with [create](https://aws-step-functions-data-science-sdk.readthedocs.io/en/latest/workflow.html#stepfunctions.workflow.Workflow.create).

In [None]:
pipeline.create()

Run the workflow with [execute](https://aws-step-functions-data-science-sdk.readthedocs.io/en/latest/workflow.html#stepfunctions.workflow.Workflow.execute). A link will be provided after the following cell is executed. Following this link, you can monitor your pipeline execution on Step Functions' console.

In [None]:
execution = pipeline.execute()

In [None]:
execution.render_progress()

In [None]:
import time

events = execution.list_events()

while len(events) <= 5:
    print('Number of events:  {}'.format(len(events)))
    time.sleep(30)
    events = execution.list_events()

print('Number of events:  {}'.format(len(events)))

# _Wait for ^^ Number of Events ^^ to Reach At Least 5_

In [None]:
import json

training_job_name = json.loads(events[5]['taskSucceededEventDetails']['output'])['TrainingJobName']
print('Training Job Name: {}'.format(training_job_name))

print('')

trained_model_s3_uri = json.loads(events[5]['taskSucceededEventDetails']['output'])['ModelArtifacts']['S3ModelArtifacts']
print('Trained Model S3 URI: {}'.format(trained_model_s3_uri))

# Copy the Model from S3

In [None]:
!aws s3 cp $trained_model_s3_uri ./model.tar.gz

In [None]:
!tar -xvzf ./model.tar.gz

# Show the Model Prediction Signature

In [None]:
!saved_model_cli show --all --dir ./tensorflow/saved_model/0/

In [None]:
events = execution.list_events()

while len(events) <= 18:
    print('Number of events:  {}'.format(len(events)))
    time.sleep(30)
    events = execution.list_events()

print('Number of events:  {}'.format(len(events)))

# _Wait for ^^ Number of Events ^^ to Reach At Least 18_

In [None]:
import json

step_functions_pipeline_endpoint_name = json.loads(events[18]['taskScheduledEventDetails']['parameters'])['EndpointName']

print('Endpoint Name: {}'.format(step_functions_pipeline_endpoint_name))

In [None]:
events = execution.list_events()

while len(events) <= 21:
    print('Number of events:  {}'.format(len(events)))
    time.sleep(30)
    events = execution.list_events()    

print('Number of events:  {}'.format(len(events)))

# _Wait for ^^ Number of Events ^^ to Reach At Least 21_

In [None]:
event_details = json.loads(events[21]['stateExitedEventDetails']['output'])

print(event_details)

# Pass Variables to the Next Notebooks(s)

In [None]:
print(step_functions_pipeline_endpoint_name)

In [None]:
%store step_functions_pipeline_endpoint_name