# Build a BERT SageMaker Pipeline

https://github.com/kubeflow/pipelines/blob/master/samples/contrib/aws-samples/mnist-kmeans-sagemaker/mnist-classification-pipeline.py

https://github.com/aws-samples/eks-kubeflow-workshop/blob/master/notebooks/05_Kubeflow_Pipeline/05_04_Pipeline_SageMaker.ipynb

## Install AWS Python SDK (`boto3`)

In [None]:
!pip install boto3

## Install Kubeflow Pipelines SDK

In [None]:
!pip install https://storage.googleapis.com/ml-pipeline/release/0.1.29/kfp.tar.gz --upgrade

In [None]:
# # Restart the kernel to pick up pip installed libraries
# from IPython.core.display import HTML
# HTML("<script>Jupyter.notebook.kernel.restart()</script>")

In [None]:
import boto3

AWS_REGION_AS_SLIST=!curl -s http://169.254.169.254/latest/meta-data/placement/availability-zone | sed 's/\(.*\)[a-z]/\1/'
AWS_REGION = AWS_REGION_AS_SLIST.s
print('Region: {}'.format(AWS_REGION))

AWS_ACCOUNT_ID=boto3.client('sts').get_caller_identity().get('Account')
print('Account ID: {}'.format(AWS_ACCOUNT_ID))

S3_BUCKET='sagemaker-{}-{}'.format(AWS_REGION, AWS_ACCOUNT_ID)
print('S3 Bucket: {}'.format(S3_BUCKET))

SAGEMAKER_ROLE_ARN='arn:aws:iam::{}:role/TeamRole'.format(AWS_ACCOUNT_ID)
print('SageMaker Role ARN: {}'.format(SAGEMAKER_ROLE_ARN))

# Copy Data from Public S3 to Private S3

In [None]:
s3_public_path_tsv = 's3://amazon-reviews-pds/tsv'

In [None]:
s3_private_path_tsv = 's3://{}/amazon-reviews-pds/tsv'.format(S3_BUCKET)
print(s3_private_path_tsv)

In [None]:
!aws s3 cp --recursive $s3_public_path_tsv/ $s3_private_path_tsv/ --exclude "*" --include "amazon_reviews_us_Digital_Software_v1_00.tsv.gz"
!aws s3 cp --recursive $s3_public_path_tsv/ $s3_private_path_tsv/ --exclude "*" --include "amazon_reviews_us_Digital_Video_Games_v1_00.tsv.gz"
!aws s3 cp --recursive $s3_public_path_tsv/ $s3_private_path_tsv/ --exclude "*" --include "amazon_reviews_us_Digital_Ebook_Purchase_v1_01.tsv.gz"


In [None]:
raw_input_data_s3_uri = 's3://{}/amazon-reviews-pds/tsv/'.format(S3_BUCKET)


# Build Pipeline

In [None]:
import kfp
from kfp import components
from kfp import dsl
from kfp.aws import use_aws_secret

In [None]:
sagemaker_process_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/3ebd075212e0a761b982880707ec497c36a99d80/components/aws/sagemaker/process/component.yaml')


In [None]:
sagemaker_train_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/3ebd075212e0a761b982880707ec497c36a99d80/components/aws/sagemaker/train/component.yaml')


In [None]:
sagemaker_model_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/3ebd075212e0a761b982880707ec497c36a99d80/components/aws/sagemaker/model/component.yaml')


In [None]:
sagemaker_deploy_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/3ebd075212e0a761b982880707ec497c36a99d80/components/aws/sagemaker/deploy/component.yaml')


# Setup Pre-Processing Code 

In [None]:
processing_code_s3_uri = 's3://{}/processing_code/preprocess-scikit-text-to-bert.py'.format(S3_BUCKET)
print(processing_code_s3_uri)

!aws s3 cp ./preprocess-scikit-text-to-bert.py $processing_code_s3_uri


# Setup Training Code

In [None]:
!tar -cvzf sourcedir.tar.gz -C code .

In [None]:
training_code_s3_uri = 's3://{}/training_code/sourcedir.tar.gz'.format(S3_BUCKET)
print(training_code_s3_uri)

!aws s3 cp sourcedir.tar.gz $training_code_s3_uri

In [None]:
def processing_input(input_name, s3_uri, local_path, s3_data_distribution_type):
    return {
        "InputName": input_name,
        "S3Input": {
            "LocalPath": local_path,
            "S3Uri": s3_uri,
            "S3DataType": "S3Prefix",
            "S3DataDistributionType": s3_data_distribution_type,
            "S3InputMode": "File",
        },
    }

def processing_output(output_name, s3_uri, local_path, s3_upload_mode):
    return {
        "OutputName": output_name,
        "S3Output": {
            "LocalPath": local_path,            
            "S3Uri": s3_uri,
            "S3UploadMode": s3_upload_mode
        },
    }

In [None]:
def training_input(input_name, s3_uri, s3_data_distribution_type):
    return {
        "ChannelName": input_name,
        "DataSource": {
            "S3DataSource": {
                "S3Uri": s3_uri, 
                "S3DataType": "S3Prefix",
                "S3DataDistributionType": s3_data_distribution_type                
            }
        },
    }

# Setup Pipeline

In [None]:
@dsl.pipeline(
    name="BERT Pipeline",
    description="BERT Pipeline",
)
def bert_pipeline(role=SAGEMAKER_ROLE_ARN, 
                  bucket=S3_BUCKET, 
                  region=AWS_REGION,
                  raw_input_data_s3_uri=raw_input_data_s3_uri,):

    import time
    import json

    processing_image='763104351884.dkr.ecr.{}.amazonaws.com/tensorflow-training:2.1.0-cpu-py36-ubuntu18.04'.format(region)
    train_image='763104351884.dkr.ecr.{}.amazonaws.com/tensorflow-training:2.1.0-cpu-py36-ubuntu18.04'.format(region)
    serve_image='763104351884.dkr.ecr.{}.amazonaws.com/tensorflow-inference:2.1.0-cpu-py36-ubuntu18.04'.format(region)

    pipeline_name = 'kubeflow-pipeline-sagemaker-{}'.format(int(time.time()))

    network_isolation=False

    max_seq_length=64
    train_split_percentage=0.90
    validation_split_percentage=0.05
    test_split_percentage=0.05
    balance_dataset=True
    processing_instance_count=2
    processing_instance_type='ml.c5.2xlarge'

    processed_train_data_s3_uri = 's3://{}/{}/processing/output/bert-train'.format(bucket, pipeline_name)
    processed_validation_data_s3_uri = 's3://{}/{}/processing/output/bert-validation'.format(bucket, pipeline_name)
    processed_test_data_s3_uri = 's3://{}/{}/processing/output/bert-test'.format(bucket, pipeline_name)

    processing_instance_type = 'ml.c5.2xlarge'
    processing_instance_count = 2    

    # Training input and output location based on bucket name
    process = sagemaker_process_op(
        role=role,
        region=region,
        image=processing_image,
        network_isolation=network_isolation,
        instance_type=processing_instance_type,
        instance_count=processing_instance_count,
        container_arguments=['--train-split-percentage', str(train_split_percentage),
                             '--validation-split-percentage', str(validation_split_percentage),
                             '--test-split-percentage', str(test_split_percentage),
                             '--max-seq-length', str(max_seq_length),
                             '--balance-dataset', str(balance_dataset)],
        container_entrypoint=[
            "python3",
            "/opt/ml/processing/input/code/preprocess-scikit-text-to-bert.py",
        ],
        input_config=[
            processing_input(
                input_name="raw_input",
                s3_uri="{}".format(raw_input_data_s3_uri),
                local_path="/opt/ml/processing/input/data/",
                s3_data_distribution_type="ShardedByS3Key"
            ),
            processing_input(
                input_name="code",
                s3_uri="{}".format(processing_code_s3_uri),
                local_path="/opt/ml/processing/input/code",
                s3_data_distribution_type="FullyReplicated"
            ),
        ],
        output_config=[
            processing_output(
                output_name="bert-train",
                s3_uri="{}".format(processed_train_data_s3_uri),
                local_path="/opt/ml/processing/output/bert/train",
                s3_upload_mode="EndOfJob"
            ),
            processing_output(
                output_name="bert-validation",
                s3_uri="{}".format(processed_validation_data_s3_uri),
                local_path="/opt/ml/processing/output/bert/validation",
                s3_upload_mode="EndOfJob"
            ),
            processing_output(
                output_name="bert-test",
                s3_uri="{}".format(processed_test_data_s3_uri),
                local_path="/opt/ml/processing/output/bert/test",
                s3_upload_mode="EndOfJob"
            ),
        ],
    )


    ########################
    # TRAIN
    ########################
    
    train_channels = [
        training_input(input_name="train", 
                       s3_uri=processed_train_data_s3_uri,
                       s3_data_distribution_type="ShardedByS3Key"
        ),
        training_input(input_name="validation", 
                       s3_uri=processed_validation_data_s3_uri,
                       s3_data_distribution_type="ShardedByS3Key"
        ),                       
        training_input(input_name="test", 
                       s3_uri=processed_test_data_s3_uri,
                       s3_data_distribution_type="ShardedByS3Key"
        )
    ]

    epochs=3
    learning_rate=0.00001
    epsilon=0.00000001
    train_batch_size=128
    validation_batch_size=128
    test_batch_size=128
    train_steps_per_epoch=100
    validation_steps=100
    test_steps=100
    train_volume_size=1024
    use_xla=True
    use_amp=True
    freeze_bert_layer=False
    enable_sagemaker_debugger=False
    enable_checkpointing=False
    enable_tensorboard=False
    input_mode='Pipe'
    run_validation=True
    run_test=True
    run_sample_predictions=True
    
    hyperparameters={
        'epochs': '{}'.format(epochs),
        'learning_rate': '{}'.format(learning_rate),
        'epsilon': '{}'.format(epsilon),
        'train_batch_size': '{}'.format(train_batch_size),
        'validation_batch_size': '{}'.format(validation_batch_size),
        'test_batch_size': '{}'.format(test_batch_size),                                             
        'train_steps_per_epoch': '{}'.format(train_steps_per_epoch),
        'validation_steps': '{}'.format(validation_steps),
        'test_steps': '{}'.format(test_steps),
        'use_xla': '{}'.format(use_xla),
        'use_amp': '{}'.format(use_amp),                                             
        'max_seq_length': '{}'.format(max_seq_length),
        'freeze_bert_layer': '{}'.format(freeze_bert_layer),
        'enable_sagemaker_debugger': '{}'.format(enable_sagemaker_debugger),
        'enable_checkpointing': '{}'.format(enable_checkpointing),
        'enable_tensorboard': '{}'.format(enable_tensorboard),                                        
        'run_validation': '{}'.format(run_validation),
        'run_test': '{}'.format(run_test),
        'run_sample_predictions': '{}'.format(run_sample_predictions),
        'model_dir': '{}'.format(train_output_location),
        'sagemaker_program': 'tf_bert_reviews.py',
        'sagemaker_region': '{}'.format(region),
        'sagemaker_submit_directory': training_code_s3_uri
    }
    hyperparameters_json = json.dumps(hyperparameters)
   
    # metric_definitions='{"val_acc": "val_accuracy: ([0-9\\\\.]+)"}',
    metrics_definitions = [
        {'Name': 'train:loss', 'Regex': 'loss: ([0-9\\.]+)'},
        {'Name': 'train:accuracy', 'Regex': 'accuracy: ([0-9\\.]+)'},
        {'Name': 'validation:loss', 'Regex': 'val_loss: ([0-9\\.]+)'},
        {'Name': 'validation:accuracy', 'Regex': 'val_accuracy: ([0-9\\.]+)'},
    ]
    metrics_definitions_json = json.dumps(metrics_definitions)
    print(metrics_definitions_json)

    train_instance_count=1
    train_instance_type='ml.c5.9xlarge'   
    train_output_location = "s3://{}/{}/output".format(bucket, pipeline_name)

    # .after(process) is explicitly appended below
    training = sagemaker_train_op(
        region=region,
        image=train_image,
        network_isolation=network_isolation,        
        instance_type=train_instance_type,
        instance_count=train_instance_count,
        hyperparameters=hyperparameters_json,
        training_input_mode=input_mode,    
        channels=train_channels,        
        model_artifact_path=train_output_location,
        # metric_definitions=metrics_definitions_json,
        # TODO:  Add rules
        role=role        
    ).after(process)


    ########################
    # DEPLOY
    ########################
    
    # .after(training) is implied because we depend on training.outputs[]
    create_model = sagemaker_model_op(
        region=region,
        model_name=training.outputs["job_name"],
        image=serve_image,
        model_artifact_url=training.outputs["model_artifact_url"],
        role=role
    )

    deploy_instance_count=1
    deploy_instance_type='ml.m5.4xlarge'

    # .after(create_model) is implied because we depend on create_model.outputs
    sagemaker_deploy_op(
        region=region,
        model_name_1=create_model.output,
        instance_type_1=deploy_instance_type,
        initial_instance_count_1=deploy_instance_count        
    )


# Compile Kubeflow Pipeline

In [None]:
kfp.compiler.Compiler().compile(bert_pipeline, 'bert-pipeline.zip')

In [None]:
!ls -al ./bert-pipeline.zip

In [None]:
!unzip -o ./bert-pipeline.zip

In [None]:
!cat pipeline.yaml

# Launch Pipeline on Kubernetes Cluster

In [None]:
client = kfp.Client()

aws_experiment = client.create_experiment(name='aws')

my_run = client.run_pipeline(aws_experiment.id, 
                             'bert-pipeline', 
                             'bert-pipeline.zip')

## Training

_Note:  The above training job may take 5-10 minutes.  Please be patient._

In the meantime, open the SageMaker Console to monitor the progress of your training job.

![SageMaker Training Job Console](img/sagemaker-training-job-console.png)

## Get the Name of the Deployed Prediction Endpoint
First, we need to get the endpoint name of our newly-deployed SageMaker Prediction Endpoint.

Open AWS console and enter SageMaker service, find the endpoint name as the following picture shows.

![download-pipeline](images/sm-endpoint.jpg)

# Make a Prediction

# _YOU MUST COPY/PASTE THE `ENDPOINT_NAME` BEFORE CONTINUING_
Make sure to include preserve the single-quotes as shown below.

In [None]:
# #################################
# #################################
# # Replace ENDPOINT_NAME with the endpoint name in the SageMaker console.
# # Surround with single quotes.
# ENDPOINT_NAME= # 'Endpoint-<your-endpoint-name>'
# #################################
# #################################

## Clean up

Go to Sagemaker console and delete `endpoint` and `model`.