# Create a Kubeflow Pipeline with BERT and Amazon SageMaker

# Install Dependencies

In [None]:
!pip install sagemaker==1.72.0
!pip install https://storage.googleapis.com/ml-pipeline/release/0.1.29/kfp.tar.gz --upgrade

In [None]:
!pip install awscli==1.18.140

# _Note: Ignore all pip install warning or errors above_

In [None]:
# Restart the kernel to pick up pip installed libraries
from IPython.core.display import HTML

HTML("<script>Jupyter.notebook.kernel.restart()</script>")

# Setup Environment Variables

In [None]:
import boto3

aws_region_as_slist=!curl -s http://169.254.169.254/latest/meta-data/placement/availability-zone | sed 's/\(.*\)[a-z]/\1/'
region = aws_region_as_slist.s
print('Region: {}'.format(region))

account_id=boto3.client('sts').get_caller_identity().get('Account')
print('Account ID: {}'.format(account_id))

bucket='sagemaker-{}-{}'.format(region, account_id)
print('S3 Bucket: {}'.format(bucket))

In [None]:
iam_roles = boto3.client("iam").list_roles()["Roles"]

for iam_role in iam_roles:
    if "SageMakerExecutionRole" in iam_role["RoleName"]:
        role = iam_role["Arn"]
        break
print("Role: {}".format(role))

# Copy Data from Public S3 to Private S3

In [None]:
s3_public_path_tsv = "s3://amazon-reviews-pds/tsv"

In [None]:
s3_private_path_tsv = "s3://{}/amazon-reviews-pds/tsv".format(bucket)
print(s3_private_path_tsv)

In [None]:
!aws s3 cp --recursive $s3_public_path_tsv/ $s3_private_path_tsv/ --exclude "*" --include "amazon_reviews_us_Digital_Software_v1_00.tsv.gz"
!aws s3 cp --recursive $s3_public_path_tsv/ $s3_private_path_tsv/ --exclude "*" --include "amazon_reviews_us_Digital_Video_Games_v1_00.tsv.gz"
!aws s3 cp --recursive $s3_public_path_tsv/ $s3_private_path_tsv/ --exclude "*" --include "amazon_reviews_us_Gift_Card_v1_00.tsv.gz"

In [None]:
raw_input_data_s3_uri = "s3://{}/amazon-reviews-pds/tsv/".format(bucket)

# Build Pipeline

## Using Amazon SageMaker Components for Kubeflow Pipelines: 
https://github.com/kubeflow/pipelines/tree/master/components/aws/sagemaker

https://docs.aws.amazon.com/sagemaker/latest/dg/usingamazon-sagemaker-components.html

In [None]:
import kfp
from kfp import components
from kfp import dsl
from kfp.aws import use_aws_secret

In [None]:
sagemaker_process_op = components.load_component_from_url(
    "https://raw.githubusercontent.com/kubeflow/pipelines/3ebd075212e0a761b982880707ec497c36a99d80/components/aws/sagemaker/process/component.yaml"
)

In [None]:
sagemaker_train_op = components.load_component_from_url(
    "https://raw.githubusercontent.com/kubeflow/pipelines/3ebd075212e0a761b982880707ec497c36a99d80/components/aws/sagemaker/train/component.yaml"
)

In [None]:
sagemaker_model_op = components.load_component_from_url(
    "https://raw.githubusercontent.com/kubeflow/pipelines/3ebd075212e0a761b982880707ec497c36a99d80/components/aws/sagemaker/model/component.yaml"
)

In [None]:
sagemaker_deploy_op = components.load_component_from_url(
    "https://raw.githubusercontent.com/kubeflow/pipelines/3ebd075212e0a761b982880707ec497c36a99d80/components/aws/sagemaker/deploy/component.yaml"
)

# Setup Pre-Processing Code 

In [None]:
processing_code_s3_uri = "s3://{}/processing_code/preprocess-scikit-text-to-bert-feature-store.py".format(bucket)
print(processing_code_s3_uri)

!aws s3 cp ./preprocess-scikit-text-to-bert-feature-store.py $processing_code_s3_uri

# Package and Upload Training Code to S3

In [None]:
!tar -cvzf sourcedir.tar.gz -C ./code .

In [None]:
training_code_s3_uri = "s3://{}/training_code/sourcedir.tar.gz".format(bucket)
print(training_code_s3_uri)

!aws s3 cp sourcedir.tar.gz $training_code_s3_uri

In [None]:
def processing_input(input_name, s3_uri, local_path, s3_data_distribution_type):
    return {
        "InputName": input_name,
        "S3Input": {
            "LocalPath": local_path,
            "S3Uri": s3_uri,
            "S3DataType": "S3Prefix",
            "S3DataDistributionType": s3_data_distribution_type,
            "S3InputMode": "File",
        },
    }


def processing_output(output_name, s3_uri, local_path, s3_upload_mode):
    return {
        "OutputName": output_name,
        "S3Output": {"LocalPath": local_path, "S3Uri": s3_uri, "S3UploadMode": s3_upload_mode},
    }

In [None]:
def training_input(input_name, s3_uri, s3_data_distribution_type):
    return {
        "ChannelName": input_name,
        "DataSource": {
            "S3DataSource": {
                "S3Uri": s3_uri,
                "S3DataType": "S3Prefix",
                "S3DataDistributionType": s3_data_distribution_type,
            }
        },
    }

# Setup Pipeline

In [None]:
@dsl.pipeline(
    name="BERT Pipeline",
    description="BERT Pipeline",
)
def bert_pipeline(role=role, bucket=bucket, region=region, raw_input_data_s3_uri=raw_input_data_s3_uri):

    import time
    import json

    pipeline_name = "kubeflow-pipeline-sagemaker-{}".format(int(time.time()))

    network_isolation = False

    ########################
    # FEATURE ENGINEERING
    ########################

    max_seq_length = 64
    train_split_percentage = 0.90
    validation_split_percentage = 0.05
    test_split_percentage = 0.05
    balance_dataset = True

    processed_train_data_s3_uri = "s3://{}/{}/processing/output/bert-train".format(bucket, pipeline_name)
    processed_validation_data_s3_uri = "s3://{}/{}/processing/output/bert-validation".format(bucket, pipeline_name)
    processed_test_data_s3_uri = "s3://{}/{}/processing/output/bert-test".format(bucket, pipeline_name)

    processing_instance_type = "ml.c5.2xlarge"
    processing_instance_count = 2

    timestamp = int(time.time())

    feature_store_offline_prefix = "reviews-feature-store-" + str(timestamp)
    feature_group_name = "reviews-feature-group-" + str(timestamp)

    # hard-coding to avoid the wrong ECR account id with create_image_uri()
    processing_image = "683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-scikit-learn:0.23-1-cpu-py3"
    #    import sagemaker
    #    processing_image = sagemaker.fw_utils.create_image_uri(framework='scikit-learn',
    #                                                           framework_version='0.23-1',
    #                                                           py_version='py3',
    #                                                           instance_type='ml.c5.9xlarge',
    #                                                           region='us-east-1') # hard-coding to avoid serialization issue

    process = sagemaker_process_op(
        role=role,
        region=region,
        image=processing_image,
        network_isolation=network_isolation,
        instance_type=processing_instance_type,
        instance_count=processing_instance_count,
        container_arguments=[
            "--train-split-percentage",
            str(train_split_percentage),
            "--validation-split-percentage",
            str(validation_split_percentage),
            "--test-split-percentage",
            str(test_split_percentage),
            "--max-seq-length",
            str(max_seq_length),
            "--balance-dataset",
            str(balance_dataset),
            "--feature-store-offline-prefix",
            str(feature_store_offline_prefix),
            "--feature-group-name",
            str(feature_group_name),
        ],
        environment={"AWS_DEFAULT_REGION": "us-east-1"},  # hard-coding to avoid serialization issue
        container_entrypoint=[
            "python3",
            "/opt/ml/processing/input/code/preprocess-scikit-text-to-bert-feature-store.py",
        ],
        input_config=[
            processing_input(
                input_name="raw-input-data",
                s3_uri="{}".format(raw_input_data_s3_uri),
                local_path="/opt/ml/processing/input/data/",
                s3_data_distribution_type="ShardedByS3Key",
            ),
            processing_input(
                input_name="code",
                s3_uri="{}".format(processing_code_s3_uri),
                local_path="/opt/ml/processing/input/code",
                s3_data_distribution_type="FullyReplicated",
            ),
        ],
        output_config=[
            processing_output(
                output_name="bert-train",
                s3_uri="{}".format(processed_train_data_s3_uri),
                local_path="/opt/ml/processing/output/bert/train",
                s3_upload_mode="EndOfJob",
            ),
            processing_output(
                output_name="bert-validation",
                s3_uri="{}".format(processed_validation_data_s3_uri),
                local_path="/opt/ml/processing/output/bert/validation",
                s3_upload_mode="EndOfJob",
            ),
            processing_output(
                output_name="bert-test",
                s3_uri="{}".format(processed_test_data_s3_uri),
                local_path="/opt/ml/processing/output/bert/test",
                s3_upload_mode="EndOfJob",
            ),
        ],
    )

    ########################
    # TRAIN
    ########################

    train_channels = [
        training_input(
            input_name="train", s3_uri=processed_train_data_s3_uri, s3_data_distribution_type="ShardedByS3Key"
        ),
        training_input(
            input_name="validation",
            s3_uri=processed_validation_data_s3_uri,
            s3_data_distribution_type="ShardedByS3Key",
        ),
        training_input(
            input_name="test", s3_uri=processed_test_data_s3_uri, s3_data_distribution_type="ShardedByS3Key"
        ),
    ]

    epochs = 1
    learning_rate = 0.00001
    epsilon = 0.00000001
    train_batch_size = 128
    validation_batch_size = 128
    test_batch_size = 128
    train_steps_per_epoch = 100
    validation_steps = 100
    test_steps = 100
    train_volume_size = 1024
    use_xla = True
    use_amp = True
    freeze_bert_layer = False
    enable_sagemaker_debugger = False
    enable_checkpointing = False
    enable_tensorboard = False
    input_mode = "File"
    run_validation = True
    run_test = True
    run_sample_predictions = True

    train_instance_type = "ml.c5.9xlarge"
    train_instance_count = 1

    train_output_location = "s3://{}/{}/output".format(bucket, pipeline_name)

    hyperparameters = {
        "epochs": "{}".format(epochs),
        "learning_rate": "{}".format(learning_rate),
        "epsilon": "{}".format(epsilon),
        "train_batch_size": "{}".format(train_batch_size),
        "validation_batch_size": "{}".format(validation_batch_size),
        "test_batch_size": "{}".format(test_batch_size),
        "train_steps_per_epoch": "{}".format(train_steps_per_epoch),
        "validation_steps": "{}".format(validation_steps),
        "test_steps": "{}".format(test_steps),
        "use_xla": "{}".format(use_xla),
        "use_amp": "{}".format(use_amp),
        "max_seq_length": "{}".format(max_seq_length),
        "freeze_bert_layer": "{}".format(freeze_bert_layer),
        "enable_sagemaker_debugger": "{}".format(enable_sagemaker_debugger),
        "enable_checkpointing": "{}".format(enable_checkpointing),
        "enable_tensorboard": "{}".format(enable_tensorboard),
        "run_validation": "{}".format(run_validation),
        "run_test": "{}".format(run_test),
        "run_sample_predictions": "{}".format(run_sample_predictions),
        "model_dir": "{}".format(train_output_location),
        "sagemaker_program": "tf_bert_reviews.py",
        "sagemaker_region": "{}".format(region),
        "sagemaker_submit_directory": training_code_s3_uri,
    }
    hyperparameters_json = json.dumps(hyperparameters)

    # metric_definitions='{"val_acc": "val_accuracy: ([0-9\\\\.]+)"}',
    metrics_definitions = [
        {"Name": "train:loss", "Regex": "loss: ([0-9\\.]+)"},
        {"Name": "train:accuracy", "Regex": "accuracy: ([0-9\\.]+)"},
        {"Name": "validation:loss", "Regex": "val_loss: ([0-9\\.]+)"},
        {"Name": "validation:accuracy", "Regex": "val_accuracy: ([0-9\\.]+)"},
    ]
    metrics_definitions_json = json.dumps(metrics_definitions)
    print(metrics_definitions_json)

    # .after(process) is explicitly appended below
    train_image = "763104351884.dkr.ecr.{}.amazonaws.com/tensorflow-training:2.3.1-cpu-py37-ubuntu18.04".format(region)
    training = sagemaker_train_op(
        region=region,
        image=train_image,
        network_isolation=network_isolation,
        instance_type=train_instance_type,
        instance_count=train_instance_count,
        hyperparameters=hyperparameters_json,
        training_input_mode=input_mode,
        channels=train_channels,
        model_artifact_path=train_output_location,
        # metric_definitions=metrics_definitions_json,
        # TODO:  Add rules
        role=role,
    ).after(process)

    ########################
    # DEPLOY
    ########################

    # .after(training) is implied because we depend on training.outputs[]
    serve_image = "763104351884.dkr.ecr.{}.amazonaws.com/tensorflow-inference:2.3.1-cpu".format(region)
    create_model = sagemaker_model_op(
        region=region,
        model_name=training.outputs["job_name"],
        image=serve_image,
        network_isolation=network_isolation,
        model_artifact_url=training.outputs["model_artifact_url"],
        role=role,
    )

    deploy_instance_type = "ml.c5.9xlarge"
    deploy_instance_count = 1

    # .after(create_model) is implied because we depend on create_model.outputs
    deploy_model = sagemaker_deploy_op(
        region=region,
        variant_name_1="AllTraffic",
        model_name_1=create_model.output,
        instance_type_1=deploy_instance_type,
        initial_instance_count_1=deploy_instance_count,
    )

# Compile Kubeflow Pipeline

In [None]:
kfp.compiler.Compiler().compile(bert_pipeline, "bert-pipeline.zip")

In [None]:
!ls -al ./bert-pipeline.zip

In [None]:
!unzip -o ./bert-pipeline.zip

In [None]:
!pygmentize pipeline.yaml

# Launch Pipeline on Kubernetes Cluster

In [None]:
client = kfp.Client()

experiment = client.create_experiment(name="kubeflow")

my_run = client.run_pipeline(experiment.id, "bert-pipeline", "bert-pipeline.zip")

# Review Training Job

_Note:  The above training job may take 5-10 minutes.  Please be patient._

In the meantime, open the SageMaker Console to monitor the progress of your training job.

![SageMaker Training Job Console](img/sagemaker-training-job-console.png)

# Review the Endpoint
First, we need to get the endpoint name of our newly-deployed SageMaker Prediction Endpoint.

Open AWS console and enter SageMaker service, find the endpoint name as the following picture shows.

![download-pipeline](img/sm-endpoint.jpg)

# Make a Prediction

## _YOU MUST COPY/PASTE THE `endpoint_name` BEFORE CONTINUING_
Make sure to include preserve the single-quotes as shown below.

![](img/sm-endpoint-kubeflow.png)

In [None]:
import boto3

sm_runtime = boto3.Session(region_name=region).client("sagemaker-runtime")

endpoint_name = "<COPY-AND-PASTE-FROM-KUBEFLOW-PIPELINE-LOGS>"

In [None]:
import json

inputs = [{"features": ["This is great!"]}, {"features": ["This is bad."]}]

response = sm_runtime.invoke_endpoint(
    EndpointName=endpoint_name,
    ContentType="application/jsonlines",
    Accept="application/jsonlines",
    Body=json.dumps(inputs).encode("utf-8"),
)
print("response: {}".format(response))

predicted_classes_str = response["Body"].read().decode()
predicted_classes_json = json.loads(predicted_classes_str)

predicted_classes = predicted_classes_json.splitlines()
print("predicted_classes: {}".format(predicted_classes))

for predicted_class_json, input_data in zip(predicted_classes, inputs):
    predicted_class = json.loads(predicted_class_json)["predicted_label"]
    print('Predicted star_rating: {} for review_body "{}"'.format(predicted_class, input_data["features"][0]))