Based on these posts: 
* https://medium.com/@chrisfotache/text-classification-in-python-pipelines-nlp-nltk-tf-idf-xgboost-and-more-b83451a327e0
* https://github.com/keisukeirie/Amazon_review_helpfulness_prediction
* https://stackabuse.com/text-classification-with-bert-tokenizer-and-tf-2-0-in-python/
* https://towardsdatascience.com/simple-bert-using-tensorflow-2-0-132cb19e9b22

In [None]:
!pip install -q boto3
!pip install -q xgboost==0.90
!pip install --upgrade --ignore-installed --no-cache stepfunctions

In [None]:
import boto3
import sagemaker
import pandas as pd

sess   = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name='sagemaker', region_name=region)

In [None]:
prefix_train = 'feature-store/amazon-reviews/csv/balanced-tfidf-without-header/train'
prefix_validation = 'feature-store/amazon-reviews/csv/balanced-tfidf-without-header/validation'
prefix_test = 'feature-store/amazon-reviews/csv/balanced-tfidf-without-header/test'

balanced_tfidf_without_header_train_s3_uri = 's3://{}/{}/data.csv'.format(bucket, prefix_train)
balanced_tfidf_without_header_validation_s3_uri = 's3://{}/{}/data.csv'.format(bucket, prefix_validation)
balanced_tfidf_without_header_test_s3_uri = 's3://{}/{}/data.csv'.format(bucket, prefix_test)

s3_input_train_data = sagemaker.s3_input(s3_data=balanced_tfidf_without_header_train_s3_uri, content_type='text/csv')
s3_input_validation_data = sagemaker.s3_input(s3_data=balanced_tfidf_without_header_validation_s3_uri, content_type='text/csv')
s3_input_test_data = sagemaker.s3_input(s3_data=balanced_tfidf_without_header_test_s3_uri, content_type='text/csv')

print(s3_input_train_data.config)
print(s3_input_validation_data.config)
print(s3_input_test_data.config)

In [None]:
!cat src/xgboost_reviews.py

In [None]:
from sagemaker.xgboost import XGBoost

# TODO:  Bug re: s3://s3://?  in just pipelines?  doesn't seem to be in ScriptMode
#model_output_path = 's3://{}/models/amazon-reviews/script-mode/training-runs'.format(bucket)
model_output_path = 's3://{}/models/amazon-reviews/script-mode/training-runs'.format(bucket)

xgb_estimator = XGBoost(entry_point='xgboost_reviews.py', 
                        source_dir='src/',
                        role=role,
                        train_instance_count=1, 
#                        train_instance_type='local',
                        train_instance_type='ml.m4.xlarge',
                        framework_version='0.90-2',
                        py_version='py3',
                        output_path=model_output_path,
                        hyperparameters={'objective':'binary:logistic',
                                         'num_round': 1,
                                         'max_depth': 5},
                        enable_cloudwatch_metrics=True,
                       )

### Build a training pipeline with the Step Functions SDK

A typical task for a data scientist is to train a model and deploy that model to an endpoint. Without the Step Functions SDK, this is a four step process on SageMaker that includes the following.

1. Training the model
2. Creating the model on SageMaker
3. Creating an endpoint configuration
4. Deploying the trained model to the configured endpoint

The Step Functions SDK provides the [TrainingPipeline](https://aws-step-functions-data-science-sdk.readthedocs.io/en/latest/pipelines.html#stepfunctions.template.pipeline.train.TrainingPipeline) API to simplify this procedure. The following configures `pipeline` with the necessary parameters to define a training pipeline.


In [None]:
# paste the StepFunctionsWorkflowExecutionRole ARN from above
workflow_execution_role = "arn:aws:iam::835319576252:role/StepFunctionsWorkflowExecutionRole"
#workflow_execution_role = "XXXX"

In [None]:
from stepfunctions.template.pipeline import TrainingPipeline

pipeline = TrainingPipeline(
    estimator=xgb_estimator,
    role=workflow_execution_role,
    inputs={'train': s3_input_train_data, 
            'validation': s3_input_validation_data},
    s3_bucket=model_output_path)

## Visualize the pipeline
You can now view the workflow definition, and also visualize it as a graph. This workflow and graph represent your training pipeline.


### View the pipeline definition

In [None]:
print(pipeline.workflow.definition.to_json(pretty=True))

### Visualize the pipeline graph

In [None]:
pipeline.render_graph()

### Create and execute the pipeline on AWS Step Functions

Create the pipeline in AWS Step Functions with [create](https://aws-step-functions-data-science-sdk.readthedocs.io/en/latest/workflow.html#stepfunctions.workflow.Workflow.create).

In [None]:
pipeline.create()

### Run the pipeline 

A link will be provided after the following cell is executed. Following this link, you can monitor your pipeline execution on Step Functions' console.

In [None]:
execution = pipeline.execute()

In [None]:
execution.render_progress()

## YOU MUST WAIT FOR THE ABOVE PIPELINE TO COMPLETE BEFORE CONTINUING!

### Review the execution events

In [None]:
import json
events = execution.list_events()

event_output = json.loads(events[21]['stateExitedEventDetails']['output'])
endpoint_arn = event_output['EndpointArn']

endpoint_name = json.loads(events[18]['taskScheduledEventDetails']['parameters'])['EndpointName']
endpoint_name

In [None]:
# TODO:  Retieve the predictor from the pipeline/workflow above
# predictor = mnist_estimator.deploy(initial_instance_count=1, instance_type='ml.c5.2xlarge')

predictor = sagemaker.predictor.RealTimePredictor(endpoint=endpoint_name)

# Deploy Endpoint

### From an external application, you can use the following code to make a prediction

In [None]:
import boto3
import sagemaker
import pandas as pd

sess   = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

In [None]:
import time

# https://towardsdatascience.com/xgboost-in-amazon-sagemaker-28e5e354dbcd
from sagemaker.predictor import csv_serializer

xgb_endpoint_name = 'xgboost-script-pipeline-{}'.format(time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime()))
xgb_endpoint_name

In [None]:
## Deploy trained XGBoost model endpoint to perform predictions
xgb_predictor = xgb_estimator.deploy(initial_instance_count = 1, 
                                     instance_type = 'ml.m4.xlarge',
                                     endpoint_name=xgb_endpoint_name)

xgb_predictor.content_type = 'text/csv'
xgb_predictor.serializer = csv_serializer
xgb_predictor.deserializer = None

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, classification_report, confusion_matrix

sm_runtime = boto3.client('sagemaker-runtime')

payload_500_samples = X_test[:500].to_csv(index=False, header=False).rstrip()

response_500_samples = sm_runtime.invoke_endpoint(
    EndpointName=xgb_endpoint_name,
    Body=payload.encode('utf-8'),
    ContentType='text/csv')['Body'].read()

In [None]:
predictions_500_samples = np.fromstring(response_500_samples, sep=',')
predictions_500_samples_0_or_1 = np.where(predictions_500_samples > 0.5, 1, 0)

In [None]:
print('Test Accuracy: ', accuracy_score(y_test[:500], predictions_500_samples_0_or_1))
print('Test Precision: ', precision_score(y_test[:500], predictions_500_samples_0_or_1, average=None))

In [None]:
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt

df_cm_test = confusion_matrix(y_test[:500], predictions_500_samples_0_or_1)
df_cm_test

In [None]:
import itertools

import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format='retina'

def plot_conf_mat(cm, classes, title, cmap = plt.cm.Greens):
    print(cm)
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
        horizontalalignment="center",
        color="black" if cm[i, j] > thresh else "black")

        plt.tight_layout()
        plt.ylabel('True label')
        plt.xlabel('Predicted label')

# Plot non-normalized confusion matrix
plt.figure()
fig, ax = plt.subplots(figsize=(6,4))
plot_conf_mat(df_cm_test, classes=['Not Positive Sentiment', 'Positive Sentiment'], 
                          title='Confusion matrix')
plt.show()

In [None]:
from sklearn import metrics

import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format='retina'

auc = round(metrics.roc_auc_score(y_test, preds_test), 4)
print('AUC is ' + repr(auc))

fpr, tpr, _ = metrics.roc_curve(y_test, preds_test)

plt.title('ROC Curve')
plt.plot(fpr, tpr, 'b',
label='AUC = %0.2f'% auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.1])
plt.ylim([-0.1,1.1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
predictions, raw_outputs = xgb_predictor.predict(["""Very funny. A typical mid 50's comedy."""])
print('Predictions: {}'.format(predictions))
print('Raw outputs: {}'.format(raw_outputs))

In [None]:
predictions, raw_outputs = xgb_predictor.predict(["""That movie was absolutely awful."""])
print('Predictions: {}'.format(predictions))
print('Raw outputs: {}'.format(raw_outputs))