Based on these posts: 
* https://medium.com/@chrisfotache/text-classification-in-python-pipelines-nlp-nltk-tf-idf-xgboost-and-more-b83451a327e0
* https://github.com/keisukeirie/Amazon_review_helpfulness_prediction
* https://stackabuse.com/text-classification-with-bert-tokenizer-and-tf-2-0-in-python/
* https://towardsdatascience.com/simple-bert-using-tensorflow-2-0-132cb19e9b22

In [None]:
!pip install -q boto3
!pip install -q xgboost==0.90

In [None]:
import boto3
import sagemaker
import pandas as pd

sess   = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name='sagemaker', region_name=region)

In [None]:
prefix_train = 'feature-store/amazon-reviews/balanced-tfidf-without-header/train'
prefix_validation = 'feature-store/amazon-reviews/balanced-tfidf-without-header/validation'
prefix_test = 'feature-store/amazon-reviews/balanced-tfidf-without-header/test'

balanced_tfidf_without_header_train_s3_uri = 's3://{}/{}/data.csv'.format(bucket, prefix_train)
balanced_tfidf_without_header_validation_s3_uri = 's3://{}/{}/data.csv'.format(bucket, prefix_validation)
balanced_tfidf_without_header_test_s3_uri = 's3://{}/{}/data.csv'.format(bucket, prefix_test)

s3_input_train_data = sagemaker.s3_input(s3_data=balanced_tfidf_without_header_train_s3_uri, content_type='text/csv')
s3_input_validation_data = sagemaker.s3_input(s3_data=balanced_tfidf_without_header_validation_s3_uri, content_type='text/csv')
s3_input_test_data = sagemaker.s3_input(s3_data=balanced_tfidf_without_header_test_s3_uri, content_type='text/csv')

print(s3_input_train_data.config)
print(s3_input_validation_data.config)
print(s3_input_test_data.config)

In [None]:
!cat src/xgboost_reviews.py

In [None]:
from sagemaker.xgboost import XGBoost

model_output_path = 's3://{}/sagemaker/xgboost/training-runs'.format(bucket)

xgb_estimator = XGBoost(entry_point='xgboost_reviews.py', 
                        source_dir='src/',
                        role=role,
                        train_instance_count=1, 
#                        train_instance_type='local',
                        train_instance_type='ml.m4.xlarge',
                        framework_version='0.90-2',
                        py_version='py3',
                        output_path=model_output_path,
                        hyperparameters={'objective':'binary:logistic',
                                         'num_round': 1,
                                         'max_depth': 5}                                         
                       )

### Train the model

In [None]:
xgb_estimator.fit(inputs={'train': s3_input_train_data, 
                          'validation': s3_input_validation_data}, wait=False) 

In [None]:
training_job_name = xgb_estimator.latest_training_job.name
print('training_job_name:  {}'.format(training_job_name))

In [None]:
from sagemaker.xgboost import XGBoost

xgb_estimator = XGBoost.attach(training_job_name=training_job_name)

# TODO:  Fix prediction

In [None]:
xgb_endpoint_name = prefix + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())

xgb_predictor = xgb_estimator.deploy(
                     initial_instance_count=1, 
                     instance_type='ml.m4.xlarge',
                     endpoint_name=xgb_endpoint_name)

In [None]:
import boto3
# TODO:  Fix this...
sm_rt = boto3.client('sagemaker-runtime')

# Predict a sample from the validation set
payload = df_validation[:1].drop(['is_positive_sentiment'], axis=1) 
payload = payload.to_csv(header=False, index=False).rstrip()

print(payload)

In [None]:
response = sm_rt.invoke_endpoint(
    EndpointName=xgb_endpoint_name,
    Body=payload.encode('utf8'),
    ContentType='text/csv')

print(response['Body'].read())

In [None]:
# Don't forget to delete the endpoint!
# sagemaker_session.delete_endpoint(endpoint_name=xgb_endpoint_name) 

In [None]:
predictions, raw_outputs = xgb_predictor.predict(["""Very funny. A typical mid 50's comedy."""])
print('Predictions: {}'.format(predictions))
print('Raw outputs: {}'.format(raw_outputs))

In [None]:
predictions, raw_outputs = xgb_predictor.predict(["""That movie was absolutely awful."""])
print('Predictions: {}'.format(predictions))
print('Raw outputs: {}'.format(raw_outputs))