In [1]:
!pip install -q xgboost==0.90

In [12]:
import boto3
import sagemaker
import pandas as pd

sess   = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name='sagemaker', region_name=region)

In [13]:
%store -r spark_processing_job_s3_output_prefix

In [14]:
print('Previous Spark Processing Job Name: {}'.format(spark_processing_job_s3_output_prefix))

Previous Spark Processing Job Name: amazon-reviews-spark-processor-2020-03-30-04-12-49


# Specify the S3 Location of the Features

In [15]:
prefix_train = '{}/output/tfidf-train'.format(spark_processing_job_s3_output_prefix)
prefix_validation = '{}/output/tfidf-validation'.format(spark_processing_job_s3_output_prefix)
prefix_test = '{}/output/tfidf-test'.format(spark_processing_job_s3_output_prefix)

balanced_tfidf_without_header_train_path = './{}'.format(prefix_train)
balanced_tfidf_without_header_validation_path = './{}'.format(prefix_validation)
balanced_tfidf_without_header_test_path = './{}'.format(prefix_test)

balanced_tfidf_without_header_train_s3_uri = 's3://{}/{}'.format(bucket, prefix_train)
balanced_tfidf_without_header_validation_s3_uri = 's3://{}/{}'.format(bucket, prefix_validation)
balanced_tfidf_without_header_test_s3_uri = 's3://{}/{}'.format(bucket, prefix_test)

In [16]:
s3_input_train_data = sagemaker.s3_input(s3_data=balanced_tfidf_without_header_train_s3_uri, content_type='text/csv')
s3_input_validation_data = sagemaker.s3_input(s3_data=balanced_tfidf_without_header_validation_s3_uri, content_type='text/csv')
s3_input_test_data = sagemaker.s3_input(s3_data=balanced_tfidf_without_header_test_s3_uri, content_type='text/csv')

print(s3_input_train_data.config)
print(s3_input_validation_data.config)
print(s3_input_test_data.config)

{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sagemaker-us-east-1-835319576252/amazon-reviews-spark-processor-2020-03-30-04-12-49/output/tfidf-train', 'S3DataDistributionType': 'FullyReplicated'}}, 'ContentType': 'text/csv'}
{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sagemaker-us-east-1-835319576252/amazon-reviews-spark-processor-2020-03-30-04-12-49/output/tfidf-validation', 'S3DataDistributionType': 'FullyReplicated'}}, 'ContentType': 'text/csv'}
{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sagemaker-us-east-1-835319576252/amazon-reviews-spark-processor-2020-03-30-04-12-49/output/tfidf-test', 'S3DataDistributionType': 'FullyReplicated'}}, 'ContentType': 'text/csv'}


In [17]:
!cat src/xgboost_reviews.py

import os
import argparse
import pickle as pkl
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, classification_report, confusion_matrix
from sklearn import metrics
from sklearn.base import BaseEstimator, TransformerMixin
import nltk
import re
import xgboost as xgb
from xgboost import XGBClassifier
import glob


def load_dataset(path, sep, header):
    data = pd.concat([pd.read_csv(f, sep=sep, header=header) for f in glob.glob('{}/*.csv'.format(path))], ignore_index = True)

    labels = data.iloc[:,0]
    features = data.drop(data.columns[0], axis=1)
    
    if header==None:
        # Adjust the column names after dropped the 0th column above
        # New column names are 0 (inclusive) to len(features.columns) (exclusive)
        new_column_names = list(range(0, len(features.columns)))
        features.columns = new_column_names

    return features, labels


def model_fn(model_dir):
    """
    :param: model_dir The dire

In [18]:
from sagemaker.xgboost import XGBoost

model_output_path = 's3://{}/models/amazon-reviews/script-mode/training-runs'.format(bucket)

xgb_estimator = XGBoost(entry_point='xgboost_reviews.py', 
                        source_dir='src/',
                        role=role,
                        train_instance_count=1,
                        train_instance_type='ml.c5.9xlarge',
                        framework_version='0.90-2',
                        py_version='py3',
                        output_path=model_output_path,
                        hyperparameters={'objective':'binary:logistic',
                                         'num_round': 1,
                                         'max_depth': 5},
                        enable_cloudwatch_metrics=True
                       )

# Train the Model

In [19]:
xgb_estimator.fit(inputs={'train': s3_input_train_data, 
                          'validation': s3_input_validation_data}, wait=False) 

In [20]:
training_job_name = xgb_estimator.latest_training_job.name
print('training_job_name:  {}'.format(training_job_name))

training_job_name:  sagemaker-xgboost-2020-03-30-22-14-40-855


In [21]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a href="https://console.aws.amazon.com/cloudwatch/home?region={}#logStream:group=/aws/sagemaker/TrainingJobs;prefix={};streamFilter=typeLogStreamPrefix">CloudWatch Logs</a> After About 5 Minutes</b>'.format(region, training_job_name)))


In [None]:
from sagemaker.xgboost import XGBoost

xgb_estimator = XGBoost.attach(training_job_name=training_job_name)

# Predict with the Model
Invoke the endpoint that was deployed during the Pipeline.

In [None]:
sm.describe_endpoint(EndpointName=training_job_name)

In [None]:
sm_runtime = boto3.Session().client(service_name='sagemaker-runtime', region_name=region)

In [None]:
payload = 'This item is great!'

response = sm_runtime.invoke_endpoint(
    EndpointName=training_job_name,
    Body=payload.encode('utf-8'),
    ContentType='text/csv')['Body'].read()

print(response)

In [None]:
import json

def serializer(df):
    return [1]
#     feature_dim = customer_index.shape[0] + product_index.shape[0] + 1
#     js = {'instances': []}
#     for index, data in df.iterrows():
#         js['instances'].append({'data': {'features': {'values': [1, 1, data['days_since_first']],
#                                                       'keys': [data['user'], data['item'], feature_dim - 1],
#                                                       'shape': [feature_dim]}}})
#     return json.dumps(js)

In [None]:
import json

def deserializer(df):
    return [1]
#     feature_dim = customer_index.shape[0] + product_index.shape[0] + 1
#     js = {'instances': []}
#     for index, data in df.iterrows():
#         js['instances'].append({'data': {'features': {'values': [1, 1, data['days_since_first']],
#                                                       'keys': [data['user'], data['item'], feature_dim - 1],
#                                                       'shape': [feature_dim]}}})
#     return json.dumps(js)

In [None]:
from sagemaker.predictor import json_deserializer

xgb_predictor.content_type = 'text/csv'
xgb_predictor.serializer = serializer
xgb_predictor.deserializer = deserializer

In [None]:
xgb_predictor.predict(X_test)