In [None]:
import os
import boto3
import re
import copy
import time
from time import gmtime, strftime
import sagemaker
from sagemaker import get_execution_role
from sagemaker.debugger import Rule, rule_configs

role = get_execution_role()

region = boto3.Session().region_name

sagemaker_session = sagemaker.Session()

bucket=sagemaker.Session().default_bucket()
prefix = 'sagemaker/DEMO-xgboost-tripfare'

In [None]:
%store
%store -r

In [None]:
# training step for generating model artifacts
training_instance_type = "ml.m5.xlarge"
model_output = f"s3://{bucket}/{prefix}/model"

# Define the XGBoost training report rules
# see: https://docs.aws.amazon.com/sagemaker/latest/dg/debugger-training-xgboost-report.html
rules = [Rule.sagemaker(rule_configs.create_xgboost_report())]

image_uri = sagemaker.image_uris.retrieve(
    framework="xgboost",
    region=region,
    version="1.2-2",
    py_version="py3",
    instance_type=training_instance_type,
)
xgb_train = sagemaker.estimator.Estimator(
    image_uri=image_uri,
    instance_type=training_instance_type,
    instance_count=2,
    output_path=model_output,
    base_job_name=f"{prefix.split('/')[-1]}-train",
    sagemaker_session=sagemaker_session,
    role=role,
    disable_profiler=False,  # Profile processing job
    rules=rules,  # Report processing job
)

In [None]:
# Set some hyper parameters
# https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost_hyperparameters.html
xgb_train.set_hyperparameters(
    objective="reg:squarederror",
    num_round=100,
    early_stopping_rounds=10,
    max_depth=9,
    eta=0.2,
    gamma=4,
    min_child_weight=300,
    subsample=0.8,
)

In [None]:
from sagemaker.inputs import TrainingInput
content_type = "csv"
train_input = TrainingInput(
    train_path, content_type=content_type, distribution='ShardedByS3Key'
)
validation_input = TrainingInput(
    validation_path, content_type=content_type, distribution='ShardedByS3Key'
)

In [None]:
xgb_train.fit({'train': train_input, 'validation': validation_input})

In [None]:
training_job_name = xgb_train.latest_training_job.job_name
model_url = xgb_train.model_data

In [None]:
%store training_job_name
%store model_url

In [None]:
xgboost_endpoint_name = "xgboost-endpoint-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
xgboost_predictor = xgb_train.deploy(
    initial_instance_count=1, instance_type="ml.m5.xlarge", endpoint_name=xgboost_endpoint_name
)

In [None]:
import awswrangler as wr
test_df = wr.s3.read_csv(
        path=test_path, dataset=True, nrows=5, header=None
    )

In [None]:
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import CSVDeserializer

xgboost_predictor.serializer = CSVSerializer()
xgboost_predictor.deserializer = CSVDeserializer()
xgboost_predictor.predict(test_df.iloc[:,1:].values)[0]

## Automatic model Tuning 
Amazon SageMaker automatic model tuning, also known as hyperparameter tuning, finds the best version of a model by running many training jobs on your dataset using the algorithm and ranges of hyperparameters that you specify. It then chooses the hyperparameter values that result in a model that performs the best, as measured by a metric that you choose.
For example, suppose that you want to solve a binary classification problem on this marketing dataset. Your goal is to maximize the area under the curve (auc) metric of the algorithm by training an XGBoost Algorithm model. You don't know which values of the eta, alpha, min_child_weight, and max_depth hyperparameters to use to train the best model. To find the best values for these hyperparameters, you can specify ranges of values that Amazon SageMaker hyperparameter tuning searches to find the combination of values that results in the training job that performs the best as measured by the objective metric that you chose. Hyperparameter tuning launches training jobs that use hyperparameter values in the ranges that you specified, and returns the training job with highest auc.


In [None]:
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner
hyperparameter_ranges = {'eta': ContinuousParameter(0, 1),
                            'min_child_weight': ContinuousParameter(1, 10),
                            'alpha': ContinuousParameter(0, 2),
                            'max_depth': IntegerParameter(1, 10)}

In [None]:
objective_metric_name = 'validation:rmse'
objective_type = 'Minimize'

In [None]:
tuner = HyperparameterTuner(xgb_train,
                            objective_metric_name,
                            hyperparameter_ranges,
                            objective_type=objective_type,
                            max_jobs=10,
                            max_parallel_jobs=3)

In [None]:
tuner.fit({'train': train_input, 'validation': validation_input})

In [None]:
boto3.client('sagemaker').describe_hyper_parameter_tuning_job(
HyperParameterTuningJobName=tuner.latest_tuning_job.job_name)['HyperParameterTuningJobStatus']

In [None]:
# return the best training job name
tuner.best_training_job()

In [None]:
#  Deploy the best trained or user specified model to an Amazon SageMaker endpoint
tuner_predictor = tuner.deploy(initial_instance_count=1,
                           instance_type='ml.m4.xlarge')

In [None]:
# Create a serializer
tuner_predictor.serializer = CSVSerializer()
tuner_predictor.deserializer = CSVDeserializer()

In [None]:
# Predict
tuner_predictor.predict(test_df.iloc[:,1:].values)[0]

### (Optional) Delete the Endpoint
If you're done with this exercise, please run the delete_endpoint line in the cell below. This will remove the hosted endpoint and avoid any charges from a stray instance being left on.

In [None]:
xgboost_predictor.delete_endpoint(delete_endpoint_config=True)

In [None]:
tuner_predictor.delete_endpoint(delete_endpoint_config=True)