In [1]:
# Run this cell if you wan't to run locally on laptop
import boto3
region = boto3.Session().region_name

def resolve_sm_role():
    client = boto3.client('iam', region_name=region)
    response_roles = client.list_roles(
        PathPrefix='/',
        # Marker='string',
        MaxItems=999
    )
    for role in response_roles['Roles']:
        if role['RoleName'].startswith('AmazonSageMaker-ExecutionRole-'):
            #print('Resolved SageMaker IAM Role to: ' + str(role))
            return role['Arn']
    raise Exception('Could not resolve what should be the SageMaker role to be used')

role = resolve_sm_role()
print(role)

arn:aws:iam::603012210694:role/service-role/AmazonSageMaker-ExecutionRole-20210304T123661


### Extra Step
if you have run last pracice you don't need to run this cell
Because there we already prepare the data and upload it in s3
if you did't run prevouse code or have deleted that s3 bucket
You should run this cell
If you wan't to know about these command see previos example

Im not going to run below cell

In [None]:
# Preparing data
from sklearn.datasets import load_boston
from pandas import DataFrame
import pandas as pd
import numpy as np

boston = load_boston()
data = boston
df = DataFrame(np.concatenate((boston.data, boston.target.reshape(-1, 1)), axis=1), 
                            columns=np.concatenate((boston.feature_names, ["MEDV"])))

del df['B']

df = pd.concat([df['MEDV'],df.drop(['MEDV'], axis=1)],axis=1)

# spliting data
from sklearn.model_selection import train_test_split
training_dataset, validation_dataset = train_test_split(df, test_size=0.1)

training_dataset.to_csv('training_dataset.csv', index=False, header=False)
validation_dataset.to_csv('validation_dataset.csv', index=False, header=False)

# Uploading data
import sagemaker
sess = sagemaker.Session()
bucket = sess.default_bucket()

prefix = 'boston-housing'
training_data_path = sess.upload_data(path='training_dataset.csv', key_prefix=prefix + '/input/training')
validation_data_path = sess.upload_data(path='validation_dataset.csv', key_prefix=prefix + '/input/validation')

### Classification with XGBoost
Let's train a model on the Boston Housing dataset with the XGBoost algorithm
( https://github.com/dmlc/xgboost ). As we will see in Chapter 7, Using Built-in
Frameworks, SageMaker also supports XGBoost scripts:

1. We reuse the dataset preparation steps from the previous examples.

2. We find the name of the XGBoost container. As several versions are supported,

we select the latest one (1.0-1 at the time of writing):

In [2]:
import boto3
from sagemaker import image_uris
region = boto3.Session().region_name
container = image_uris.retrieve('xgboost', region, version='latest')

In [3]:
# code from previous example
import sagemaker
from sagemaker.estimator import Estimator
sess = sagemaker.Session()
bucket = sess.default_bucket()
prefix = 'boston-housing'

3. We configure the Estimator function. The code is strictly identical to the one used with LinearLearner :

In [4]:
xgb_estimator = Estimator(container,
                        role=role,#sagemaker.get_execution_role(),
                        instance_count=1,
                        instance_type='ml.m4.xlarge',#'ml.m5.large',
                        output_path='s3://{}/{}/output'.format(bucket,
                        prefix))

4. Taking a look at the hyperparameters ( https://docs.aws.amazon.com/
sagemaker/latest/dg/xgboost_hyperparameters.html ), we see that
the only required one is num_round . As it's not obvious which value to set,
we'll go for a large value, and we'll also define the early_stopping_rounds
parameter in order to avoid overfitting. Of course, we need to set the objective for
a regression problem:

In [5]:
xgb_estimator.set_hyperparameters(
            objective='reg:linear',
            num_round=200,
            early_stopping_rounds=10)

5. We define the training input, just like in the previous example:

In [6]:
# if you don't have these CSV in Current directory this code will not work
training_data_path = sess.upload_data(path='training_dataset.csv', key_prefix=prefix + '/input/training')
validation_data_path = sess.upload_data(path='validation_dataset.csv', key_prefix=prefix + '/input/validation')

In [7]:
training_data_channel = sagemaker.TrainingInput(
                        s3_data=training_data_path,
                        content_type='text/csv')

In [8]:
validation_data_channel = sagemaker.TrainingInput(
                                    s3_data=validation_data_path,
                                    content_type='text/csv')

6. We then launch the training job:

In [9]:
xgb_estimator.fit({'train': training_data_channel,
                    'validation': validation_data_channel})

2021-03-28 14:04:06 Starting - Starting the training job...
2021-03-28 14:04:08 Starting - Launching requested ML instancesProfilerReport-1616940245: InProgress
......
2021-03-28 14:05:41 Starting - Preparing the instances for training......
2021-03-28 14:06:55 Downloading - Downloading input data...
2021-03-28 14:07:22 Training - Downloading the training image..[34mArguments: train[0m
[34m[2021-03-28:14:07:43:INFO] Running standalone xgboost training.[0m
[34m[2021-03-28:14:07:43:INFO] File size need to be processed in the node: 0.03mb. Available memory size in the node: 8435.86mb[0m
[34m[2021-03-28:14:07:43:INFO] Determined delimiter of CSV input is ','[0m
[34m[14:07:43] S3DistributionType set as FullyReplicated[0m
[34m[14:07:43] 455x12 matrix with 5460 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2021-03-28:14:07:43:INFO] Determined delimiter of CSV input is ','[0m
[34m[14:07:43] S3DistributionType set as FullyReplicated[0


2021-03-28 14:08:02 Uploading - Uploading generated training model
2021-03-28 14:08:02 Completed - Training job completed
Training seconds: 60
Billable seconds: 60


7. The job only ran for 22 rounds, meaning that early stopping was triggered. Looking
at the training log, we see that round #12 was actually the best one, with a root
mean square error (RMSE) of 2.43126:

8. Deploying still takes one line of code:

In [10]:
from time import strftime, gmtime
timestamp = strftime('%d-%H-%M-%S', gmtime())
endpoint_name = 'xgb-demo'+'-'+timestamp

In [11]:
xgb_predictor = xgb_estimator.deploy(
                endpoint_name=endpoint_name,
                initial_instance_count=1,
                instance_type='ml.t2.medium')

----------------!

9. Once the model is deployed, we used the predict() API again to send
it a CSV sample:

In [12]:
test_sample = '0.00632,18.00,2.310,0,0.5380,6.5750,65.20,4.0900,1,296.0,15.30,4.98'

In [13]:
#xgb_predictor.content_type = 'text/csv'
xgb_predictor.serializer = sagemaker.serializers.CSVSerializer()
xgb_predictor.deserializer = sagemaker.deserializers.CSVDeserializer()

response = xgb_predictor.predict(test_sample)
print(response)

[['23.949708938598633']]


The result tells us that this house should cost $23,754.
[['23.73023223876953']]

10. Finally, we delete the endpoint when we're done:

In [14]:
xgb_predictor.delete_endpoint()