In [1]:
# Run this cell if you wan't to run locally on laptop
import boto3
region = boto3.Session().region_name

def resolve_sm_role():
    client = boto3.client('iam', region_name=region)
    response_roles = client.list_roles(
        PathPrefix='/',
        # Marker='string',
        MaxItems=999
    )
    for role in response_roles['Roles']:
        if role['RoleName'].startswith('AmazonSageMaker-ExecutionRole-'):
            #print('Resolved SageMaker IAM Role to: ' + str(role))
            return role['Arn']
    raise Exception('Could not resolve what should be the SageMaker role to be used')

role = resolve_sm_role()
print(role)

arn:aws:iam::603012210694:role/service-role/AmazonSageMaker-ExecutionRole-20210304T123661


### Using the SageMaker SDK with built-in algorithms

## 1.Preparing data
In Book Example we manually downloaded csv from link below

https://github.com/PacktPublishing/Learn-Amazon-SageMaker/blob/master/sdkv2/ch4/housing.csv

But this dataset is also available in sklearn, so i'm just going to import it import it from there. 

In [2]:
from sklearn.datasets import load_boston
from pandas import DataFrame
import pandas as pd
import numpy as np

boston = load_boston()
data = boston
df = DataFrame(np.concatenate((boston.data, boston.target.reshape(-1, 1)), axis=1), 
                            columns=np.concatenate((boston.feature_names, ["MEDV"])))

print("Rows and Columns :",df.shape)
display(df.head(2))

Rows and Columns : (506, 14)


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6


1. In book example we don't have Column name B, so we are going to drop it as well.

In [3]:
del df['B']
print("Rows and Columns :",df.shape)
display(df.head(2))

Rows and Columns : (506, 13)


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,9.14,21.6


Reading the algorithm documentation ( https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-training.html )
, we see that Amazon SageMaker requires that a CSV file doesn't have a header record and that the target
variable is in the first column. Accordingly, we move the medv column to the front
of the dataframe:

In [4]:
df = pd.concat([df['MEDV'],df.drop(['MEDV'], axis=1)],axis=1)
print("Rows and Columns :",df.shape)
display(df.head(2))

Rows and Columns : (506, 13)


Unnamed: 0,MEDV,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,LSTAT
0,24.0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,4.98
1,21.6,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,9.14


Spliting dataframe into: 90% training, and 10% validation:

In [5]:
from sklearn.model_selection import train_test_split
training_dataset, validation_dataset = train_test_split(df, test_size=0.1)
print(training_dataset.shape)
print(validation_dataset.shape)

(455, 13)
(51, 13)


We save these two splits to individual CSV files, without either an index or a header:

In [6]:
training_dataset.to_csv('training_dataset.csv', index=False, header=False)
validation_dataset.to_csv('validation_dataset.csv', index=False, header=False)

In [7]:
# lets check in our pc, you can skip this step
import os
print(os.listdir("."))

display(pd.read_csv('training_dataset.csv').head(2))
display(pd.read_csv('validation_dataset.csv').head(2))

['.ipynb_checkpoints', '1.Linear Learner on Boston Housing Dataset.ipynb', '2.XGBoost on Boston Housing Dataset.ipynb', 'Boston_Housing Model.ipynb', 'housing.csv', 'Linear Learner on Boston Housing Dataset.ipynb', 'mnist old', 'training_dataset.csv', 'validation_dataset.csv', 'XGBoost on Boston Housing Dataset.ipynb']


Unnamed: 0,24.0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,4.98
0,12.3,7.99248,0.0,18.1,0.0,0.7,5.52,100.0,1.5331,24.0,666.0,20.2,24.56
1,5.0,67.9208,0.0,18.1,0.0,0.693,5.683,100.0,1.4254,24.0,666.0,20.2,22.98


Unnamed: 0,16.3,28.6558,0.0,18.1,0.0.1,0.597,5.155,100.0,1.5894,24.0,666.0,20.2,20.08
0,50.0,0.57834,20.0,3.97,0.0,0.575,8.297,67.0,2.4216,5.0,264.0,13.0,7.44
1,36.4,0.08664,45.0,3.44,0.0,0.437,7.178,26.3,6.4798,5.0,398.0,15.2,2.87


## 2.Uploading data
We now need to upload these two files to S3. We could use any bucket, and
here we'll use the default bucket conveniently created by SageMaker in the region
we're running in.

In [14]:
import sagemaker
#print(sagemaker.__version__)
sess = sagemaker.Session()
bucket = sess.default_bucket()

prefix = 'boston-housing'
training_data_path = sess.upload_data(path='training_dataset.csv', key_prefix=prefix + '/input/training')
validation_data_path = sess.upload_data(path='validation_dataset.csv', key_prefix=prefix + '/input/validation')

print(training_data_path)
print(validation_data_path)

s3://sagemaker-us-east-1-603012210694/boston-housing/input/training/training_dataset.csv
s3://sagemaker-us-east-1-603012210694/boston-housing/input/validation/validation_dataset.csv


## 3.Configuring a training job
1. Earlier in this chapter, we learned that SageMaker algorithms are packaged in
Docker containers. Using boto3 and the image_uris.retrieve() API, we
can easily find the name of the Linear Learner algorithm in the region
we're running:

In [8]:
import boto3
from sagemaker import image_uris

region = boto3.Session().region_name    
container = image_uris.retrieve('linear-learner', region)
#print(container)

2. Now that we know the name of the container, we can configure our training job
with the Estimator object. In addition to the container name, we also pass the
IAM role that SageMaker instances will use, the instance type and instance count
to use for training, as well as the output location for the model. Estimator will
generate a training job automatically, and we could also set our own prefix with the
base_job_name parameter:

In [11]:
from sagemaker.estimator import Estimator
ll_estimator = Estimator(
                container,
                role=role,#sagemaker.get_execution_role(),
                instance_count=1,
                instance_type='ml.m4.xlarge',#'ml.m5.large',
                output_path='s3://{}/{}/output'.format(bucket,prefix))

3. Next, we have to set hyperparameters. This step is possibly one of the most obscure
and most difficult parts of any machine learning project.
Let's look at the documentation, and see which hyperparameters are mandatory
( https://docs.aws.amazon.com/sagemaker/latest/dg/ll_hyperparameters.html) . As it turns out, there is only one: predictor_type . It defines the type of problem that Linear Learner is training on (regression,
binary classification, or multiclass classification).
Taking a deeper look, we see that the default value for mini_batch_size is 1000:
this isn't going to work well with our 506-sample dataset, so let's set it to 32. We also
learn that the normalize_data parameter is set to true by default, which makes
it unnecessary to normalize data ourselves:

In [12]:
ll_estimator.set_hyperparameters(predictor_type='regressor', mini_batch_size=32)

4. Now, let's define the data channels: a channel is a named source of data passed to
a SageMaker estimator. All built-in algorithms need at least a train channel, and
many also accept additional channels for validation and testing. Here, we have two
channels, which both provide data in CSV format. The TrainingInput() API
lets us define their location, their format, whether they are compressed, and so on:

In [15]:
training_data_channel   = sagemaker.TrainingInput(s3_data=training_data_path, content_type='text/csv')
validation_data_channel = sagemaker.TrainingInput(s3_data=validation_data_path, content_type='text/csv')

## 4.Launching a training job
1. We simply pass a Python dictionary containing the two channels to the fit() API:

In [16]:
ll_data = {'train': training_data_channel, 'validation': validation_data_channel}
ll_estimator.fit(ll_data)

2021-03-28 11:16:09 Starting - Starting the training job...
2021-03-28 11:16:32 Starting - Launching requested ML instancesProfilerReport-1616930168: InProgress
......
2021-03-28 11:17:52 Starting - Preparing the instances for training......
2021-03-28 11:18:52 Downloading - Downloading input data...
2021-03-28 11:19:33 Training - Training image download completed. Training in progress.[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[03/28/2021 11:19:37 INFO 140028512040768] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-input.json: {'mini_batch_size': '1000', 'epochs': '15', 'feature_dim': 'auto', 'use_bias': 'true', 'binary_classifier_model_selection_criteria': 'accuracy', 'f_beta': '1.0', 'target_recall': '0.8', 'target_precision': '0.8', 'num_models': 'auto', 'num_calibration_samples': '10000000', 'init_method': 'uniform', 'init_scale': '0.07', 'init_sigm

[34m#metrics {"StartTime": 1616930380.0319042, "EndTime": 1616930380.0319948, "Dimensions": {"Algorithm": "Linear Learner", "Host": "algo-1", "Operation": "training", "epoch": 3, "model": 0}, "Metrics": {"train_mse_objective": {"sum": 0.3958063785518919, "count": 1, "min": 0.3958063785518919, "max": 0.3958063785518919}}}
[0m
[34m#metrics {"StartTime": 1616930380.0321078, "EndTime": 1616930380.0321286, "Dimensions": {"Algorithm": "Linear Learner", "Host": "algo-1", "Operation": "training", "epoch": 3, "model": 1}, "Metrics": {"train_mse_objective": {"sum": 0.41201661633593695, "count": 1, "min": 0.41201661633593695, "max": 0.41201661633593695}}}
[0m
[34m#metrics {"StartTime": 1616930380.0322192, "EndTime": 1616930380.0322394, "Dimensions": {"Algorithm": "Linear Learner", "Host": "algo-1", "Operation": "training", "epoch": 3, "model": 2}, "Metrics": {"train_mse_objective": {"sum": 0.44800473962511334, "count": 1, "min": 0.44800473962511334, "max": 0.44800473962511334}}}
[0m
[34m#m


2021-03-28 11:20:13 Uploading - Uploading generated training model
2021-03-28 11:20:13 Completed - Training job completed
ProfilerReport-1616930168: NoIssuesFound
Training seconds: 68
Billable seconds: 68


2. Once training is complete, the model is copied automatically to S3.
3. Looking at the output location in our S3 bucket, we see the model artifact:

In [18]:
# command is working in studio but not working locally

#%%bash -s "$ll_estimator.output_path"
#aws s3 ls --recursive $1

## 5.Deploying a model
1.It's good practice to create identifiable and unique endpoint names. We could also
let SageMaker create one for us during deployment:

In [19]:
from time import strftime, gmtime
timestamp = strftime('%d-%H-%M-%S', gmtime())

endpoint_name = 'linear-learner-demo-'+timestamp
print(endpoint_name)

linear-learner-demo-28-11-21-21


2. We deploy the model using the deploy() API.

In [20]:
ll_predictor = ll_estimator.deploy(
                endpoint_name=endpoint_name, 
                initial_instance_count=1, 
                instance_type='ml.t2.medium')

-----------------!

3. A few minutes later, the endpoint is in service. We can use the predict() API
to send it a CSV sample for prediction. We set content type and serialization
accordingly: built-in functions are available, and we use them as is:

In [22]:
test_sample = '0.00632,18.00,2.310,0,0.5380,6.5750,65.20,4.0900,1,296.0,15.30,4.98'

#ll_predictor.content_type = 'text/csv'
ll_predictor.serializer = sagemaker.serializers.CSVSerializer()
ll_predictor.deserializer = sagemaker.deserializers.CSVDeserializer()

response = ll_predictor.predict(test_sample)
print(response)

[['31.62579345703125']]


4. We can also predict multiple samples at a time:

In [23]:
test_samples = ['0.00632,18.00,2.310,0,0.5380,6.5750,65.20,4.0900,1,296.0,15.30,4.98',
                '0.02731,0.00,7.070,0,0.4690,6.4210,78.90,4.9671,2,242.0,17.80,9.14']

response = ll_predictor.predict(test_samples)
print(response)

[['31.62579345703125'], ['24.375823974609375']]


In [24]:
# another way to predict
runtime = boto3.Session().client(service_name='runtime.sagemaker') 

response = runtime.invoke_endpoint(EndpointName=endpoint_name, 
                                  ContentType='text/csv', 
                                  Body=test_sample)

print(response['Body'].read())

b'{"predictions": [{"score": 31.62579345703125}]}'


## Cleaning up

In [25]:
ll_predictor.delete_endpoint()

Note:

we didn't delete the data stored in s3, if you don't want to be charged, delete it as well.

Also we are going to use same data in next example, better not to delete it for now.