In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv('salary.csv')

In [3]:
import sagemaker
#boto3 use to interface with services from AWS such as S3 or EC2
import boto3

In [4]:
#create session to put in model later
sagemaker_session= sagemaker.Session()
#to use sage maker locally, need to install CLI in terminal

In [5]:
#Define the S3 bucket and prefix that we want to use in this session
bucket = 'sagemaker-bichinh' #created in AWS
prefix = 'linear_learner' #prefix is subfolder within bucket

In [6]:
# Note that AWS Identity and Access Management (IAM) role that Amazon SageMaker can assume to perform tasks on your behalf (for example, reading training results, called model artifacts, from the S3 bucket and writing training results to Amazon S3). 
role = sagemaker.get_execution_role()
print(role)

Couldn't call 'get_role' to get Role ARN from role name AmazonSageMaker-ExecutionRole-20210524T215690 to get Role path.
Assuming role was created in SageMaker AWS console, as the name contains `AmazonSageMaker-ExecutionRole`. Defaulting to Role ARN with service-role in path. If this Role ARN is incorrect, please add IAM read permissions to your role or supply the Role Arn directly.


arn:aws:iam::715120690714:role/service-role/AmazonSageMaker-ExecutionRole-20210524T215690


In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X=df[['YearsExperience']]
y=df[['Salary']]

In [9]:
X.head()

Unnamed: 0,YearsExperience
0,1.1
1,1.3
2,1.5
3,2.0
4,2.2


In [10]:
#the model only supports float32 tensor
X = np.array(X).astype('float32')
y = np.array(y).astype('float32')

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [12]:
X_train.shape

(28, 1)

In [13]:
#Labels must be a Vector
y_train=y_train[:,0]

In [14]:
y_train

array([ 66029.,  81363., 135675.,  54445.,  60150., 113812.,  83088.,
        64445.,  43525., 112635.,  55794., 127345., 105582.,  39343.,
        57081.,  91738.,  39891.,  93940., 128765.,  57189.,  46205.,
       139465., 101302.,  56642.,  67938., 121872.,  98273.,  37731.],
      dtype=float32)

In [15]:
#the input support recordIO, and the io module allows for dealing with various type of IO
import io
import sagemaker.amazon.common as smac #sage maker common library

In [16]:
#convert the data in numpy array format to RecordIO format
#This is the format required by SageMaker

buf = io.BytesIO() #create an in-memory byte array
smac.write_numpy_to_dense_tensor(buf, X_train, y_train)

# When you write to in-memory byte arrays, it increments 1 every time you write to it
# Let's reset that back to zero 
buf.seek(0)

0

In [17]:
#os module include many functions to interact with the file system.
import os

# Code to upload RecordIO data to S3
 
# Key refers to the name of the file    
key = 'linear-train-data'

# The following code uploads the data in record-io format to S3 bucket to be accessed later for training
boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf)

# Let's print out the training data location in s3
s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)
print('uploaded training data location: {}'.format(s3_train_data))

uploaded training data location: s3://sagemaker-bichinh/linear_learner/train/linear-train-data


In [18]:
# Make sure that the target label is a vector
y_test = y_test[:,0]

In [19]:
y_test

array([116969.,  61111., 122391., 126756.,  56957., 109431.,  63218.],
      dtype=float32)

In [20]:
# create an output placeholder in S3 bucket to store the linear learner output

output_location = 's3://{}/{}/output'.format(bucket, prefix)
print('Training artifacts will be uploaded to: {}'.format(output_location))

Training artifacts will be uploaded to: s3://sagemaker-bichinh/linear_learner/output


In [21]:
# This code is used to get the training container of sagemaker built-in algorithms
# all we have to do is to specify the name of the algorithm, that we want to use

# Let's obtain a reference to the linearLearner container image
# Note that all regression models are named estimators
# You don't have to specify (hardcode) the region, get_image_uri will get the current region name using boto3.Session

from sagemaker.amazon.amazon_estimator import image_uris

container = sagemaker.image_uris.retrieve('linear-learner',boto3.Session().region_name )

In [22]:
# We have pass in the container, the type of instance that we would like to use for training 
# output path and sagemaker session into the Estimator. 
# We can also specify how many instances we would like to use for training
# sagemaker_session = sagemaker.Session()

linear = sagemaker.estimator.Estimator(container,
                                       role, 
                                       instance_count = 1, 
                                       instance_type = 'ml.c4.xlarge',
                                       output_path = output_location,
                                       sagemaker_session = sagemaker_session)


# We can tune parameters like the number of features that we are passing in, type of predictor like 'regressor' or 'classifier', mini batch size, epochs
# Train 32 different versions of the model and will get the best out of them (built-in parameters optimization!)

linear.set_hyperparameters(feature_dim = 1,
                           predictor_type = 'regressor',
                           mini_batch_size = 5,
                           epochs = 5,
                           num_models = 32,
                           loss = 'absolute_loss') #for regression, absolute_loss is the best)

# Now we are ready to pass in the training data from S3 to train the linear learner model


In [23]:
linear.fit({'train':s3_train_data})

2021-05-28 01:54:02 Starting - Starting the training job...ProfilerReport-1622166841: InProgress
...
2021-05-28 01:55:01 Starting - Launching requested ML instances......
2021-05-28 01:56:01 Starting - Preparing the instances for training......
2021-05-28 01:57:02 Downloading - Downloading input data...
2021-05-28 01:57:31 Training - Downloading the training image...
2021-05-28 01:58:02 Uploading - Uploading generated training model[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[05/28/2021 01:57:56 INFO 140155614271296] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-input.json: {'mini_batch_size': '1000', 'epochs': '15', 'feature_dim': 'auto', 'use_bias': 'true', 'binary_classifier_model_selection_criteria': 'accuracy', 'f_beta': '1.0', 'target_recall': '0.8', 'target_precision': '0.8', 'num_models': 'auto', 'num_calibration_samples': '10000000', 'init_metho

In [28]:
# Deploying the model to perform inference 

linear_regressor = linear.deploy(initial_instance_count = 1,
                                          instance_type = 'm1.m5.large')

ClientError: An error occurred (ValidationException) when calling the CreateEndpointConfig operation: 1 validation error detected: Value 'm1.m5.large' at 'productionVariants.1.member.instanceType' failed to satisfy constraint: Member must satisfy enum value set: [ml.r5d.12xlarge, ml.r5.12xlarge, ml.p2.xlarge, ml.m5.4xlarge, ml.m4.16xlarge, ml.r5d.24xlarge, ml.r5.24xlarge, ml.p3.16xlarge, ml.m5d.xlarge, ml.m5.large, ml.t2.xlarge, ml.p2.16xlarge, ml.m5d.12xlarge, ml.inf1.2xlarge, ml.m5d.24xlarge, ml.c4.2xlarge, ml.c5.2xlarge, ml.c4.4xlarge, ml.inf1.6xlarge, ml.c5d.2xlarge, ml.c5.4xlarge, ml.g4dn.xlarge, ml.g4dn.12xlarge, ml.c5d.4xlarge, ml.g4dn.2xlarge, ml.c4.8xlarge, ml.c4.large, ml.c5d.xlarge, ml.c5.large, ml.g4dn.4xlarge, ml.c5.9xlarge, ml.g4dn.16xlarge, ml.c5d.large, ml.c5.xlarge, ml.c5d.9xlarge, ml.c4.xlarge, ml.inf1.xlarge, ml.g4dn.8xlarge, ml.inf1.24xlarge, ml.m5d.2xlarge, ml.t2.2xlarge, ml.c5d.18xlarge, ml.m5d.4xlarge, ml.t2.medium, ml.c5.18xlarge, ml.r5d.2xlarge, ml.r5.2xlarge, ml.p3.2xlarge, ml.m5d.large, ml.m5.xlarge, ml.m4.10xlarge, ml.t2.large, ml.r5d.4xlarge, ml.r5.4xlarge, ml.m5.12xlarge, ml.m4.xlarge, ml.m5.24xlarge, ml.m4.2xlarge, ml.p2.8xlarge, ml.m5.2xlarge, ml.r5d.xlarge, ml.r5d.large, ml.r5.xlarge, ml.r5.large, ml.p3.8xlarge, ml.m4.4xlarge]

In [None]:
from sagemaker.predictor import csv_serializer, json_deserializer

# Content type overrides the data that will be passed to the deployed model, since the deployed model expects data in text/csv format.

# Serializer accepts a single argument, the input data, and returns a sequence of bytes in the specified content type

# Deserializer accepts two arguments, the result data and the response content type, and return a sequence of bytes in the specified content type.

# Reference: https://sagemaker.readthedocs.io/en/stable/predictors.html

linear_regressor.content_type = 'text/csv'
linear_regressor.serializer = csv_serializer
linear_regressor.deserializer = json_deserializer