In [2]:
# Note that we are using AWS SageMaker 2.72.1
# We will be using the new SageMaker 2.x SDK 
!pip list

Package                              Version
------------------------------------ -----------------
aiobotocore                          2.3.4
aiohttp                              3.8.1
aioitertools                         0.10.0
aiosignal                            1.2.0
alabaster                            0.7.12
anaconda-client                      1.7.2
anaconda-project                     0.8.3
ansi2html                            1.8.0
argh                                 0.26.2
argon2-cffi                          21.3.0
argon2-cffi-bindings                 21.2.0
asn1crypto                           1.3.0
astroid                              2.12.5
astropy                              4.0
async-timeout                        4.0.2
asynctest                            0.13.0
atomicwrites                         1.3.0
attrs                                21.4.0
autopep8                             1.4.4
autovizwidget                        0.20.0
awscli                           

In [3]:
# install seaborn library
!pip install --upgrade Seaborn

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


Collecting Seaborn
  Downloading seaborn-0.12.0-py3-none-any.whl (285 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m285.1/285.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: Seaborn
  Attempting uninstall: Seaborn
    Found existing installation: seaborn 0.10.0
    Uninstalling seaborn-0.10.0:
      Successfully uninstalled seaborn-0.10.0
Successfully installed Seaborn-0.12.0
[0m

In [4]:
# read the data using Pandas 
icecream_sales_df = pd.read_csv('IceCreamData.csv')

In [5]:
# View the DataFrame
icecream_sales_df

Unnamed: 0,Temperature,Revenue
0,24.566884,534.799028
1,26.005191,625.190122
2,27.790554,660.632289
3,20.595335,487.706960
4,11.503498,316.240194
...,...,...
495,22.274899,524.746364
496,32.893092,755.818399
497,12.588157,306.090719
498,22.362402,566.217304


In [6]:
icecream_sales_df.head()

Unnamed: 0,Temperature,Revenue
0,24.566884,534.799028
1,26.005191,625.190122
2,27.790554,660.632289
3,20.595335,487.70696
4,11.503498,316.240194


In [7]:
icecream_sales_df.tail()

Unnamed: 0,Temperature,Revenue
495,22.274899,524.746364
496,32.893092,755.818399
497,12.588157,306.090719
498,22.362402,566.217304
499,28.957736,655.660388


In [8]:
# Separate the data into input X and Output y
X = icecream_sales_df[['Temperature']]
y = icecream_sales_df[['Revenue']]

In [9]:
X

Unnamed: 0,Temperature
0,24.566884
1,26.005191
2,27.790554
3,20.595335
4,11.503498
...,...
495,22.274899
496,32.893092
497,12.588157
498,22.362402


In [10]:
y

Unnamed: 0,Revenue
0,534.799028
1,625.190122
2,660.632289
3,487.706960
4,316.240194
...,...
495,524.746364
496,755.818399
497,306.090719
498,566.217304


In [11]:
# Check out the shape of the input
X.shape

(500, 1)

In [12]:
# Check out the shape of the output
y.shape

(500, 1)

In [13]:
# Convert the datatype to float32
X = np.array(X).astype('float32')
y = np.array(y).astype('float32')

In [15]:
# Only take the numerical variables and scale them
# X 

In [17]:
# y

In [18]:
# split the data into training and testing using SkLearn Library
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [19]:
# Train learner
# Boto3 is the Amazon Web Services (AWS) Software Development Kit (SDK) for Python
# Boto3 allows Python developer to write software that makes use of services like Amazon S3 and Amazon EC2

import sagemaker
import boto3

# Let's create a Sagemaker session
sagemaker_session = sagemaker.Session()

# Let's define the S3 bucket and prefix that we want to use in this session
bucket = 'aws-ml-engineer-training001' # bucket need to be created beforehand
prefix = 'linear_learner' # prefix is the subfolder within the bucket.

# Let's get the execution role for the notebook instance. 
# This is the IAM role that you created when you created your notebook instance. You pass the role to the training job.
# Note that AWS Identity and Access Management (IAM) role that Amazon SageMaker can assume to perform tasks on your behalf (for example, reading training results, called model artifacts, from the S3 bucket and writing training results to Amazon S3). 
role = sagemaker.get_execution_role()
print(role)

arn:aws:iam::852800376493:role/service-role/AmazonSageMaker-ExecutionRole-20221006T193158


In [20]:
X_train.shape

(400, 1)

In [21]:
# batch format is needed to train
y_train = y_train[:,0]

In [22]:
y_train.shape

(400,)

In [23]:
import io # The io module allows for dealing with various types of I/O (text I/O, binary I/O and raw I/O). 
import numpy as np
import sagemaker.amazon.common as smac # sagemaker common libary

# Code below converts the data in numpy array format to RecordIO format
# This is the format required by Sagemaker Linear Learner (one of many available options!)

buf = io.BytesIO() # create an in-memory byte array (buf is a buffer I will be writing to)
smac.write_numpy_to_dense_tensor(buf, X_train, y_train)
buf.seek(0) 
# When you write to in-memory byte arrays, it increments 1 every time you write to it
# Let's reset that back to zero 


0

In [24]:
import os

# Code to upload RecordIO data to S3
 
# Key refers to the name of the file    
key = 'linear-train-data'

# The following code uploads the data in record-io format to S3 bucket to be accessed later for training
boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf)

# Let's print out the training data location in s3
s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)
print('uploaded training data location: {}'.format(s3_train_data))

uploaded training data location: s3://aws-ml-engineer-training001/linear_learner/train/linear-train-data


In [25]:
X_test.shape

(100, 1)

In [26]:
y_test.shape

(100, 1)

In [27]:
# Make sure that the target label is a vector
y_test = y_test[:,0]


In [28]:
# Code to upload RecordIO data to S3

buf = io.BytesIO() # create an in-memory byte array (buf is a buffer I will be writing to)
smac.write_numpy_to_dense_tensor(buf, X_test, y_test)
buf.seek(0) 
# When you write to in-memory byte arrays, it increments 1 every time you write to it
# Let's reset that back to zero 


0

In [29]:
# Key refers to the name of the file    
key = 'linear-test-data'

# The following code uploads the data in record-io format to S3 bucket to be accessed later for training
boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'test', key)).upload_fileobj(buf)

# Let's print out the testing data location in s3
s3_test_data = 's3://{}/{}/test/{}'.format(bucket, prefix, key)
print('uploaded training data location: {}'.format(s3_test_data))

uploaded training data location: s3://aws-ml-engineer-training001/linear_learner/test/linear-test-data


In [30]:
# create an output placeholder in S3 bucket to store the linear learner output

output_location = 's3://{}/{}/output'.format(bucket, prefix)
print('Training artifacts will be uploaded to: {}'.format(output_location))

Training artifacts will be uploaded to: s3://aws-ml-engineer-training001/linear_learner/output


In [31]:
# Note that this code leverages the new SageMaker SDK 2.0

# This code is used to get the training container of sagemaker built-in algorithms
# all we have to do is to specify the name of the algorithm that we want to use

# Let's obtain a reference to the linearLearner container image
# You don't have to specify (hardcode) the region, get_image_uri will get the current region name using boto3.Session
container = sagemaker.image_uris.retrieve("linear-learner", boto3.Session().region_name)


# If you are using an old version of AWS SageMAker SDK 1.0, you need to use get_image_uri
# from sagemaker.amazon.amazon_estimator import get_image_uri
# container = get_image_uri(boto3.Session().region_name, 'linear-learner')

In [32]:
# Note that this code leverages the new SageMaker SDK 2.0

# We have to pass in the container, the type of instance that we would like to use for training 
# output path and sagemaker session into the Estimator. 
# We can also specify how many instances we would like to use for training

linear = sagemaker.estimator.Estimator(container,
                                       role, 
                                       instance_count = 1, 
                                       instance_type = 'ml.m4.xlarge',
                                       output_path = output_location,
                                       sagemaker_session = sagemaker_session)


# We can tune parameters like the number of features that we are passing in, type of predictor like 'regressor' or 'classifier', mini batch size, epochs
# Train 32 different versions of the model and will get the best out of them (built-in parameters optimization!)

linear.set_hyperparameters(feature_dim = 1,
                           predictor_type = 'regressor',
                           mini_batch_size = 5,
                           epochs = 5,
                           num_models = 32,
                           loss = 'absolute_loss')

# Now we are ready to pass in the training data from S3 to train the linear learner model

linear.fit({'train': s3_train_data})

# Let's see the progress using cloudwatch logs

2022-10-10 12:54:38 Starting - Starting the training job...ProfilerReport-1665406478: InProgress
...
2022-10-10 12:55:23 Starting - Preparing the instances for training......
2022-10-10 12:56:24 Downloading - Downloading input data...
2022-10-10 12:57:03 Training - Downloading the training image...........[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[10/10/2022 12:58:49 INFO 139696714217280] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-input.json: {'mini_batch_size': '1000', 'epochs': '15', 'feature_dim': 'auto', 'use_bias': 'true', 'binary_classifier_model_selection_criteria': 'accuracy', 'f_beta': '1.0', 'target_recall': '0.8', 'target_precision': '0.8', 'num_models': 'auto', 'num_calibration_samples': '10000000', 'init_method': 'uniform', 'init_scale': '0.07', 'init_sigma': '0.01', 'init_bias': '0.0', 'optimizer': 'auto', 'loss': 'auto', 'margin': '1.

In [33]:
# A Spot offers a lower price compared to an on-Demand instance.
# Amazon EC2 Spot Instances offer spare compute capacity available in the AWS Cloud at ~90% discounts compared to On-Demand prices. 

# train_use_spot_instances (bool): Specifies whether to use SageMaker Managed Spot instances for training.
# max_run (int): Timeout in seconds for training (default: 24 * 60 * 60). After this amount of time Amazon SageMaker terminates the job regardless of its current status.
# max_wait (int): Timeout in seconds waiting for spot training instances (default: None). After this amount of time Amazon SageMaker will stop waiting for Spot instances to become available (default:None).


linear = sagemaker.estimator.Estimator(container,
                                       role, 
                                       instance_count = 1, 
                                       instance_type = 'ml.m4.xlarge',
                                       output_path = output_location,
                                       sagemaker_session = sagemaker_session,
                                       use_spot_instances = True,
                                       max_run = 300,
                                       max_wait = 600)

# We can tune parameters like the number of features that we are passing in, type of predictor like 'regressor' or 'classifier', mini batch size, epochs
# Train 32 different versions of the model and will get the best out of them (built-in parameters optimization!)

linear.set_hyperparameters(feature_dim = 1,
                           predictor_type = 'regressor',
                           mini_batch_size = 5,
                           epochs = 5,
                           num_models = 32,
                           loss = 'absolute_loss')

# Now we are ready to pass in the training data from S3 to train the linear learner model

linear.fit({'train': s3_train_data})

# Let's see the progress using cloudwatch logs

2022-10-10 13:30:34 Starting - Starting the training job...
2022-10-10 13:30:59 Starting - Preparing the instances for trainingProfilerReport-1665408634: InProgress
.........
2022-10-10 13:32:23 Downloading - Downloading input data...
2022-10-10 13:32:57 Training - Downloading the training image............
2022-10-10 13:34:58 Training - Training image download completed. Training in progress..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[10/10/2022 13:34:58 INFO 139627997730624] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-input.json: {'mini_batch_size': '1000', 'epochs': '15', 'feature_dim': 'auto', 'use_bias': 'true', 'binary_classifier_model_selection_criteria': 'accuracy', 'f_beta': '1.0', 'target_recall': '0.8', 'target_precision': '0.8', 'num_models': 'auto', 'num_calibration_samples': '10000000', 'init_method': 'uniform', 'init_scale': '0.07', 'i

In [34]:
# Deploying the model to perform inference 
# serializer: A serializer object is used to encode data for an inference endpoint.
# deserializer: A deserializer object is used to decode data from an inference endpoint.

from sagemaker.deserializers import JSONDeserializer
from sagemaker.serializers import CSVSerializer


linear_regressor = linear.deploy(initial_instance_count = 1,
                                 instance_type = 'ml.m4.xlarge',
                                 serializer = CSVSerializer(),
                                 deserializer = JSONDeserializer())

---------!

In [None]:
# Use code lines below if you're using AWS SDK 1.0
# from sagemaker.predictor import csv_serializer, json_deserializer
# linear_regressor.content_type = 'text/csv' # This will need to be enabled for AWS SageMaker SDK 1.0
# linear_regressor.serializer = csv_serializer
# linear_regressor.deserializer = json_deserializer

In [35]:
# making prediction on the test data

result = linear_regressor.predict(X_test)

In [39]:
result # results are in Json format

{'predictions': [{'score': 389.88690185546875},
  {'score': 509.239501953125},
  {'score': 127.8633041381836},
  {'score': 758.1600341796875},
  {'score': 351.9227600097656},
  {'score': 441.7303771972656},
  {'score': 231.46726989746094},
  {'score': 603.9263916015625},
  {'score': 478.20269775390625},
  {'score': 492.3080749511719},
  {'score': 448.81396484375},
  {'score': 409.36553955078125},
  {'score': 654.7429809570312},
  {'score': 525.6947021484375},
  {'score': 1011.6722412109375},
  {'score': 650.9990844726562},
  {'score': 720.1375122070312},
  {'score': 539.11083984375},
  {'score': 798.2227172851562},
  {'score': 662.3613891601562},
  {'score': 564.3772583007812},
  {'score': 367.7423095703125},
  {'score': 467.65557861328125},
  {'score': 349.32513427734375},
  {'score': 446.5513610839844},
  {'score': 630.5269165039062},
  {'score': 702.7984008789062},
  {'score': 379.4493408203125},
  {'score': 775.9596557617188},
  {'score': 652.5621337890625},
  {'score': 483.1778259

In [40]:
# Since the result is in json format, we access the scores by iterating through the scores in the predictions

predictions = np.array([r['score'] for r in result['predictions']])

In [43]:
predictions[:20]

array([ 389.88690186,  509.23950195,  127.86330414,  758.16003418,
        351.92276001,  441.7303772 ,  231.4672699 ,  603.9263916 ,
        478.20269775,  492.30807495,  448.81396484,  409.36553955,
        654.74298096,  525.69470215, 1011.67224121,  650.99908447,
        720.13751221,  539.11083984,  798.22271729,  662.36138916])

In [44]:
predictions.shape

(100,)

In [45]:
# VISUALIZE TEST SET RESULTS
plt.figure(figsize = (10, 6))
plt.scatter(X_test, y_test, color = 'blue')
plt.plot(X_test, predictions, color = 'red')
plt.xlabel('Temperature [DegC]')
plt.ylabel('Revenue [$]')
plt.title('Temperature vs. Revenue (Testing Dataset)')
plt.grid()

In [None]:
# Delete the end-point
linear_regressor.delete_endpoint()