Goal: with Iris dataset, run a successful training job

Flex Goal:
1. train the job using pipemode.

Research:

Notes:
1. Notebook adapted from LinearLearner QuickStart

In [2]:
import os
import boto3
import re
import sagemaker
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
# SageMaker Config.


role = sagemaker.get_execution_role()
region = boto3.Session().region_name
bucket = sagemaker.Session().default_bucket()
prefix = 'iris'



In [4]:
iris_data = f"s3://{bucket}/{prefix}/Iris.csv"
iris_data

's3://sagemaker-us-east-1-258532878709/iris/Iris.csv'

In [5]:
# Removing header because that's the format sagemaker expects.
# In sagemaker the target is the first column
# Columns: Id	SepalLengthCm	SepalWidthCm	PetalLengthCm	PetalWidthCm	Species

iris_df = pd.read_csv(iris_data, header = None)

# Prep data into S3
shuffled = iris_df[1:].sample(frac=1, random_state=1).reset_index()
shuffled = shuffled.iloc[2:,2:-1]

train_df = shuffled[:110]
test_df = shuffled[110:120]
validation_df = shuffled[120:]


In [6]:
shuffled 

Unnamed: 0,1,2,3,4
2,6.6,3.0,4.4,1.4
3,5.4,3.9,1.3,0.4
4,7.9,3.8,6.4,2.0
5,6.3,3.3,4.7,1.6
6,6.9,3.1,5.1,2.3
...,...,...,...,...
145,6.3,2.8,5.1,1.5
146,6.4,3.1,5.5,1.8
147,6.3,2.5,4.9,1.5
148,6.7,3.1,5.6,2.4


In [7]:
FILE_TRAIN = "iris_train.csv"
FILE_TEST = "iris_test.csv"
FILE_VALIDATION = "iris_validation.csv"

# Create CSV files locally in SageMaker Instance/EFS
train_df.to_csv(FILE_TRAIN, index = False)
test_df.to_csv(FILE_TEST, index = False)
validation_df.to_csv(FILE_VALIDATION, index = False)

In [8]:
# Load onto S3

s3 = boto3.client("s3")


s3_train_data = f"s3://{bucket}/{prefix}/train"
s3_validation_data = f"s3://{bucket}/{prefix}/validation"
output_location = f"s3://{bucket}/{prefix}/output"


s3.upload_file(FILE_TRAIN, bucket, f"{prefix}/train/{FILE_TRAIN}")
s3.upload_file(FILE_TEST, bucket, f"{prefix}/test/{FILE_TEST}")
s3.upload_file(FILE_VALIDATION, bucket, f"{prefix}/validation/{FILE_VALIDATION}")

In [9]:

# generating the session.s3_input() format for fit() accepted by the sdk
train_data = sagemaker.inputs.TrainingInput(
    s3_train_data,
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
    record_wrapping=None,
    compression=None,
)
validation_data = sagemaker.inputs.TrainingInput(
    s3_validation_data,
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
    record_wrapping=None,
    compression=None,
)

## Training the Linear Learner model


In [10]:
# getting the linear learner image according to the region
from sagemaker.image_uris import retrieve

container = retrieve("linear-learner", boto3.Session().region_name, version="1")
print(container)

382416733822.dkr.ecr.us-east-1.amazonaws.com/linear-learner:1


In [11]:
%%time
import boto3
import sagemaker
from time import gmtime, strftime

sess = sagemaker.Session()

job_name = "linear-learner-iris-regression-" + strftime("%Y%m%d-%H-%M-%S", gmtime())
print("Training job", job_name)

linear = sagemaker.estimator.Estimator(
    container,
    role,
    input_mode="File",
    instance_count=1,
    instance_type="ml.m4.xlarge",
    output_path=output_location,
    sagemaker_session=sess,
)


linear.set_hyperparameters(
    feature_dim=3, # Make sure this accurate to the training data. Don't include the target in this count.  ## https://stackoverflow.com/questions/49303648/sagemaker-clienterror-rows-1-5000-have-more-fields-than-expected-size-3
    epochs=5,
    wd=0.01,
    loss="absolute_loss",
    predictor_type="regressor",
    normalize_data=True,
    optimizer="adam",
    mini_batch_size=25,
    lr_scheduler_step=100,
    lr_scheduler_factor=0.99,
    lr_scheduler_minimum_lr=0.0001,
    learning_rate=0.1,
)

Training job linear-learner-iris-regression-20221104-14-45-15
CPU times: user 28.8 ms, sys: 116 µs, total: 29 ms
Wall time: 28.4 ms


In [None]:
%%time
linear.fit(inputs={"train": train_data, "validation": validation_data}, job_name=job_name)

2022-11-04 14:45:15 Starting - Starting the training job...
2022-11-04 14:45:40 Starting - Preparing the instances for trainingProfilerReport-1667573115: InProgress
.........
2022-11-04 14:47:15 Downloading - Downloading input data
2022-11-04 14:47:15 Training - Downloading the training image..........

In [None]:
%%time
# creating the endpoint out of the trained model
linear_predictor = linear.deploy(initial_instance_count=1, instance_type="ml.c4.xlarge")
print(f"\ncreated endpoint: {linear_predictor.endpoint_name}")

In [None]:
# configure the predictor to accept to serialize csv input and parse the reposne as json
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer

linear_predictor.serializer = CSVSerializer()
linear_predictor.deserializer = JSONDeserializer()

Inferences/Prediction

In [None]:
# getting testing sample from our test file
import random 
test_data = [l for l in open(FILE_TEST, "r")]
sample = random.choice(test_data).split(",")
actual_age = sample[0]
payload = sample[1:]  
payload = ",".join(map(str, payload))

# Invoke the predicor and analyise the result
result = linear_predictor.predict(payload)

# extracting the prediction value
result = round(float(result["predictions"][0]["score"]), 2)


In [None]:
# Given SepalWidthCm, PetalLengthCm, and PetalWidthCm, the SepalLengthCm is: 
result

## Delete the Endpoint
Having an endpoint running will incur some costs. Therefore as a clean-up job, we should delete the endpoint.

In [None]:
sagemaker.Session().delete_endpoint(linear_predictor.endpoint_name)
print(f"deleted {linear_predictor.endpoint_name} successfully!")