In [1]:
import datetime
import time
import tarfile

import boto3
import pandas as pd
import numpy as np
from sagemaker import get_execution_role
import sagemaker
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston


sm_boto3 = boto3.client("sagemaker")

sess = sagemaker.Session()

region = sess.boto_session.region_name

bucket = sess.default_bucket()  # this could also be a hard-coded bucket name

print("Using bucket " + bucket)



Using bucket sagemaker-us-east-1-183492708471


In [2]:
# we use the Boston housing dataset
data = load_boston()

In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    data.data, data.target, test_size=0.25, random_state=42
)

trainX = pd.DataFrame(X_train, columns=data.feature_names)
trainX["target"] = y_train

testX = pd.DataFrame(X_test, columns=data.feature_names)
testX["target"] = y_test



In [4]:
trainX.head()

trainX.to_csv("boston_train.csv")
testX.to_csv("boston_test.csv")

# send data to S3. SageMaker will take training data from s3
trainpath = sess.upload_data(
    path="boston_train.csv", bucket=bucket, key_prefix="sagemaker/sklearncontainer"
)

testpath = sess.upload_data(
    path="boston_test.csv", bucket=bucket, key_prefix="sagemaker/sklearncontainer"
)



In [5]:
%%writefile script.py

import argparse
import joblib
import os

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor


# inference functions ---------------
def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf


if __name__ == "__main__":

    print("extracting arguments")
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    # to simplify the demo we don't use all sklearn RandomForest hyperparameters
    parser.add_argument("--n-estimators", type=int, default=10)
    parser.add_argument("--min-samples-leaf", type=int, default=3)

    # Data, model, and output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="boston_train.csv")
    parser.add_argument("--test-file", type=str, default="boston_test.csv")
    parser.add_argument(
        "--features", type=str
    )  # in this script we ask user to explicitly name features
    parser.add_argument(
        "--target", type=str
    )  # in this script we ask user to explicitly name the target

    args, _ = parser.parse_known_args()

    print("reading data")
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))

    print("building training and testing datasets")
    X_train = train_df[args.features.split()]
    X_test = test_df[args.features.split()]
    y_train = train_df[args.target]
    y_test = test_df[args.target]

    # train
    print("training model")
    model = RandomForestRegressor(
        n_estimators=args.n_estimators, min_samples_leaf=args.min_samples_leaf, n_jobs=-1
    )

    model.fit(X_train, y_train)

    # print abs error
    print("validating model")
    abs_err = np.abs(model.predict(X_test) - y_test)

    # print couple perf metrics
    for q in [10, 50, 90]:
        print("AE-at-" + str(q) + "th-percentile: " + str(np.percentile(a=abs_err, q=q)))

    # persist model
    path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model, path)
    print("model persisted at " + path)
    print(args.min_samples_leaf)



Writing script.py


In [6]:
! python script.py --n-estimators 100 \
                   --min-samples-leaf 2 \
                   --model-dir ./ \
                   --train ./ \
                   --test ./ \
                   --features 'CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT' \
                   --target target

extracting arguments
reading data
building training and testing datasets
training model
validating model
AE-at-10th-percentile: 0.20591714285714255
AE-at-50th-percentile: 1.6314446969697016
AE-at-90th-percentile: 4.079971111111117
model persisted at ./model.joblib
2


In [7]:
# We use the Estimator from the SageMaker Python SDK
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"

sklearn_estimator = SKLearn(
    entry_point="script.py",
    role=get_execution_role(),
    instance_count=1,
    instance_type="ml.m4.xlarge",#"ml.c5.xlarge",#"ml.m4.xlarge",
    framework_version=FRAMEWORK_VERSION,
    base_job_name="rf-scikit",
    metric_definitions=[{"Name": "median-AE", "Regex": "AE-at-50th-percentile: ([0-9.]+).*$"}],
    hyperparameters={
        "n-estimators": 100,
        "min-samples-leaf": 3,
        "features": "CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT",
        "target": "target",
    },
)

import time
tic = time.clock()

# launch training job, with asynchronous call
sklearn_estimator.fit({"train": trainpath, "test": testpath}, wait=False)

toc = time.clock()
print(toc - tic)

0.10000000000000009


In [8]:
sklearn_estimator.latest_training_job.wait(logs="None")
artifact = sm_boto3.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]

print("Model artifact persisted at " + artifact)

from sagemaker.sklearn.model import SKLearnModel

model = SKLearnModel(
    model_data=artifact,
    role=get_execution_role(),
    entry_point="script.py",
    framework_version=FRAMEWORK_VERSION,
)

predictor = model.deploy(instance_type="ml.c5.large", initial_instance_count=1)




2021-12-12 09:25:52 Starting - Starting the training job
2021-12-12 09:25:55 Starting - Launching requested ML instances................
2021-12-12 09:27:21 Starting - Preparing the instances for training...........................
2021-12-12 09:29:42 Downloading - Downloading input data...
2021-12-12 09:30:02 Training - Downloading the training image.....
2021-12-12 09:30:31 Training - Training image download completed. Training in progress..
2021-12-12 09:30:43 Uploading - Uploading generated training model.
2021-12-12 09:30:50 Completed - Training job completed
Model artifact persisted at s3://sagemaker-us-east-1-183492708471/rf-scikit-2021-12-12-09-25-52-106/output/model.tar.gz
------!

In [9]:
## invoke endpoint with lots of request
n = 0
while n < 400:
    print(predictor.predict(testX[data.feature_names]))
    n = n+1

[22.98372976 31.11700292 17.3172241  23.50900664 17.21059405 21.4631301
 19.76100545 16.03384964 21.15855476 20.8417535  20.14908048 19.79905263
  7.94297233 21.88845296 19.44965537 26.13429246 18.7758869   8.70837688
 44.09919286 15.40885711 24.182289   23.93984784 15.04170372 23.60216165
 14.86919696 15.26518932 21.56212532 14.09224567 19.78239268 21.0179653
 19.73208016 23.70115963 29.40328929 20.2604123  14.59666999 16.13537369
 34.9858952  19.48045    21.5309518  23.89891468 19.73006696 28.77203521
 44.34723095 19.60533477 22.94970501 14.206604   15.6597877  24.37670155
 18.70765916 29.1331631  20.86911406 33.74913626 17.32561151 25.86898561
 45.93018849 21.81132424 15.79706147 32.69590271 22.3638544  20.69828085
 25.43320516 34.11378889 30.61793611 19.01387269 27.41775534 16.95365938
 13.87222796 23.10976631 28.81123387 15.10678481 20.61733167 27.0461619
 10.3777017  21.83536378 22.26843429  7.23535711 20.27028254 45.38150141
 11.19164307 13.70591432 21.57696064 10.84992973 19.85

In [10]:
sm_boto3.delete_endpoint(EndpointName=predictor.endpoint_name)

{'ResponseMetadata': {'RequestId': 'ac62fbba-9cc3-46f6-b41b-7c3c4b11440c',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'ac62fbba-9cc3-46f6-b41b-7c3c4b11440c',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0',
   'date': 'Sun, 12 Dec 2021 09:36:17 GMT'},
  'RetryAttempts': 0}}