In [2]:
import sagemaker
import boto3
import os
import numpy as np
import seaborn as sns

sns.set_context("talk")

BUCKET = 'project05-capstone-vexenta'
role = sagemaker.get_execution_role()

# Train Model

Using best hyperparameters from `notebook/sagemaker/00-sm-hp-tuning.ipynb`.

In [3]:
model_output_dir = f"s3://{BUCKET}/model/hp-tuning/model.tar.gz"
input_train = f"s3://{BUCKET}/data/model-input/train/df_train_rfe.csv"
input_test = f"s3://{BUCKET}/data/model-input/test/df_test_rfe.csv"

os.environ["SM_MODEL_DIR"] = model_output_dir
os.environ["SM_CHANNEL_TRAIN"] = input_train
os.environ["SM_CHANNEL_TEST"] = input_test

In [19]:
hyperparameters = {
    '_tuning_objective_metric': '"cv f1-score"',
    'max_depth': '"30"',
    'min_samples_split': '"2"',
    'n_estimators': '"300"',
    'sagemaker_container_log_level': '20',
    'sagemaker_estimator_class_name': '"SKLearn"',
    'sagemaker_estimator_module': '"sagemaker.sklearn.estimator"',
    'sagemaker_job_name': '"sagemaker-scikit-learn-2022-01-05-01-33-08-441"',
    'sagemaker_program': '"train-rf.py"',
    'sagemaker_region': '"us-east-1"',
    'sagemaker_submit_directory': '"s3://sagemaker-us-east-1-567220378588/sagemaker-scikit-learn-2022-01-05-01-33-08-441/source/sourcedir.tar.gz"'
}

## reformat input
for key in ["min_samples_split","n_estimators","max_depth"]:
    if not isinstance(hyperparameters[key], int):
        hyperparameters[key] = int(hyperparameters[key].replace('"',''))
        
hyperparameters

{'_tuning_objective_metric': '"cv f1-score"',
 'max_depth': 30,
 'min_samples_split': 2,
 'n_estimators': 300,
 'sagemaker_container_log_level': '20',
 'sagemaker_estimator_class_name': '"SKLearn"',
 'sagemaker_estimator_module': '"sagemaker.sklearn.estimator"',
 'sagemaker_job_name': '"sagemaker-scikit-learn-2022-01-05-01-33-08-441"',
 'sagemaker_program': '"train-rf.py"',
 'sagemaker_region': '"us-east-1"',
 'sagemaker_submit_directory': '"s3://sagemaker-us-east-1-567220378588/sagemaker-scikit-learn-2022-01-05-01-33-08-441/source/sourcedir.tar.gz"'}

In [20]:
## create estimators for your HPs
from sagemaker.sklearn.estimator import SKLearn

estimator = SKLearn(
    entry_point="../../src/modelling/train-rf.py",
    role=role,
    py_version='py3',
    framework_version="0.20.0",
    instance_count=1,
    instance_type="ml.m5.2xlarge",
    hyperparameters=hyperparameters,
)

## Fit your model
estimator.fit(
    inputs={
        "train": input_train, 
        "test": input_test,
    },
    wait=True
)

2022-01-05 02:18:32 Starting - Starting the training job...
2022-01-05 02:18:56 Starting - Launching requested ML instancesProfilerReport-1641349111: InProgress
......
2022-01-05 02:19:56 Starting - Preparing the instances for training......
2022-01-05 02:21:01 Downloading - Downloading input data...
2022-01-05 02:21:17 Training - Downloading the training image....[34m2022-01-05 02:22:06,464 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2022-01-05 02:22:06,467 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-01-05 02:22:06,476 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2022-01-05 02:22:06,754 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-01-05 02:22:13,024 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-01-05 02:22:13,036 sagemaker-training-toolkit 

In [21]:
estimator.model_data

's3://sagemaker-us-east-1-567220378588/sagemaker-scikit-learn-2022-01-05-02-18-31-464/output/model.tar.gz'

# Deploy model as a Sagemaker Endpoint

Relevant readings:
- https://github.com/aws/amazon-sagemaker-examples/blob/master/sagemaker-python-sdk/scikit_learn_randomforest/Sklearn_on_SageMaker_end2end.ipynb
- https://dev.to/aws-builders/running-custom-algorithm-in-aws-sagemaker-4jdf
- https://github.com/aws/amazon-sagemaker-examples/blob/master/sagemaker-python-sdk/scikit_learn_inference_pipeline/Inference%20Pipeline%20with%20Scikit-learn%20and%20Linear%20Learner.ipynb

## Pass a JSON input

In [53]:
## live endpoint: prepare preprocessing logic as functions, then fit into the endpoint
## to enable this, MUST provide an inference script (with model_fn, etc.)
from sagemaker.sklearn.model import SKLearnModel
from sagemaker.serializers import JSONSerializer

model_location = 's3://sagemaker-us-east-1-567220378588/sagemaker-scikit-learn-2022-01-05-02-18-31-464/output/model.tar.gz'
sklearn_model = SKLearnModel(
    model_data=model_location, 
    role=role,
    entry_point='../../src/modelling/inference.py',
    py_version='py3',
    framework_version='0.20.0',
)

predictor = sklearn_model.deploy(
    initial_instance_count=1, 
    instance_type='ml.m5.large',
    serializer=JSONSerializer()
)

predictor

-----!

<sagemaker.sklearn.model.SKLearnPredictor at 0x7fee396accd0>

In [54]:
import json

payload = {
    "data": {
        "sessionNo": 101,
        "startHour": 4,
        "startWeekday": 7,
        "duration": 0,
        "cCount": 2,
        "cMinPrice": 30,
        "cMaxPrice": 40,
        "cSumPrice": 70,
        "bCount": 1,
        "bMinPrice": 30,
        "bMaxPrice": 30,
        "bSumPrice": 30,
        "bStep": "?",
        "onlineStatus": "?",
        "availability": "?",
        "customerNo": 39,
        "maxVal": 200,
        "customerScore": 65,
        "accountLifetime": 30,
        "payments": 2,
        "age": 39,
        "address": 1,
        "lastOrder": 30,
    }
}

print(json.dumps(payload))

response = predictor.predict(
#     json.dumps(payload), ## not needed since we've used JSONSerializer()
    payload,
    initial_args={
        "ContentType": "application/json"
    }
)

print(response)

{"data": {"sessionNo": 101, "startHour": 4, "startWeekday": 7, "duration": 0, "cCount": 2, "cMinPrice": 30, "cMaxPrice": 40, "cSumPrice": 70, "bCount": 1, "bMinPrice": 30, "bMaxPrice": 30, "bSumPrice": 30, "bStep": "?", "onlineStatus": "?", "availability": "?", "customerNo": 39, "maxVal": 200, "customerScore": 65, "accountLifetime": 30, "payments": 2, "age": 39, "address": 1, "lastOrder": 30}}
[0]


In [55]:
payload = {
    "data": {
        "sessionNo": 15,
        "startHour": 6,
        "startWeekday": 5,
        "duration": 10,
        "cCount": 2,
        "cMinPrice": 30,
        "cMaxPrice": 40,
        "cSumPrice": 70,
        "bCount": 1,
        "bMinPrice": 30,
        "bMaxPrice": 30,
        "bSumPrice": 30,
        "bStep": "?",
        "onlineStatus": "?",
        "availability": "?",
        "customerNo": 39,
        "maxVal": 200,
        "customerScore": 65,
        "accountLifetime": 30,
        "payments": 2,
        "age": 19,
        "address": 2,
        "lastOrder": 75,
    }
}

print(json.dumps(payload))

response = predictor.predict(
    payload,
    initial_args={
        "ContentType": "application/json"
    }
)

print(response)

{"data": {"sessionNo": 15, "startHour": 6, "startWeekday": 5, "duration": 10, "cCount": 2, "cMinPrice": 30, "cMaxPrice": 40, "cSumPrice": 70, "bCount": 1, "bMinPrice": 30, "bMaxPrice": 30, "bSumPrice": 30, "bStep": "?", "onlineStatus": "?", "availability": "?", "customerNo": 39, "maxVal": 200, "customerScore": 65, "accountLifetime": 30, "payments": 2, "age": 19, "address": 2, "lastOrder": 75}}
[1]


In [56]:
predictor.delete_endpoint()

Yeay, it works!

## Pass CSV input

**not working yet**.

In [41]:
## live endpoint: prepare preprocessing logic as functions, then fit into the endpoint
## to enable this, MUST provide an inference script (with model_fn, etc.)
from sagemaker.sklearn.model import SKLearnModel
from sagemaker.serializers import CSVSerializer

model_location = 's3://sagemaker-us-east-1-567220378588/sagemaker-scikit-learn-2022-01-05-02-18-31-464/output/model.tar.gz'
sklearn_model = SKLearnModel(
    model_data=model_location, 
    role=role,
    entry_point='../../src/modelling/inference.py',
    py_version='py3',
    framework_version='0.20.0',
)

predictor = sklearn_model.deploy(
    initial_instance_count=1, 
    instance_type='ml.m5.large',
    serializer=CSVSerializer(),
#     deserializer=JSONDeserializer, #CSVDeserializer,
)

predictor

-----!

<sagemaker.sklearn.model.SKLearnPredictor at 0x7fee3a2b7310>

In [45]:
import pandas as pd

## sample input to test the endpoint
payload = {
        "sessionNo": 101,
        "startHour": 4,
        "startWeekday": 7,
        "duration": 0,
        "cCount": 2,
        "cMinPrice": 30,
        "cMaxPrice": 40,
        "cSumPrice": 70,
        "bCount": 1,
        "bMinPrice": 30,
        "bMaxPrice": 30,
        "bSumPrice": 30,
        "bStep": "?",
        "onlineStatus": "?",
        "availability": "?",
        "customerNo": 39,
        "maxVal": 200,
        "customerScore": 65,
        "accountLifetime": 30,
        "payments": 2,
        "age": 39,
        "address": 1,
        "lastOrder": 30,
    }

df_input = pd.json_normalize(payload)
df_input

Unnamed: 0,sessionNo,startHour,startWeekday,duration,cCount,cMinPrice,cMaxPrice,cSumPrice,bCount,bMinPrice,...,onlineStatus,availability,customerNo,maxVal,customerScore,accountLifetime,payments,age,address,lastOrder
0,101,4,7,0,2,30,40,70,1,30,...,?,?,39,200,65,30,2,39,1,30


In [51]:
input_csv = [str(elem) if elem != "?" else '"?"' for elem in df_input.values[0].tolist()]

response = predictor.predict(
    data=input_csv,
    initial_args={
        "ContentType": "text/csv"
    }
)

print(response)

ModelError: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received server error (500) from primary with message "<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
<title>500 Internal Server Error</title>
<h1>Internal Server Error</h1>
<p>The server encountered an internal error and was unable to complete your request. Either the server is overloaded or there is an error in the application.</p>
". See https://us-east-1.console.aws.amazon.com/cloudwatch/home?region=us-east-1#logEventViewer:group=/aws/sagemaker/Endpoints/sagemaker-scikit-learn-2022-01-06-02-30-27-382 in account 567220378588 for more information.

In [52]:
predictor.delete_endpoint()

# Batch transform

**Not working at the moment**; we should adjust `inference.py` to accept `text/csv` content type properly - may requeire hard-coding the feature names there.

In [25]:
## batch transform
from sagemaker.sklearn.model import SKLearnModel

batch_transform_input_path = f"s3://{BUCKET}/data/model-input/test/df_test_rfe.csv"
batch_transform_output_path = f"s3://{BUCKET}/data/model-output/batch-transform/"
model_location = 's3://sagemaker-us-east-1-567220378588/sagemaker-scikit-learn-2022-01-05-02-18-31-464/output/model.tar.gz'

sklearn_model = SKLearnModel(
    model_data=model_location, 
    role=role,
    entry_point='../../src/modelling/inference.py',
    py_version='py3',
    framework_version='0.20.0',
)

transformer = sklearn_model.transformer(
    instance_count=1, 
    instance_type='ml.m5.large', 
    output_path=batch_transform_output_path    
)

transformer.transform(
    data=batch_transform_input_path, 
    data_type='S3Prefix',
    content_type='text/csv',
    split_type='Line'
)

transformer.wait()