In [None]:
import os
import sagemaker
from sagemaker import get_execution_role

bucket = os.getenv('BUCKET_NAME')
endpoint_name = os.getenv('ENDPOINT_NAME')
sagemaker_session = sagemaker.Session(default_bucket=bucket)

# Get a SageMaker-compatible role used by this Notebook Instance.
role = get_execution_role()
region = sagemaker_session.boto_session.region_name

In [None]:
from sagemaker.sklearn.estimator import SKLearn

entry_point = 'sklearn_featureizer.py'
source_dir = 'pipeline'

s3_output_key_prefix = "training_output"
model_output_path = 's3://{}/{}/{}'.format(bucket, s3_output_key_prefix, 'sgd') # rename based on model

In [None]:
# you will need to change this
train_input = 's3://beularnotebookstack-beularsagemakerapibucket1198e-xck265jh9uop/training_output/train/train.csv'

# Train the Model

In [None]:
# terminate model training after 48 hours
train_max_run = 48 * 60 * 60

grid_search = SKLearn(
    framework_version='0.23-1',
    source_dir=source_dir,
    entry_point=entry_point,
    role=role,
    train_instance_type="ml.c5.18xlarge",
    sagemaker_session=sagemaker_session,
    output_path=model_output_path,
    train_max_run=train_max_run
)

In [None]:
# Note that this will take awhile.
grid_search.fit({'train': train_input}, logs=True)

# Deploy the Model
We now have a fitted model (i.e. the best estimator from the Grid Search) in our s3 bucket. We can now deploy this model behind a single endpoint. When this is done, you'll be able to see this endpoint under Endpoints in the SageMaker console.

In [None]:
from sagemaker.model import Model
from sagemaker.pipeline import PipelineModel

model = grid_search.create_model(role=role)
model.deploy(
    initial_instance_count=1, 
    instance_type='ml.c5.xlarge',
    endpoint_name=endpoint_name
)

# Request inferences from the endpoint
With our model deployed behind a REST API, we'll now make some requests to it in order to get inferences from our validation set. We can then use these inferences to see how well the trained model performs on out-of-sample data.

Note that we need to make our request with the payload in text/csv format, since that is what our script currently supports (see input_fn() in our entrypoint file). If other formats need to be supported, this would have to be added to that input_fn() function. Note, however, that we set the accept to application/json to get our output, i.e. the inferences, that way. We do this because our ouput_fn() function returns JSON.

In [None]:
import pandas as pd
from sagemaker.predictor import json_serializer, csv_serializer, json_deserializer, RealTimePredictor
from sagemaker.content_types import CONTENT_TYPE_CSV, CONTENT_TYPE_JSON

df = pd.read_csv(train_input)

df.columns = ['Clause ID', 'Clause Text', 'Classification']

In [None]:
import json
from sklearn.metrics import f1_score, brier_score_loss

predictor = RealTimePredictor(
    endpoint=endpoint_name,
    sagemaker_session=sagemaker_session,
    serializer=csv_serializer,
    content_type=CONTENT_TYPE_CSV,
    accept=CONTENT_TYPE_JSON)

In [None]:
pred = predictor.predict("This is a test")
prediction = json.loads(pred)

import base64
base64.b64decode(prediction['instances'][0]['expl'].encode('utf-8')).decode('utf-8')

In [None]:
# this will delete the endpoint to clean up
sm_client = sagemaker_session.boto_session.client('sagemaker')
sm_client.delete_endpoint(EndpointName=endpoint_name)