In [None]:
import os
import sagemaker
from sagemaker import get_execution_role

bucket = os.getenv('BUCKET_NAME')
endpoint_name = os.getenv('ENDPOINT_NAME')
sagemaker_session = sagemaker.Session(default_bucket=bucket)

# Get a SageMaker-compatible role used by this Notebook Instance.
role = get_execution_role()
region = sagemaker_session.boto_session.region_name

# Create a SageMaker Scikit estimator
Here we create an instance of a sagemaker.sklearn.estimator.sklearn estimator. The constructor accepts several constructor arguments:

- source_dir: Path (absolute or relative) to the directory with our custom model training source code.
- entry_point: The path to the Python script SageMaker runs for training and prediction within source_dir.
- role: Role ARN, which is provided by get_execution_role()
- train_instance_type (optional): The type of SageMaker instances for training. Note: Because Scikit-learn does not natively support GPU training, Sagemaker Scikit-learn does not currently support training on GPU instance types. Also, note that you may need to request an EC2 quota increase for these ml ec2 instance types.
- sagemaker_session (optional): The session used to train on Sagemaker, as returned by sagemaker.Session().
- output_path (optional): s3 location where you want the training result (model artifacts and optional output files) saved. If not specified, results are stored to a default bucket. If the bucket with the specific name does not exist, the estimator creates the bucket during execution of the fit() method.
- train_max_run (optional): Timeout in seconds for training (default: 24 60 60). After this amount of time Amazon SageMaker terminates the job regardless of its current status.

In [None]:
from sagemaker.sklearn.estimator import SKLearn

entry_point = 'sklearn_featureizer.py'
source_dir = 'pipeline'

s3_output_key_prefix = "training_output"
model_output_path = 's3://{}/{}/{}'.format(bucket, s3_output_key_prefix, 'model')

# terminate model training after 48 hours
train_max_run = 48 * 60 * 60

grid_search = SKLearn(
    framework_version='0.23-1',
    source_dir=source_dir,
    entry_point=entry_point,
    role=role,
    train_instance_type="ml.c5.4xlarge",
    sagemaker_session=sagemaker_session,
    output_path=model_output_path,
    train_max_run=train_max_run
)

train_input = f's3://{bucket}/AI_ML_Challenge_Training_Data_Set_1_v1.csv'

In [None]:
# Note that this will take awhile.
grid_search.fit({'train': train_input}, logs=True)

# Deploy the Model
We now have a fitted model (i.e. the best estimator from the Grid Search) in our s3 bucket. We can now deploy this model behind a single endpoint. When this is done, you'll be able to see this endpoint under Endpoints in the SageMaker console.

In [None]:
from sagemaker.model import Model
from sagemaker.pipeline import PipelineModel

model = grid_search.create_model(role=role)

model.deploy(
    initial_instance_count=1, 
    instance_type='ml.c5.xlarge',
    endpoint_name=endpoint_name
)

# Request inferences from the endpoint
With our model deployed behind a REST API, we'll now make some requests to it in order to get inferences from our validation set. We can then use these inferences to see how well the trained model performs on out-of-sample data.

Note that we need to make our request with the payload in text/csv format, since that is what our script currently supports (see input_fn() in our entrypoint file). If other formats need to be supported, this would have to be added to that input_fn() function. Note, however, that we set the accept to application/json to get our output, i.e. the inferences, that way. We do this because our ouput_fn() function returns JSON.

In [None]:
import pandas as pd
from sagemaker.predictor import json_serializer, csv_serializer, json_deserializer, RealTimePredictor
from sagemaker.content_types import CONTENT_TYPE_CSV, CONTENT_TYPE_JSON

df = pd.read_csv(train_input)

df.columns = ['Clause ID', 'Clause Text', 'Classification']

In [None]:
import json
from sklearn.metrics import f1_score, brier_score_loss

predictor = RealTimePredictor(
    endpoint=endpoint_name,
    sagemaker_session=sagemaker_session,
    serializer=csv_serializer,
    content_type=CONTENT_TYPE_CSV,
    accept=CONTENT_TYPE_JSON
)


preds = []
samples_to_drop = []
clauses = df['Clause Text'].iloc[:10].tolist()
for clause in clauses:
    try:
        pred = predictor.predict(clause)
    except Exception as e:
        print(e)
        print(clause)
    preds.append(pred)

y_pred = []
pred_probs = []
for i, p in enumerate(preds):
    result = json.loads(p)
    pred = result.get("instances")[0].get('prediction')
    pred_prob = result.get("instances")[0].get('decision boundary')
    pred_probs.append(pred_prob)
    y_pred.append(pred)

y_true = df['Classification'].iloc[:10].tolist()

f1 = f1_score(y_true, y_pred)
bs = brier_score_loss(y_true, pred_probs)
print(f"f1: {f1}\nbs: {bs}")

In [None]:
# this will delete the endpoint to clean up
sm_client = sagemaker_session.boto_session.client('sagemaker')
sm_client.delete_endpoint(EndpointName=endpoint_name)