# Deploying on AWS SageMaker for scheduled Batch Transform

Notebook version

by [Daniel Marostica](https://www.linkedin.com/in/danielmarostica/)

In [18]:
import os
import sys

import boto3, sagemaker

from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.sklearn.model import SKLearnModel

## Get role with sagemaker, and S3 permissions

In [3]:
iam = boto3.client('iam')
role = iam.get_role(RoleName='datascience-sagemaker-s3')['Role']['Arn']

## Start session

You must have your credentials file at /home/your.name/.aws/ correctly set with CLI keys and tokens.

In [4]:
sagemaker_session = sagemaker.Session()

In [5]:
bucket = sagemaker_session.default_bucket() # creates a bucket based on your region and account ID
prefix = "titanic_example" # folder name

## Upload csv to S3

In [12]:
def upload(sagemaker_session, bucket, prefix, file_path):
    raw_data = sagemaker_session.upload_data(
        path="{}".format(file_path),
        bucket=bucket,
        key_prefix="{}".format(prefix))
    print('Data has been stored in the following bucket:', bucket)
    return raw_data

s3_data_uri = upload(sagemaker_session, bucket, prefix, file_path='data/data.csv')

Data has been stored in the following bucket: sagemaker-us-east-1-296025910508


## Preprocessing

### Define the instance to be created

In [13]:
sklearn_processor = SKLearnProcessor(framework_version='0.23-1',
                                    role=role,
                                    instance_type='ml.t3.medium',
                                    instance_count=1,
                                    base_job_name='sm-preprocessing')

### Start the job

You can check its state at AWS's web interface under SageMaker > Processing Jobs

In [14]:
sklearn_processor.run(code='modules/preprocessing.py',
                    inputs=[ProcessingInput(
                            source=s3_data_uri,
                            destination='/opt/ml/processing/input')],
                    outputs=[ProcessingOutput(output_name='train_data',
                                                source='/opt/ml/processing/train'),
                            ProcessingOutput(output_name='test_data',
                                                source='/opt/ml/processing/test')],
                    arguments=['--train-test-split-ratio', '0.2'])

preprocessing_job_description = sklearn_processor.jobs[-1].describe() # save the name of the processing job


Job Name:  sm-preprocessing-2021-11-30-19-47-11-855
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-296025910508/titanic_example/data.csv', 'LocalPath': '/opt/ml/processing/input', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-296025910508/sm-preprocessing-2021-11-30-19-47-11-855/input/code/preprocessing.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'train_data', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-us-east-1-296025910508/sm-preprocessing-2021-11-30-19-47-11-855/output/train_data', 'LocalPath': '/opt/ml/processing/train', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'test_data', 'AppManaged

## Training

In [15]:
sklearn = SKLearn(
    entry_point='modules/model.py',
    framework_version='0.23-1',
    instance_type='ml.m5.large',
    role=role,
    sagemaker_session=sagemaker_session,
    hyperparameters={"max_leaf_nodes": 30}, # you can pass hyperparameters to the algorithm
    base_job_name='sm-training')

In [16]:
train_file = os.path.join('s3://', bucket, preprocessing_job_description['ProcessingJobName'], 'output', 'train_data', 'train.csv')
test_file = os.path.join('s3://', bucket, preprocessing_job_description['ProcessingJobName'], 'output', 'test_data', 'test.csv')

sklearn.fit({'train': train_file, 'test': test_file})

training_job_name = sklearn._current_job_name # save the name of the training job

2021-11-30 20:11:30 Starting - Starting the training job...
2021-11-30 20:12:01 Starting - Launching requested ML instancesProfilerReport-1638303088: InProgress
......
2021-11-30 20:13:02 Starting - Preparing the instances for training......
2021-11-30 20:14:22 Downloading - Downloading input data...
2021-11-30 20:14:47 Training - Downloading the training image..[34m2021-11-30 20:15:07,998 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2021-11-30 20:15:08,001 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-11-30 20:15:08,012 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2021-11-30 20:15:15,413 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-11-30 20:15:15,425 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-11-30 20:15:15,438 sagemaker-training-toolkit IN

## Load model for inference

In [19]:
model_artifact = os.path.join('s3://', bucket, training_job_name, 'output', 'model.tar.gz') # fancy name for pickle

model = SKLearnModel(model_data=model_artifact,
                     role=role,
                     framework_version='0.23-1',
                     entry_point='modules/model.py')

### Load and transform data for inference


In [24]:
data = os.path.join('s3://', bucket, 'titanic_example', 'data.csv')

sklearn_processor = SKLearnProcessor(framework_version='0.23-1',
                                    role=role,
                                    instance_type='ml.t3.medium',
                                    instance_count=1)

sklearn_processor.run(code='modules/preprocessing.py',
                    inputs=[ProcessingInput(
                            source=data,
                            destination='/opt/ml/processing/input')],
                    outputs=[ProcessingOutput(output_name='processed_data',
                                                source='/opt/ml/processing/data')],
                    arguments=['--inference', 'true']) # this will prevent train/test split

preprocessing_job_description = sklearn_processor.jobs[-1].describe() 
processed_data = os.path.join('s3://', bucket, preprocessing_job_description['ProcessingJobName'], 'output', 'processed_data', 'data.csv') # retrieve the dumped file


Job Name:  sagemaker-scikit-learn-2021-11-30-20-45-53-997
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-296025910508/titanic_example/data.csv', 'LocalPath': '/opt/ml/processing/input', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-296025910508/sagemaker-scikit-learn-2021-11-30-20-45-53-997/input/code/preprocessing.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'processed_data', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-us-east-1-296025910508/sagemaker-scikit-learn-2021-11-30-20-45-53-997/output/processed_data', 'LocalPath': '/opt/ml/processing/data', 'S3UploadMode': 'EndOfJob'}}]
..............

## Prediction/Inference

The transformer already knows that, by default, the first column is the target

Finally, do the batch transform with the processed data

In [42]:
output_path = 's3://{}/{}/titanic_results'.format(bucket, prefix)

In [43]:
transformer = model.transformer(
    instance_count=1, 
    instance_type='ml.m4.xlarge', 
    assemble_with='Line', 
    accept='text/csv',
    output_path=os.path.join(output_path, 'data.csv.out'))       

transformer.transform(data=processed_data, content_type='text/csv')                 

..............................
[34m2021-11-30 22:49:33,180 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2021-11-30 22:49:33,182 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2021-11-30 22:49:33,183 INFO - sagemaker-containers - nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;[0m
[35m2021-11-30 22:49:33,180 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[35m2021-11-30 22:49:33,182 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[35m2021-11-30 22:49:33,183 INFO - sagemaker-containers - nginx config: [0m
[35mworker_processes auto;[0m
[35mdaemon off;[0m
[35mpid /tmp/nginx.pid;[0m
[35merror_log  /dev/stderr;[0m
[34mworker_rlimit_nofile 4096;[0m
[34mevents {
  worker_connections 2048;[0m
[34m}[0m
[34mhttp {
  include /etc/nginx/mime.types;
  defa

Bring a copy of the results from S3

In [56]:
os.path.join(output_path, 'data.csv.out')

's3://sagemaker-us-east-1-296025910508/titanic_example/titanic_results/data.csv.out'

In [57]:
!aws s3 cp 's3://sagemaker-us-east-1-296025910508/titanic_example/titanic_results/data.csv.out' ./

Completed 3.5 KiB/3.5 KiB (2.5 KiB/s) with 1 file(s) remainingdownload: s3://sagemaker-us-east-1-296025910508/titanic_example/titanic_results/data.csv.out to ./data.csv.out
