In [109]:
import boto3
import sagemaker
from sagemaker import get_execution_role

region = boto3.session.Session().region_name

role = get_execution_role()

In [110]:
%%writefile src/Dockerfile

FROM python:3.7-slim-buster
COPY . /src
WORKDIR /src
RUN pip3 install -r /src/requirements.txt
ENV PYTHONUNBUFFERED=TRUE
ENTRYPOINT ["python3"]

Overwriting src/Dockerfile


In [111]:
account_id = boto3.client('sts').get_caller_identity().get('Account')
ecr_uri = '{}.dkr.ecr.{}.amazonaws.com'.format(account_id, region)
ecr_repository = 'sagemaker-processing-container'
tag = ':latest'
processing_repository_uri = '{}.dkr.ecr.{}.amazonaws.com/{}'.format(account_id, region, ecr_repository + tag)

In [112]:
# Create Repo, Throws error is the repo is already present. Ignore and move forward
!aws ecr create-repository --repository-name $ecr_repository


An error occurred (RepositoryAlreadyExistsException) when calling the CreateRepository operation: The repository with name 'sagemaker-processing-container' already exists in the registry with id '895300689201'


In [113]:
print(processing_repository_uri)

# Create ECR repository and push docker image
!docker build -t $ecr_repository src
!aws ecr get-login-password --region ap-south-1 | docker login --username AWS --password-stdin $ecr_uri
!docker tag {ecr_repository + tag} $processing_repository_uri

895300689201.dkr.ecr.ap-south-1.amazonaws.com/sagemaker-processing-container:latest
Sending build context to Docker daemon  7.168kB
Step 1/6 : FROM python:3.7-slim-buster
 ---> 30a42f143b4e
Step 2/6 : COPY . /src
 ---> a03d6609652f
Step 3/6 : WORKDIR /src
 ---> Running in 7074bb02c82d
Removing intermediate container 7074bb02c82d
 ---> 5216dede4255
Step 4/6 : RUN pip3 install -r /src/requirements.txt
 ---> Running in 9221eb333399
Collecting pandas==0.25.3
  Downloading pandas-0.25.3-cp37-cp37m-manylinux1_x86_64.whl (10.4 MB)
Collecting scikit-learn==0.21.3
  Downloading scikit_learn-0.21.3-cp37-cp37m-manylinux1_x86_64.whl (6.7 MB)
Collecting pytz>=2017.2
  Downloading pytz-2021.1-py2.py3-none-any.whl (510 kB)
Collecting python-dateutil>=2.6.1
  Downloading python_dateutil-2.8.1-py2.py3-none-any.whl (227 kB)
Collecting numpy>=1.13.3
  Downloading numpy-1.20.2-cp37-cp37m-manylinux2010_x86_64.whl (15.3 MB)
Collecting scipy>=0.17.0
  Downloading scipy-1.6.2-cp37-cp37m-manylinux1_x86_64.whl 

In [114]:
!docker push $processing_repository_uri

The push refers to repository [895300689201.dkr.ecr.ap-south-1.amazonaws.com/sagemaker-processing-container]

[1B8ce1e414: Preparing 
[1B3459816c: Preparing 
[1B10e872c2: Preparing 
[1B6bb9ffea: Preparing 
[1B663e622b: Preparing 
[1B845af46d: Preparing 
[7B8ce1e414: Pushed   299.6MB/293.9MB[6A[2K[7A[2K[7A[2K[6A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[

In [115]:
# Provide the startup script pre-processing py, this is the actual code that gets executed.
from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput

script_processor = ScriptProcessor(command=['python3', '/src/preprocessing.py'],
                image_uri=processing_repository_uri
                role=role,
                instance_count=1,
                instance_type='ml.m5.xlarge')

In [116]:
# Input and Output files
input_data = 's3://sagemaker-sample-data-{}/processing/census/census-income.csv'.format(region)
output_data = 's3://sagemaker-ck-data/processing/output'

In [None]:
# Process the files
from sagemaker.processing import ProcessingInput, ProcessingOutput
script_processor.run(
    code='src/preprocessing.py',
    inputs=[
        ProcessingInput(source=input_data,destination='/opt/ml/processing/input')
    ],
    outputs=[
        ProcessingOutput(source='/opt/ml/processing/output/train',output_name='train_data',destination = output_data),
        ProcessingOutput(source='/opt/ml/processing/output/validation',output_name='validation_data',destination = output_data),
        ProcessingOutput(source='/opt/ml/processing/output/test',output_name='test_data',destination = output_data)
    ],
    arguments=['--train-test-split-ratio', '0.2']
)


Job Name:  sagemaker-processing-container-2021-04-13-07-45-59-719
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-sample-data-ap-south-1/processing/census/census-income.csv', 'LocalPath': '/opt/ml/processing/input', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-ap-south-1-895300689201/sagemaker-processing-container-2021-04-13-07-45-59-719/input/code/preprocessing.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'train_data', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-ck-data/processing/output', 'LocalPath': '/opt/ml/processing/output/train', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'validation_data', 'AppManaged': 