In [None]:
import boto3
import sagemaker
from sagemaker import get_execution_role

region = boto3.session.Session().region_name

role = get_execution_role()

In [None]:
%%writefile src/Dockerfile

FROM python:3.7-slim-buster
COPY . /src
WORKDIR /src
RUN pip3 install -r /src/requirements.txt
ENV PYTHONUNBUFFERED=TRUE
ENTRYPOINT ["python3"]

In [None]:
account_id = boto3.client('sts').get_caller_identity().get('Account')
ecr_uri = '{}.dkr.ecr.{}.amazonaws.com'.format(account_id, region)
ecr_repository = 'sagemaker-processing-container'
tag = ':latest'
processing_repository_uri = '{}.dkr.ecr.{}.amazonaws.com/{}'.format(account_id, region, ecr_repository + tag)

In [None]:
# Create Repo, Throws error is the repo is already present. Ignore and move forward
!aws ecr create-repository --repository-name $ecr_repository

In [None]:
print(processing_repository_uri)

# Create ECR repository and push docker image
!docker build -t $ecr_repository src
!aws ecr get-login-password --region ap-south-1 | docker login --username AWS --password-stdin $ecr_uri
!docker tag {ecr_repository + tag} $processing_repository_uri

In [None]:
!docker push $processing_repository_uri

In [None]:
# Provide the startup script pre-processing py, this is the actual code that gets executed.
from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput

script_processor = ScriptProcessor(command=['python3', '/src/preprocessing.py'],
                image_uri=processing_repository_uri
                role=role,
                instance_count=1,
                instance_type='ml.m5.xlarge')

In [None]:
# Input and Output files
input_data = 's3://sagemaker-sample-data-{}/processing/census/census-income.csv'.format(region)
output_data = 's3://sagemaker-ck-data/processing/output'

In [None]:
# Process the files
from sagemaker.processing import ProcessingInput, ProcessingOutput
script_processor.run(
    code='src/preprocessing.py',
    inputs=[
        ProcessingInput(source=input_data,destination='/opt/ml/processing/input')
    ],
    outputs=[
        ProcessingOutput(source='/opt/ml/processing/output/train',output_name='train_data',destination = output_data),
        ProcessingOutput(source='/opt/ml/processing/output/validation',output_name='validation_data',destination = output_data),
        ProcessingOutput(source='/opt/ml/processing/output/test',output_name='test_data',destination = output_data)
    ],
    arguments=['--train-test-split-ratio', '0.2']
)