## Run Tokensier on SageMaker

#### Prerequisites

1. Create the docker Image, See **infra/README.md** for more details


2. Make sure you deploy the sagemaker stack to set up roles and permissions. See **infra/README.md** for more details

In [1]:
import sys
sys.path.append("./src")

In [2]:
role_name = "bpe-sagemaker-SageMakerRole"
ecr_repo_name="bpe-tokeniser"

s3_input = "s3://aegovan-data/pubmed-json/"
s3_output = "s3//aegovan-data-bert/vocab/"


In [3]:
import boto3

account_id = boto3.client('sts').get_caller_identity().get('Account')
region = boto3.session.Session().region_name



In [4]:
repository_uri = "{}.dkr.ecr.{}.amazonaws.com/{}:latest".format(account_id, region, ecr_repo_name)
sagemaker_role_arn= "arn:aws:iam::{}:role/{}".format(account_id, role_name)
instance_type = "ml.m5.4xlarge"



In [5]:
from sagemaker.network import NetworkConfig
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.processing import ScriptProcessor

In [None]:
import os

sm_local_input = "/opt/ml/processing/input/data"
sm_local_output = "/opt/ml/processing/output"

script_processor = ScriptProcessor(image_uri=repository_uri,
                                       command=["python3"],
                                       env={'mode': 'python'},
                                       role=sagemaker_role_arn,
                                       instance_type=instance_type,
                                       instance_count=1,
                                       volume_size_in_gb = 150,
                                       network_config=NetworkConfig(enable_network_isolation=False)
                                       )

script_processor.run(
        code='src/pubmed_bpe_tokeniser.py',

        arguments=['--datadir',sm_local_input,
                   "--outputfile", os.path.join( sm_local_output, "vocab.json"),
                   "--vocabsize", '30000',
                   "--log-level", "INFO"],

        inputs=[
            ProcessingInput(
                source=s3_input,
                destination=sm_local_input)
        ],

        outputs=[ProcessingOutput(source=sm_local_output, destination=s3_output)]
    )

.