### SageMaker chemprot large scale data preparation

In [1]:
import sagemaker
import boto3

sagemaker_session = sagemaker.Session()
account_id =  boto3.client('sts').get_caller_identity().get('Account')
region = boto3.session.Session().region_name


#role = sagemaker.get_execution_role()
role="arn:aws:iam::{}:role/service-role/AmazonSageMaker-ExecutionRole-20190118T115449".format(account_id)


In [2]:
version_tag="202208062230"
pytorch_custom_image_name=f"large-scale-ptm-ppi:cpu-{version_tag}"
instance_type = "ml.m5.xlarge" 

In [3]:
docker_repo = "{}.dkr.ecr.{}.amazonaws.com/{}".format(account_id, region, pytorch_custom_image_name)

In [4]:
bucket = "aegovan-data"

In [5]:
ner_s3_uri = f"s3://{bucket}/chemprotnerlargescale/chemprot-ner-largescale-20220730125708/"

In [6]:
import datetime
date_fmt = datetime.datetime.today().strftime("%Y%m%d%H")

### Run  data prep

In [7]:
s3_output_predictions = "s3://aegovan-data/chemprotlargescale/input/data_".format(date_fmt)
s3_input_data = ner_s3_uri
s3_data_type="S3Prefix"

instance_count = 1


In [8]:
s3_input_data, s3_data_type

('s3://aegovan-data/chemprotnerlargescale/chemprot-ner-largescale-20220730125708/',
 'S3Prefix')

In [None]:
from sagemaker.network import NetworkConfig
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.processing import ScriptProcessor

script_processor = ScriptProcessor(image_uri=docker_repo,
                                       command=["python"],
                                       env={'mode': 'python', 'PYTHONPATH':'/opt/ml/code'},
                                       role=role,
                                       instance_type=instance_type,
                                       instance_count=instance_count,
                                       max_runtime_in_seconds= 5 * 24 * 60 * 60,
                                       volume_size_in_gb = 250,
                                       network_config=NetworkConfig(enable_network_isolation=False),
                                       base_job_name ="chemprot-inference"
                                       )



sm_local_output = "/opt/ml/processing/output"
sm_local_input_data = "/opt/ml/processing/input/data"



script_processor.run(
        code='../src/preprocessors/chemprot_abstract_jsondir_converter.py',

        arguments=[
          "--inputdir", sm_local_input_data,
          "--outputdir", sm_local_output,
          "--abstractfilesuffix", ".abstract.tsv",
          "--entitiesfilesuffix", ".anon.txt"
       

        ],

        inputs=[
                ProcessingInput(
                    source=s3_input_data,
                    s3_data_type = s3_data_type,
                    destination=sm_local_input_data,
                    s3_data_distribution_type="FullyReplicated")
            ],


        outputs=[ProcessingOutput(
                source=sm_local_output, 
                destination=s3_output_predictions,
                output_name='predictions')]
    )





Job Name:  chemprot-inference-2022-08-06-22-37-03-054
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://aegovan-data/chemprotnerlargescale/chemprot-ner-largescale-20220730125708/', 'LocalPath': '/opt/ml/processing/input/data', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-2-324346001917/chemprot-inference-2022-08-06-22-37-03-054/input/code/chemprot_abstract_jsondir_converter.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'predictions', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://aegovan-data/chemprotlargescale/input/data_', 'LocalPath': '/opt/ml/processing/output', 'S3UploadMode': 'EndOfJob'}}]
.