# Chemprot: Bert NER on Pubmed Abstracts using PyTorch Prediction




In [1]:
import sys, os
import logging

sys.path.append("src")

logging.basicConfig(level="INFO", handlers=[logging.StreamHandler(sys.stdout)],
                        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

In [2]:
training_job = "chemprot-ner-bert-2022-07-23-21-56-34-969"

In [3]:
local_temp="temp"

In [4]:
!rm -rf $local_temp
!mkdir -p $local_temp

### Bucket and role set up

In [5]:

s3_uri_pubmedjson = "s3://aegovan-data/pubmed-json/pubmed19n06"

s3_output_base = "s3://aegovan-data/chemprotnerlargescale/"
s3_code_path = "s3://aegovan-data/chemprotnercode"


In [6]:
import boto3

#from sagemaker import get_execution_role

import sagemaker.session 
sm_session = sagemaker.session.Session()
account_id =  boto3.client('sts').get_caller_identity().get('Account')

# role=get_execution_role()
role ="arn:aws:iam::{}:role/service-role/AmazonSageMaker-ExecutionRole-20181222T162635".format(account_id)


2022-07-24 17:55:16,574 - botocore.credentials - INFO - Found credentials in shared credentials file: ~/.aws/credentials
2022-07-24 17:55:16,656 - botocore.credentials - INFO - Found credentials in shared credentials file: ~/.aws/credentials


## Predict

This shows you how to train BERT on SageMaker using SPOT instances

In [7]:
instance_type = "ml.p3.2xlarge"
instance_count=1

In [8]:
import sagemaker
estimator = sagemaker.estimator.Estimator.attach(training_job)
model_uri = estimator.model_data
model_uri


2022-07-23 22:40:11 Starting - Preparing the instances for training
2022-07-23 22:40:11 Downloading - Downloading input data
2022-07-23 22:40:11 Training - Training image download completed. Training in progress.
2022-07-23 22:40:11 Uploading - Uploading generated training model
2022-07-23 22:40:11 Completed - Training job completed


's3://aegovan-data/chemprotner/output/chemprot-ner-bert-2022-07-23-21-56-34-969/output/model.tar.gz'

In [9]:
# set True if you need spot instance
use_spot = False
max_run_secs =   5 *24 * 60 * 60
spot_wait_sec =  5 * 60
max_wait_time_secs = max_run_secs +  spot_wait_sec

if not use_spot:
    max_wait_time_secs = None
    
# During local mode, no spot.., use smaller dataset
if instance_type == 'local':
    use_spot = False
    max_wait_time_secs = 0
    wait = True
   

In [10]:
job_type = "chemprot-ner-largescale"

from datetime import datetime
job_name = "{}-{}".format(job_type,datetime.now().strftime("%Y%m%d%H%M%S"))
job_name

'chemprot-ner-largescale-20220724175517'

In [11]:
from sagemaker.pytorch.processing import PyTorchProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput

processor = PyTorchProcessor( role=role,
                    framework_version ="1.4.0",
                    code_location=s3_code_path,
                    instance_count=1,
                    instance_type=instance_type,
                    max_runtime_in_seconds = max_run_secs,
                    volume_size_in_gb=250
                    )

sm_data_local = '/opt/ml/processing/input/data'
sm_model_local = '/opt/ml/processing/input/model'
sm_output_local = '/opt/ml/processing/output/data'
s3_output_uri = "{}/{}".format( s3_output_base.rstrip("/"), job_name)

processor.run(
    job_name=job_name,
    code='chemprot_batch_inference.py',
    source_dir='src',
    arguments = [
        "--inputdatadir",sm_data_local,
        "--modeltar", "{}/{}".format(sm_model_local,model_uri.split("/")[-1]),
        "--outputdatadir",sm_output_local,
        "--batchsize", "8",
    ],
    inputs=[
        ProcessingInput(
            source=s3_uri_pubmedjson,
            destination=sm_data_local,
            s3_data_distribution_type = "ShardedByS3Key"
        ),
    ProcessingInput(
            source=model_uri,
            destination=sm_model_local,
            s3_data_distribution_type = "FullyReplicated"
        )
    ],
    outputs=[
        ProcessingOutput(output_name='output', 
                         source=sm_output_local, 
                         destination=s3_output_uri)
            ]
)

2022-07-24 17:55:19,658 - sagemaker.processing - INFO - Uploaded src to s3://aegovan-data/chemprotnercode/chemprot-ner-largescale-20220724175517/source/sourcedir.tar.gz
2022-07-24 17:55:20,139 - sagemaker.processing - INFO - runproc.sh uploaded to s3://aegovan-data/chemprotnercode/chemprot-ner-largescale-20220724175517/source/runproc.sh

Job Name:  chemprot-ner-largescale-20220724175517
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://aegovan-data/pubmed-json/pubmed19n06', 'LocalPath': '/opt/ml/processing/input/data', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'ShardedByS3Key', 'S3CompressionType': 'None'}}, {'InputName': 'input-2', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://aegovan-data/chemprotner/output/chemprot-ner-bert-2022-07-23-21-56-34-969/output/model.tar.gz', 'LocalPath': '/opt/ml/processing/input/model', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3Compr

ClientError: An error occurred (ValidationException) when calling the CreateProcessingJob operation: 1 validation error detected: Value 'ml.p3.3xlarge' at 'processingResources.clusterConfig.instanceType' failed to satisfy constraint: Member must satisfy enum value set: [ml.r5.12xlarge, ml.m5.4xlarge, ml.p2.xlarge, ml.m4.16xlarge, ml.r5.24xlarge, ml.t3.xlarge, ml.r5.16xlarge, ml.m5.large, ml.p3.16xlarge, ml.p2.16xlarge, ml.c4.2xlarge, ml.c5.2xlarge, ml.c4.4xlarge, ml.c5.4xlarge, ml.g4dn.xlarge, ml.g4dn.12xlarge, ml.g4dn.2xlarge, ml.c4.8xlarge, ml.g4dn.4xlarge, ml.c5.9xlarge, ml.g4dn.16xlarge, ml.c5.xlarge, ml.c4.xlarge, ml.g4dn.8xlarge, ml.t3.2xlarge, ml.t3.medium, ml.c5.18xlarge, ml.r5.2xlarge, ml.p3.2xlarge, ml.m5.xlarge, ml.m4.10xlarge, ml.r5.4xlarge, ml.m5.12xlarge, ml.m4.xlarge, ml.t3.large, ml.m5.24xlarge, ml.m4.2xlarge, ml.m5.2xlarge, ml.p2.8xlarge, ml.r5.8xlarge, ml.r5.xlarge, ml.r5.large, ml.p3.8xlarge, ml.m4.4xlarge]