### SageMaker Large Scale prediction

In [1]:
import sagemaker
import boto3

sagemaker_session = sagemaker.Session()
account_id =  boto3.client('sts').get_caller_identity().get('Account')
region = boto3.session.Session().region_name


#role = sagemaker.get_execution_role()
role="arn:aws:iam::{}:role/service-role/AmazonSageMaker-ExecutionRole-20190118T115449".format(account_id)


In [2]:
version_tag="202106010047"
pytorch_custom_image_name=f"ppi:cpu-{version_tag}"
instance_type = "ml.m5.large" 

In [3]:
docker_repo = "{}.dkr.ecr.{}.amazonaws.com/{}".format(account_id, region, pytorch_custom_image_name)

In [4]:
bucket = "aegovan-data"

In [5]:
testfile= "s3://{}/processed_dataset/test_multiclass.json".format(bucket)
valfile="s3://{}/processed_dataset/val_multiclass.json".format(bucket)
trainfile= "s3://{}/processed_dataset/train_multiclass.json".format(bucket)
largescalefiles = "s3://aegovan-data/pubmed_asbtract/inference_multi_2021053113/"
largescalefiles_manifest = "s3://aegovan-data/pubmed_asbtract/inference_mainifest/{}"\
                                .format(largescalefiles.rstrip("/").split("/")[-1])

### Step 1: Convert pubtator format to inference json

The input pubtator files look like this.. These are converted to produce inference 

```text
20791654|a|Liver scan characteristics and liver function tests of 72 patients with proved hepatic malignancy (54 metastatic, 18 primary) were evaluated. Well-defined focal defects were observed in 83% of patients with metastatic and 77% of patients with primary liver carcinoma. In 10% of the patients with metastatic liver disease the distribution of radioactivity was normal. Four or more biochemical liver function tests were normal in 33% of metastatic and 29% of primary liver cancer patients. Hepatic enlargement was present in the scan in 94% of the patients with liver metastases; however, data obtained from 104 necropsies of patients with hepatic metastases showed that only 46% had hepatomegaly. We recommend, therefore, that a liver scan should be performed before major tumour surgery in every patient with known malignancy regardless of normal liver size or normal liver function tests.
20791654	58	66	patients	Species	9606
20791654	193	201	patients	Species	9606
20791654	229	237	patients	Species	9606
20791654	282	290	patients	Species	9606
20791654	478	486	patients	Species	9606
20791654	546	554	patients	Species	9606
20791654	624	632	patients	Species	9606
20791654	796	803	patient	Species	9606

20791817|a|5-Aminosalicylic acid given to rats as a single intravenous injection led to necrosis of the proximal convoluted tubules and of the renal papilla. These two lesions developed at the same time and the cortical lesions did not appear to be a consequence of the renal papillary necrosis. Since the compound possesses the molecular structure both of a phenacetin derivative and of a salicylate these observations may be relevant to the problem of renal damage incident to abuse of analgesic compounds and suggest the possibility that in this syndrome cortical lesions may develop independently of renal papillary necrosis.
20791817	31	35	rats	Species	10116

```

In [6]:
import datetime
date_fmt = datetime.datetime.today().strftime("%Y%m%d%H")

In [7]:
#s3_input_pubtator = "s3://aegovan-data/pubmed_json_parts_annotation_iseries/pubmed19n0550.json.txt"
s3_input_pubtator = "s3://aegovan-data/pubmed_json_parts_annotation_iseries/"
s3_id_mapping_file="s3://aegovan-data/settings/HUMAN_9606_idmapping.dat"

s3_output_pubmed_asbtract = f"s3://aegovan-data/pubmed_asbtract/inference_multi_{date_fmt}/"

In [8]:
# from sagemaker.network import NetworkConfig
# from sagemaker.processing import ProcessingInput, ProcessingOutput
# from sagemaker.processing import ScriptProcessor

# script_processor = ScriptProcessor(image_uri=docker_repo,
#                                        command=["python"],
#                                        env={'mode': 'python', 'PYTHONPATH':'/opt/ml/code'},
#                                        role=role,
#                                        instance_type=instance_type,
#                                        instance_count=10,
#                                        max_runtime_in_seconds=172800,
#                                        volume_size_in_gb = 50,
#                                        network_config=NetworkConfig(enable_network_isolation=False),
#                                        base_job_name ="ppi-large-inference-data-prep"


#                                        )


# sm_local_input_pubtator_txt = "/opt/ml/processing/input/data/json"
# sm_local_input_idmapping = "/opt/ml/processing/input/data/mapping"
# sm_local_output = "/opt/ml/processing/output"


# script_processor.run(
#         code='source/datatransformer/pubtator_annotations_inference_transformer.py',

#         arguments=[
        
#             sm_local_input_pubtator_txt,
#             sm_local_output,
#            "{}/{}".format(sm_local_input_idmapping,s3_id_mapping_file.split("/")[-1]) 

#         ],
    
#        inputs=[
#                 ProcessingInput(
#                     source=s3_input_pubtator,
#                     destination=sm_local_input_pubtator_txt,
#                     s3_data_distribution_type="ShardedByS3Key")

#             ,ProcessingInput(
#                     source=s3_id_mapping_file,
#                     destination=sm_local_input_idmapping,
#                     s3_data_distribution_type="FullyReplicated")
#             ],

#         outputs=[ProcessingOutput(
#                 source=sm_local_output, 
#                 destination=s3_output_pubmed_asbtract,
#                 output_name='inferenceabstracts')]
#    )

## Step 2: Run predictions

In [9]:
prepare_models = False

In [10]:
jobs_f1 = [
"ppimulticlass-bert-f1-2021-05-11-03-51-45-979",
"ppimulticlass-bert-f1-2021-05-11-03-51-32-646",
"ppimulticlass-bert-f1-2021-05-11-03-51-18-103",
"ppimulticlass-bert-f1-2021-05-11-03-51-11-189",
"ppimulticlass-bert-f1-2021-05-10-19-31-36-944",
"ppimulticlass-bert-f1-2021-05-10-19-31-18-988",
"ppimulticlass-bert-f1-2021-05-10-16-23-25-703",
"ppimulticlass-bert-f1-2021-05-10-16-23-19-189",
"ppimulticlass-bert-f1-2021-05-10-16-23-03-547",
"ppimulticlass-bert-f1-2021-05-10-16-22-33-226"
]

In [11]:
jobs_loss = [
   
"ppimulticlass-bert-2021-05-08-20-29-59-117",
"ppimulticlass-bert-2021-05-08-20-29-39-694",
"ppimulticlass-bert-2021-05-08-20-29-11-549",
"ppimulticlass-bert-2021-05-08-20-29-06-842",
"ppimulticlass-bert-2021-05-08-19-21-50-296",
"ppimulticlass-bert-2021-05-08-19-21-43-271",
"ppimulticlass-bert-2021-05-08-19-21-15-995",
"ppimulticlass-bert-2021-05-08-19-08-50-131",
"ppimulticlass-bert-2021-05-08-17-12-32-759",
"ppimulticlass-bert-2021-05-08-17-11-49-920"

]



In [12]:
jobs=jobs_f1

In [13]:
s3_model_path_format = "s3://aegovan-data/ppi_multiclass_sagemakerresults/{}/output/model.tar.gz"

s3_model_paths = [s3_model_path_format.format(j) for j in jobs]

In [14]:
job_prefix = "{}-{}".format(jobs[-1][:32], len(jobs) )
s3_output_ensemble_models = "s3://aegovan-data/ppi_multiclass_ensemble_models/{}".format(job_prefix)

### Prepare ensemble models
TODO: This is just a hack to untar a bunch of zipped models and upload them to a single s3 locaton. Have a single processing job to do this is an overkill...

In [15]:
def get_processing_inputs_s3_local_path(s3_model_paths, sm_local_input):
    # Map the s3 model path to local input path
    inputs = []
    for i, s3_path in enumerate(s3_model_paths):
         p = ProcessingInput(
                        source=s3_path,
                        destination="{}/{}".format(sm_local_input.rstrip("/"), i)
         )
         inputs.append(p)
    return inputs


In [16]:
from sagemaker.network import NetworkConfig
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.processing import ScriptProcessor


sm_local_input = "/opt/ml/processing/input/models"
sm_local_output = "/opt/ml/processing/output"

script_processor = ScriptProcessor(image_uri=docker_repo,
                                       command=["python"],
                                       env={'mode': 'python', 'PYTHONPATH':'/opt/ml/code'},
                                       role=role,
                                       instance_type=instance_type,
                                       instance_count=1,
                                       max_runtime_in_seconds=172800,
                                       volume_size_in_gb = 50,
                                       network_config=NetworkConfig(enable_network_isolation=False),
                                       base_job_name ="ppi-ensemble-model-packer"
                                       )


In [17]:

if prepare_models:
    # Work around to get over the processing job input limit size
    chunk_size=5
    for i in range(0, len(s3_model_paths), chunk_size ):

        script_processor.run(
                code='../src/inference/ensemble_inference_prepare_models.py',

                arguments=[
                    "--input-dir",
                    sm_local_input,
                    "--dest-dir",
                    sm_local_output

                ],

                inputs=get_processing_inputs_s3_local_path(s3_model_paths[i:i+chunk_size], sm_local_input),


                outputs=[ProcessingOutput(
                        source=sm_local_output, 
                        destination=s3_output_ensemble_models,
                        output_name='models')]
            )



### Run ensemble prediction

In [18]:
prepare_manifest=True

In [19]:
from sagemaker.s3 import S3Downloader, S3Uploader, parse_s3_url
import tempfile
import boto3, json
from datetime import datetime


def prepare_manifest_file(s3_input_file_prefix, s3_manifest_output_prefix, batch_size=250):
    """
    As a workaround to get around the 5 day limit, use split manifest file...
    """
    s3_files_list =  S3Downloader.list(s3_input_file_prefix)
    
    s3_manifest_output_prefix = s3_manifest_output_prefix.rstrip("/")
    
    for i, r in enumerate(range(0, len(s3_files_list), batch_size)) :
        manifest_file =  "manifest_{:05}.json".format(i)
        s3_manifest_file = "{}/{}".format(s3_manifest_output_prefix, manifest_file)
        
        files_batch = s3_files_list[r:r+batch_size]
        
        manifest_lst = [{"prefix": s3_input_file_prefix}]
        
        manifest_lst.extend(list(map(lambda x: x[len(s3_input_file_prefix):], files_batch)))
        
        manifest_body = json.dumps(manifest_lst)
        
        S3Uploader.upload_string_as_file_body(manifest_body, desired_s3_uri=s3_manifest_file)
        

def _s3_move_file(s3_src, s3_dest):
    src_bucket, src_key = parse_s3_url(s3_src)
    dest_bucket, dest_key = parse_s3_url(s3_dest)
    
    client = boto3.resource('s3')
    copy_source = {
        'Bucket': src_bucket,
        'Key': src_key
    }
    bucket = client.Bucket(dest_bucket)
    obj = bucket.Object(dest_key)
    obj.copy(copy_source)
    
    src_obj = client.Object(src_bucket,src_key)
    src_obj.delete()


def _s3_delete_files(s3_prefix, filter_func=None):
    s3_files =  S3Downloader.list("{}/".format( s3_prefix.rstrip("/")))
    
    client = boto3.resource('s3')
    
    filter_func = filter_func or (lambda x: True)

    for s in filter( filter_func, s3_files):
        src_bucket, src_key = parse_s3_url(s)
        print("Deleting {}".format(s))
        src_obj = client.Object(src_bucket,src_key)
        src_obj.delete()
 
def get_manifest_file_with_rename(s3_manifest_output_prefix, rename_file=True):
    s3_pending_manifest_files =  S3Downloader.list("{}/manifest_".format( s3_manifest_output_prefix.rstrip("/")))
    
    src_file = s3_pending_manifest_files[0]
    
    processed_file_prefix, processed_file_name = "/".join(src_file.split("/")[:-1]), src_file.split("/")[-1]
    
    process_date = datetime.now().strftime("%Y%m%d%H%M")
    
    processed_file="{}/processed_{}_{}".format(processed_file_prefix, process_date, processed_file_name)
    
    result = src_file
    if rename_file :
        _s3_move_file(src_file, processed_file )
        result = processed_file
    
    return result


    

In [20]:
if prepare_manifest: prepare_manifest_file(largescalefiles, largescalefiles_manifest)

In [21]:
#s3_output_predictions = "s3://aegovan-data/pubmed_asbtract/predictions_largescale_{}_{}/".format(job_prefix,date_fmt)
s3_output_predictions = "s3://aegovan-data/pubmed_asbtract/predictions_largescale_{}_{}/".format(job_prefix,"2021-07-01")
s3_input_data = get_manifest_file_with_rename(largescalefiles_manifest)
s3_data_type="ManifestFile" # S3Prefix
usefilter=1
filter_threshold_std=1.0
instance_count = 2

s3_input_models = s3_output_ensemble_models
s3_input_vocab = "s3://{}/embeddings/bert/".format(bucket)

In [22]:
s3_input_data, s3_data_type

('s3://aegovan-data/pubmed_asbtract/inference_mainifest/inference_multi_2021053113/processed_202106030930_manifest_00000.json',
 'ManifestFile')

In [23]:
pytorch_custom_image_name=f"ppi:gpu-{version_tag}"


instance_type = "ml.p3.2xlarge" 


In [24]:
docker_repo = "{}.dkr.ecr.{}.amazonaws.com/{}".format(account_id, region, pytorch_custom_image_name)

In [None]:
from sagemaker.network import NetworkConfig
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.processing import ScriptProcessor

script_processor = ScriptProcessor(image_uri=docker_repo,
                                       command=["python"],
                                       env={'mode': 'python', 'PYTHONPATH':'/opt/ml/code'},
                                       role=role,
                                       instance_type=instance_type,
                                       instance_count=instance_count,
                                       max_runtime_in_seconds= 5 * 24 * 60 * 60,
                                       volume_size_in_gb = 250,
                                       network_config=NetworkConfig(enable_network_isolation=False),
                                       base_job_name ="ppi-ensemble-inference"
                                       )


sm_local_input_models = "/opt/ml/processing/input/data/models"
sm_local_input_data = "/opt/ml/processing/input/data/jsonlines"
sm_local_input_vocab = "/opt/ml/processing/input/data/vocab"


sm_local_output = "/opt/ml/processing/output"



script_processor.run(
        code='../src/inference/ppi_multiclass_batch_predict.py',

        arguments=[
            sm_local_input_data,
            sm_local_input_models,
            sm_local_output,
            "--ensemble", "1",
            "--tokenisor_data_dir", sm_local_input_vocab,
            "--protein_name_replacer_random_seed", "43",
            "--filter", str(usefilter),
            "--batch", "32",
            "--filterstdthreshold", str(filter_threshold_std)
        ],

        inputs=[
                ProcessingInput(
                    source=s3_input_data,
                    s3_data_type = s3_data_type,
                    destination=sm_local_input_data,
                    s3_data_distribution_type="ShardedByS3Key"),

            ProcessingInput(
                    source=s3_input_models,
                    destination=sm_local_input_models,
                    s3_data_distribution_type="FullyReplicated"),
            
            ProcessingInput(
                    source=s3_input_vocab,
                    destination=sm_local_input_vocab,
                    s3_data_distribution_type="FullyReplicated")
            ],


        outputs=[ProcessingOutput(
                source=sm_local_output, 
                destination=s3_output_predictions,
                output_name='predictions')]
    )





Job Name:  ppi-ensemble-inference-2021-06-03-16-30-29-429
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://aegovan-data/pubmed_asbtract/inference_mainifest/inference_multi_2021053113/processed_202106030930_manifest_00000.json', 'LocalPath': '/opt/ml/processing/input/data/jsonlines', 'S3DataType': 'ManifestFile', 'S3InputMode': 'File', 'S3DataDistributionType': 'ShardedByS3Key', 'S3CompressionType': 'None'}}, {'InputName': 'input-2', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://aegovan-data/ppi_multiclass_ensemble_models/ppimulticlass-bert-f1-2021-05-10-10', 'LocalPath': '/opt/ml/processing/input/data/models', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'input-3', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://aegovan-data/embeddings/bert/', 'LocalPath': '/opt/ml/processing/input/data/vocab', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistrib