## Compare effect of PPI Marker

This compares the large scale data with training file


In [1]:
bucket = "aegovan-data"

s3_trainfile = f"s3://{bucket}/processed_dataset/train_multiclass.json"
s3_testfile = f"s3://{bucket}/processed_dataset/test_multiclass.json"

s3_output_ensemble_models = f"s3://{bucket}/ppi_multiclass_ensemble_models/ppimulticlass-bert-f1-2021-05-10-10"


In [2]:
docker_version_tag="202110290314"
pytorch_custom_image_name=f"large-scale-ptm-ppi:gpu-{docker_version_tag}"
instance_type = "ml.p3.2xlarge" 

In [3]:
label_order = [ "acetylation", "methylation", "phosphorylation", "dephosphorylation", "ubiquitination"]

In [5]:
import os, sys
sys.path.append("src")



In [4]:
import sagemaker
import boto3

sagemaker_session = sagemaker.Session()
account_id =  boto3.client('sts').get_caller_identity().get('Account')
region = boto3.session.Session().region_name


#role = sagemaker.get_execution_role()
role="arn:aws:iam::{}:role/service-role/AmazonSageMaker-ExecutionRole-20190118T115449".format(account_id)


In [7]:
s3_output_predictions = "s3://{}/pubmed_asbtract/predictions_test_{}_{}/".format(bucket, "test","2021-10-01")
s3_input_data = s3_testfile
s3_data_type="S3Prefix"
usefilter=0
filter_threshold_std=1.0
instance_count = 1

s3_input_models = s3_output_ensemble_models
s3_input_vocab = "s3://{}/embeddings/bert/".format(bucket)


In [None]:
docker_repo = "{}.dkr.ecr.{}.amazonaws.com/{}".format(account_id, region, pytorch_custom_image_name)

from sagemaker.network import NetworkConfig
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.processing import ScriptProcessor

script_processor = ScriptProcessor(image_uri=docker_repo,
                                       command=["python"],
                                       env={'mode': 'python', 'PYTHONPATH':'/opt/ml/code'},
                                       role=role,
                                       instance_type=instance_type,
                                       instance_count=instance_count,
                                       max_runtime_in_seconds= 5 * 24 * 60 * 60,
                                       volume_size_in_gb = 250,
                                       network_config=NetworkConfig(enable_network_isolation=False),
                                       base_job_name ="ppi-ensemble-test"
                                       )


sm_local_input_models = "/opt/ml/processing/input/data/models"
sm_local_input_data = "/opt/ml/processing/input/data/jsonlines"
sm_local_input_vocab = "/opt/ml/processing/input/data/vocab"


sm_local_output = "/opt/ml/processing/output"



script_processor.run(
        code='../src/inference/ppi_multiclass_batch_predict.py',

        arguments=[
            sm_local_input_data,
            sm_local_input_models,
            sm_local_output,
            "--datasetfactory", "datasets.ppi_multiclass_error_analysis_dataset_factory.PpiMulticlassErrorAnalysisDatasetFactory",
            "--ensemble", "1",
            "--tokenisor_data_dir", sm_local_input_vocab,
            "--protein_name_replacer_random_seed", "43",
            "--filter", str(usefilter),
            "--batch", "32",
            "--filterstdthreshold", str(filter_threshold_std),
            "--log-level","DEBUG"
            
        ],

        inputs=[
                ProcessingInput(
                    source=s3_input_data,
                    s3_data_type = s3_data_type,
                    destination=sm_local_input_data,
                    s3_data_distribution_type="ShardedByS3Key"),

            ProcessingInput(
                    source=s3_input_models,
                    destination=sm_local_input_models,
                    s3_data_distribution_type="FullyReplicated"),
            
            ProcessingInput(
                    source=s3_input_vocab,
                    destination=sm_local_input_vocab,
                    s3_data_distribution_type="FullyReplicated")
            ],


        outputs=[ProcessingOutput(
                source=sm_local_output, 
                destination=s3_output_predictions,
                output_name='predictions')]
    )



In [11]:
s3_output_predictions

's3://aegovan-data/pubmed_asbtract/predictions_test_test_2021-10-01/'