### SageMaker Counterfactual prediction

In [1]:
import sagemaker
import boto3

sagemaker_session = sagemaker.Session()
account_id =  boto3.client('sts').get_caller_identity().get('Account')
region = boto3.session.Session().region_name


#role = sagemaker.get_execution_role()
role="arn:aws:iam::{}:role/service-role/AmazonSageMaker-ExecutionRole-20190118T115449".format(account_id)


In [2]:
version_tag="202304091951"
pytorch_custom_image_name=f"large-scale-ptm-ppi:gpu-{version_tag}"
instance_type = "ml.g4dn.2xlarge"  #ml.g4dn.2xlarge
instance_count = 1



In [3]:
docker_repo = "{}.dkr.ecr.{}.amazonaws.com/{}".format(account_id, region, pytorch_custom_image_name)

In [4]:
bucket = "aegovan-data"

In [5]:

dataset = { 
    "yelp" :  "s3://aegovan-data/yelp_polarity/test.csv",
    "amazon":  "s3://aegovan-data/amazon_reviews_polarity/test.csv",
    "semeval": "s3://aegovan-data/semeval2017taskb/SemEval2017-task4-test.subtask-BD.english.txt"
}


dataset_type="semeval"
eval_file = dataset[dataset_type]



In [6]:
import datetime
date_fmt = datetime.datetime.today().strftime("%Y%m%d%H")

In [7]:
training_job = "counterfact-imdb-0-0-0-2023-04-09-21-10-10-142"
#training_job =  "counterfact-imdb-original-2023-04-09-19-03-45-522" # .90 adv
#training_job = "counterfact-imdb-simple-2023-04-03-01-35-49-441" # .25 adv
#training_job = "counterfact-imdb-simple-2023-04-09-02-17-57-367"  # .10 adv
s3_model_path = f"s3://aegovan-data/sagemakerresults/{training_job}/output/model.tar.gz"
s3_output_predictions = "s3://aegovan-data/counterfactual_{}/predictions_{}/{}".format(dataset_type,training_job,date_fmt)



### Run  prediction

In [8]:
#s3_output_predictions = "s3://aegovan-data/pubmed_asbtract/predictions_largescale_{}_{}/".format(job_prefix,date_fmt)
s3_input_data = eval_file
s3_data_type="S3Prefix"
usefilter=0
filter_threshold_std=1.0

s3_input_models = s3_model_path
s3_input_vocab = "s3://aegovan-data/pretrained_models/bert-base-uncased/"

In [9]:
s3_input_data, s3_data_type

('s3://aegovan-data/semeval2017taskb/SemEval2017-task4-test.subtask-BD.english.txt',
 'S3Prefix')

In [10]:
from sagemaker.network import NetworkConfig
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.processing import ScriptProcessor

script_processor = ScriptProcessor(image_uri=docker_repo,
                                       command=["python"],
                                       env={'mode': 'python', 'PYTHONPATH':'/opt/ml/code'},
                                       role=role,
                                       instance_type=instance_type,
                                       instance_count=instance_count,
                                       max_runtime_in_seconds= 5 * 24 * 60 * 60,
                                       volume_size_in_gb = 200,
                                       network_config=NetworkConfig(enable_network_isolation=False),
                                       base_job_name =f"{dataset_type}-inference"
                                       )


sm_local_input_models = "/opt/ml/processing/input/data/models"
sm_local_input_data = "/opt/ml/processing/input/data/jsondata"
sm_local_input_vocab = "/opt/ml/processing/input/data/vocab"


sm_local_output = "/opt/ml/processing/output"

input_file_name = s3_input_data.split("/")[-1]

script_processor.run(
        code='../src/inference/main_predict_sentiment_polarity.py',

        arguments=[
            f"{sm_local_input_data}/{eval_file.split('/')[-1]}",
            sm_local_input_models,
            sm_local_output,
            dataset_type,
            "--tokenisor_data_dir", sm_local_input_vocab,           
            "--batch", "32"
        ],

        inputs=[
                ProcessingInput(
                    source=s3_input_data,
                    s3_data_type = s3_data_type,
                    destination=sm_local_input_data,
                    s3_data_distribution_type="ShardedByS3Key"),

            ProcessingInput(
                    source=s3_input_models,
                    destination=sm_local_input_models,
                    s3_data_distribution_type="FullyReplicated"),
            
            ProcessingInput(
                    source=s3_input_vocab,
                    destination=sm_local_input_vocab,
                    s3_data_distribution_type="FullyReplicated")
            ],


        outputs=[ProcessingOutput(
                source=sm_local_output, 
                destination=s3_output_predictions,
                output_name='predictions')]
    )





Job Name:  semeval-inference-2023-04-10-00-47-05-244
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://aegovan-data/semeval2017taskb/SemEval2017-task4-test.subtask-BD.english.txt', 'LocalPath': '/opt/ml/processing/input/data/jsondata', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'ShardedByS3Key', 'S3CompressionType': 'None'}}, {'InputName': 'input-2', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://aegovan-data/sagemakerresults/counterfact-imdb-0-0-0-2023-04-09-21-10-10-142/output/model.tar.gz', 'LocalPath': '/opt/ml/processing/input/data/models', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'input-3', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://aegovan-data/pretrained_models/bert-base-uncased/', 'LocalPath': '/opt/ml/processing/input/data/vocab', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'Ful