### SageMaker Counterfactual prediction

In [1]:
import sagemaker
import boto3

sagemaker_session = sagemaker.Session()
account_id =  boto3.client('sts').get_caller_identity().get('Account')
region = boto3.session.Session().region_name


#role = sagemaker.get_execution_role()
role="arn:aws:iam::{}:role/service-role/AmazonSageMaker-ExecutionRole-20190118T115449".format(account_id)


In [2]:
version_tag="202304091951"
pytorch_custom_image_name=f"large-scale-ptm-ppi:gpu-{version_tag}"
instance_type = "ml.m5.2xlarge"  #ml.g4dn.2xlarge
instance_count = 1



In [3]:
docker_repo = "{}.dkr.ecr.{}.amazonaws.com/{}".format(account_id, region, pytorch_custom_image_name)

In [4]:
bucket = "aegovan-data"

In [5]:
import datetime
date_fmt = datetime.datetime.today().strftime("%Y%m%d%H")

In [6]:

s3_output_predictions = "s3://aegovan-data/counterfactuals/imdb/{}".format(date_fmt)



### Run  data prep

In [7]:
from sagemaker.network import NetworkConfig
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.processing import ScriptProcessor

script_processor = ScriptProcessor(image_uri=docker_repo,
                                       command=["python"],
                                       env={'mode': 'python', 'PYTHONPATH':'/opt/ml/code'},
                                       role=role,
                                       instance_type=instance_type,
                                       instance_count=instance_count,
                                       max_runtime_in_seconds= 5 * 24 * 60 * 60,
                                       volume_size_in_gb = 200,
                                       network_config=NetworkConfig(enable_network_isolation=False),
                                       base_job_name ="imdbdataprep"
                                       )


sm_local_input_models = "/opt/ml/processing/input/data/models"
sm_local_input_data = "/opt/ml/processing/input/data/jsondata"
sm_local_input_vocab = "/opt/ml/processing/input/data/vocab"


sm_local_output = "/opt/ml/processing/output"


script_processor.run(
        code='../src/utils/counterfactuals_imdb_dataprep.py',

        arguments=[
           
            "--outputdir", sm_local_output
        ],

      
        outputs=[ProcessingOutput(
                source=sm_local_output, 
                destination=s3_output_predictions,
                output_name='predictions')]
    )





Job Name:  imdbdataprep-2023-04-15-20-05-20-856
Inputs:  [{'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-2-324346001917/imdbdataprep-2023-04-15-20-05-20-856/input/code/counterfactuals_imdb_dataprep.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'predictions', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://aegovan-data/counterfactuals/imdb/2023041513', 'LocalPath': '/opt/ml/processing/output', 'S3UploadMode': 'EndOfJob'}}]
......................[34m{'outputdir': '/opt/ml/processing/output', 'log_level': 'INFO'}[0m
[34m2023-04-15 20:09:06,126 - __main__ - INFO - Train, val: ((15409, 3), (3853, 3))[0m
[34m2023-04-15 20:09:06,413 - __main__ - INFO - Counter factual train: (3414, 3)[0m
[34m2023-04-15 20:09:06,559 - __main__ - INFO - Counter factual val: (490, 3)[0m
[34m2023-04-15 20:09:

[34m[nltk_data] Downloading package punkt to /root/nltk_data...[0m
[34m[nltk_data]   Package punkt is already up-to-date![0m
[34m2023-04-15 20:59:01,926 - __main__ - INFO - {"Unique": 3396, "PosRate": 0.5, "AdvRatePN": 0.0, "AffRateP": 0.0058823529411764705, "AffRateN": 0.001176470588235294, "Total": 3400}[0m
[34m[nltk_data] Downloading package punkt to /root/nltk_data...[0m
[34m[nltk_data]   Package punkt is already up-to-date![0m
[34m[nltk_data] Downloading package punkt to /root/nltk_data...[0m
[34m[nltk_data]   Package punkt is already up-to-date![0m
[34m[nltk_data] Downloading package punkt to /root/nltk_data...[0m
[34m[nltk_data]   Package punkt is already up-to-date![0m
[34m2023-04-15 20:59:22,762 - __main__ - INFO - {"Unique": 680, "PosRate": 0.5, "AdvRatePN": 0.0, "AffRateP": 0.0, "AffRateN": 0.0058823529411764705, "Total": 680}[0m
[34m2023-04-15 20:59:22,773 - __main__ - INFO - Target Total size:3400,  target_neg_samples:1700 target_pos_samples: 1700[0m


[34m[nltk_data] Downloading package punkt to /root/nltk_data...[0m
[34m[nltk_data]   Package punkt is already up-to-date![0m
[34m2023-04-15 21:32:48,744 - __main__ - INFO - {"Unique": 3396, "PosRate": 0.5, "AdvRatePN": 0.0, "AffRateP": 0.0058823529411764705, "AffRateN": 0.001176470588235294, "Total": 3400}[0m
[34m[nltk_data] Downloading package punkt to /root/nltk_data...[0m
[34m[nltk_data]   Package punkt is already up-to-date![0m
[34m[nltk_data] Downloading package punkt to /root/nltk_data...[0m
[34m[nltk_data]   Package punkt is already up-to-date![0m
[34m[nltk_data] Downloading package punkt to /root/nltk_data...[0m
[34m[nltk_data]   Package punkt is already up-to-date![0m
[34m2023-04-15 21:33:09,708 - __main__ - INFO - {"Unique": 680, "PosRate": 0.5, "AdvRatePN": 0.0, "AffRateP": 0.0, "AffRateN": 0.0, "Total": 680}[0m
[34m2023-04-15 21:33:09,718 - __main__ - INFO - Target Total size:3400,  target_neg_samples:1700 target_pos_samples: 1700[0m
[34m2023-04-15 21

[34m2023-04-15 22:14:52,792 - __main__ - INFO - {"Unique": 3393, "PosRate": 0.5, "AdvRatePN": 0.09411764705882353, "AffRateP": 0.006470588235294118, "AffRateN": 0.002352941176470588, "Total": 3400}[0m
[34m[nltk_data] Downloading package punkt to /root/nltk_data...[0m
[34m[nltk_data]   Package punkt is already up-to-date![0m
[34m[nltk_data] Downloading package punkt to /root/nltk_data...[0m
[34m[nltk_data]   Package punkt is already up-to-date![0m
[34m[nltk_data] Downloading package punkt to /root/nltk_data...[0m
[34m[nltk_data]   Package punkt is already up-to-date![0m
[34m2023-04-15 22:15:14,093 - __main__ - INFO - {"Unique": 680, "PosRate": 0.5, "AdvRatePN": 0.1, "AffRateP": 0.0, "AffRateN": 0.0, "Total": 680}[0m
[34m2023-04-15 22:15:14,104 - __main__ - INFO - Target Total size:3400,  target_neg_samples:1700 target_pos_samples: 1700[0m
[34m2023-04-15 22:15:14,106 - __main__ - INFO - DF value counts Negative    7712[0m
[34mPositive    7697[0m
[34mName: Sentiment