In [1]:
import sagemaker
import boto3
from uuid import uuid4
import os
import shutil
from datetime import datetime
sagemaker_session = sagemaker.Session()
account_id =  boto3.client('sts').get_caller_identity().get('Account')
region = boto3.session.Session().region_name


#role = sagemaker.get_execution_role()
role="arn:aws:iam::{}:role/service-role/AmazonSageMaker-ExecutionRole-20190118T115449".format(account_id)


In [2]:
PACKAGE_MODEL=False

In [3]:
temp_dir = "temp"

In [4]:
transformer_examples_dir = os.path.join(temp_dir, "hugging_face_example")


### 2. Setup image and instance type

In [5]:
custom_image_name=f"huggingface-pytorch-training:1.9.1-transformers4.12.3-gpu-py38-cu111-ubuntu20.04"
image_account_id="763104351884"
instance_type = "ml.p3.2xlarge"
instance_type_gpu_map = {"ml.p3.8xlarge":4, "ml.p3.2xlarge": 1, "ml.p3.16xlarge":8}
instance_count=1

In [6]:
docker_repo = "{}.dkr.ecr.{}.amazonaws.com/{}".format(image_account_id, region, custom_image_name)

### 3. Configure train/ test and validation datasets



In [7]:
bucket = "aegovan-data"

In [55]:
pretrained_bert="s3://{}/embeddings/bert_base_cased/".format(bucket)


trainfile = "s3://{}/glue_data/train.tsv".format(bucket)

val_matched_file="s3://{}/glue_data_pred/dev_matched.csv".format(bucket)
val_matched_shuffle_file="s3://{}/glue_data_pred/dev_matched_shuffled.csv".format(bucket)


val_mismatched_file="s3://{}/glue_data_pred/dev_mismatched.csv".format(bucket)


model_version = "mnli-reverse-lang-bert-accuracy-2022-01-23-21-29-34-327"

s3_model_path = f"s3://aegovan-data/mnli_sagemakerresults/{model_version}/output/model.tar.gz"
s3_model_package_path = f"s3://aegovan-data/models/{model_version}/output"
s3_model_config_vocab_path = "s3://aegovan-data/embeddings/bert_base_cased/"

s3_output_path= f"s3://{bucket}/gluebenchmark_sagemakerresults/"
s3_code_path= f"s3://{bucket}/gbucket_code"
s3_checkpoint = "s3://{}/mnli_bert_checkpoint/{}".format(bucket, str(uuid4()))

## Run processing job training

### Get train

In [9]:
if os.path.exists(transformer_examples_dir):
    shutil.rmtree(transformer_examples_dir)
    os.makedirs(transformer_examples_dir)

In [10]:
!git clone https://github.com/huggingface/transformers $transformer_examples_dir
!git -C $transformer_examples_dir checkout tags/v4.12.3

Cloning into 'temp/hugging_face_example'...
remote: Enumerating objects: 101038, done.[K
remote: Total 101038 (delta 0), reused 0 (delta 0), pack-reused 101038[K
Receiving objects: 100% (101038/101038), 86.53 MiB | 1.30 MiB/s, done.
Resolving deltas: 100% (73350/73350), done.
Note: switching to 'tags/v4.12.3'.

You are in 'detached HEAD' state. You can look around, make experimental
changes and commit them, and you can discard any commits you make in this
state without impacting any branches by switching back to a branch.

If you want to create a new branch to retain commits you create, you may
do so (now or later) by using -c with the switch command. Example:

  git switch -c <new-branch-name>

Or undo this operation with:

  git switch -

Turn off this advice by setting config variable advice.detachedHead to false

HEAD is now at 3ea15d278 Style


In [11]:
from sagemaker.network import NetworkConfig
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.processing import FrameworkProcessor
from sagemaker.huggingface import HuggingFace


framework_processor = FrameworkProcessor(HuggingFace,
                                      framework_version=None,
                                      image_uri=docker_repo,
                                      code_location = s3_code_path, 
                                       py_version="py36",
                                       command=["python"],
                                       env={'mode': 'python', 'PYTHONPATH':'/opt/ml/code'},
                                       role=role,
                                       instance_type=instance_type,
                                       instance_count=instance_count,
                                       max_runtime_in_seconds= 5 * 24 * 60 * 60,
                                       volume_size_in_gb = 250,
                                       network_config=NetworkConfig(enable_network_isolation=False)
                                       )



## Run base mnli

In [12]:


sm_local_input_models = "/opt/ml/processing/input/data/models"
sm_local_input_data = "/opt/ml/processing/input/data/jsonlines"
sm_local_input_vocab = "/opt/ml/processing/input/data/vocab"


sm_local_output = "/opt/ml/processing/output"

job_name = "gluebenchmark-bertbase-{}".format(datetime.now().strftime("%Y%m%d%H%M"))

framework_processor.run(
        job_name = job_name,
        wait=False,
        code=f'run_glue.py',
        source_dir=f'{transformer_examples_dir}/examples/pytorch/text-classification',
        arguments=[
            "--task_name", "mnli",
            "--model_name_or_path", "bert-base-cased",
            "--do_train", "1",
            "--do_eval","1",
            "--do_predict","1",
            "--max_seq_length", str(512),
            "--per_device_train_batch_size", str(8),
            "--gradient_accumulation_steps", str(4),
            "--learning_rate", str(2e-5),
            "--num_train_epochs", str(3),
            "--output_dir", sm_local_output,
            "--overwrite_output_dir", "1",
            "--load_best_model_at_end", "1",     # load the best model when finished training (default metric is loss)
            "--eval_steps","200",
            "--save_steps","200",
            "--evaluation_strategy","steps",
            "--disable_tqdm","1"
           
        ],

        inputs=[
#                 ProcessingInput(
#                     source=s3_input_data,
#                     s3_data_type = s3_data_type,
#                     destination=sm_local_input_data,
#                     s3_data_distribution_type="FullyReplicated"),

#                 ProcessingInput(
#                         source=s3_model_path,
#                         destination=sm_local_input_models,
#                         s3_data_distribution_type="FullyReplicated"),

#                 ProcessingInput(
#                         source=s3_input_vocab,
#                         destination=sm_local_input_vocab,
#                         s3_data_distribution_type="FullyReplicated")
            ],


        outputs=[ProcessingOutput(
                source=sm_local_output, 
                destination="{}/{}".format(s3_output_path.rstrip("/"), job_name) ,
                output_name='predictions')]
    )


Job Name:  gluebenchmark-bertbase-202202201547
Inputs:  [{'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://aegovan-data/gbucket_code/gluebenchmark-bertbase-202202201547/source/sourcedir.tar.gz', 'LocalPath': '/opt/ml/processing/input/code/', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'entrypoint', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://aegovan-data/gbucket_code/gluebenchmark-bertbase-202202201547/source/runproc.sh', 'LocalPath': '/opt/ml/processing/input/entrypoint', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'predictions', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://aegovan-data/gluebenchmark_sagemakerresults/gluebenchmark-bertbase-202202201547', 'LocalPath': '/opt/ml/processing/output', 'S3UploadMode': 'EndOfJob'}}]


## Run with reverse train

### Run model packaging

In [13]:
from sagemaker.network import NetworkConfig
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.processing import FrameworkProcessor
from sagemaker.huggingface import HuggingFace


sm_local_input_model = "/opt/ml/processing/input/data/model"
sm_local_input_config_vocab = "/opt/ml/processing/input/data/config_vocab"
sm_local_output = "/opt/ml/processing/output"

if PACKAGE_MODEL:
    framework_processor = FrameworkProcessor(HuggingFace,
                                      framework_version=None,
                                      image_uri=docker_repo,
                                       code_location = s3_code_path, 
                                       py_version="py36",
                                       command=["python"],
                                       env={'mode': 'python', 'PYTHONPATH':'/opt/ml/code'},
                                       role=role,
                                       instance_type="ml.m5.large",
                                       instance_count=1,
                                       max_runtime_in_seconds= 5 * 24 * 60 * 60,
                                       volume_size_in_gb = 250,
                                       network_config=NetworkConfig(enable_network_isolation=False),
                                       base_job_name ="model-packaging"
                                       )
    
    framework_processor.run(
            code=f'model_package_bert_utils.py',
            source_dir=f'../src/utils',
            arguments=[
                "--modeltarfile", f"{sm_local_input_model}/model.tar.gz" ,
                "--modelconfigfile", f"{sm_local_input_config_vocab}/config.json",
                "--vocabfile",f"{sm_local_input_config_vocab}/vocab.txt",
                "--outdir",sm_local_output

            ],

            inputs=[
                    ProcessingInput(
                        source=s3_model_path,
                        s3_data_type = "S3Prefix",
                        destination=sm_local_input_model,
                        s3_data_distribution_type="FullyReplicated"),

                    ProcessingInput(
                            source=s3_model_config_vocab_path,
                            destination=sm_local_input_config_vocab,
                            s3_data_distribution_type="FullyReplicated")

                ],


            outputs=[ProcessingOutput(
                    source=sm_local_output, 
                    destination=s3_model_package_path,
                    output_name='predictions')]
        )

### Train with reverse mnli

In [99]:
from sagemaker.network import NetworkConfig
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.processing import FrameworkProcessor
from sagemaker.huggingface import HuggingFace


framework_processor = FrameworkProcessor(HuggingFace,
                                      framework_version=None,
                                      image_uri=docker_repo,
                                      code_location = s3_code_path, 
                                       py_version="py36",
                                       command=["python"],
                                       env={'mode': 'python', 'PYTHONPATH':'/opt/ml/code'},
                                       role=role,
                                       instance_type=instance_type,
                                       instance_count=instance_count,
                                       max_runtime_in_seconds= 5 * 24 * 60 * 60,
                                       volume_size_in_gb = 250,
                                       network_config=NetworkConfig(enable_network_isolation=False)
                                       )



job_name = "gluebenchmark-reversemnli-{}".format(datetime.now().strftime("%Y%m%d%H%M"))




sm_local_input_model = "/opt/ml/processing/input/data/model"
sm_local_input_data = "/opt/ml/processing/input/data/jsonlines"
sm_local_input_vocab = "/opt/ml/processing/input/data/vocab"


sm_local_output = "/opt/ml/processing/output"



framework_processor.run(
        job_name = job_name,
        wait=False,
        code=f'run_glue.py',
    
        source_dir=f'{transformer_examples_dir}/examples/pytorch/text-classification',
        arguments=[
            "--task_name", "mnli",
            "--model_name_or_path", sm_local_input_model,
            "--do_train", "1",
            "--do_eval","1",
            "--do_predict","1",
            "--max_seq_length", str(512),
            "--per_device_train_batch_size", str(8),
            "--gradient_accumulation_steps", str(4),
            "--learning_rate", str(2e-5),
            "--num_train_epochs", str(10),
            "--output_dir", sm_local_output,
            "--overwrite_output_dir", "1",
            "--load_best_model_at_end", "1",     # load the best model when finished training (default metric is loss)
            "--eval_steps","200",
            "--save_steps","200",
            "--evaluation_strategy","steps",
            "--disable_tqdm","1"
           
        ],

        inputs=[
#                 ProcessingInput(
#                     source=s3_input_data,
#                     s3_data_type = s3_data_type,
#                     destination=sm_local_input_data,
#                     s3_data_distribution_type="FullyReplicated"),

                ProcessingInput(
                        source=s3_model_package_path,
                        destination=sm_local_input_model,
                        s3_data_distribution_type="FullyReplicated"),

#                 ProcessingInput(
#                         source=s3_input_vocab,
#                         destination=sm_local_input_vocab,
#                         s3_data_distribution_type="FullyReplicated")
            ],


        outputs=[ProcessingOutput(
                source=sm_local_output, 
                destination="{}/{}".format(s3_output_path.rstrip("/"), job_name),
                output_name='predictions')]
    )


Job Name:  gluebenchmark-reversemnli-202202271646
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://aegovan-data/models/mnli-reverse-lang-bert-accuracy-2022-01-23-21-29-34-327/output', 'LocalPath': '/opt/ml/processing/input/data/model', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://aegovan-data/gbucket_code/gluebenchmark-reversemnli-202202271646/source/sourcedir.tar.gz', 'LocalPath': '/opt/ml/processing/input/code/', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'entrypoint', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://aegovan-data/gbucket_code/gluebenchmark-reversemnli-202202271646/source/runproc.sh', 'LocalPath': '/opt/ml/processing/input/entrypoint', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDi

## Inference

In [64]:
inference_experiments = {   
     "glue-reverse-mnli-pretrained-pred-dev-m" : {
        "model" : s3_model_package_path,
         "data" : val_matched_file
    },
     "glue-reverse-mnli-finetuned-pred-dev-m" : {
        "model" : "s3://aegovan-data/gluebenchmark_sagemakerresults/gluebenchmark-reversemnli-202202201547/",
        "data" : val_matched_file
    },
    "glue-bert-base-pretrained-pred-dev-m" : {
        "model" : pretrained_bert,
        "data" : val_matched_file
    },
     "glue-bert-base-finetuned-pred-dev-m" : {
        "model" : "s3://aegovan-data/gluebenchmark_sagemakerresults/gluebenchmark-bertbase-202202201547/",
         "data" : val_matched_file
    },
    "glue-reverse-mnli-pretrained-pred-dev-m-shuf" : {
        "model" : s3_model_package_path,
         "data" : val_matched_shuffle_file
    },
    "glue-reverse-mnli-finetuned-pred-dev-m-shuf" : {
        "model" : "s3://aegovan-data/gluebenchmark_sagemakerresults/gluebenchmark-reversemnli-202202201547/",
        "data" : val_matched_shuffle_file
    },
    "glue-bert-base-pretrained-pred-dev-m-shuf" : {
        "model" : pretrained_bert,
        "data" : val_matched_shuffle_file
    },
     "glue-bert-base-finetuned-pred-dev-m-shuf" : {
        "model" : "s3://aegovan-data/gluebenchmark_sagemakerresults/gluebenchmark-bertbase-202202201547/",
        "data" : val_matched_shuffle_file
    }
}

In [97]:
experiment = "glue-bert-base-finetuned-pred-dev-m"

In [98]:
from sagemaker.network import NetworkConfig
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.processing import FrameworkProcessor
from sagemaker.huggingface import HuggingFace


framework_processor = FrameworkProcessor(HuggingFace,
                                      framework_version=None,
                                      image_uri=docker_repo,
                                      code_location = s3_code_path, 
                                       py_version="py36",
                                       command=["python"],
                                       env={'mode': 'python', 'PYTHONPATH':'/opt/ml/code'},
                                       role=role,
                                       instance_type=instance_type,
                                       instance_count=instance_count,
                                       max_runtime_in_seconds= 5 * 24 * 60 * 60,
                                       volume_size_in_gb = 250,
                                       network_config=NetworkConfig(enable_network_isolation=False)
                                       )

job_name = "{}-{}".format(experiment, datetime.now().strftime("%Y%m%d%H%M"))
s3_inference_model = inference_experiments[experiment]["model"]
prediction_s3_file = inference_experiments[experiment]["data"]

sm_local_input_model = "/opt/ml/processing/input/data/model"
sm_local_input_data = "/opt/ml/processing/input/data/jsonlines"
sm_local_input_vocab = "/opt/ml/processing/input/data/vocab"

sm_local_output = "/opt/ml/processing/output"

predict_local_file = "{}/{}".format(sm_local_input_data.rstrip("/"),prediction_s3_file.split("/")[-1] )
framework_processor.run(
        job_name = job_name,
        wait=False,
        code=f'run_glue.py',
    
        source_dir=f'{transformer_examples_dir}/examples/pytorch/text-classification',
        arguments=[
           # "--task_name", "mnli",
            "--model_name_or_path", sm_local_input_model,
            "--do_train", "0",
            "--do_eval","1",
            "--do_predict","1",
            "--train_file", predict_local_file,
            "--validation_file", predict_local_file,
            "--test_file", predict_local_file,
            "--max_seq_length", str(512),
            "--per_device_train_batch_size", str(32),
        
            "--output_dir", sm_local_output,
           
            
            "--disable_tqdm","1"
           
        ],

        inputs=[
                ProcessingInput(
                    source=prediction_s3_file,
                    destination=sm_local_input_data,
                    s3_data_distribution_type="FullyReplicated"),

                ProcessingInput(
                        source=s3_inference_model,
                        destination=sm_local_input_model,
                        s3_data_distribution_type="FullyReplicated"),

#                 ProcessingInput(
#                         source=s3_input_vocab,
#                         destination=sm_local_input_vocab,
#                         s3_data_distribution_type="FullyReplicated")
            ],


        outputs=[ProcessingOutput(
                source=sm_local_output, 
                destination="{}/{}".format(s3_output_path.rstrip("/"), job_name),
                output_name='predictions')]
    )


Job Name:  glue-bert-base-finetuned-pred-dev-m-202202271506
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://aegovan-data/glue_data_pred/dev_matched.csv', 'LocalPath': '/opt/ml/processing/input/data/jsonlines', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'input-2', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://aegovan-data/gluebenchmark_sagemakerresults/gluebenchmark-bertbase-202202201547/', 'LocalPath': '/opt/ml/processing/input/data/model', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://aegovan-data/gbucket_code/glue-bert-base-finetuned-pred-dev-m-202202271506/source/sourcedir.tar.gz', 'LocalPath': '/opt/ml/processing/input/code/', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'Fu