In [1]:
import sagemaker
import boto3
from uuid import uuid4
import os
import shutil

sagemaker_session = sagemaker.Session()
account_id =  boto3.client('sts').get_caller_identity().get('Account')
region = boto3.session.Session().region_name


#role = sagemaker.get_execution_role()
role="arn:aws:iam::{}:role/service-role/AmazonSageMaker-ExecutionRole-20190118T115449".format(account_id)
max_runs=1

In [2]:
temp_dir = "temp"

In [3]:
transformer_examples_dir = os.path.join(temp_dir, "hugging_face_example")


### 2. Setup image and instance type

In [4]:
custom_image_name=f"huggingface-pytorch-training:1.9.1-transformers4.12.3-gpu-py38-cu111-ubuntu20.04"
image_account_id="763104351884"
instance_type = "ml.p3.2xlarge"
instance_type_gpu_map = {"ml.p3.8xlarge":4, "ml.p3.2xlarge": 1, "ml.p3.16xlarge":8}
instance_count=1

In [5]:
docker_repo = "{}.dkr.ecr.{}.amazonaws.com/{}".format(image_account_id, region, custom_image_name)

### 3. Configure train/ test and validation datasets



In [6]:
bucket = "aegovan-data"

In [7]:
pretrained_bert="s3://{}/embeddings/bert_base_cased/".format(bucket)


trainfile = "s3://{}/glue_dataset/train/multinli_1.0_train.jsonl".format(bucket)
# valfile="s3://{}/mnli_dataset/val/multinli_1.0_dev_matched.jsonl".format(bucket)

#trainfile = "s3://{}/mnli_dataset_mini/train/multinli.jsonl".format(bucket)
valfile="s3://{}/glue_dataset_mini/train/multinli.jsonl".format(bucket)

model_version = "mnli-reverse-lang-bert-accuracy-2022-01-23-21-29-34-327"

s3_model_path = f"s3://aegovan-data/mnli_sagemakerresults/{model_version}/output/model.tar.gz"
s3_model_package_path = f"s3://aegovan-data/models/{model_version}/output"
s3_model_config_vocab_path = "s3://aegovan-data/embeddings/bert_base_cased/"

s3_output_path= f"s3://{bucket}/gluebenchmark_sagemakerresults/{model_version}/"
s3_code_path= f"s3://{bucket}/gbucket_code"
s3_checkpoint = "s3://{}/mnli_bert_checkpoint/{}".format(bucket, str(uuid4()))

## Run processing job training

### Get train

In [8]:
if os.path.exists(transformer_examples_dir):
    shutil.rmtree(transformer_examples_dir)
    os.makedirs(transformer_examples_dir)

In [9]:
!git clone https://github.com/huggingface/transformers $transformer_examples_dir
!git -C $transformer_examples_dir checkout tags/v4.12.3

Cloning into 'temp/hugging_face_example'...
remote: Enumerating objects: 101039, done.[K
remote: Counting objects: 100% (82/82), done.[K
remote: Compressing objects: 100% (50/50), done.[K
remote: Total 101039 (delta 45), reused 44 (delta 28), pack-reused 100957[K
Receiving objects: 100% (101039/101039), 86.59 MiB | 15.83 MiB/s, done.
Resolving deltas: 100% (73330/73330), done.
Note: switching to 'tags/v4.12.3'.

You are in 'detached HEAD' state. You can look around, make experimental
changes and commit them, and you can discard any commits you make in this
state without impacting any branches by switching back to a branch.

If you want to create a new branch to retain commits you create, you may
do so (now or later) by using -c with the switch command. Example:

  git switch -c <new-branch-name>

Or undo this operation with:

  git switch -

Turn off this advice by setting config variable advice.detachedHead to false

HEAD is now at 3ea15d278 Style


In [10]:
from sagemaker.network import NetworkConfig
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.processing import FrameworkProcessor
from sagemaker.huggingface import HuggingFace


framework_processor = FrameworkProcessor(HuggingFace,
                                      framework_version=None,
                                      image_uri=docker_repo,
                                      code_location = s3_code_path, 
                                       py_version="py36",
                                       command=["python"],
                                       env={'mode': 'python', 'PYTHONPATH':'/opt/ml/code'},
                                       role=role,
                                       instance_type=instance_type,
                                       instance_count=instance_count,
                                       max_runtime_in_seconds= 5 * 24 * 60 * 60,
                                       volume_size_in_gb = 250,
                                       network_config=NetworkConfig(enable_network_isolation=False),
                                       base_job_name ="glue-processing"
                                       )



## Run base mnli

In [11]:

sm_local_input_models = "/opt/ml/processing/input/data/models"
sm_local_input_data = "/opt/ml/processing/input/data/jsonlines"
sm_local_input_vocab = "/opt/ml/processing/input/data/vocab"


sm_local_output = "/opt/ml/processing/output"



framework_processor.run(
        code=f'run_glue.py',
        source_dir=f'{transformer_examples_dir}/examples/pytorch/text-classification',
        arguments=[
            "--task_name", "mnli",
            "--model_name_or_path", "bert-base-cased",
            "--do_train", "1",
            "--do_eval","1",
            "--do_predict","1",
            "--max_seq_length", str(512),
            "--per_device_train_batch_size", str(8),
            "--gradient_accumulation_steps", str(4),
            "--learning_rate", str(2e-5),
            "--num_train_epochs", str(3),
            "--output_dir", sm_local_output,
            "--overwrite_output_dir", "1",
            "--load_best_model_at_end", "1",     # load the best model when finished training (default metric is loss)
            "--eval_steps","200",
            "--save_steps","200",
            "--evaluation_strategy","steps",
            "--disable_tqdm","1"
           
        ],

        inputs=[
#                 ProcessingInput(
#                     source=s3_input_data,
#                     s3_data_type = s3_data_type,
#                     destination=sm_local_input_data,
#                     s3_data_distribution_type="FullyReplicated"),

#                 ProcessingInput(
#                         source=s3_model_path,
#                         destination=sm_local_input_models,
#                         s3_data_distribution_type="FullyReplicated"),

#                 ProcessingInput(
#                         source=s3_input_vocab,
#                         destination=sm_local_input_vocab,
#                         s3_data_distribution_type="FullyReplicated")
            ],


        outputs=[ProcessingOutput(
                source=sm_local_output, 
                destination=s3_output_path,
                output_name='predictions')]
    )


Job Name:  glue-processing-2022-02-19-17-25-50-403
Inputs:  [{'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://aegovan-data/gbucket_code/glue-processing-2022-02-19-17-25-50-403/source/sourcedir.tar.gz', 'LocalPath': '/opt/ml/processing/input/code/', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'entrypoint', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://aegovan-data/gbucket_code/glue-processing-2022-02-19-17-25-50-403/source/runproc.sh', 'LocalPath': '/opt/ml/processing/input/entrypoint', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'predictions', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://aegovan-data/gluebenchmark_sagemakerresults/mnli-reverse-lang-bert-accuracy-2022-01-23-21-29-34-327/', 'LocalPath': '/opt/ml/processing/output', 'S3UploadMode': 'EndOfJob'}}]


[34m#015Downloading:   0%|          | 0.00/313M [00:00<?, ?B/s]#015Downloading:   0%|          | 52.2k/313M [00:00<15:51, 329kB/s]#015Downloading:   0%|          | 296k/313M [00:00<05:00, 1.04MB/s]#015Downloading:   0%|          | 1.25M/313M [00:00<01:34, 3.29MB/s]#015Downloading:   1%|          | 3.46M/313M [00:00<00:35, 8.71MB/s]#015Downloading:   2%|▏         | 7.14M/313M [00:00<00:18, 16.9MB/s]#015Downloading:   4%|▍         | 12.1M/313M [00:00<00:11, 26.7MB/s]#015Downloading:   5%|▌         | 16.4M/313M [00:00<00:09, 31.4MB/s]#015Downloading:   7%|▋         | 21.4M/313M [00:00<00:07, 36.8MB/s]#015Downloading:   8%|▊         | 26.1M/313M [00:01<00:07, 39.9MB/s]#015Downloading:  10%|█         | 31.5M/313M [00:01<00:06, 44.0MB/s]#015Downloading:  12%|█▏        | 36.4M/313M [00:01<00:06, 44.1MB/s]#015Downloading:  13%|█▎        | 41.8M/313M [00:01<00:05, 46.8MB/s]#015Downloading:  15%|█▌        | 47.1M/313M [00:01<00:05, 48.7MB/s]#015Downloading:  17%|█▋        | 52.5M/313M [00:01<00

KeyboardInterrupt: 

## Run with reverse train

### Run model packaging

In [None]:
from sagemaker.network import NetworkConfig
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.processing import FrameworkProcessor
from sagemaker.huggingface import HuggingFace


framework_processor = FrameworkProcessor(HuggingFace,
                                      framework_version=None,
                                      image_uri=docker_repo,
                                       code_location = s3_code_path, 
                                       py_version="py36",
                                       command=["python"],
                                       env={'mode': 'python', 'PYTHONPATH':'/opt/ml/code'},
                                       role=role,
                                       instance_type="ml.m5.large",
                                       instance_count=1,
                                       max_runtime_in_seconds= 5 * 24 * 60 * 60,
                                       volume_size_in_gb = 250,
                                       network_config=NetworkConfig(enable_network_isolation=False),
                                       base_job_name ="model-packaging"
                                       )

sm_local_input_model = "/opt/ml/processing/input/data/model"
sm_local_input_config_vocab = "/opt/ml/processing/input/data/config_vocab"


sm_local_output = "/opt/ml/processing/output"

framework_processor.run(
        code=f'model_package_bert_utils.py',
        source_dir=f'../src/utils',
        arguments=[
            "--modeltarfile", f"{sm_local_input_model}/model.tar.gz" ,
            "--modelconfigfile", f"{sm_local_input_config_vocab}/config.json",
            "--vocabfile",f"{sm_local_input_config_vocab}/vocab.txt",
            "--outdir",sm_local_output
          
        ],

        inputs=[
                ProcessingInput(
                    source=s3_model_path,
                    s3_data_type = "S3Prefix",
                    destination=sm_local_input_model,
                    s3_data_distribution_type="FullyReplicated"),

                ProcessingInput(
                        source=s3_model_config_vocab_path,
                        destination=sm_local_input_config_vocab,
                        s3_data_distribution_type="FullyReplicated")

            ],


        outputs=[ProcessingOutput(
                source=sm_local_output, 
                destination=s3_model_package_path,
                output_name='predictions')]
    )

### Train with reverse mnli

In [12]:
from sagemaker.network import NetworkConfig
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.processing import FrameworkProcessor
from sagemaker.huggingface import HuggingFace


framework_processor = FrameworkProcessor(HuggingFace,
                                      framework_version=None,
                                      image_uri=docker_repo,
                                      code_location = s3_code_path, 
                                       py_version="py36",
                                       command=["python"],
                                       env={'mode': 'python', 'PYTHONPATH':'/opt/ml/code'},
                                       role=role,
                                       instance_type=instance_type,
                                       instance_count=instance_count,
                                       max_runtime_in_seconds= 5 * 24 * 60 * 60,
                                       volume_size_in_gb = 250,
                                       network_config=NetworkConfig(enable_network_isolation=False),
                                       base_job_name ="glue-reverse-mnli"
                                       )






sm_local_input_model = "/opt/ml/processing/input/data/model"
sm_local_input_data = "/opt/ml/processing/input/data/jsonlines"
sm_local_input_vocab = "/opt/ml/processing/input/data/vocab"


sm_local_output = "/opt/ml/processing/output"



framework_processor.run(
        code=f'run_glue.py',
        source_dir=f'{transformer_examples_dir}/examples/pytorch/text-classification',
        arguments=[
            "--task_name", "mnli",
            "--model_name_or_path", sm_local_input_model,
            "--do_train", "1",
            "--do_eval","1",
            "--do_predict","1",
            "--max_seq_length", str(512),
            "--per_device_train_batch_size", str(8),
            "--gradient_accumulation_steps", str(4),
            "--learning_rate", str(2e-5),
            "--num_train_epochs", str(3),
            "--output_dir", sm_local_output,
            "--overwrite_output_dir", "1",
            "--load_best_model_at_end", "1",     # load the best model when finished training (default metric is loss)
            "--eval_steps","200",
            "--save_steps","200",
            "--evaluation_strategy","steps",
            "--disable_tqdm","1"
           
        ],

        inputs=[
#                 ProcessingInput(
#                     source=s3_input_data,
#                     s3_data_type = s3_data_type,
#                     destination=sm_local_input_data,
#                     s3_data_distribution_type="FullyReplicated"),

                ProcessingInput(
                        source=s3_model_package_path,
                        destination=sm_local_input_model,
                        s3_data_distribution_type="FullyReplicated"),

#                 ProcessingInput(
#                         source=s3_input_vocab,
#                         destination=sm_local_input_vocab,
#                         s3_data_distribution_type="FullyReplicated")
            ],


        outputs=[ProcessingOutput(
                source=sm_local_output, 
                destination=s3_output_path,
                output_name='predictions')]
    )


Job Name:  glue-reverse-mnli-2022-02-19-17-33-44-928
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://aegovan-data/models/mnli-reverse-lang-bert-accuracy-2022-01-23-21-29-34-327/output', 'LocalPath': '/opt/ml/processing/input/data/model', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://aegovan-data/gbucket_code/glue-reverse-mnli-2022-02-19-17-33-44-928/source/sourcedir.tar.gz', 'LocalPath': '/opt/ml/processing/input/code/', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'entrypoint', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://aegovan-data/gbucket_code/glue-reverse-mnli-2022-02-19-17-33-44-928/source/runproc.sh', 'LocalPath': '/opt/ml/processing/input/entrypoint', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 

[34m#015Downloading:   0%|          | 0.00/313M [00:00<?, ?B/s]#015Downloading:   0%|          | 52.2k/313M [00:00<16:34, 314kB/s]#015Downloading:   0%|          | 296k/313M [00:00<05:10, 1.01MB/s]#015Downloading:   0%|          | 1.25M/313M [00:00<01:37, 3.18MB/s]#015Downloading:   1%|          | 3.40M/313M [00:00<00:37, 8.34MB/s]#015Downloading:   2%|▏         | 6.96M/313M [00:00<00:18, 16.2MB/s]#015Downloading:   4%|▍         | 12.3M/313M [00:00<00:11, 26.9MB/s]#015Downloading:   5%|▌         | 16.1M/313M [00:00<00:10, 29.3MB/s]#015Downloading:   7%|▋         | 21.1M/313M [00:01<00:08, 35.1MB/s]#015Downloading:   8%|▊         | 26.3M/313M [00:01<00:07, 40.3MB/s]#015Downloading:  10%|█         | 31.5M/313M [00:01<00:06, 43.7MB/s]#015Downloading:  12%|█▏        | 36.9M/313M [00:01<00:05, 46.4MB/s]#015Downloading:  13%|█▎        | 41.7M/313M [00:01<00:06, 45.1MB/s]#015Downloading:  15%|█▌        | 47.3M/313M [00:01<00:05, 48.4MB/s]#015Downloading:  17%|█▋        | 52.2M/313M [00:01<00

[34m[INFO|modeling_utils.py:1607] 2022-02-19 17:41:04,429 >> All model checkpoint weights were used when initializing BertForSequenceClassification.[0m
[34mYou should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.[0m
[34m02/19/2022 17:41:04 - INFO - datasets.arrow_dataset - Caching processed dataset at /root/.cache/huggingface/datasets/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-89e6c98322b0322d.arrow[0m
[34m#015Running tokenizer on dataset:   0%|          | 0/393 [00:00<?, ?ba/s]#015Running tokenizer on dataset:   0%|          | 1/393 [00:00<02:45,  2.36ba/s]#015Running tokenizer on dataset:   1%|          | 2/393 [00:00<01:46,  3.66ba/s]#015Running tokenizer on dataset:   1%|          | 3/393 [00:00<01:43,  3.78ba/s]#015Running tokenizer on dataset:   1%|          | 4/393 [00:01<01:28,  4.39ba/s]#015Running tokenizer on dataset:   1%|▏         | 5/393 [00:01<01:20,  4.82ba/s]#015Run

[34m393 [00:36<00:34,  5.80ba/s]#015Running tokenizer on dataset:  49%|████▉     | 193/393 [00:36<00:34,  5.82ba/s]#015Running tokenizer on dataset:  49%|████▉     | 194/393 [00:36<00:34,  5.84ba/s]#015Running tokenizer on dataset:  50%|████▉     | 195/393 [00:37<00:34,  5.80ba/s]#015Running tokenizer on dataset:  50%|████▉     | 196/393 [00:37<00:36,  5.37ba/s]#015Running tokenizer on dataset:  50%|█████     | 197/393 [00:37<00:35,  5.57ba/s]#015Running tokenizer on dataset:  50%|█████     | 198/393 [00:37<00:34,  5.71ba/s]#015Running tokenizer on dataset:  51%|█████     | 199/393 [00:37<00:33,  5.78ba/s]#015Running tokenizer on dataset:  51%|█████     | 200/393 [00:38<00:33,  5.82ba/s]#015Running tokenizer on dataset:  51%|█████     | 201/393 [00:38<00:32,  5.84ba/s]#015Running tokenizer on dataset:  51%|█████▏    | 202/393 [00:38<00:32,  5.87ba/s]#015Running tokenizer on dataset:  52%|█████▏    | 203/393 [00:38<00:32,  5.86ba/s]#015Running tokenizer on dataset:  52%|█████▏    | 204

KeyboardInterrupt: 