In [1]:
import sagemaker
import boto3
from uuid import uuid4
import os
import shutil

sagemaker_session = sagemaker.Session()
account_id =  boto3.client('sts').get_caller_identity().get('Account')
region = boto3.session.Session().region_name


#role = sagemaker.get_execution_role()
role="arn:aws:iam::{}:role/service-role/AmazonSageMaker-ExecutionRole-20190118T115449".format(account_id)
max_runs=1

In [2]:
temp_dir = "temp"

In [3]:
transformer_examples_dir = os.path.join(temp_dir, "hugging_face_example")


### 2. Setup image and instance type

In [4]:
custom_image_name=f"huggingface-pytorch-training:1.9.1-transformers4.12.3-gpu-py38-cu111-ubuntu20.04"
image_account_id="763104351884"
instance_type = "ml.p3.2xlarge"
instance_type_gpu_map = {"ml.p3.8xlarge":4, "ml.p3.2xlarge": 1, "ml.p3.16xlarge":8}
instance_count=1

In [5]:
docker_repo = "{}.dkr.ecr.{}.amazonaws.com/{}".format(image_account_id, region, custom_image_name)

### 3. Configure train/ test and validation datasets



In [6]:
bucket = "aegovan-data"

In [7]:
pretrained_bert="s3://{}/embeddings/bert_base_cased/".format(bucket)


trainfile = "s3://{}/glue_dataset/train/multinli_1.0_train.jsonl".format(bucket)
# valfile="s3://{}/mnli_dataset/val/multinli_1.0_dev_matched.jsonl".format(bucket)

#trainfile = "s3://{}/mnli_dataset_mini/train/multinli.jsonl".format(bucket)
valfile="s3://{}/glue_dataset_mini/train/multinli.jsonl".format(bucket)

s3_output_path= "s3://{}/glue_sagemakerresults/".format(bucket)
s3_code_path= "s3://{}/glue_code".format(bucket)
s3_checkpoint = "s3://{}/mnli_bert_checkpoint/{}".format(bucket, str(uuid4()))

## Run processing job training

### Get train

In [8]:
if os.path.exists(transformer_examples_dir):
    shutil.rmtree(transformer_examples_dir)
    os.makedirs(transformer_examples_dir)

In [9]:
!git clone https://github.com/huggingface/transformers $transformer_examples_dir
!git -C $transformer_examples_dir checkout tags/v4.12.3

Cloning into 'temp/hugging_face_example'...
remote: Enumerating objects: 99654, done.[K
remote: Counting objects: 100% (23/23), done.[K
remote: Compressing objects: 100% (15/15), done.[K
remote: Total 99654 (delta 7), reused 17 (delta 5), pack-reused 99631[K
Receiving objects: 100% (99654/99654), 84.61 MiB | 2.65 MiB/s, done.
Resolving deltas: 100% (72298/72298), done.
Note: switching to 'tags/v4.12.3'.

You are in 'detached HEAD' state. You can look around, make experimental
changes and commit them, and you can discard any commits you make in this
state without impacting any branches by switching back to a branch.

If you want to create a new branch to retain commits you create, you may
do so (now or later) by using -c with the switch command. Example:

  git switch -c <new-branch-name>

Or undo this operation with:

  git switch -

Turn off this advice by setting config variable advice.detachedHead to false

HEAD is now at 3ea15d278 Style


In [10]:
from sagemaker.network import NetworkConfig
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.processing import FrameworkProcessor
from sagemaker.huggingface import HuggingFace


script_processor = FrameworkProcessor(HuggingFace,
                                      framework_version=None,
                                      image_uri=docker_repo,
                                      code_location = s3_code_path, 
                                       py_version="py36",
                                       command=["python"],
                                       env={'mode': 'python', 'PYTHONPATH':'/opt/ml/code'},
                                       role=role,
                                       instance_type=instance_type,
                                       instance_count=instance_count,
                                       max_runtime_in_seconds= 5 * 24 * 60 * 60,
                                       volume_size_in_gb = 250,
                                       network_config=NetworkConfig(enable_network_isolation=False),
                                       base_job_name ="glue-processing"
                                       )



In [None]:

sm_local_input_models = "/opt/ml/processing/input/data/models"
sm_local_input_data = "/opt/ml/processing/input/data/jsonlines"
sm_local_input_vocab = "/opt/ml/processing/input/data/vocab"


sm_local_output = "/opt/ml/processing/output"



# python run_glue.py \
#   --model_name_or_path bert-base-cased \
#   --task_name $TASK_NAME \
#   --do_train \
#   --do_eval \
#   --max_seq_length 128 \
#   --per_device_train_batch_size 32 \
#   --learning_rate 2e-5 \
#   --num_train_epochs 3 \
#   --output_dir /tmp/$TASK_NAME/


script_processor.run(
        code=f'run_glue.py',
        source_dir=f'{transformer_examples_dir}/examples/pytorch/text-classification',
        arguments=[
            "--task_name", "mnli",
            "--model_name_or_path", "bert-base-cased",
            "--do_train", "1",
            "--do_eval","1",
            "--do_predict","1",
            "--max_seq_length", str(512),
            "--per_device_train_batch_size", str(8),
            "--gradient_accumulation_steps", str(4),
            "--learning_rate", str(2e-5),
            "--num_train_epochs", str(3),
            "--output_dir", sm_local_output,
            "--overwrite_output_dir", "1",
            "--load_best_model_at_end", "1",     # load the best model when finished training (default metric is loss)
            "--eval_steps","200",
            "--save_steps","200",
            "--evaluation_strategy","steps",
            "--disable_tqdm","1"
           
        ],

        inputs=[
#                 ProcessingInput(
#                     source=s3_input_data,
#                     s3_data_type = s3_data_type,
#                     destination=sm_local_input_data,
#                     s3_data_distribution_type="FullyReplicated"),

#                 ProcessingInput(
#                         source=s3_model_path,
#                         destination=sm_local_input_models,
#                         s3_data_distribution_type="FullyReplicated"),

#                 ProcessingInput(
#                         source=s3_input_vocab,
#                         destination=sm_local_input_vocab,
#                         s3_data_distribution_type="FullyReplicated")
            ],


        outputs=[ProcessingOutput(
                source=sm_local_output, 
                destination=s3_output_path,
                output_name='predictions')]
    )


Job Name:  glue-processing-2022-02-12-18-55-12-257
Inputs:  [{'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://aegovan-data/glue_code/glue-processing-2022-02-12-18-55-12-257/source/sourcedir.tar.gz', 'LocalPath': '/opt/ml/processing/input/code/', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'entrypoint', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://aegovan-data/glue_code/glue-processing-2022-02-12-18-55-12-257/source/runproc.sh', 'LocalPath': '/opt/ml/processing/input/entrypoint', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'predictions', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://aegovan-data/glue_sagemakerresults/', 'LocalPath': '/opt/ml/processing/output', 'S3UploadMode': 'EndOfJob'}}]
[34mCollecting accelerate
  Downloading accelerate-0.5.1-py3-none-any.

[34m#015Downloading:   0%|          | 0.00/313M [00:00<?, ?B/s]#015Downloading:   0%|          | 9.22k/313M [00:00<57:04, 91.3kB/s]#015Downloading:   0%|          | 54.3k/313M [00:00<17:51, 292kB/s] #015Downloading:   0%|          | 124k/313M [00:00<11:02, 472kB/s] #015Downloading:   0%|          | 298k/313M [00:00<05:24, 964kB/s]#015Downloading:   0%|          | 625k/313M [00:00<02:54, 1.79MB/s]#015Downloading:   0%|          | 1.26M/313M [00:00<01:34, 3.29MB/s]#015Downloading:   1%|          | 2.53M/313M [00:00<00:49, 6.31MB/s]#015Downloading:   2%|▏         | 5.08M/313M [00:00<00:25, 12.2MB/s]#015Downloading:   3%|▎         | 9.18M/313M [00:00<00:14, 21.0MB/s]#015Downloading:   4%|▍         | 12.7M/313M [00:01<00:11, 25.4MB/s]#015Downloading:   5%|▌         | 16.7M/313M [00:01<00:10, 29.0MB/s]#015Downloading:   7%|▋         | 20.9M/313M [00:01<00:09, 31.7MB/s]#015Downloading:   8%|▊         | 25.0M/313M [00:01<00:08, 34.6MB/s]#015Downloading:   9%|▉         | 28.5M/313M [00:01<00:0

[34m#015Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]#015Downloading:   0%|          | 937k/416M [00:00<00:45, 9.59MB/s]#015Downloading:   2%|▏         | 6.27M/416M [00:00<00:11, 37.0MB/s]#015Downloading:   2%|▏         | 10.2M/416M [00:00<00:10, 39.1MB/s]#015Downloading:   3%|▎         | 14.2M/416M [00:00<00:10, 40.2MB/s]#015Downloading:   4%|▍         | 18.1M/416M [00:00<00:11, 35.5MB/s]#015Downloading:   5%|▌         | 21.5M/416M [00:00<00:12, 32.6MB/s]#015Downloading:   6%|▌         | 25.5M/416M [00:00<00:11, 35.1MB/s]#015Downloading:   7%|▋         | 29.9M/416M [00:00<00:10, 38.5MB/s]#015Downloading:   8%|▊         | 33.7M/416M [00:00<00:10, 36.5MB/s]#015Downloading:   9%|▉         | 37.2M/416M [00:01<00:11, 34.4MB/s]#015Downloading:  10%|▉         | 41.2M/416M [00:01<00:10, 36.2MB/s]#015Downloading:  11%|█         | 45.6M/416M [00:01<00:09, 39.2MB/s]#015Downloading:  12%|█▏        | 49.4M/416M [00:01<00:10, 37.0MB/s]#015Downloading:  13%|█▎        | 53.3M/416M [00:01<0

[34m#015Running tokenizer on dataset:   0%|          | 0/393 [00:00<?, ?ba/s]#015Running tokenizer on dataset:   0%|          | 1/393 [00:00<02:33,  2.56ba/s]#015Running tokenizer on dataset:   1%|          | 2/393 [00:00<01:40,  3.88ba/s]#015Running tokenizer on dataset:   1%|          | 3/393 [00:00<01:35,  4.08ba/s]#015Running tokenizer on dataset:   1%|          | 4/393 [00:00<01:22,  4.73ba/s]#015Running tokenizer on dataset:   1%|▏         | 5/393 [00:01<01:15,  5.16ba/s]#015Running tokenizer on dataset:   2%|▏         | 6/393 [00:01<01:11,  5.44ba/s]#015Running tokenizer on dataset:   2%|▏         | 7/393 [00:01<01:08,  5.64ba/s]#015Running tokenizer on dataset:   2%|▏         | 8/393 [00:01<01:06,  5.75ba/s]#015Running tokenizer on dataset:   2%|▏         | 9/393 [00:01<01:05,  5.88ba/s]#015Running tokenizer on dataset:   3%|▎         | 10/393 [00:01<01:04,  5.97ba/s]#015Running tokenizer on dataset:   3%|▎         | 11/393 [00:02<01:03,  5.99ba/s]#015Running tokenizer on data

[34m393 [00:32<00:33,  6.06ba/s]#015Running tokenizer on dataset:  49%|████▉     | 193/393 [00:32<00:32,  6.06ba/s]#015Running tokenizer on dataset:  49%|████▉     | 194/393 [00:32<00:32,  6.08ba/s]#015Running tokenizer on dataset:  50%|████▉     | 195/393 [00:32<00:32,  6.05ba/s]#015Running tokenizer on dataset:  50%|████▉     | 196/393 [00:33<00:34,  5.68ba/s]#015Running tokenizer on dataset:  50%|█████     | 197/393 [00:33<00:33,  5.84ba/s]#015Running tokenizer on dataset:  50%|█████     | 198/393 [00:33<00:32,  5.97ba/s]#015Running tokenizer on dataset:  51%|█████     | 199/393 [00:33<00:32,  6.04ba/s]#015Running tokenizer on dataset:  51%|█████     | 200/393 [00:33<00:31,  6.08ba/s]#015Running tokenizer on dataset:  51%|█████     | 201/393 [00:33<00:31,  6.11ba/s]#015Running tokenizer on dataset:  51%|█████▏    | 202/393 [00:34<00:31,  6.11ba/s]#015Running tokenizer on dataset:  52%|█████▏    | 203/393 [00:34<00:31,  6.10ba/s]#015Running tokenizer on dataset:  52%|█████▏    | 204

[34m#015Running tokenizer on dataset:   0%|          | 0/10 [00:00<?, ?ba/s]#015Running tokenizer on dataset:  10%|█         | 1/10 [00:00<00:01,  4.54ba/s]#015Running tokenizer on dataset:  20%|██        | 2/10 [00:00<00:01,  5.30ba/s]#015Running tokenizer on dataset:  30%|███       | 3/10 [00:00<00:01,  5.59ba/s]#015Running tokenizer on dataset:  40%|████      | 4/10 [00:00<00:01,  5.84ba/s]#015Running tokenizer on dataset:  50%|█████     | 5/10 [00:00<00:00,  6.03ba/s]#015Running tokenizer on dataset:  60%|██████    | 6/10 [00:01<00:00,  6.09ba/s]#015Running tokenizer on dataset:  70%|███████   | 7/10 [00:01<00:00,  6.14ba/s]#015Running tokenizer on dataset:  80%|████████  | 8/10 [00:01<00:00,  5.58ba/s]#015Running tokenizer on dataset:  90%|█████████ | 9/10 [00:01<00:00,  5.74ba/s]#015Running tokenizer on dataset: 100%|██████████| 10/10 [00:01<00:00,  6.16ba/s]#015Running tokenizer on dataset: 100%|██████████| 10/10 [00:01<00:00,  5.87ba/s][0m
[34m02/12/2022 19:03:17 - INFO - da

[34m[INFO|trainer.py:540] 2022-02-12 19:06:41,189 >> The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: idx, hypothesis, premise.[0m
[34m[INFO|trainer.py:2243] 2022-02-12 19:06:41,191 >> ***** Running Evaluation *****[0m
[34m[INFO|trainer.py:2245] 2022-02-12 19:06:41,191 >>   Num examples = 9815[0m
[34m[INFO|trainer.py:2248] 2022-02-12 19:06:41,191 >>   Batch size = 8[0m
[34m02/12/2022 19:08:25 - INFO - datasets.metric - Removing /root/.cache/huggingface/metrics/glue/mnli/default_experiment-1-0.arrow[0m
[34m{'eval_loss': 0.7826343178749084, 'eval_accuracy': 0.6681609780947529, 'eval_runtime': 104.5461, 'eval_samples_per_second': 93.882, 'eval_steps_per_second': 11.736, 'epoch': 0.02}[0m
[34m[INFO|trainer.py:1995] 2022-02-12 19:08:25,737 >> Saving model checkpoint to /opt/ml/processing/output/checkpoint-200[0m
[34m[INFO|configuration_utils.py:417] 2022-02-12 19:08:25,738 >> Con

[34m02/12/2022 19:33:41 - INFO - datasets.metric - Removing /root/.cache/huggingface/metrics/glue/mnli/default_experiment-1-0.arrow[0m
[34m{'eval_loss': 0.58695387840271, 'eval_accuracy': 0.7615894039735099, 'eval_runtime': 105.4892, 'eval_samples_per_second': 93.043, 'eval_steps_per_second': 11.632, 'epoch': 0.1}[0m
[34m[INFO|trainer.py:1995] 2022-02-12 19:33:41,999 >> Saving model checkpoint to /opt/ml/processing/output/checkpoint-1200[0m
[34m[INFO|configuration_utils.py:417] 2022-02-12 19:33:42,000 >> Configuration saved in /opt/ml/processing/output/checkpoint-1200/config.json[0m
[34m[INFO|modeling_utils.py:1058] 2022-02-12 19:33:42,693 >> Model weights saved in /opt/ml/processing/output/checkpoint-1200/pytorch_model.bin[0m
[34m[INFO|tokenization_utils_base.py:2034] 2022-02-12 19:33:42,694 >> tokenizer config file saved in /opt/ml/processing/output/checkpoint-1200/tokenizer_config.json[0m
[34m[INFO|tokenization_utils_base.py:2040] 2022-02-12 19:33:42,694 >> Special toke