In [1]:
import sagemaker
import boto3
from uuid import uuid4
import os
import shutil

sagemaker_session = sagemaker.Session()
account_id =  boto3.client('sts').get_caller_identity().get('Account')
region = boto3.session.Session().region_name


#role = sagemaker.get_execution_role()
role="arn:aws:iam::{}:role/service-role/AmazonSageMaker-ExecutionRole-20190118T115449".format(account_id)
max_runs=1

In [2]:
temp_dir = "temp"

In [3]:
transformer_examples_dir = os.path.join(temp_dir, "hugging_face_example")


### 2. Setup image and instance type

In [4]:
custom_image_name=f"huggingface-pytorch-training:1.9.1-transformers4.12.3-gpu-py38-cu111-ubuntu20.04"
image_account_id="763104351884"
instance_type = "ml.p3.2xlarge"
instance_type_gpu_map = {"ml.p3.8xlarge":4, "ml.p3.2xlarge": 1, "ml.p3.16xlarge":8}
instance_count=1

In [5]:
docker_repo = "{}.dkr.ecr.{}.amazonaws.com/{}".format(image_account_id, region, custom_image_name)

### 3. Configure train/ test and validation datasets



In [6]:
bucket = "aegovan-data"

In [7]:
pretrained_bert="s3://{}/embeddings/bert_base_cased/".format(bucket)


trainfile = "s3://{}/glue_dataset/train/multinli_1.0_train.jsonl".format(bucket)
# valfile="s3://{}/mnli_dataset/val/multinli_1.0_dev_matched.jsonl".format(bucket)

#trainfile = "s3://{}/mnli_dataset_mini/train/multinli.jsonl".format(bucket)
valfile="s3://{}/glue_dataset_mini/train/multinli.jsonl".format(bucket)

s3_output_path= "s3://{}/glue_sagemakerresults/".format(bucket)
s3_code_path= "s3://{}/glue_code".format(bucket)
s3_checkpoint = "s3://{}/mnli_bert_checkpoint/{}".format(bucket, str(uuid4()))

## Run processing job training

### Get train

In [8]:
if os.path.exists(transformer_examples_dir):
    shutil.rmtree(transformer_examples_dir)
    os.makedirs(transformer_examples_dir)

In [9]:
!git clone https://github.com/huggingface/transformers $transformer_examples_dir
!git -C $transformer_examples_dir checkout tags/v4.12.3

Cloning into 'temp/hugging_face_example'...
remote: Enumerating objects: 99654, done.[K
remote: Counting objects: 100% (23/23), done.[K
remote: Compressing objects: 100% (15/15), done.[K
remote: Total 99654 (delta 7), reused 17 (delta 5), pack-reused 99631[K
Receiving objects: 100% (99654/99654), 84.61 MiB | 2.65 MiB/s, done.
Resolving deltas: 100% (72298/72298), done.
Note: switching to 'tags/v4.12.3'.

You are in 'detached HEAD' state. You can look around, make experimental
changes and commit them, and you can discard any commits you make in this
state without impacting any branches by switching back to a branch.

If you want to create a new branch to retain commits you create, you may
do so (now or later) by using -c with the switch command. Example:

  git switch -c <new-branch-name>

Or undo this operation with:

  git switch -

Turn off this advice by setting config variable advice.detachedHead to false

HEAD is now at 3ea15d278 Style


In [10]:
from sagemaker.network import NetworkConfig
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.processing import FrameworkProcessor
from sagemaker.huggingface import HuggingFace


script_processor = FrameworkProcessor(HuggingFace,
                                      framework_version=None,
                                      image_uri=docker_repo,
                                      code_location = s3_code_path, 
                                       py_version="py36",
                                       command=["python"],
                                       env={'mode': 'python', 'PYTHONPATH':'/opt/ml/code'},
                                       role=role,
                                       instance_type=instance_type,
                                       instance_count=instance_count,
                                       max_runtime_in_seconds= 5 * 24 * 60 * 60,
                                       volume_size_in_gb = 250,
                                       network_config=NetworkConfig(enable_network_isolation=False),
                                       base_job_name ="glue-processing"
                                       )



In [None]:

sm_local_input_models = "/opt/ml/processing/input/data/models"
sm_local_input_data = "/opt/ml/processing/input/data/jsonlines"
sm_local_input_vocab = "/opt/ml/processing/input/data/vocab"


sm_local_output = "/opt/ml/processing/output"



# python run_glue.py \
#   --model_name_or_path bert-base-cased \
#   --task_name $TASK_NAME \
#   --do_train \
#   --do_eval \
#   --max_seq_length 128 \
#   --per_device_train_batch_size 32 \
#   --learning_rate 2e-5 \
#   --num_train_epochs 3 \
#   --output_dir /tmp/$TASK_NAME/


script_processor.run(
        code=f'run_glue.py',
        source_dir=f'{transformer_examples_dir}/examples/pytorch/text-classification',
        arguments=[
            "--task_name", "mnli",
            "--model_name_or_path", "bert-base-cased",
            "--do_train", "1",
            "--do_eval","1",
            "--do_predict","1",
            "--max_seq_length", str(512),
            "--per_device_train_batch_size", str(8),
            "--gradient_accumulation_steps", str(4),
            "--learning_rate", str(2e-5),
            "--num_train_epochs", str(3),
            "--output_dir", sm_local_output
           
        ],

        inputs=[
#                 ProcessingInput(
#                     source=s3_input_data,
#                     s3_data_type = s3_data_type,
#                     destination=sm_local_input_data,
#                     s3_data_distribution_type="FullyReplicated"),

#                 ProcessingInput(
#                         source=s3_model_path,
#                         destination=sm_local_input_models,
#                         s3_data_distribution_type="FullyReplicated"),

#                 ProcessingInput(
#                         source=s3_input_vocab,
#                         destination=sm_local_input_vocab,
#                         s3_data_distribution_type="FullyReplicated")
            ],


        outputs=[ProcessingOutput(
                source=sm_local_output, 
                destination=s3_output_path,
                output_name='predictions')]
    )