### Set up

#### 1. Set  up  accounts and role

In [1]:
import sagemaker
import boto3
from uuid import uuid4

sagemaker_session = sagemaker.Session()
account_id =  boto3.client('sts').get_caller_identity().get('Account')
region = boto3.session.Session().region_name


#role = sagemaker.get_execution_role()
role="arn:aws:iam::{}:role/service-role/AmazonSageMaker-ExecutionRole-20190118T115449".format(account_id)
step_func_role = "arn:aws:iam::{}:role/AmazonSageMaker-StepFunctionsWorkflowExecutionRole".format(account_id)
max_runs=1



#### 2. Setup image and instance type

In [2]:
# pytorch_custom_image_name="ppi-extractor:gpu-1.0.0-201910130520"
instance_type = "ml.p3.2xlarge"
instance_type_gpu_map = {"ml.p3.8xlarge":4, "ml.p3.2xlarge": 1, "ml.p3.16xlarge":8}

In [3]:
# docker_repo = "{}.dkr.ecr.{}.amazonaws.com/{}".format(account_id, region, pytorch_custom_image_name)

#### 3. Configure train/ test and validation datasets

In [4]:
bucket = "aegovan-data"

In [5]:
pretrained_bert="s3://{}/embeddings/bert/".format(bucket)

# abstract_train_prefix= "s3://aegovan-data/chemprot_adversarial/"
# abstract_testfile= "s3://{}/chemprot_adversarial/chemprot_abstract_test.json".format(bucket)
# abstract_valfile="s3://{}/chemprot_adversarial/chemprot_abstract_val.json".format(bucket)

# abstract_train_prefix="s3://aegovan-data/chemprot_adversarial/202212041518/affable"
# abstract_testfile= "s3://{}/chemprot_adversarial/202212041518/chemprot_abstract_test.json".format(bucket)
# abstract_valfile="s3://{}/chemprot_adversarial/202212041518/chemprot_abstract_val.json".format(bucket)

sst2_dataset = "s3://aegovan-data/glue_full_set/SST-2/"
mnli_dataset = "s3://aegovan-data/glue_full_set/mnli/"
imdb_5h_dataset = "s3://aegovan-data/glue_full_set/imdb/imdb-5h/202306041250/00_00_00_01/"

s3_output_path= "s3://{}/hiddencut_sagemakerresults/".format(bucket)
s3_code_path= "s3://{}/hiddencut_code".format(bucket)
s3_checkpoint = "s3://{}/hiddencut_bert_checkpoint/{}".format(bucket, str(uuid4()))

### Start training

In [6]:
sm_localcheckpoint_dir="/opt/ml/checkpoints/"


In [7]:
dataset = "imdb-json"

In [8]:
inputs = {
    "SST-2" :  {"all": sst2_dataset},
    "mnli" : {"all":  mnli_dataset},
    "imdb-json":{"all":  imdb_5h_dataset},
}

In [9]:
hyperparameters = {
    "model_name_or_path":"roberta-base" 
  , "data_dir": "/opt/ml/input/data/all"
  , "task_name": dataset
  , "do_train" : 1
  , "do_eval" : 1
  , "evaluate_during_training" :1
  , "do_aug" : 1
  , "aug_type" : 'attn_span_cutoff' 
  , "aug_cutoff_ratio" : "0.1"
  , "aug_ce_loss": "1.0" 
  , "aug_js_loss" : "1.0" 
  , "learning_rate" : "7e-6" 
  , "num_train_epochs" : "10" 
  , "logging_steps" : "500"
  , "save_steps" : "500"
  , "per_gpu_train_batch_size": "16" 
  , "output_dir" : "/opt/ml/model"
  , "output_data_dir"  : "/opt/ml/output/data"
  , "early_stop": 100
  , "seed": 42}


temp_hyperparameters = {
    "model_name_or_path":"roberta-base" 
  , "data_dir": "/opt/ml/input/data/all"
  , "task_name": dataset
  , "do_train" : 1
  , "do_eval" : 1
  , "evaluate_during_training" :1
  , "do_aug" : 1
  , "aug_type" : 'attn_span_cutoff' 
  , "aug_cutoff_ratio" : "0.1"
  , "aug_ce_loss": "1.0" 
  , "aug_js_loss" : "1.0" 
  , "learning_rate" : "7e-6" 
  , "num_train_epochs" : "1" 
  , "logging_steps" : "50"
  , "save_steps" : "50"
  , "per_gpu_train_batch_size": "16" 
  , "output_dir" : "/opt/ml/model"
  , "output_data_dir"  : "/opt/ml/output/data"
  , "early_stop": 100
  , "seed": 42}



hyperparameters = temp_hyperparameters

In [10]:
metric_definitions = [{"Name": "TrainLoss",
                     "Regex": "###score: train_loss### (\d*[.]?\d*)"}
                    ,{"Name": "ValidationLoss",
                     "Regex": "###score: val_loss### (\d*[.]?\d*)"}
                
                    ,{"Name": "TrainAucScore",
                     "Regex": "###score: train_ResultScorerAucMacro_score### (\d*[.]?\d*)"}
                   ,{"Name": "ValidationAucScore",
                     "Regex": "###score: val_ResultScorerAucMacro_score### (\d*[.]?\d*)"}
                      
                      
                     ,{"Name": "TrainF1BinaryScore",
                     "Regex": "###score: train_ResultScorerF1Binary_score### (\d*[.]?\d*)"}
                   ,{"Name": "ValidationF1BinaryScore",
                     "Regex": "###score: val_ResultScorerF1Binary_score### (\d*[.]?\d*)"}
                    ]

In [11]:
# set True if you need spot instance
use_spot = False
train_max_run_secs =   5 *24 * 60 * 60
spot_wait_sec =  5 * 60
max_wait_time_secs = train_max_run_secs +  spot_wait_sec

if not use_spot:
    max_wait_time_secs = None
    
# During local mode, no spot.., use smaller dataset
if instance_type == 'local':
    use_spot = False
    max_wait_time_secs = 0
    wait = True
    # Use smaller dataset to run locally
    # TODO:
    #  inputs = inputs_sample

In [12]:
import datetime
date_fmt = datetime.datetime.today().strftime("%Y%m%d%H%M")

In [13]:
hyperparameters

{'model_name_or_path': 'roberta-base',
 'data_dir': '/opt/ml/input/data/all',
 'task_name': 'imdb-json',
 'do_train': 1,
 'do_eval': 1,
 'evaluate_during_training': 1,
 'do_aug': 1,
 'aug_type': 'attn_span_cutoff',
 'aug_cutoff_ratio': '0.1',
 'aug_ce_loss': '1.0',
 'aug_js_loss': '1.0',
 'learning_rate': '7e-6',
 'num_train_epochs': '1',
 'logging_steps': '50',
 'save_steps': '50',
 'per_gpu_train_batch_size': '16',
 'output_dir': '/opt/ml/model',
 'output_data_dir': '/opt/ml/output/data',
 'early_stop': 100,
 'seed': 42}

In [14]:
from sagemaker.pytorch import PyTorch
from sagemaker.inputs import TrainingInput

import random


estimator = PyTorch(
  entry_point='run_glue.py',
                    source_dir = '../src',
                    dependencies =['../src/transformers'],
#                     git_config= git_config,
#                     image_name= docker_repo,
                    role=role,
                    framework_version ="1.4.0",
                    py_version='py3',
                    instance_count=1,
                    instance_type=instance_type,
                    hyperparameters = hyperparameters,
                    output_path=s3_output_path,
                    metric_definitions=metric_definitions,
                    volume_size=30,
                    code_location=s3_code_path,
                    debugger_hook_config=False,
                    base_job_name = f"hiddencut-{dataset}",  
                    max_run =  train_max_run_secs,
                    max_wait = max_wait_time_secs,   
)


estimator.fit(inputs[dataset], wait=True)

Using provided s3_resource


INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: hiddencut-imdb-json-2023-09-10-01-35-49-599


2023-09-10 01:35:51 Starting - Starting the training job...
2023-09-10 01:36:06 Starting - Preparing the instances for training.........
2023-09-10 01:37:48 Downloading - Downloading input data
2023-09-10 01:37:48 Training - Downloading the training image.........
2023-09-10 01:39:19 Training - Training image download completed. Training in progress....[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2023-09-10 01:39:46,539 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2023-09-10 01:39:46,571 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2023-09-10 01:39:46,573 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2023-09-10 01:39:46,869 sagemaker-containers INFO     Module default_user_module_name does not provide a setup.py. [0m
[34mGenerating setup.py[0m
[34m2023-0

[34m2023-09-10 01:40:01,132 - transformers.training_args - INFO - PyTorch: setting up devices[0m
[34m2023-09-10 01:40:01,152 - root - INFO - Training/evaluation parameters TrainingArguments(output_dir='/opt/ml/model', output_data_dir='/opt/ml/output/data', overwrite_output_dir=False, do_train=True, do_eval=True, do_eval_all=False, do_predict=False, evaluate_during_training=True, do_debug=False, per_gpu_train_batch_size=16, per_gpu_eval_batch_size=64, gradient_accumulation_steps=1, learning_rate=7e-06, weight_decay=0.1, adam_epsilon=1e-06, adam_betas='0.9,0.98', max_grad_norm=0.0, num_train_epochs=1.0, max_steps=-1, early_stop=100, warmup_steps=0, warmup_ratio=0.06, do_aug=True, aug_type='attn_span_cutoff', aug_ce_loss=1.0, aug_js_loss=1.0, aug_cutoff_ratio=0.1, logging_dir=None, logging_first_step=False, logging_steps=50, save_steps=50, save_total_limit=1, no_cuda=False, seed=42, fp16=False, fp16_opt_level='O1', local_rank=-1, tpu_num_cores=None, tpu_metrics_debug=False)[0m
[34m20

[34m2023-09-10 01:40:10,575 - transformers.file_utils - INFO - storing https://cdn.huggingface.co/roberta-base-pytorch_model.bin in cache at /root/.cache/torch/transformers/80b4a484eddeb259bec2f06a6f2f05d90934111628e0e1c09a33bd4a121358e1.49b88ba7ec2c26a7558dda98ca3884c3b80fa31cf43a1b1f23aef3ff81ba344e[0m
[34m2023-09-10 01:40:10,575 - transformers.file_utils - INFO - creating metadata file for /root/.cache/torch/transformers/80b4a484eddeb259bec2f06a6f2f05d90934111628e0e1c09a33bd4a121358e1.49b88ba7ec2c26a7558dda98ca3884c3b80fa31cf43a1b1f23aef3ff81ba344e[0m
[34m2023-09-10 01:40:10,575 - filelock - INFO - Lock 139840398774792 released on /root/.cache/torch/transformers/80b4a484eddeb259bec2f06a6f2f05d90934111628e0e1c09a33bd4a121358e1.49b88ba7ec2c26a7558dda98ca3884c3b80fa31cf43a1b1f23aef3ff81ba344e.lock[0m
[34m2023-09-10 01:40:10,576 - transformers.modeling_utils - INFO - loading weights file https://cdn.huggingface.co/roberta-base-pytorch_model.bin from cache at /root/.cache/torch/tr

[34m2023-09-10 01:40:22,195 - root - INFO - You are instantiating a Trainer but W&B is not installed. To use wandb logging, run `pip install wandb; wandb login` see https://docs.wandb.com/huggingface.[0m
[34m2023-09-10 01:40:22,198 - root - INFO - ***** Running training *****[0m
[34m2023-09-10 01:40:22,198 - root - INFO -   Num examples = 500[0m
[34m2023-09-10 01:40:22,198 - root - INFO -   Num Epochs = 1[0m
[34m2023-09-10 01:40:22,198 - root - INFO -   Instantaneous batch size per device = 16[0m
[34m2023-09-10 01:40:22,199 - root - INFO -   Total train batch size (w. parallel, distributed & accumulation) = 16[0m
[34m2023-09-10 01:40:22,199 - root - INFO -   Gradient Accumulation steps = 1[0m
[34m2023-09-10 01:40:22,199 - root - INFO -   Total optimization steps = 32[0m
[34m2023-09-10 01:40:38,730 - root - INFO - ***** Running Evaluation *****[0m
[34m2023-09-10 01:40:38,731 - root - INFO -   Num examples = 75[0m
[34m2023-09-10 01:40:38,731 - root - INFO -   Batch s