### Set up

#### 1. Set  up  accounts and role

In [1]:
import sagemaker
import boto3
from uuid import uuid4

sagemaker_session = sagemaker.Session()
account_id =  boto3.client('sts').get_caller_identity().get('Account')
region = boto3.session.Session().region_name


#role = sagemaker.get_execution_role()
role="arn:aws:iam::{}:role/service-role/AmazonSageMaker-ExecutionRole-20190118T115449".format(account_id)
step_func_role = "arn:aws:iam::{}:role/AmazonSageMaker-StepFunctionsWorkflowExecutionRole".format(account_id)
max_runs=1



#### 2. Setup image and instance type

In [2]:
# pytorch_custom_image_name="ppi-extractor:gpu-1.0.0-201910130520"
instance_type = "ml.p3.2xlarge"
instance_type_gpu_map = {"ml.p3.8xlarge":4, "ml.p3.2xlarge": 1, "ml.p3.16xlarge":8}

In [3]:
# docker_repo = "{}.dkr.ecr.{}.amazonaws.com/{}".format(account_id, region, pytorch_custom_image_name)

#### 3. Configure train/ test and validation datasets

In [4]:
bucket = "aegovan-data"

In [5]:
pretrained_bert="s3://{}/embeddings/bert/".format(bucket)

# abstract_train_prefix= "s3://aegovan-data/chemprot_adversarial/"
# abstract_testfile= "s3://{}/chemprot_adversarial/chemprot_abstract_test.json".format(bucket)
# abstract_valfile="s3://{}/chemprot_adversarial/chemprot_abstract_val.json".format(bucket)

# abstract_train_prefix="s3://aegovan-data/chemprot_adversarial/202212041518/affable"
# abstract_testfile= "s3://{}/chemprot_adversarial/202212041518/chemprot_abstract_test.json".format(bucket)
# abstract_valfile="s3://{}/chemprot_adversarial/202212041518/chemprot_abstract_val.json".format(bucket)

sst2_dataset = "s3://aegovan-data/glue_full_set/SST-2/"

s3_output_path= "s3://{}/hiddencut_sagemakerresults/".format(bucket)
s3_code_path= "s3://{}/hiddencut_code".format(bucket)
s3_checkpoint = "s3://{}/hiddencut_bert_checkpoint/{}".format(bucket, str(uuid4()))

### Start training

In [6]:
sm_localcheckpoint_dir="/opt/ml/checkpoints/"


In [7]:
inputs = {
    "all" : sst2_dataset
}

In [9]:
hyperparameters = {
    "model_name_or_path":"roberta-base" 
  , "data_dir": "/opt/ml/input/data/all"
  , "task_name": "SST-2"
  , "do_train" : "1"
  , "do_eval" : "1"
  , "evaluate_during_training" :1
  , "do_aug" : 1
  , "aug_type" : 'attn_span_cutoff' 
  , "aug_cutoff_ratio" : "0.1"
  , "aug_ce_loss": "1.0" 
  , "aug_js_loss" : "1.0" 
  , "learning_rate" : "7e-6" 
  , "num_train_epochs" : "1" 
  , "logging_steps" : "30"
  , "save_steps" : "30"
  , "per_gpu_train_batch_size": "16" 
  , "output_dir" : "/opt/ml/model"
  , "early_stop": 100
  , "seed": 42}

In [10]:
metric_definitions = [{"Name": "TrainLoss",
                     "Regex": "###score: train_loss### (\d*[.]?\d*)"}
                    ,{"Name": "ValidationLoss",
                     "Regex": "###score: val_loss### (\d*[.]?\d*)"}
                
                    ,{"Name": "TrainAucScore",
                     "Regex": "###score: train_ResultScorerAucMacro_score### (\d*[.]?\d*)"}
                   ,{"Name": "ValidationAucScore",
                     "Regex": "###score: val_ResultScorerAucMacro_score### (\d*[.]?\d*)"}
                      
                      
                     ,{"Name": "TrainF1BinaryScore",
                     "Regex": "###score: train_ResultScorerF1Binary_score### (\d*[.]?\d*)"}
                   ,{"Name": "ValidationF1BinaryScore",
                     "Regex": "###score: val_ResultScorerF1Binary_score### (\d*[.]?\d*)"}
                    ]

In [11]:
# set True if you need spot instance
use_spot = False
train_max_run_secs =   5 *24 * 60 * 60
spot_wait_sec =  5 * 60
max_wait_time_secs = train_max_run_secs +  spot_wait_sec

if not use_spot:
    max_wait_time_secs = None
    
# During local mode, no spot.., use smaller dataset
if instance_type == 'local':
    use_spot = False
    max_wait_time_secs = 0
    wait = True
    # Use smaller dataset to run locally
    # TODO:
    #  inputs = inputs_sample

In [12]:
import datetime
date_fmt = datetime.datetime.today().strftime("%Y%m%d%H%M")

In [13]:
hyperparameters

{'model_name_or_path': 'roberta-base',
 'data_dir': '/opt/ml/input/data/all',
 'task_name': 'SST-2',
 'do_train': '1',
 'do_eval': '1',
 'evaluate_during_training': 1,
 'do_aug': 1,
 'aug_type': 'attn_span_cutoff',
 'aug_cutoff_ratio': '0.1',
 'aug_ce_loss': '1.0',
 'aug_js_loss': '1.0',
 'learning_rate': '7e-6',
 'num_train_epochs': '1',
 'logging_steps': '30',
 'save_steps': '30',
 'per_gpu_train_batch_size': '16',
 'output_dir': '/opt/ml/model',
 'early_stop': 100,
 'seed': 42}

In [17]:
from sagemaker.pytorch import PyTorch
from sagemaker.inputs import TrainingInput

import random


estimator = PyTorch(
  entry_point='run_glue.py',
                    source_dir = '../src',
                    dependencies =['../src/transformers'],
#                     git_config= git_config,
#                     image_name= docker_repo,
                    role=role,
                    framework_version ="1.4.0",
                    py_version='py3',
                    instance_count=1,
                    instance_type=instance_type,
                    hyperparameters = hyperparameters,
                    output_path=s3_output_path,
                    metric_definitions=metric_definitions,
                    volume_size=30,
                    code_location=s3_code_path,
                    debugger_hook_config=False,
                    base_job_name = "hiddencut-sst2",  
                    max_run =  train_max_run_secs,
                    max_wait = max_wait_time_secs,   
)


estimator.fit(inputs, wait=False)

Using provided s3_resource


INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: hiddencut-sst2-2023-09-09-02-32-11-132
