### Set up

#### 1. Set  up  accounts and role

In [1]:
import sagemaker
import boto3
from uuid import uuid4

sagemaker_session = sagemaker.Session()
account_id =  boto3.client('sts').get_caller_identity().get('Account')
region = boto3.session.Session().region_name


#role = sagemaker.get_execution_role()
role="arn:aws:iam::{}:role/service-role/AmazonSageMaker-ExecutionRole-20190118T115449".format(account_id)
step_func_role = "arn:aws:iam::{}:role/AmazonSageMaker-StepFunctionsWorkflowExecutionRole".format(account_id)
max_runs=1

#### 2. Setup image and instance type

In [2]:
# pytorch_custom_image_name="ppi-extractor:gpu-1.0.0-201910130520"
instance_type = "ml.p3.2xlarge"
instance_type_gpu_map = {"ml.p3.8xlarge":4, "ml.p3.2xlarge": 1, "ml.p3.16xlarge":8}

In [3]:
# docker_repo = "{}.dkr.ecr.{}.amazonaws.com/{}".format(account_id, region, pytorch_custom_image_name)

#### 3. Configure train/ test and validation datasets

In [4]:
bucket = "aegovan-data"

In [5]:
pretrained_bert="s3://{}/embeddings/bert/".format(bucket)

# abstract_train_prefix= "s3://aegovan-data/chemprot_adversarial/"
# abstract_testfile= "s3://{}/chemprot_adversarial/chemprot_abstract_test.json".format(bucket)
# abstract_valfile="s3://{}/chemprot_adversarial/chemprot_abstract_val.json".format(bucket)

# abstract_train_prefix="s3://aegovan-data/chemprot_adversarial/202212041518/affable"
# abstract_testfile= "s3://{}/chemprot_adversarial/202212041518/chemprot_abstract_test.json".format(bucket)
# abstract_valfile="s3://{}/chemprot_adversarial/202212041518/chemprot_abstract_val.json".format(bucket)

abstract_train_prefix="s3://aegovan-data/chemprot_adversarial/202212132150/affablepositives"
abstract_valfile= "s3://aegovan-data/chemprot_adversarial/202212132150/chemprot_abstract_val.json".format(bucket)
abstract_testfile= "s3://aegovan-data/chemprot_adversarial/202212132150/chemprot_abstract_test.json".format(bucket)

    
s3_output_path= "s3://{}/chemprot_sagemakerresults/".format(bucket)
s3_code_path= "s3://{}/chemprot_code".format(bucket)
s3_checkpoint = "s3://{}/chemprot_bert_checkpoint/{}".format(bucket, str(uuid4()))

### Start training

In [6]:
commit_id = "a6211b46f5940b9ac48fd3bde9274734ec3605a5"

In [7]:
sm_localcheckpoint_dir="/opt/ml/checkpoints/"


In [8]:
BertNetworkFactoryhyperparameters = {
    "datasetfactory":"datasets.chemprot_abstract_adverserial_dataset_factory.ChemprotAbstractAdverserialDatasetFactory",
    "modelfactory" :"models.bert_model_factory.BertModelFactory",
    "tokenisor_lower_case":0,
    "uselosseval":0,
    "batch": "8" * instance_type_gpu_map[instance_type],
    "gradientaccumulationsteps" : "8",
    # "protein_name_replacer_random_seed":42,
    "epochs" : "200",   
    "log-level" : "INFO",
    "learningrate":.00001,
    "earlystoppingpatience":50,
    "checkpointdir" : sm_localcheckpoint_dir,
    # Checkpoints once every n epochs
    "checkpointfreq": 2,
    "weight_decay":0.01,
    "commit_id" : commit_id



}

In [9]:
metric_definitions = [{"Name": "TrainLoss",
                     "Regex": "###score: train_loss### (\d*[.]?\d*)"}
                    ,{"Name": "ValidationLoss",
                     "Regex": "###score: val_loss### (\d*[.]?\d*)"}
                
                    ,{"Name": "TrainAucScore",
                     "Regex": "###score: train_ResultScorerAucMacro_score### (\d*[.]?\d*)"}
                   ,{"Name": "ValidationAucScore",
                     "Regex": "###score: val_ResultScorerAucMacro_score### (\d*[.]?\d*)"}
                      
                      
                     ,{"Name": "TrainF1BinaryScore",
                     "Regex": "###score: train_ResultScorerF1Binary_score### (\d*[.]?\d*)"}
                   ,{"Name": "ValidationF1BinaryScore",
                     "Regex": "###score: val_ResultScorerF1Binary_score### (\d*[.]?\d*)"}
                    ]

In [10]:
!git log -1 | head -1
!git log -1 | head -5 | tail -1

commit 1923f13a6d8f939d797742b9659de5ac09eac068
    Update notebook - Add affable positives samples


In [11]:
# set True if you need spot instance
use_spot = False
train_max_run_secs =   5 *24 * 60 * 60
spot_wait_sec =  5 * 60
max_wait_time_secs = train_max_run_secs +  spot_wait_sec

if not use_spot:
    max_wait_time_secs = None
    
# During local mode, no spot.., use smaller dataset
if instance_type == 'local':
    use_spot = False
    max_wait_time_secs = 0
    wait = True
    # Use smaller dataset to run locally
    # TODO:
    #  inputs = inputs_sample

In [12]:
def create_per_train_hp(s3_prefix, val, prefix="fake"):
    from sagemaker.s3 import S3Downloader
    s3_files = S3Downloader.list(s3_prefix)
    s3_prefix = s3_prefix.rstrip("/")
    files = sorted(list(filter( lambda x: x.split("/")[-1].startswith("train"),s3_files ))   ,reverse=True)
                   
    
    fake_experiments = {}
    
    
    for f in files:
        base_name = f.split("/")[-1].split(".")[0]

        fake_experiments[f"chemprot-{prefix}-{base_name}"] = {
            "hp" :BertNetworkFactoryhyperparameters,
            "inputs" :  {
                    "train" : f"{f}",
                    "val" : val,
                    "PRETRAINED_MODEL" : pretrained_bert
                }
            }

            
    return fake_experiments


per_train_experiments_hp = create_per_train_hp(abstract_train_prefix,abstract_valfile, "affable")

In [13]:
per_train_experiments_hp

{'chemprot-affable-train_767_210_84': {'hp': {'datasetfactory': 'datasets.chemprot_abstract_adverserial_dataset_factory.ChemprotAbstractAdverserialDatasetFactory',
   'modelfactory': 'models.bert_model_factory.BertModelFactory',
   'tokenisor_lower_case': 0,
   'uselosseval': 0,
   'batch': '8',
   'gradientaccumulationsteps': '8',
   'epochs': '200',
   'log-level': 'INFO',
   'learningrate': 1e-05,
   'earlystoppingpatience': 50,
   'checkpointdir': '/opt/ml/checkpoints/',
   'checkpointfreq': 2,
   'weight_decay': 0.01,
   'commit_id': 'a6211b46f5940b9ac48fd3bde9274734ec3605a5'},
  'inputs': {'train': 's3://aegovan-data/chemprot_adversarial/202212132150/affablepositives/train_767_210_84.json',
   'val': 's3://aegovan-data/chemprot_adversarial/202212132150/chemprot_abstract_val.json',
   'PRETRAINED_MODEL': 's3://aegovan-data/embeddings/bert/'}},
 'chemprot-affable-train_767_210_63': {'hp': {'datasetfactory': 'datasets.chemprot_abstract_adverserial_dataset_factory.ChemprotAbstractAdv

In [14]:
import datetime
date_fmt = datetime.datetime.today().strftime("%Y%m%d%H%M")

In [15]:
from sagemaker.pytorch import PyTorch
from sagemaker.inputs import TrainingInput
from stepfunctions.steps import *
from stepfunctions.workflow import Workflow
import random

train_steps = []
variations = 5
for n,e in  per_train_experiments_hp.items():
    for i in range(variations):
        job_name = n.replace("_","-")+ f"-{i:02d}-" + date_fmt

        estimator = PyTorch(
          entry_point='main_train_pipeline.py',
                            source_dir = '../src',
                            dependencies =['../src/datasets', '../src/models','../src/utils', '../src/scorers'],
        #                     git_config= git_config,
        #                     image_name= docker_repo,
                            role=role,
                            framework_version ="1.4.0",
                            py_version='py3',
                            instance_count=1,
                            instance_type=instance_type,
                            hyperparameters = e["hp"],
                            output_path=s3_output_path,
                            metric_definitions=metric_definitions,
                            volume_size=30,
                            code_location=s3_code_path,
                            debugger_hook_config=False,
                            base_job_name = n.replace("_", "-"),  
                            max_run =  train_max_run_secs,
                            max_wait = max_wait_time_secs,   
        )



        # Job
        step_train = sagemaker.TrainingStep( f"Train-{job_name}", 
                                                   estimator, 
                                                   job_name, 
                                                   data=e["inputs"])

        train_steps.append(step_train)
    


parallel_steps = []
max_parallel = 5
for i in range(0, len(train_steps), max_parallel):
    p = states.Parallel(f"train-p-{i}")
    for si, s in enumerate(train_steps[i: i+max_parallel]):
        w = (si+1)*30
        p.add_branch(Chain([states.Wait(f"wait-{i}-{w}", seconds=w) , s]))
    parallel_steps.append(p)

    
basic_path = Chain(parallel_steps)

basic_workflow = Workflow(
    name=f"chemprot-affable-training-{date_fmt}", definition=basic_path, role=step_func_role
)



In [16]:
# Render the workflow
basic_workflow.render_graph()



In [17]:
basic_workflow.create()

'arn:aws:states:us-east-2:324346001917:stateMachine:chemprot-affable-training-202212132155'

In [18]:
basic_workflow.execute()