### Set up

#### 1. Set  up  accounts and role

In [1]:
import sagemaker
import boto3
from uuid import uuid4

sagemaker_session = sagemaker.Session()
account_id =  boto3.client('sts').get_caller_identity().get('Account')
region = boto3.session.Session().region_name


#role = sagemaker.get_execution_role()
role="arn:aws:iam::{}:role/service-role/AmazonSageMaker-ExecutionRole-20190118T115449".format(account_id)
max_runs=1

#### 2. Setup image and instance type

In [2]:
# pytorch_custom_image_name="ppi-extractor:gpu-1.0.0-201910130520"
instance_type = "ml.p3.2xlarge"
instance_type_gpu_map = {"ml.p3.8xlarge":4, "ml.p3.2xlarge": 1, "ml.p3.16xlarge":8}

In [3]:
# docker_repo = "{}.dkr.ecr.{}.amazonaws.com/{}".format(account_id, region, pytorch_custom_image_name)

#### 3. Configure train/ test and validation datasets

In [4]:
bucket = "aegovan-data"

In [5]:
pretrained_bert="s3://{}/embeddings/bert/".format(bucket)

trainfile = "s3://{}/chemprot/chemprot_train.json".format(bucket)
testfile= "s3://{}/chemprot/chemprot_test.json".format(bucket)
valfile="s3://{}/chemprot/chemprot_dev.json".format(bucket)


abstract_all_trainfile = "s3://{}/chemprot/chemprot_abstract_train.json".format(bucket)
abstract_minimal_trainfile= "s3://aegovan-data/chemprot_minimal/202212161734/chemprot_abstract_train.json"
abstract_testfile= "s3://{}/chemprot/chemprot_abstract_test.json".format(bucket)
abstract_valfile="s3://{}/chemprot/chemprot_abstract_val.json".format(bucket)

s3_output_path= "s3://{}/chemprot_sagemakerresults/".format(bucket)
s3_code_path= "s3://{}/chemprot_code".format(bucket)
s3_checkpoint = "s3://{}/chemprot_bert_checkpoint/{}".format(bucket, str(uuid4()))

### Start training

In [6]:
commit_id = "a6211b46f5940b9ac48fd3bde9274734ec3605a5"

In [7]:
train_inputs = {
    "train" : trainfile,
    "val" : valfile,
    "PRETRAINED_MODEL" : pretrained_bert
}

In [8]:
train_abstract_inputs = {
    "train" : abstract_all_trainfile,
    "val" : abstract_valfile,
    "PRETRAINED_MODEL" : pretrained_bert
}

In [9]:
train_abstract_min_inputs = {
    "train" : abstract_minimal_trainfile,
    "val" : abstract_valfile,
    "PRETRAINED_MODEL" : pretrained_bert
}

In [10]:
sm_localcheckpoint_dir="/opt/ml/checkpoints/"


In [11]:
BertNetworkFactoryhyperparameters = {
    "datasetfactory":"datasets.chemprot_dataset_factory.ChemprotDatasetFactory",
    "modelfactory" :"models.bert_model_factory.BertModelFactory",
    "tokenisor_lower_case":0,
    "uselosseval":1,
    "batch": "8" * instance_type_gpu_map[instance_type],
    "gradientaccumulationsteps" : "8",
    # "protein_name_replacer_random_seed":42,
    "epochs" : "200",   
    "log-level" : "INFO",
    "learningrate":.00001,
    "earlystoppingpatience":50,
    "checkpointdir" : sm_localcheckpoint_dir,
    # Checkpoints once every n epochs
    "checkpointfreq": 2,
    "weight_decay":0.01,
    "commit_id" : commit_id



}

In [12]:
abstract_BertNetworkFactoryhyperparameters_max_f1 = BertNetworkFactoryhyperparameters.copy()
abstract_BertNetworkFactoryhyperparameters_max_f1["datasetfactory"] = "datasets.chemprot_abstract_dataset_factory.ChemprotAbstractDatasetFactory"
abstract_BertNetworkFactoryhyperparameters_max_f1["uselosseval"] = 0

In [13]:
BertNetworkFactoryhyperparameters_max_f1 = BertNetworkFactoryhyperparameters.copy()
BertNetworkFactoryhyperparameters_max_f1["uselosseval"] = 0

In [14]:
BertNetworkFactoryhyperparameters_max_f1_aimed = BertNetworkFactoryhyperparameters.copy()
BertNetworkFactoryhyperparameters_max_f1_aimed["uselosseval"] = 0
BertNetworkFactoryhyperparameters_max_f1_aimed["weight_decay"] = 0.001

In [15]:
metric_definitions = [{"Name": "TrainLoss",
                     "Regex": "###score: train_loss### (\d*[.]?\d*)"}
                    ,{"Name": "ValidationLoss",
                     "Regex": "###score: val_loss### (\d*[.]?\d*)"}
                
                    ,{"Name": "TrainAucScore",
                     "Regex": "###score: train_ResultScorerAucMacro_score### (\d*[.]?\d*)"}
                   ,{"Name": "ValidationAucScore",
                     "Regex": "###score: val_ResultScorerAucMacro_score### (\d*[.]?\d*)"}
                      
                      
                     ,{"Name": "TrainF1MacroScore",
                     "Regex": "###score: train_ResultScorerF1Macro_score### (\d*[.]?\d*)"}
                   ,{"Name": "ValidationF1MacroScore",
                     "Regex": "###score: val_ResultScorerF1Macro_score### (\d*[.]?\d*)"}
                    ]

In [16]:
!git log -1 | head -1
!git log -1 | head -5 | tail -1

commit d3f5c6c7525ac9e6e1d7b7571af8734fac1f15e9
    Add minimal train set


In [17]:
# set True if you need spot instance
use_spot = False
train_max_run_secs =   5 *24 * 60 * 60
spot_wait_sec =  5 * 60
max_wait_time_secs = train_max_run_secs +  spot_wait_sec

if not use_spot:
    max_wait_time_secs = None
    
# During local mode, no spot.., use smaller dataset
if instance_type == 'local':
    use_spot = False
    max_wait_time_secs = 0
    wait = True
    # Use smaller dataset to run locally
    # TODO:
    #  inputs = inputs_sample

In [18]:
experiments = {
    "chemprot-bert" : {
        "hp" :BertNetworkFactoryhyperparameters,
        "inputs" : train_inputs
    },
    
     "chemprot-bert-f1" : {
        "hp" :BertNetworkFactoryhyperparameters_max_f1,
        "inputs" : train_inputs
    },
     "chemprot-abstract-bert-f1" : {
        "hp" :abstract_BertNetworkFactoryhyperparameters_max_f1,
        "inputs" : train_abstract_inputs
    },
    "chemprot-abstract-bert-min-f1" : {
        "hp" :abstract_BertNetworkFactoryhyperparameters_max_f1,
        "inputs" : train_abstract_min_inputs
    }
}

In [19]:
experiment_name = "chemprot-abstract-bert-f1"
hyperparameters = experiments[experiment_name]["hp"]
inputs = experiments[experiment_name]["inputs"]
base_name = experiment_name

In [20]:
hyperparameters

{'datasetfactory': 'datasets.chemprot_abstract_dataset_factory.ChemprotAbstractDatasetFactory',
 'modelfactory': 'models.bert_model_factory.BertModelFactory',
 'tokenisor_lower_case': 0,
 'uselosseval': 0,
 'batch': '8',
 'gradientaccumulationsteps': '8',
 'epochs': '200',
 'log-level': 'INFO',
 'learningrate': 1e-05,
 'earlystoppingpatience': 50,
 'checkpointdir': '/opt/ml/checkpoints/',
 'checkpointfreq': 2,
 'weight_decay': 0.01,
 'commit_id': 'a6211b46f5940b9ac48fd3bde9274734ec3605a5'}

In [21]:
git_config = {'repo': 'https://github.com/elangovana/large-scale-ptm-ppi.git',
              'branch': 'main',
              'commit': hyperparameters["commit_id"]
             }

In [22]:
hyperparameters

{'datasetfactory': 'datasets.chemprot_abstract_dataset_factory.ChemprotAbstractDatasetFactory',
 'modelfactory': 'models.bert_model_factory.BertModelFactory',
 'tokenisor_lower_case': 0,
 'uselosseval': 0,
 'batch': '8',
 'gradientaccumulationsteps': '8',
 'epochs': '200',
 'log-level': 'INFO',
 'learningrate': 1e-05,
 'earlystoppingpatience': 50,
 'checkpointdir': '/opt/ml/checkpoints/',
 'checkpointfreq': 2,
 'weight_decay': 0.01,
 'commit_id': 'a6211b46f5940b9ac48fd3bde9274734ec3605a5'}

In [23]:
inputs

{'train': 's3://aegovan-data/chemprot/chemprot_abstract_train.json',
 'val': 's3://aegovan-data/chemprot/chemprot_abstract_val.json',
 'PRETRAINED_MODEL': 's3://aegovan-data/embeddings/bert/'}

In [24]:
from sagemaker.pytorch import PyTorch

for i in range(5):
    estimator = PyTorch(
      entry_point='main_train_pipeline.py',
                        source_dir = '../src',
                        dependencies =['../src/datasets', '../src/models','../src/utils', '../src/scorers'],
    #                     git_config= git_config,
    #                     image_name= docker_repo,
                        role=role,
                        framework_version ="1.4.0",
                        py_version='py3',
                        instance_count=1,
                        instance_type=instance_type,
                        hyperparameters = hyperparameters,
                        output_path=s3_output_path,
                        metric_definitions=metric_definitions,
                        volume_size=30,
                        code_location=s3_code_path,
                        debugger_hook_config=False,
                        base_job_name =base_name,  
                        use_spot_instances = use_spot,
                        max_run =  train_max_run_secs,
                        max_wait = max_wait_time_secs,   
                      #  checkpoint_s3_uri=s3_checkpoint,
                     #   checkpoint_local_path=sm_localcheckpoint_dir
    
    )

    estimator.fit(inputs, wait=False)