### Set up

#### 1. Set  up  accounts and role

In [1]:
import sagemaker
import boto3
from datetime import datetime

sagemaker_session = sagemaker.Session()
account_id =  boto3.client('sts').get_caller_identity().get('Account')
region = boto3.session.Session().region_name


#role = sagemaker.get_execution_role()
role="arn:aws:iam::{}:role/service-role/AmazonSageMaker-ExecutionRole-20190118T115449".format(account_id)
max_runs=1

#### 2. Setup image and instance type

In [2]:
# pytorch_custom_image_name="ppi-extractor:gpu-1.0.0-201910130520"
instance_type = "ml.p3.2xlarge" 

In [3]:
# docker_repo = "{}.dkr.ecr.{}.amazonaws.com/{}".format(account_id, region, pytorch_custom_image_name)

#### 3. Configure train/ test and validation datasets

In [4]:
bucket = "aegovan-data"

In [5]:
train_unique = "s3://{}/aimed_dataset/kfold_unique".format(bucket)
train_random ="s3://{}/aimed_dataset/kfold_random".format(bucket)

pretrained_bert="s3://{}/embeddings/bert/".format(bucket)



train_unique_single_fold="s3://{}/aimed_dataset/kfold_unique/fold_0/train".format(bucket)
val_unique_single_fold="s3://{}/aimed_dataset/kfold_unique/fold_0/validation".format(bucket)

train_random_single_fold="s3://{}/aimed_dataset/kfold_random/fold_0/train".format(bucket)
val_random_single_fold="s3://{}/aimed_dataset/kfold_random/fold_0/validation".format(bucket)



s3_output_path= "s3://{}/sagemakerresults/".format(bucket)
s3_code_path= "s3://{}/aimed_bert_code".format(bucket)
s3_checkpoint = "s3://{}/aimed_bert_checkpoint/{}".format(bucket, datetime.now().strftime("%m%d%Y%H%M%S"))

### Start training

In [6]:
commit_id = "6df30be45e08af56a0f10fbfc8a724737f7ca9e1"

In [7]:
unique_inputs = {
    "train" : train_unique,
    "PRETRAINED_MODEL" : pretrained_bert
}

In [8]:
random_inputs = {
    "train" : train_random,
    "PRETRAINED_MODEL" : pretrained_bert
}

In [9]:
unique_single_fold = {
    "train" : train_unique_single_fold,
    "val" : val_unique_single_fold,
    "PRETRAINED_MODEL" : pretrained_bert
}

In [10]:
random_single_fold = {
    "train" : train_random_single_fold,
    "val" : val_random_single_fold,
    "PRETRAINED_MODEL" : pretrained_bert
}

In [11]:
sm_localcheckpoint_dir="/opt/ml/checkpoints/"


In [12]:
BertNetworkFactoryhyperparameters_kfold = {
    "datasetfactory":"datasets.aimed_dataset_factory.AimedDatasetFactory",
    "modelfactory" :"models.bert_model_factory.BertModelFactory",
    "tokenisor_lower_case":0,
    "kfoldtrainprefix": "train",
    "batch": "8",
    "gradientaccumulationsteps" : "8",
    # "protein_name_replacer_random_seed":42,
    "epochs" : "100",   
    "log-level" : "INFO",
    "learningrate":.00001,
    "earlystoppingpatience":9,
    "checkpointdir" : sm_localcheckpoint_dir,
    # Checkpoints once every n epochs
    "checkpointfreq": 2,
    
    "commit_id" : commit_id



}

In [13]:
BertNetworkFactoryhyperparameters_single_fold = BertNetworkFactoryhyperparameters_kfold.copy()
BertNetworkFactoryhyperparameters_single_fold.pop("kfoldtrainprefix")

'train'

In [14]:
BertNetworkFactoryhyperparameters_single_fold_loss = BertNetworkFactoryhyperparameters_single_fold.copy()
BertNetworkFactoryhyperparameters_single_fold_loss["uselosseval"] =1

In [15]:
metric_definitions = [{"Name": "TrainLoss",
                     "Regex": "###score: train_loss### (\d*[.]?\d*)"}
                    ,{"Name": "ValidationLoss",
                     "Regex": "###score: val_loss### (\d*[.]?\d*)"}
                
                    ,{"Name": "TrainAucScore",
                     "Regex": "###score: train_ResultScorerAucBinary_score### (\d*[.]?\d*)"}
                   ,{"Name": "ValidationAucScore",
                     "Regex": "###score: val_ResultScorerAucBinary_score### (\d*[.]?\d*)"}
                      
                      ,{"Name": "TrainPRScore",
                     "Regex": "###score: train_ResultScorerPrBinary_score### (\d*[.]?\d*)"}
                   ,{"Name": "ValidationPRScore",
                     "Regex": "###score: val_ResultScorerPrBinary_score### (\d*[.]?\d*)"}
                      
                     ,{"Name": "TrainF1BinaryScore",
                     "Regex": "###score: train_ResultScorerF1Binary_score### (\d*[.]?\d*)"}
                   ,{"Name": "ValidationF1BinaryScore",
                     "Regex": "###score: val_ResultScorerF1Binary_score### (\d*[.]?\d*)"}
                    ]

In [16]:
!git log -1 | head -1
!git log -1 | head -5 | tail -1

commit 19996f6b4651f32784885ed09fca43c1071696a0
    Update notebooks


In [17]:
# set True if you need spot instance
use_spot = True
train_max_run_secs =   5 *24 * 60 * 60
spot_wait_sec =  5 * 60
max_wait_time_secs = train_max_run_secs +  spot_wait_sec

if not use_spot:
    max_wait_time_secs = None
    
# During local mode, no spot.., use smaller dataset
if instance_type == 'local':
    use_spot = False
    max_wait_time_secs = 0
    wait = True
    # Use smaller dataset to run locally
    inputs = inputs_sample

In [None]:
experiments = {
    "aimed-bert-unique-1fold-loss" : {
        "hp" :BertNetworkFactoryhyperparameters_single_fold_loss
        "inputs" : unique_single_fold
    },
     "aimed-bert-unique-1fold" : {
        "hp" :BertNetworkFactoryhyperparameters_single_fold
        "inputs" : unique_single_fold
    },
      "aimed-bert-unique-kfold" : {
        "hp" :BertNetworkFactoryhyperparameters_kfold
        "inputs" : unique_inputs
    }
}

In [18]:

base_name = "aimed-bert-unique-1fold-loss"

hyperparameters = experiments[base_name]["hp"]
inputs = experiments[base_name]["inputs"] 

In [19]:
hyperparameters

{'datasetfactory': 'datasets.aimed_dataset_factory.AimedDatasetFactory',
 'modelfactory': 'models.bert_model_factory.BertModelFactory',
 'tokenisor_lower_case': 0,
 'batch': '8',
 'gradientaccumulationsteps': '8',
 'epochs': '100',
 'log-level': 'INFO',
 'learningrate': 1e-05,
 'earlystoppingpatience': 9,
 'checkpointdir': '/opt/ml/checkpoints/',
 'checkpointfreq': 2,
 'commit_id': '6df30be45e08af56a0f10fbfc8a724737f7ca9e1',
 'uselosseval': 1}

In [20]:
git_config = {'repo': 'https://github.com/elangovana/ppi-aimed.git',
              'branch': 'main',
              'commit': hyperparameters["commit_id"]
             }

In [21]:
hyperparameters

{'datasetfactory': 'datasets.aimed_dataset_factory.AimedDatasetFactory',
 'modelfactory': 'models.bert_model_factory.BertModelFactory',
 'tokenisor_lower_case': 0,
 'batch': '8',
 'gradientaccumulationsteps': '8',
 'epochs': '100',
 'log-level': 'INFO',
 'learningrate': 1e-05,
 'earlystoppingpatience': 9,
 'checkpointdir': '/opt/ml/checkpoints/',
 'checkpointfreq': 2,
 'commit_id': '6df30be45e08af56a0f10fbfc8a724737f7ca9e1',
 'uselosseval': 1}

In [22]:
inputs

{'train': 's3://aegovan-data/aimed_dataset/kfold_unique/fold_0/train',
 'val': 's3://aegovan-data/aimed_dataset/kfold_unique/fold_0/validation',
 'PRETRAINED_MODEL': 's3://aegovan-data/embeddings/bert/'}

In [23]:


from sagemaker.pytorch import PyTorch

estimator = PyTorch(
    entry_point='main_train_pipeline.py',
                    source_dir = 'src',
                    dependencies =['src/datasets', 'src/models','src/utils', 'src/scorers'],
                      git_config= git_config,
#                     image_name= docker_repo,
                    role=role,
                    framework_version ="1.4.0",
                    py_version='py3',
                    instance_count=1,
                    instance_type=instance_type,
                    hyperparameters = hyperparameters,
                    output_path=s3_output_path,
                    metric_definitions=metric_definitions,
                    volume_size=30,
                    code_location=s3_code_path,
                    debugger_hook_config=False,
                    base_job_name =base_name,  
                    use_spot_instances = use_spot,
                    max_run =  train_max_run_secs,
                    max_wait = max_wait_time_secs,   
                    checkpoint_s3_uri=s3_checkpoint,
                    checkpoint_local_path=sm_localcheckpoint_dir)

estimator.fit(inputs, wait=False)