# Train self supervised bert

### Set up

#### 1. Set  up  accounts and role

In [1]:
import sagemaker
import boto3
from uuid import uuid4

sagemaker_session = sagemaker.Session()
account_id =  boto3.client('sts').get_caller_identity().get('Account')
region = boto3.session.Session().region_name


#role = sagemaker.get_execution_role()
role="arn:aws:iam::{}:role/service-role/AmazonSageMaker-ExecutionRole-20190118T115449".format(account_id)
max_runs=1

#### 2. Setup image and instance type

In [2]:
# pytorch_custom_image_name="ppi-extractor:gpu-1.0.0-201910130520"
instance_type = "ml.p3.2xlarge"
instance_type_gpu_map = {"ml.p3.8xlarge":4, "ml.p3.2xlarge": 1, "ml.p3.16xlarge":8}

In [3]:
# docker_repo = "{}.dkr.ecr.{}.amazonaws.com/{}".format(account_id, region, pytorch_custom_image_name)

#### 3. Configure train/ test and validation datasets

In [4]:
bucket = "aegovan-data"

In [5]:
pretrained_bert="s3://{}/embeddings/bert/".format(bucket)

trainfile = "s3://{}/self-supervised/train.json".format(bucket)
testfile= "s3://{}/self-supervised/test.json".format(bucket)
valfile="s3://{}/self-supervised/val.json".format(bucket)

s3_output_path= "s3://{}/selfsupervised_results/".format(bucket)
s3_code_path= "s3://{}/selfsupervised_code".format(bucket)
s3_checkpoint = "s3://{}/selfsupervised_bert_checkpoint/{}".format(bucket, str(uuid4()))

In [6]:
fake_trainfile = "s3://{}/self-supervised-fake/train.json".format(bucket)
fake_testfile= "s3://{}/self-supervised-fake/test.json".format(bucket)
fake_valfile="s3://{}/self-supervised-fake/val.json".format(bucket)

### Start training

In [7]:
commit_id = "a6211b46f5940b9ac48fd3bde9274734ec3605a5"

In [8]:
train_inputs = {
    "train" : trainfile,
    "val" : valfile,
    "PRETRAINED_MODEL" : pretrained_bert
}

In [9]:
fake_train_inputs = {
    "train" : fake_trainfile,
    "val" : fake_valfile,
    "PRETRAINED_MODEL" : pretrained_bert
}

In [10]:
sm_localcheckpoint_dir="/opt/ml/checkpoints/"


In [11]:
BertNetworkFactoryhyperparameters = {
    "datasetfactory":"datasets.chemprot_selfsupervised_dataset_factory.ChemprotSelfsupervisedDatasetFactory",
    "modelfactory" :"models.bert_model_factory.BertModelFactory",
    "tokenisor_lower_case":0,
    "uselosseval":1,
    "batch": "8" * instance_type_gpu_map[instance_type],
    "gradientaccumulationsteps" : "8",
    # "protein_name_replacer_random_seed":42,
    "epochs" : "200",   
    "log-level" : "INFO",
    "learningrate":.00001,
    "earlystoppingpatience":50,
    "checkpointdir" : sm_localcheckpoint_dir,
    # Checkpoints once every n epochs
    "checkpointfreq": 2,
    "weight_decay":0.01,
    "commit_id" : commit_id



}

In [12]:
BertNetworkFactoryhyperparameters_max_f1 = BertNetworkFactoryhyperparameters.copy()
BertNetworkFactoryhyperparameters_max_f1["uselosseval"] = 0

In [13]:
metric_definitions = [{"Name": "TrainLoss",
                     "Regex": "###score: train_loss### (\d*[.]?\d*)"}
                    ,{"Name": "ValidationLoss",
                     "Regex": "###score: val_loss### (\d*[.]?\d*)"}
                
                    ,{"Name": "TrainAucScore",
                     "Regex": "###score: train_ResultScorerAucMacro_score### (\d*[.]?\d*)"}
                   ,{"Name": "ValidationAucScore",
                     "Regex": "###score: val_ResultScorerAucMacro_score### (\d*[.]?\d*)"}
                      
                      
                     ,{"Name": "TrainF1MacroScore",
                     "Regex": "###score: train_ResultScorerF1Macro_score### (\d*[.]?\d*)"}
                   ,{"Name": "ValidationF1MacroScore",
                     "Regex": "###score: val_ResultScorerF1Macro_score### (\d*[.]?\d*)"}
                    ]

In [14]:
!git log -1 | head -1
!git log -1 | head -5 | tail -1

commit 7ab4967bbdd1a2e64dcaafc252321d7b2d3148c8
    Update notebook add fake negative samples


In [15]:
# set True if you need spot instance
use_spot = False
train_max_run_secs =   5 *24 * 60 * 60
spot_wait_sec =  5 * 60
max_wait_time_secs = train_max_run_secs +  spot_wait_sec

if not use_spot:
    max_wait_time_secs = None
    
# During local mode, no spot.., use smaller dataset
if instance_type == 'local':
    use_spot = False
    max_wait_time_secs = 0
    wait = True
    # Use smaller dataset to run locally
    # TODO:
    #  inputs = inputs_sample

In [16]:
experiments = {

     "selfsupervised-bert-f1" : {
        "hp" :BertNetworkFactoryhyperparameters_max_f1,
        "inputs" : train_inputs
    },
    "selfsupervised-fake-bert-f1" : {
        "hp" :BertNetworkFactoryhyperparameters_max_f1,
        "inputs" : fake_train_inputs
    }
}

In [17]:
experiment_name = "selfsupervised-fake-bert-f1"
hyperparameters = experiments[experiment_name]["hp"]
inputs = experiments[experiment_name]["inputs"]
base_name = experiment_name

In [18]:
hyperparameters

{'datasetfactory': 'datasets.chemprot_selfsupervised_dataset_factory.ChemprotSelfsupervisedDatasetFactory',
 'modelfactory': 'models.bert_model_factory.BertModelFactory',
 'tokenisor_lower_case': 0,
 'uselosseval': 0,
 'batch': '8',
 'gradientaccumulationsteps': '8',
 'epochs': '200',
 'log-level': 'INFO',
 'learningrate': 1e-05,
 'earlystoppingpatience': 50,
 'checkpointdir': '/opt/ml/checkpoints/',
 'checkpointfreq': 2,
 'weight_decay': 0.01,
 'commit_id': 'a6211b46f5940b9ac48fd3bde9274734ec3605a5'}

In [19]:
git_config = {'repo': 'https://github.com/elangovana/large-scale-ptm-ppi.git',
              'branch': 'main',
              'commit': hyperparameters["commit_id"]
             }

In [20]:
hyperparameters

{'datasetfactory': 'datasets.chemprot_selfsupervised_dataset_factory.ChemprotSelfsupervisedDatasetFactory',
 'modelfactory': 'models.bert_model_factory.BertModelFactory',
 'tokenisor_lower_case': 0,
 'uselosseval': 0,
 'batch': '8',
 'gradientaccumulationsteps': '8',
 'epochs': '200',
 'log-level': 'INFO',
 'learningrate': 1e-05,
 'earlystoppingpatience': 50,
 'checkpointdir': '/opt/ml/checkpoints/',
 'checkpointfreq': 2,
 'weight_decay': 0.01,
 'commit_id': 'a6211b46f5940b9ac48fd3bde9274734ec3605a5'}

In [21]:
inputs

{'train': 's3://aegovan-data/self-supervised-fake/train.json',
 'val': 's3://aegovan-data/self-supervised-fake/val.json',
 'PRETRAINED_MODEL': 's3://aegovan-data/embeddings/bert/'}

In [22]:
from sagemaker.pytorch import PyTorch

estimator = PyTorch(
  entry_point='main_train_pipeline.py',
                    source_dir = '../src',
                    dependencies =['../src/datasets', '../src/models','../src/utils', '../src/scorers'],
                     # git_config= git_config,
#                     image_name= docker_repo,
                    role=role,
                    framework_version ="1.4.0",
                    py_version='py3',
                    instance_count=1,
                    instance_type=instance_type,
                    hyperparameters = hyperparameters,
                    output_path=s3_output_path,
                    metric_definitions=metric_definitions,
                    volume_size=30,
                    code_location=s3_code_path,
                    debugger_hook_config=False,
                    base_job_name =base_name,  
                    use_spot_instances = use_spot,
                    max_run =  train_max_run_secs,
                    max_wait = max_wait_time_secs,   
                    checkpoint_s3_uri=s3_checkpoint,
                    checkpoint_local_path=sm_localcheckpoint_dir)

estimator.fit(inputs, wait=False)