### Set up

#### 1. Set  up  accounts and role

In [1]:
#!pip install sagemaker==1.39.0

In [2]:
import sagemaker
import boto3

sagemaker_session = sagemaker.Session()
account_id =  boto3.client('sts').get_caller_identity().get('Account')
region = boto3.session.Session().region_name


#role = sagemaker.get_execution_role()
role="arn:aws:iam::{}:role/service-role/AmazonSageMaker-ExecutionRole-20190118T115449".format(account_id)


#### 2. Setup image and instance type

In [3]:
pytorch_custom_image_name="ppi-extractor:gpu-1.0.0-201910130520"
instance_type = "ml.p3.8xlarge" 

In [4]:
docker_repo = "{}.dkr.ecr.{}.amazonaws.com/{}".format(account_id, region, pytorch_custom_image_name)

#### 3. Configure train/ test and validation datasets

In [5]:
bucket = "aegovan-data"

In [6]:
trainfile = "s3://{}/processed_dataset/train_unique_pub_v6_less_negative.json".format(bucket)
testfile= "s3://{}/processed_dataset/test_unique_pub_v6_less_negative.json".format(bucket)
valfile="s3://{}/processed_dataset/val_unique_pub_v6_less_negative.json".format(bucket)


trainfile = "s3://{}/processed_dataset/train_unique_pub_v6_max_neg.json".format(bucket)
testfile= "s3://{}/processed_dataset/test_unique_pub_v6_max_neg.json".format(bucket)
valfile="s3://{}/processed_dataset/val_unique_pub_v6_max_neg.json".format(bucket)
# trainfile = "s3://{}/aimed/train_unique_negative_entity_only.json".format(bucket)
# testfile= "s3://{}/aimed/test_unique_negative_entity_only.json".format(bucket)
# valfile= "s3://{}/aimed/val_unique_negative_entity_only.json".format(bucket)



pretrained_bert="s3://{}/embeddings/bert/".format(bucket)


s3_output_path= "s3://{}/results/".format(bucket)
s3_code_path= "s3://{}/aimed_bert_code".format(bucket)

### Start training

In [7]:
pub_inputs = {
    "train" : trainfile,
    "val" :valfile,
    "PRETRAINED_BIOBERT" : pretrained_bert
}

In [8]:
BertNetworkFactoryhyperparameters = {
    "dataset":"PpiDatasetFactory",
    "network" :"RelationExtractorBioBertFactory",
    "trainfile":trainfile.split("/")[-1],
     "valfile":valfile.split("/")[-1],
    "batchsize": "8",
    "accumulation_steps": 8,
    "epochs" : "1000",   
    "log-level" : "INFO",
    "learningrate":.00001,
    
    "earlystoppingpatience":20
}

In [9]:
BertNoType_NetworkFactoryhyperparameters = {
    "dataset":"PpiNoInteractionDatasetFactory",
    "network" :"RelationExtractorBioBertFactory",
    "trainfile":trainfile.split("/")[-1],
     "valfile":valfile.split("/")[-1],
    "batchsize": "8",
    "epochs" : "1000",   
    "log-level" : "INFO",
    "learningrate":.00001,
    "earlystoppingpatience":20
}

In [10]:
metric_definitions = [{"Name": "TrainLoss",
                     "Regex": "###score: train_loss### (\d*[.]?\d*)"}
                    ,{"Name": "ValidationLoss",
                     "Regex": "###score: val_loss### (\d*[.]?\d*)"}
                    ,{"Name": "TrainFScore",
                     "Regex": "###score: train_fscore### (\d*[.]?\d*)"}
                   ,{"Name": "ValidationFScore",
                     "Regex": "###score: val_fscore### (\d*[.]?\d*)"}
                    ]

In [11]:
!git log -1 | head -1
!git log -1 | head -5 | tail -1

commit 44905c0d617bc64cc5c3efa6fed7fde4e77eaf8b
    add dataset preprocessor


In [12]:
git_config = {'repo': 'https://github.com/elangovana/PPI-typed-relation-extractor.git',
              'branch': 'master',
            #  'commit': '58a09e154935248667062a36fdae7d86b86b477c'
             }

In [13]:
hyperparameters =BertNetworkFactoryhyperparameters #BertNoType_NetworkFactoryhyperparameters 
inputs = pub_inputs 


In [14]:
hyperparameters

{'accumulation_steps': 8,
 'batchsize': '8',
 'dataset': 'PpiDatasetFactory',
 'earlystoppingpatience': 20,
 'epochs': '1000',
 'learningrate': 1e-05,
 'log-level': 'INFO',
 'network': 'RelationExtractorBioBertFactory',
 'trainfile': 'train_unique_pub_v6_max_neg.json',
 'valfile': 'val_unique_pub_v6_max_neg.json'}

In [15]:
inputs

{'PRETRAINED_BIOBERT': 's3://aegovan-data/embeddings/bert/',
 'train': 's3://aegovan-data/processed_dataset/train_unique_pub_v6_max_neg.json',
 'val': 's3://aegovan-data/processed_dataset/val_unique_pub_v6_max_neg.json'}

In [16]:
from sagemaker.pytorch import PyTorch

estimator = PyTorch(
     #entry_point='main_train_k_fold.py',
    entry_point='main_train_bert.py',
                    source_dir = 'source/algorithms',
                    dependencies =['source/algorithms', 'source/datasets', 'source/preprocessor', 'source/modelnetworks','source/trainpipelinesbuilders'],
                    role=role,
                    framework_version ="1.0.0",
                    py_version='py3',
                    git_config= git_config,
                    image_name= docker_repo,
                    train_instance_count=1,
                    train_instance_type=instance_type,
                    hyperparameters =hyperparameters,
                    output_path=s3_output_path,
                    metric_definitions=metric_definitions,
                    #train_use_spot_instances = True
                    train_volume_size=30,
                    code_location=s3_code_path,
                    base_job_name ="ppi-bert-extractor-neg")

In [None]:
estimator.fit(inputs)

2019-10-27 02:51:00 Starting - Starting the training job...
2019-10-27 02:51:01 Starting - Launching requested ML instances......
2019-10-27 02:52:25 Starting - Preparing the instances for training...
2019-10-27 02:53:23 Downloading - Downloading input data...
2019-10-27 02:53:39 Training - Downloading the training image.........
2019-10-27 02:55:37 Training - Training image download completed. Training in progress.[31mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[31mbash: no job control in this shell[0m
[31m2019-10-27 02:55:38,830 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training[0m
[31m2019-10-27 02:55:38,874 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[31m2019-10-27 02:55:40,297 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[31m2019-10-27 02:55:40,611 sagemaker-containers INFO     Module main_train_bert does not provide a 

[31m2019-10-27 02:55:45,837 - trainpipelinesbuilders.BertTrainInferenceBuilder - INFO - Retrieving key batchsize with default 32, found 8[0m
[31m2019-10-27 02:55:45,837 - trainpipelinesbuilders.BertTrainInferenceBuilder - INFO - Retrieving key pretrained_biobert_dir with default None, found /opt/ml/input/data/PRETRAINED_BIOBERT[0m
[31m2019-10-27 02:55:45,837 - pytorch_pretrained_bert.tokenization - INFO - loading vocabulary file /opt/ml/input/data/PRETRAINED_BIOBERT/vocab.txt[0m
[31m2019-10-27 02:55:45,864 - pytorch_pretrained_bert.tokenization - INFO - loading vocabulary file /opt/ml/input/data/PRETRAINED_BIOBERT/vocab.txt[0m
[31m2019-10-27 02:55:45,933 - modelnetworks.RelationExtractorBioBertFactory - INFO - Retrieving key pretrained_biobert_dir with default None, found /opt/ml/input/data/PRETRAINED_BIOBERT[0m
[31m2019-10-27 02:55:45,934 - pytorch_pretrained_bert.modeling - INFO - loading archive file /opt/ml/input/data/PRETRAINED_BIOBERT[0m
[31m2019-10-27 02:55:45,935 -

[31m2019-10-27 02:56:17,006 - algorithms.transform_berttext_tokenise - INFO - Completed TransformBertTextTokenise[0m
[31m2019-10-27 02:56:17,006 - algorithms.transform_berttext_token_to_index - INFO - Transforming TransformBertTextTokenToIndex[0m
[31m2019-10-27 02:56:17,664 - algorithms.transform_berttext_token_to_index - INFO - Completed TransformBertTextTokenToIndex[0m
[31m2019-10-27 02:56:17,703 - algorithms.transform_berttext_tokenise - INFO - Transforming TransformBertTextTokenise[0m
[31m2019-10-27 02:56:20,899 - algorithms.transform_berttext_tokenise - INFO - Completed TransformBertTextTokenise[0m
[31m2019-10-27 02:56:20,899 - algorithms.transform_berttext_token_to_index - INFO - Transforming TransformBertTextTokenToIndex[0m
[31m2019-10-27 02:56:20,979 - algorithms.transform_berttext_token_to_index - INFO - Completed TransformBertTextTokenToIndex[0m
[31m2019-10-27 02:56:20,983 - algorithms.transform_label_encoder - INFO - Running TransformLabelEncoder[0m
[31m2019

[31m2019-10-27 03:24:30,284 - algorithms.BertTrain - INFO - Train set result details:[0m
[31m2019-10-27 03:24:30,292 - algorithms.result_writer - INFO - Confusion matrix, full output in /opt/ml/output/data/predictedvsactual_a558721a-b68b-4765-8fb3-496513b3ab9f_20191027_032430.csv: [0m
[31m[[4868    3]
 [ 824   15]][0m
[31m2019-10-27 03:24:30,299 - algorithms.BertTrain - INFO - Train set result details: 0.03500583430571762[0m
[31m2019-10-27 03:24:30,299 - algorithms.BertTrain - INFO - Validation set result details:[0m
[31m2019-10-27 03:24:38,531 - algorithms.result_writer - INFO - Confusion matrix, full output in /opt/ml/output/data/predictedvsactual_76b2a28c-3c51-4f8e-99e9-1ea343704730_20191027_032438.csv: [0m
[31m[[604   0]
 [105   0]][0m
[31m2019-10-27 03:24:38,533 - algorithms.BertTrain - INFO - Validation set result details: 0.0 [0m
[31m2019-10-27 03:24:38,533 - algorithms.BertTrain - INFO - Snapshotting because the current loss 36.06203036010265 is lower than 37.4

[31m2019-10-27 03:52:45,226 - algorithms.BertTrain - INFO - Train set result details:[0m
[31m2019-10-27 03:52:45,234 - algorithms.result_writer - INFO - Confusion matrix, full output in /opt/ml/output/data/predictedvsactual_bc2e79c8-9b1c-4575-b9a5-cbaf5b194a51_20191027_035245.csv: [0m
[31m[[4653  218]
 [ 228  611]][0m
[31m2019-10-27 03:52:45,242 - algorithms.BertTrain - INFO - Train set result details: 0.7326139088729018[0m
[31m2019-10-27 03:52:45,242 - algorithms.BertTrain - INFO - Validation set result details:[0m
[31m2019-10-27 03:52:53,474 - algorithms.result_writer - INFO - Confusion matrix, full output in /opt/ml/output/data/predictedvsactual_41a9fbdd-8cef-4010-83a4-82306fde4a8f_20191027_035253.csv: [0m
[31m[[548  56]
 [ 52  53]][0m
[31m2019-10-27 03:52:53,476 - algorithms.BertTrain - INFO - Validation set result details: 0.4953271028037384 [0m
[31m2019-10-27 03:52:53,476 - algorithms.BertTrain - INFO - Run   3392    11      8568     7/714         1% 136.485858 3

[31m2019-10-27 04:58:32,647 - algorithms.BertTrain - INFO - Train set result details:[0m
[31m2019-10-27 04:58:32,655 - algorithms.result_writer - INFO - Confusion matrix, full output in /opt/ml/output/data/predictedvsactual_41605314-7c96-4b64-852f-7d3125936abe_20191027_045832.csv: [0m
[31m[[4818   53]
 [  49  790]][0m
[31m2019-10-27 04:58:32,662 - algorithms.BertTrain - INFO - Train set result details: 0.93935790725327[0m
[31m2019-10-27 04:58:32,662 - algorithms.BertTrain - INFO - Validation set result details:[0m
[31m2019-10-27 04:58:40,896 - algorithms.result_writer - INFO - Confusion matrix, full output in /opt/ml/output/data/predictedvsactual_bab3c3d6-4359-453e-8c64-72859f77ca51_20191027_045840.csv: [0m
[31m[[547  57]
 [ 50  55]][0m
[31m2019-10-27 04:58:40,898 - algorithms.BertTrain - INFO - Validation set result details: 0.5069124423963134 [0m
[31m2019-10-27 04:58:40,898 - algorithms.BertTrain - INFO - Run   7339    25     18564     7/714         1% 31.067253 52.2

[31m2019-10-27 05:26:53,768 - algorithms.result_writer - INFO - Confusion matrix, full output in /opt/ml/output/data/predictedvsactual_1899ac06-574c-4546-a6c5-326c924f008a_20191027_052653.csv: [0m
[31m[[564  40]
 [ 60  45]][0m
[31m2019-10-27 05:26:53,770 - algorithms.BertTrain - INFO - Validation set result details: 0.4736842105263158 [0m
[31m2019-10-27 05:26:53,770 - algorithms.BertTrain - INFO - Run   9032    31     22848     7/714         1% 20.834247 64.614288       0.9690       0.4737[0m
[31m###score: train_loss### 20.834247324004536[0m
[31m###score: val_loss### 64.61428784727468[0m
[31m###score: train_fscore### 0.9690346083788707[0m
[31m###score: val_fscore### 0.4736842105263158[0m
[31m2019-10-27 05:31:27,612 - algorithms.BertTrain - INFO - Train set result details:[0m
[31m2019-10-27 05:31:27,621 - algorithms.result_writer - INFO - Confusion matrix, full output in /opt/ml/output/data/predictedvsactual_adb46275-d856-4840-b8f5-e3286f795b7b_20191027_053127.csv: [

[31m2019-10-27 05:59:40,285 - algorithms.BertTrain - INFO - Train set result details:[0m
[31m2019-10-27 05:59:40,294 - algorithms.result_writer - INFO - Confusion matrix, full output in /opt/ml/output/data/predictedvsactual_cc317fd1-226c-4687-afe7-2899a385071b_20191027_055940.csv: [0m
[31m[[4857   14]
 [  23  816]][0m
[31m2019-10-27 05:59:40,301 - algorithms.BertTrain - INFO - Train set result details: 0.9778310365488316[0m
[31m2019-10-27 05:59:40,301 - algorithms.BertTrain - INFO - Validation set result details:[0m
[31m2019-10-27 05:59:48,536 - algorithms.result_writer - INFO - Confusion matrix, full output in /opt/ml/output/data/predictedvsactual_e6bb7463-f560-4721-95d2-d4c703251b5c_20191027_055948.csv: [0m
[31m[[556  48]
 [ 57  48]][0m
[31m2019-10-27 05:59:48,538 - algorithms.BertTrain - INFO - Validation set result details: 0.4776119402985075 [0m
[31m2019-10-27 05:59:48,538 - algorithms.BertTrain - INFO - Run  11007    38     27846     7/714         1% 11.912090 57