### Set up

#### 1. Set  up  accounts and role

In [1]:
#!pip install sagemaker==1.39.0

In [2]:
import sagemaker
import boto3

sagemaker_session = sagemaker.Session()
account_id =  boto3.client('sts').get_caller_identity().get('Account')
region = boto3.session.Session().region_name


#role = sagemaker.get_execution_role()
role="arn:aws:iam::{}:role/service-role/AmazonSageMaker-ExecutionRole-20190118T115449".format(account_id)


#### 2. Setup image and instance type

In [3]:
pytorch_custom_image_name="ppi-extractor:gpu-1.0.0-201910130520"
instance_type = "ml.p3.8xlarge" 

In [4]:
docker_repo = "{}.dkr.ecr.{}.amazonaws.com/{}".format(account_id, region, pytorch_custom_image_name)

#### 3. Configure train/ test and validation datasets

In [5]:
bucket = "aegovan-data"

In [6]:
#trainfile = "s3://{}/aimed/AIMedtrain.json".format(bucket)
#trainfile = "s3://{}/aimed/AIMedFull.json".format(bucket)
trainfile="s3://{}/aimed/AIMedFull_preprocessed.json".format(bucket)

valfile="s3://{}/aimed/AIMedval_preprocessed.json".format(bucket)
# trainfile = "s3://{}/aimed/AIMedtrain_pubmedoverlap.json".format(bucket)
# valfile="s3://{}/aimed/AIMedval_pubmedoverlap.json".format(bucket)
#embeddingfile="s3://{}/embeddings/PubMed-and-PMC-w2v.bin.txt".format(bucket)
#embeddingfile="s3://{}/embeddings/bio_nlp_vec/PubMed-shuffle-win-30.bin.txt".format(bucket)
pretrained_bert="s3://{}/embeddings/bert/".format(bucket)


s3_output_path= "s3://{}/results/".format(bucket)
s3_code_path= "s3://{}/aimed_bert_code".format(bucket)

### Start training

In [7]:
pub_inputs = {
    "train" : trainfile,
    "PRETRAINED_BIOBERT" : pretrained_bert
}

In [8]:
BertNetworkFactoryhyperparameters = {
  #  "dataset":"PpiAimedDatasetFactory",
    "dataset":"PpiAimedDatasetPreprocessedFactory",
    "network" :"RelationExtractorBioBertFactory",
    "trainfile":trainfile.split("/")[-1],
    "batchsize": "8",
    "accumulation_steps" : "4",
    "epochs" : "1000",   
    "log-level" : "INFO",
    "learningrate":.00001,
    "earlystoppingpatience":20
}

In [9]:
metric_definitions = [{"Name": "TrainLoss",
                     "Regex": "###score: train_loss### (\d*[.]?\d*)"}
                    ,{"Name": "ValidationLoss",
                     "Regex": "###score: val_loss### (\d*[.]?\d*)"}
                    ,{"Name": "TrainFScore",
                     "Regex": "###score: train_fscore### (\d*[.]?\d*)"}
                   ,{"Name": "ValidationFScore",
                     "Regex": "###score: val_fscore### (\d*[.]?\d*)"}
                    ]

In [10]:
!git log -1 | head -1
!git log -1 | head -5 | tail -1

commit 0cc8fdd48eb7813e65179a3d325079b95fd0ae8b
    fix typo


In [11]:
git_config = {'repo': 'https://github.com/elangovana/PPI-typed-relation-extractor.git',
              'branch': 'master',
            #  'commit': '58a09e154935248667062a36fdae7d86b86b477c'
             }

In [12]:
hyperparameters = BertNetworkFactoryhyperparameters
inputs = pub_inputs 


In [13]:
hyperparameters

{'accumulation_steps': '4',
 'batchsize': '8',
 'dataset': 'PpiAimedDatasetPreprocessedFactory',
 'earlystoppingpatience': 20,
 'epochs': '1000',
 'learningrate': 1e-05,
 'log-level': 'INFO',
 'network': 'RelationExtractorBioBertFactory',
 'trainfile': 'AIMedFull_preprocessed.json'}

In [14]:
inputs

{'PRETRAINED_BIOBERT': 's3://aegovan-data/embeddings/bert/',
 'train': 's3://aegovan-data/aimed/AIMedFull_preprocessed.json'}

In [15]:
from sagemaker.pytorch import PyTorch

estimator = PyTorch(
     #entry_point='main_train_k_fold.py',
    entry_point='main_train_bert_k_fold.py',
                    source_dir = 'source/algorithms',
                    dependencies =['source/algorithms', 'source/datasets', 'source/preprocessor', 'source/modelnetworks','source/trainpipelinesbuilders'],
                    role=role,
                    framework_version ="1.0.0",
                    py_version='py3',
                    git_config= git_config,
                    image_name= docker_repo,
                    train_instance_count=1,
                    train_instance_type=instance_type,
                    hyperparameters =hyperparameters,
                    output_path=s3_output_path,
                    metric_definitions=metric_definitions,
                    #train_use_spot_instances = True
                    train_volume_size=30,
                    code_location=s3_code_path,
                    train_max_run = 60 * 60 * 24 * 4,
                    base_job_name ="aimed-ppi-bert-acc")

In [16]:
estimator.fit(inputs)

2019-10-27 06:44:43 Starting - Starting the training job...
2019-10-27 06:44:47 Starting - Launching requested ML instances...
2019-10-27 06:45:44 Starting - Preparing the instances for training......
2019-10-27 06:47:01 Downloading - Downloading input data
2019-10-27 06:47:01 Training - Downloading the training image..........[31mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[31mbash: no job control in this shell[0m
[31m2019-10-27 06:49:03,691 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training[0m
[31m2019-10-27 06:49:03,736 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[31m2019-10-27 06:49:06,753 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[31m2019-10-27 06:49:07,088 sagemaker-containers INFO     Module main_train_bert_k_fold does not provide a setup.py. [0m
[31mGenerating setup.py[0m
[31m2019-10-27 06:49:07,088 sagemaker-

[31m['--accumulation_steps', '4', '--batchsize', '8', '--learningrate', '1e-05'][0m
[31m{'dataset': 'PpiAimedDatasetPreprocessedFactory', 'network': 'RelationExtractorBioBertFactory', 'trainfile': 'AIMedFull_preprocessed.json', 'traindir': '/opt/ml/input/data/train', 'pretrained_biobert_dir': '/opt/ml/input/data/PRETRAINED_BIOBERT', 'outdir': '/opt/ml/output/data', 'modeldir': '/opt/ml/model', 'epochs': 1000, 'earlystoppingpatience': 20, 'interaction_type': None, 'log_level': 'INFO'}[0m
[31m{'accumulation_steps': '4', 'batchsize': '8', 'learningrate': '1e-05'}[0m
[31m2019-10-27 06:49:12,075 - __main__ - INFO - Running fold 0[0m
[31m2019-10-27 06:49:12,104 - trainpipelinesbuilders.BertTrainInferenceBuilder - INFO - Retrieving key batchsize with default 32, found 8[0m
[31m2019-10-27 06:49:12,104 - trainpipelinesbuilders.BertTrainInferenceBuilder - INFO - Retrieving key pretrained_biobert_dir with default None, found /opt/ml/input/data/PRETRAINED_BIOBERT[0m
[31m2019-10-27 06:

[31m2019-10-27 06:49:21,935 - algorithms.transform_berttext_token_to_index - INFO - Completed TransformBertTextTokenToIndex[0m
[31m2019-10-27 06:49:21,946 - algorithms.transform_berttext_tokenise - INFO - Transforming TransformBertTextTokenise[0m
[31m2019-10-27 06:49:22,478 - algorithms.transform_berttext_tokenise - INFO - Completed TransformBertTextTokenise[0m
[31m2019-10-27 06:49:22,478 - algorithms.transform_berttext_token_to_index - INFO - Transforming TransformBertTextTokenToIndex[0m
[31m2019-10-27 06:49:22,532 - algorithms.transform_berttext_token_to_index - INFO - Completed TransformBertTextTokenToIndex[0m
[31m2019-10-27 06:49:22,534 - algorithms.transform_label_encoder - INFO - Running TransformLabelEncoder[0m
[31m2019-10-27 06:49:22,535 - algorithms.transform_label_encoder - INFO - Complete TransformLabelEncoder[0m
[31m2019-10-27 06:49:22,580 - algorithms.BertTrainInferencePipeline - INFO - Retrieving key learningrate with default .01, found 1e-05[0m
[31m2019-

[31m2019-10-27 07:11:30,295 - algorithms.result_writer - INFO - Confusion matrix, full output in /opt/ml/output/data/predictedvsactual_0507779c-2382-4b63-a23a-ca9b0d8f1ea0_20191027_071130.csv: [0m
[31m[[495   0]
 [ 89   0]][0m
[31m2019-10-27 07:11:30,297 - algorithms.BertTrain - INFO - Validation set result details: 0.0 [0m
[31m2019-10-27 07:11:30,297 - algorithms.BertTrain - INFO - Snapshotting because the current loss 30.32103531807661 is lower than 31.1331719905138 [0m
[31m2019-10-27 07:11:30,297 - algorithms.ModelSnapshotCallback - INFO - Snappshotting model to /opt/ml/output/data/best_snaphsotmodel.pt[0m
[31m2019-10-27 07:11:30,828 - algorithms.BertTrain - INFO - Run   1328     4      3285     3/657         0% 298.852976 30.321035       0.0000       0.0000[0m
[31m###score: train_loss### 298.85297636687756[0m
[31m###score: val_loss### 30.32103531807661[0m
[31m###score: train_fscore### 0.0[0m
[31m###score: val_fscore### 0.0[0m
[31m2019-10-27 07:15:48,795 - algor

[31m2019-10-27 07:37:59,318 - algorithms.result_writer - INFO - Confusion matrix, full output in /opt/ml/output/data/predictedvsactual_e8a084a9-9895-4b9a-b77d-0308a7c79dd2_20191027_073759.csv: [0m
[31m[[452  43]
 [ 64  25]][0m
[31m2019-10-27 07:37:59,320 - algorithms.BertTrain - INFO - Validation set result details: 0.3184713375796179 [0m
[31m2019-10-27 07:37:59,320 - algorithms.BertTrain - INFO - Run   2916    10      7227     3/657         0% 218.178351 30.279408       0.5636       0.3185[0m
[31m###score: train_loss### 218.17835077643394[0m
[31m###score: val_loss### 30.2794082313776[0m
[31m###score: train_fscore### 0.5635566687539136[0m
[31m###score: val_fscore### 0.3184713375796179[0m
[31m2019-10-27 07:42:17,557 - algorithms.BertTrain - INFO - Train set result details:[0m
[31m2019-10-27 07:42:17,565 - algorithms.result_writer - INFO - Confusion matrix, full output in /opt/ml/output/data/predictedvsactual_9257c436-50e2-4531-a359-4375cd6c3740_20191027_074217.csv: [

[31m2019-10-27 08:08:45,413 - algorithms.BertTrain - INFO - Train set result details:[0m
[31m2019-10-27 08:08:45,420 - algorithms.result_writer - INFO - Confusion matrix, full output in /opt/ml/output/data/predictedvsactual_a18d1f30-a943-457b-a030-f48859335e40_20191027_080845.csv: [0m
[31m[[4276   63]
 [ 192  719]][0m
[31m2019-10-27 08:08:45,427 - algorithms.BertTrain - INFO - Train set result details: 0.8493797991730655[0m
[31m2019-10-27 08:08:45,427 - algorithms.BertTrain - INFO - Validation set result details:[0m
[31m2019-10-27 08:08:52,198 - algorithms.result_writer - INFO - Confusion matrix, full output in /opt/ml/output/data/predictedvsactual_47ebaaa9-9af2-4230-b81e-5960c6cdbafd_20191027_080852.csv: [0m
[31m[[463  32]
 [ 45  44]][0m
[31m2019-10-27 08:08:52,200 - algorithms.BertTrain - INFO - Validation set result details: 0.5333333333333333 [0m
[31m2019-10-27 08:08:52,200 - algorithms.BertTrain - INFO - Run   4769    17     11826     3/657         0% 84.740633 26

[31m2019-10-27 08:39:39,276 - algorithms.BertTrain - INFO - Train set result details:[0m
[31m2019-10-27 08:39:39,284 - algorithms.result_writer - INFO - Confusion matrix, full output in /opt/ml/output/data/predictedvsactual_a3e28936-63f4-43ff-bc84-998ad94a3ee5_20191027_083939.csv: [0m
[31m[[4313   26]
 [  55  856]][0m
[31m2019-10-27 08:39:39,291 - algorithms.BertTrain - INFO - Train set result details: 0.9548243167875069[0m
[31m2019-10-27 08:39:39,291 - algorithms.BertTrain - INFO - Validation set result details:[0m
[31m2019-10-27 08:39:46,069 - algorithms.result_writer - INFO - Confusion matrix, full output in /opt/ml/output/data/predictedvsactual_7c2e6594-97e7-4d59-9594-06c93e09942d_20191027_083946.csv: [0m
[31m[[463  32]
 [ 33  56]][0m
[31m2019-10-27 08:39:46,071 - algorithms.BertTrain - INFO - Validation set result details: 0.6327683615819208 [0m
[31m2019-10-27 08:39:46,071 - algorithms.BertTrain - INFO - Snapshotting because the current score 0.6327683615819208 is

EndpointConnectionError: Could not connect to the endpoint URL: "https://api.sagemaker.us-east-2.amazonaws.com/"