### Set up

#### 1. Set  up  accounts and role

In [24]:
!pip install sagemaker==1.39.0

[31mawsebcli 3.12.4 has requirement requests<=2.9.1,>=2.6.1, but you'll have requests 2.20.1 which is incompatible.[0m
[31mspacy 2.1.8 has requirement numpy>=1.15.0, but you'll have numpy 1.14.1 which is incompatible.[0m
[31mblis 0.2.4 has requirement numpy>=1.15.0, but you'll have numpy 1.14.1 which is incompatible.[0m
[33mYou are using pip version 10.0.1, however version 19.2.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [25]:
import sagemaker
import boto3

sagemaker_session = sagemaker.Session()
account_id =  boto3.client('sts').get_caller_identity().get('Account')
region = boto3.session.Session().region_name


#role = sagemaker.get_execution_role()
role="arn:aws:iam::{}:role/service-role/AmazonSageMaker-ExecutionRole-20190118T115449".format(account_id)


#### 2. Setup image and instance type

In [26]:
pytorch_custom_image_name="ppi-extractor:gpu-1.0.0-201909140804"
instance_type = "ml.p3.8xlarge" 

In [27]:
docker_repo = "{}.dkr.ecr.{}.amazonaws.com/{}".format(account_id, region, pytorch_custom_image_name)

#### 3. Configure train/ test and validation datasets

In [28]:
bucket = "aegovan-data"

In [29]:
trainfile = "s3://{}/aimed/AIMedtrain.json".format(bucket)
testfile= "s3://{}/aimed/AIMedval.json".format(bucket)
valfile="s3://{}/aimed/AIMedtest.json".format(bucket)
embeddingfile="s3://{}/embeddings/wikipedia-pubmed-and-PMC-w2v.bin.txt".format(bucket)
s3_output_path= "s3://{}/results/".format(bucket)

### Start training

In [30]:
inputs = {
    "train" : trainfile,
    "val" :valfile,
    "embedding" : embeddingfile
}

In [58]:
hyperparameters = {
    "dataset":"PpiAimedDataset",
    "trainfile":trainfile.split("/")[-1],
    "valfile":valfile.split("/")[-1],
    "embeddingfile":embeddingfile.split("/")[-1],
    "embeddim":200,
    "batchsize": "32",
    "epochs" : "1000",   
    "log-level" : "INFO",
    "lstmhiddensize": 50,
    "fclayersize": 50,
    "numlayers":3,
    "poolingkernelsize":4
   
}

In [59]:
metric_definitions = [{"Name": "TrainLoss",
                     "Regex": "###score: train_loss### (\d*[.]?\d*)"}
                    ,{"Name": "ValidationLoss",
                     "Regex": "###score: val_loss### (\d*[.]?\d*)"}
                    ,{"Name": "TrainFScore",
                     "Regex": "###score: train_fscore### (\d*[.]?\d*)"}
                   ,{"Name": "ValidationFScore",
                     "Regex": "###score: val_fscore### (\d*[.]?\d*)"}
                    ]

In [60]:
git_config = {'repo': 'https://github.com/elangovana/PPI-typed-relation-extractor.git',
              'branch': 'master',
              'commit': 'c050a2818efd6dc6d2adb3dac07c102b94c92eab'}

In [61]:
from sagemaker.pytorch import PyTorch

estimator = PyTorch(
     entry_point='main_train.py',
                    source_dir = 'source/algorithms',
                    dependencies =['source/algorithms', 'source/datasets'],
                    role=role,
                    framework_version ="1.0.0",
                    py_version='py3',
                   # git_config= git_config,
                    image_name= docker_repo,
                    train_instance_count=1,
                    train_instance_type=instance_type,
                    hyperparameters =hyperparameters,
                    output_path=s3_output_path,
                    metric_definitions=metric_definitions,
                    #train_use_spot_instances = True
                    train_volume_size=30,
                    base_job_name ="aimed-ppi-extractor")

In [None]:
estimator.fit(inputs)

2019-09-15 11:07:12 Starting - Starting the training job...
2019-09-15 11:07:14 Starting - Launching requested ML instances...
2019-09-15 11:08:11 Starting - Preparing the instances for training......
2019-09-15 11:09:30 Downloading - Downloading input data.....................
2019-09-15 11:13:32 Training - Downloading the training image.........
2019-09-15 11:15:36 Training - Training image download completed. Training in progress.[31mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[31mbash: no job control in this shell[0m
[31m2019-09-15 11:15:37,939 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training[0m
[31m2019-09-15 11:15:37,983 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[31m2019-09-15 11:15:37,984 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[31m2019-09-15 11:15:38,289 sagemaker-containers INFO     Module main_train does n

[31m2019-09-15 11:23:14,741 - algorithms.PretrainedEmbedderLoader - INFO - The number of words intialised without embbeder is 8[0m
[31m2019-09-15 11:24:35,385 - algorithms.transform_protein_mask - INFO - Running TransformProteinMask [0m
[31m2019-09-15 11:24:39,809 - algorithms.transform_protein_mask - INFO - Completed TransformProteinMask [0m
[31m2019-09-15 11:24:39,809 - algorithms.transform_protein_mask - INFO - Running TransformProteinMask [0m
[31m2019-09-15 11:24:39,813 - algorithms.transform_protein_mask - INFO - Completed TransformProteinMask [0m
[31m2019-09-15 11:24:39,813 - algorithms.transform_sentence_tokeniser - INFO - Running sentence tokenisor [0m
[31m2019-09-15 11:25:16,478 - algorithms.transform_sentence_tokeniser - INFO - Completed  sentence tokenisor [0m
[31m2019-09-15 11:25:16,481 - algorithms.transform_text_index - INFO - Transforming TransformTextToIndex[0m
[31m2019-09-15 11:25:16,588 - algorithms.transform_text_index - INFO - Completed TransformTex

[31m2019-09-15 11:30:27,143 - algorithms.Train - INFO - Train set result details:[0m
[31m2019-09-15 11:30:27,146 - algorithms.result_writer - INFO - Confusion matrix, full output in /opt/ml/output/data/predictedvsactual_f3f9f71f-979d-43e9-8cc2-c54da5c02ea9_20190915_113027.csv: [0m
[31m[[2351   57]
 [ 279   74]][0m
[31m2019-09-15 11:30:27,147 - algorithms.Train - INFO - Train set result details: 0.30578512396694213[0m
[31m2019-09-15 11:30:27,148 - algorithms.Train - INFO - Validation set result details:[0m
[31m2019-09-15 11:30:35,923 - algorithms.result_writer - INFO - Confusion matrix, full output in /opt/ml/output/data/predictedvsactual_c8eac478-1ab0-422d-9abb-347817c03953_20190915_113035.csv: [0m
[31m[[478  17]
 [ 71  24]][0m
[31m2019-09-15 11:30:35,924 - algorithms.Train - INFO - Validation set result details: 0.3529411764705882 [0m
[31m2019-09-15 11:30:35,924 - algorithms.Train - INFO - Snapshotting because the current score 0.3529411764705882 is greater than 0.163

[31m2019-09-15 11:36:08,299 - algorithms.result_writer - INFO - Confusion matrix, full output in /opt/ml/output/data/predictedvsactual_97a9c3dd-92da-46bc-9ba4-fe0d04f583d0_20190915_113608.csv: [0m
[31m[[430  65]
 [ 44  51]][0m
[31m2019-09-15 11:36:08,300 - algorithms.Train - INFO - Validation set result details: 0.4834123222748815 [0m
[31m2019-09-15 11:36:08,300 - algorithms.Train - INFO - Snapshotting because the current score 0.4834123222748815 is greater than 0.3975155279503106 [0m
[31m2019-09-15 11:36:08,300 - algorithms.ModelSnapshotCallback - INFO - Snappshotting model to /opt/ml/model/best_snaphsotmodel.pt[0m
[31m2019-09-15 11:36:15,021 - algorithms.Train - INFO - Run    639    10       957     4/87          5% 19.906300 9.665681       0.5605       0.4834[0m
[31m###score: train_loss### 19.906299866735935[0m
[31m###score: val_loss### 9.66568137705326[0m
[31m###score: train_fscore### 0.560530679933665[0m
[31m###score: val_fscore### 0.4834123222748815[0m
[31m20

[31m2019-09-15 11:41:39,421 - algorithms.Train - INFO - Run    964    16      1479     4/87          5% 13.029795 14.403282       0.7382       0.5079[0m
[31m###score: train_loss### 13.02979498077184[0m
[31m###score: val_loss### 14.40328224748373[0m
[31m###score: train_fscore### 0.7382352941176471[0m
[31m###score: val_fscore### 0.5079365079365079[0m
[31m2019-09-15 11:42:23,673 - algorithms.Train - INFO - Train set result details:[0m
[31m2019-09-15 11:42:23,676 - algorithms.result_writer - INFO - Confusion matrix, full output in /opt/ml/output/data/predictedvsactual_9ec4365f-eab9-4472-9ec5-f1fe13199e51_20190915_114223.csv: [0m
[31m[[2332   76]
 [ 104  249]][0m
[31m2019-09-15 11:42:23,677 - algorithms.Train - INFO - Train set result details: 0.7345132743362831[0m
[31m2019-09-15 11:42:23,677 - algorithms.Train - INFO - Validation set result details:[0m
[31m2019-09-15 11:42:32,474 - algorithms.result_writer - INFO - Confusion matrix, full output in /opt/ml/output/data/p

### HPO

In [48]:
objective_metric_name ="ValidationFScore"

In [54]:
from sagemaker.tuner import HyperparameterTuner, IntegerParameter, CategoricalParameter, ContinuousParameter
hyperparameter_ranges = {'lstmhiddensize': IntegerParameter(40,200), #ContinuousParameter(0.01, 0.2),
                          "fclayersize": IntegerParameter(10,50),
                            "numlayers":IntegerParameter(1,10),
                        "poolingkernelsize":IntegerParameter(2,5),
                         'num_epoch': IntegerParameter(10, 50)}

In [55]:
hyperparameters={ "trainfile":trainfile.split("/")[-1],
    "valfile":valfile.split("/")[-1],
    "embeddingfile":embeddingfile.split("/")[-1],
                        "embeddim":"200",
                        "dataset":"PpiAimedDataset"}

In [56]:

estimator = PyTorch(
     entry_point='main_train.py',
                    source_dir = 'source/algorithms',
                    dependencies =['source/algorithms', 'source/datasets'],
                    role=role,
                    framework_version ="1.0.0",
                    py_version='py3',
                   # git_config= git_config,
                    image_name= docker_repo,
                    train_instance_count=1,
                    train_instance_type=instance_type,
                    hyperparameters =hyperparameters,
                    output_path=s3_output_path,
                    metric_definitions=metric_definitions,
                    #train_use_spot_instances = True
                    train_volume_size=30,
                    base_job_name ="aimed-ppi-extractor")

In [57]:
tuner = HyperparameterTuner(estimator,
                            objective_metric_name,
                            hyperparameter_ranges,
                            metric_definitions,
                            max_jobs=9,
                            max_parallel_jobs=3,
                            base_tuning_job_name="hpo-aimed-ppi-extractor")
tuner.fit(inputs)