### Set up

#### 1. Set  up  accounts and role

In [1]:
#!pip install sagemaker==1.39.0

In [2]:
import sagemaker
import boto3

sagemaker_session = sagemaker.Session()
account_id =  boto3.client('sts').get_caller_identity().get('Account')
region = boto3.session.Session().region_name


#role = sagemaker.get_execution_role()
role="arn:aws:iam::{}:role/service-role/AmazonSageMaker-ExecutionRole-20190118T115449".format(account_id)


#### 2. Setup image and instance type

In [3]:
pytorch_custom_image_name="ppi-extractor:gpu-1.0.0-201910130520"
instance_type = "ml.p3.8xlarge" 

In [4]:
docker_repo = "{}.dkr.ecr.{}.amazonaws.com/{}".format(account_id, region, pytorch_custom_image_name)

#### 3. Configure train/ test and validation datasets

In [5]:
bucket = "aegovan-data"

In [6]:
trainfile = "s3://{}/processed_dataset/train_unique_pub_v6_less_negative.json".format(bucket)
testfile= "s3://{}/processed_dataset/test_unique_pub_v6_less_negative.json".format(bucket)
valfile="s3://{}/processed_dataset/val_unique_pub_v6_less_negative.json".format(bucket)
embeddingfile="s3://{}/embeddings/wikipedia-pubmed-and-PMC-w2v.bin.txt".format(bucket)
embeddingfile="s3://{}/embeddings/bio_nlp_vec/PubMed-shuffle-win-2.bin.txt".format(bucket)


embed_dim=200

trainfile = "s3://{}/processed_dataset/train_multiclass.json".format(bucket)
testfile= "s3://{}/processed_dataset/test_multiclass.json".format(bucket)
valfile="s3://{}/processed_dataset/val_multiclass.json".format(bucket)

#Collobert embedding
#embeddingfile="s3://{}/embeddings/collobert/words_vocab_collabert.txt".format(bucket)
#embed_dim=50
s3_output_path= "s3://{}/results/".format(bucket)
s3_code_path= "s3://{}/ppi_code/".format(bucket)

### Start training

In [7]:
inputs = {
    "train" : trainfile,
    "val" :valfile,
    "test" : testfile,
    "embedding" : embeddingfile
}

In [8]:
# hyperparameters = {
#     "dataset":"PpiDatasetFactory",
#     "earlystoppingpatience" : 20,
#     "trainfile":trainfile.split("/")[-1],
#     "valfile":valfile.split("/")[-1],
#     "embeddingfile":embeddingfile.split("/")[-1],
#     "embeddim":embed_dim,
#     "batchsize": "32",
#     "epochs" : "1000",   
#     "log-level" : "INFO",
#     "lstmhiddensize": 8,
#     "fclayersize": 10,
#     "numlayers":3,
#     "cnn_output":100,
#     "poolingkernelsize":3
   
# }

hyperparameters = {
    "dataset":"PpiDatasetFactory",
    "earlystoppingpatience" : 20,
    "trainfile":trainfile.split("/")[-1],
    "valfile":valfile.split("/")[-1],
    "embeddingfile":embeddingfile.split("/")[-1],
    "embeddim":embed_dim,
    "batchsize": "32",
    "epochs" : "1000",   
    "log-level" : "INFO",
    "dropout_rate_cnn": 0.5,
    "pooling_kernel_size": 3,
    "pool_stride":2,
    "cnn_kernel_size":3,
    "cnn_num_layers":3,
    "cnn_output":64,
    "learningrate":.001
   
}

In [9]:

BilstmNetworkFactoryhyperparametersNoPos = {
    "dataset":"PpiMulticlassDatasetFactory",
    "network" :"RelationExtractorBiLstmNetworkFactoryNoPos",
    "trainfile":trainfile.split("/")[-1],
    "valfile":valfile.split("/")[-1],
    "testfile":testfile.split("/")[-1],
    "embeddingfile":embeddingfile.split("/")[-1],
    "embeddim":embed_dim,
    "batchsize": "64",
    "epochs" : "1000",  
    "earlystoppingpatience":20,
    "log-level" : "INFO",
    "learningrate":.001,
    "lstm_dropout":0.5,
    "lstm_num_layers" :1,
    "lstm_hidden_size":400,
    "fc_drop_out_rate":0.5,
      "train_val_vocab_merge":1
}

In [10]:
metric_definitions = [{"Name": "TrainLoss",
                     "Regex": "###score: train_loss### (\d*[.]?\d*)"}
                    ,{"Name": "ValidationLoss",
                     "Regex": "###score: val_loss### (\d*[.]?\d*)"}
                    ,{"Name": "TrainFScore",
                     "Regex": "###score: train_fscore### (\d*[.]?\d*)"}
                   ,{"Name": "ValidationFScore",
                     "Regex": "###score: val_fscore### (\d*[.]?\d*)"}
                    ]

In [11]:
!git log -1 | head -1
!git log -1 | head -5 | tail -1

commit 08cc232d34fd2b7a68bae0387f4fc1f61c87cd15
    Add sample dataset


In [12]:
git_config = {'repo': 'https://github.com/elangovana/PPI-typed-relation-extractor.git',
              'branch': 'master'}

In [13]:
base_job_name="ppimulticlass-bilstm"
hyperparameters = BilstmNetworkFactoryhyperparametersNoPos #BilstmNetworkFactoryhyperparametersNoPos #PpiAimedDatasetFactoryYlhsiehBiLstmNetwork #SimpleResnetCnnPosNetworkFactoryhyperparameters
inputs = inputs 



In [14]:
hyperparameters

{'batchsize': '64',
 'dataset': 'PpiMulticlassDatasetFactory',
 'earlystoppingpatience': 20,
 'embeddim': 200,
 'embeddingfile': 'PubMed-shuffle-win-2.bin.txt',
 'epochs': '1000',
 'fc_drop_out_rate': 0.5,
 'learningrate': 0.001,
 'log-level': 'INFO',
 'lstm_dropout': 0.5,
 'lstm_hidden_size': 400,
 'lstm_num_layers': 1,
 'network': 'RelationExtractorBiLstmNetworkFactoryNoPos',
 'testfile': 'test_multiclass.json',
 'train_val_vocab_merge': 1,
 'trainfile': 'train_multiclass.json',
 'valfile': 'val_multiclass.json'}

In [15]:
inputs

{'embedding': 's3://aegovan-data/embeddings/bio_nlp_vec/PubMed-shuffle-win-2.bin.txt',
 'test': 's3://aegovan-data/processed_dataset/test_multiclass.json',
 'train': 's3://aegovan-data/processed_dataset/train_multiclass.json',
 'val': 's3://aegovan-data/processed_dataset/val_multiclass.json'}

In [16]:
from sagemaker.pytorch import PyTorch

estimator = PyTorch(
     entry_point='main_train.py',
                    source_dir = 'source/algorithms',
                    dependencies =['source/algorithms', 'source/datasets',  'source/preprocessor', 'source/modelnetworks'],
                    role=role,
                    framework_version ="1.0.0",
                    py_version='py3',
                    git_config= git_config,
                    image_name= docker_repo,
                    train_instance_count=1,
                    train_instance_type=instance_type,
                    hyperparameters =hyperparameters,
                    output_path=s3_output_path,
                    metric_definitions=metric_definitions,
                    #train_use_spot_instances = True
                    code_location=s3_code_path,
                    train_volume_size=30,
                    base_job_name =base_job_name)

In [None]:
estimator.fit(inputs)

2019-11-02 06:58:34 Starting - Starting the training job...
2019-11-02 06:58:37 Starting - Launching requested ML instances...
2019-11-02 06:59:30 Starting - Preparing the instances for training......
2019-11-02 07:00:26 Downloading - Downloading input data......
2019-11-02 07:02:07 Training - Downloading the training image..........[31mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[31mbash: no job control in this shell[0m
[31m2019-11-02 07:04:06,149 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training[0m
[31m2019-11-02 07:04:06,193 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[31m2019-11-02 07:04:09,208 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[31m2019-11-02 07:04:09,565 sagemaker-containers INFO     Module main_train does not provide a setup.py. [0m
[31mGenerating setup.py[0m
[31m2019-11-02 07:04:09,565 sagemaker-contai

[31m['--batchsize', '64', '--fc_drop_out_rate', '0.5', '--learningrate', '0.001', '--lstm_dropout', '0.5', '--lstm_hidden_size', '400', '--lstm_num_layers', '1', '--train_val_vocab_merge', '1'][0m
[31m{'dataset': 'PpiMulticlassDatasetFactory', 'network': 'RelationExtractorBiLstmNetworkFactoryNoPos', 'trainfile': 'train_multiclass.json', 'traindir': '/opt/ml/input/data/train', 'valfile': 'val_multiclass.json', 'valdir': '/opt/ml/input/data/val', 'testfile': 'test_multiclass.json', 'testdir': '/opt/ml/input/data/test', 'embeddingfile': 'PubMed-shuffle-win-2.bin.txt', 'embeddingdir': '/opt/ml/input/data/embedding', 'outdir': '/opt/ml/output/data', 'modeldir': '/opt/ml/model', 'embeddim': 200, 'epochs': 1000, 'earlystoppingpatience': 20, 'interaction_type': None, 'log_level': 'INFO'}[0m
[31m{'batchsize': '64', 'fc_drop_out_rate': '0.5', 'learningrate': '0.001', 'lstm_dropout': '0.5', 'lstm_hidden_size': '400', 'lstm_num_layers': '1', 'train_val_vocab_merge': '1'}[0m
[31m2019-11-02 0

[31m2019-11-02 07:09:00,657 - algorithms.Train - INFO - Train set result details:[0m
[31m2019-11-02 07:09:00,662 - algorithms.result_writer - INFO - Confusion matrix, full output in /opt/ml/output/data/predictedvsactual_71fc35fc-ac2d-480b-9748-b814a38b708b_20191102_070900.csv: [0m
[31m[[   6    0    0    0    0   23    0    0]
 [   0    4    0    0    0    0    0    0]
 [   0    0   54    0    0   51    3    0]
 [   0    0    0    5    0    7    0    0]
 [   0    0    0    0   18   29    1    0]
 [   0    0    3    0    0 2336   44    1]
 [   0    0    1    0    0  282  346    0]
 [   0    0    0    0    0    6    0    3]][0m
[31m2019-11-02 07:09:00,666 - algorithms.Train - INFO - Train set result details: 0.6472482844927341[0m
[31m2019-11-02 07:09:00,667 - algorithms.Train - INFO - Validation set result details:[0m
[31m2019-11-02 07:09:00,878 - algorithms.result_writer - INFO - Confusion matrix, full output in /opt/ml/output/data/predictedvsactual_84f19547-a835-4e49-9db8-88

[31m2019-11-02 07:09:24,753 - algorithms.Train - INFO - Train set result details:[0m
[31m2019-11-02 07:09:24,758 - algorithms.result_writer - INFO - Confusion matrix, full output in /opt/ml/output/data/predictedvsactual_2c15489b-54bd-466e-803f-b35797d0c015_20191102_070924.csv: [0m
[31m[[  27    0    0    0    0    2    0    0]
 [   0    4    0    0    0    0    0    0]
 [   0    0   95    0    0   12    1    0]
 [   0    0    0   12    0    0    0    0]
 [   0    0    0    0   44    4    0    0]
 [   6    0    6    1    3 2281   87    0]
 [   0    0    3    1    1  121  503    0]
 [   0    0    0    0    0    0    0    9]][0m
[31m2019-11-02 07:09:24,762 - algorithms.Train - INFO - Train set result details: 0.922644152868455[0m
[31m2019-11-02 07:09:24,762 - algorithms.Train - INFO - Validation set result details:[0m
[31m2019-11-02 07:09:24,975 - algorithms.result_writer - INFO - Confusion matrix, full output in /opt/ml/output/data/predictedvsactual_1519fe63-1c0e-48b2-8675-5b1

[31m2019-11-02 07:09:56,748 - algorithms.Train - INFO - Train set result details:[0m
[31m2019-11-02 07:09:56,753 - algorithms.result_writer - INFO - Confusion matrix, full output in /opt/ml/output/data/predictedvsactual_485805b4-c491-4bb5-8511-f2dd0e51c39d_20191102_070956.csv: [0m
[31m[[  28    0    0    0    0    1    0    0]
 [   0    4    0    0    0    0    0    0]
 [   0    0   72    0    0   32    4    0]
 [   0    0    0   10    0    1    1    0]
 [   0    0    0    0   45    3    0    0]
 [   2    0    0    0    0 2347   35    0]
 [   0    0    0    0    1  128  500    0]
 [   0    0    0    0    0    0    0    9]][0m
[31m2019-11-02 07:09:56,757 - algorithms.Train - INFO - Train set result details: 0.9287330103995415[0m
[31m2019-11-02 07:09:56,758 - algorithms.Train - INFO - Validation set result details:[0m
[31m2019-11-02 07:09:56,970 - algorithms.result_writer - INFO - Confusion matrix, full output in /opt/ml/output/data/predictedvsactual_147810c8-b7a9-46dc-b095-9d

[31m2019-11-02 07:10:20,816 - algorithms.Train - INFO - Train set result details:[0m
[31m2019-11-02 07:10:20,821 - algorithms.result_writer - INFO - Confusion matrix, full output in /opt/ml/output/data/predictedvsactual_5f1989ee-0a2c-43c1-8a1b-f6e7a4499ade_20191102_071020.csv: [0m
[31m[[  28    0    0    0    0    1    0    0]
 [   0    4    0    0    0    0    0    0]
 [   0    0   97    0    0    9    2    0]
 [   0    0    0   11    0    0    1    0]
 [   0    0    0    0   46    2    0    0]
 [   1    0    1    0    1 2341   40    0]
 [   0    0    2    0    1   97  529    0]
 [   0    0    0    0    0    0    0    9]][0m
[31m2019-11-02 07:10:20,825 - algorithms.Train - INFO - Train set result details: 0.9578191548714354[0m
[31m2019-11-02 07:10:20,825 - algorithms.Train - INFO - Validation set result details:[0m
[31m2019-11-02 07:10:21,040 - algorithms.result_writer - INFO - Confusion matrix, full output in /opt/ml/output/data/predictedvsactual_6e88c042-a91b-4d39-9785-9a

[31m2019-11-02 07:10:52,989 - algorithms.Train - INFO - Train set result details:[0m
[31m2019-11-02 07:10:52,994 - algorithms.result_writer - INFO - Confusion matrix, full output in /opt/ml/output/data/predictedvsactual_8d94c2d2-4933-47c9-8cd1-f8dfeeaad6d0_20191102_071052.csv: [0m
[31m[[  29    0    0    0    0    0    0    0]
 [   0    4    0    0    0    0    0    0]
 [   0    0  105    0    0    3    0    0]
 [   0    0    0   11    0    0    1    0]
 [   0    0    0    0   48    0    0    0]
 [   0    0    0    0    1 2336   47    0]
 [   0    0    5    0    1   72  551    0]
 [   0    0    0    0    0    0    0    9]][0m
[31m2019-11-02 07:10:52,998 - algorithms.Train - INFO - Train set result details: 0.9713948430530366[0m
[31m2019-11-02 07:10:52,998 - algorithms.Train - INFO - Validation set result details:[0m
[31m2019-11-02 07:10:53,216 - algorithms.result_writer - INFO - Confusion matrix, full output in /opt/ml/output/data/predictedvsactual_3d825ebd-cef5-47dd-8940-f1

[31m2019-11-02 07:11:17,133 - algorithms.Train - INFO - Train set result details:[0m
[31m2019-11-02 07:11:17,138 - algorithms.result_writer - INFO - Confusion matrix, full output in /opt/ml/output/data/predictedvsactual_c31edf47-5599-434a-bb5a-a26f5c908889_20191102_071117.csv: [0m
[31m[[  28    0    0    0    0    1    0    0]
 [   0    4    0    0    0    0    0    0]
 [   0    0  107    0    0    0    1    0]
 [   0    0    0   11    0    0    1    0]
 [   0    0    0    0   47    1    0    0]
 [   1    0    2    1    0 2335   45    0]
 [   0    0    3    0    1   66  559    0]
 [   0    0    0    0    0    0    0    9]][0m
[31m2019-11-02 07:11:17,142 - algorithms.Train - INFO - Train set result details: 0.964362476303974[0m
[31m2019-11-02 07:11:17,142 - algorithms.Train - INFO - Validation set result details:[0m
[31m2019-11-02 07:11:17,357 - algorithms.result_writer - INFO - Confusion matrix, full output in /opt/ml/output/data/predictedvsactual_0fd1cc6b-2e63-442b-8342-724

[31m2019-11-02 07:14:02,458 - algorithms.PretrainedEmbedderLoaderMinimum - INFO - Total words in original embedding handle is 2231686[0m
[31m2019-11-02 07:14:02,458 - algorithms.PretrainedEmbedderLoaderMinimum - INFO - Total words in final embedding is 5306[0m
[31m2019-11-02 07:14:02,458 - algorithms.PretrainedEmbedderLoaderMinimum - INFO - Total words randomly initialized is 990[0m
[31m2019-11-02 07:14:02,462 - algorithms.transform_sentence_tokeniser - INFO - Running sentence tokenisor [0m
[31m2019-11-02 07:15:45,526 - algorithms.transform_sentence_tokeniser - INFO - Completed  sentence tokenisor [0m
[31m2019-11-02 07:15:45,533 - algorithms.transform_text_index - INFO - Transforming TransformTextToIndex[0m
[31m2019-11-02 07:15:46,125 - algorithms.transform_text_index - INFO - Total number of unknown occurances 34574[0m
[31m2019-11-02 07:15:46,125 - algorithms.transform_text_index - INFO - Completed TransformTextToIndex[0m
[31m2019-11-02 07:15:46,126 - algorithms.trans

[31m2019-11-02 07:16:23,874 - algorithms.Train - INFO - Train set result details:[0m
[31m2019-11-02 07:16:23,879 - algorithms.result_writer - INFO - Confusion matrix, full output in /opt/ml/output/data/predictedvsactual_80ba7b11-0cd4-4a13-85cb-49c06742f470_20191102_071623.csv: [0m
[31m[[  29    0    0    0    0    0    0    0]
 [   0    4    0    0    0    0    0    0]
 [   0    0  106    0    0    0    2    0]
 [   0    0    0   11    0    0    1    0]
 [   0    0    0    0   48    0    0    0]
 [   2    0    2    0    3 2336   41    0]
 [   0    0    1    0    1   73  554    0]
 [   0    0    0    0    0    0    0    9]][0m
[31m2019-11-02 07:16:23,884 - algorithms.Train - INFO - Train set result details: 0.9672396583611771[0m
[31m2019-11-02 07:16:23,884 - algorithms.Train - INFO - Validation set result details:[0m
[31m2019-11-02 07:16:24,104 - algorithms.result_writer - INFO - Confusion matrix, full output in /opt/ml/output/data/predictedvsactual_579fc935-c264-4c6f-b15e-7d

### HPO

In [None]:
objective_metric_name ="ValidationFScore"

In [None]:
from sagemaker.tuner import HyperparameterTuner, IntegerParameter, CategoricalParameter, ContinuousParameter
hyperparameter_ranges = {'lstmhiddensize': IntegerParameter(2,200), #ContinuousParameter(0.01, 0.2),
                        "fclayersize": IntegerParameter(2,50),
                        "numlayers":IntegerParameter(1,10),
                        "poolingkernelsize":IntegerParameter(2,10)
                        }

In [None]:
hyperparameters={ "trainfile":trainfile.split("/")[-1],
    "valfile":valfile.split("/")[-1],
    "embeddingfile":embeddingfile.split("/")[-1],
                        "embeddim":"200",
                        "dataset":"PpiDatasetFactory",
                 "earlystoppingpatience":20,
                         'epochs': 100}

In [None]:

estimator = PyTorch(
     entry_point='main_train.py',
                    source_dir = 'source/algorithms',
                    dependencies =['source/algorithms', 'source/datasets', 'source/preprocessor'],
                    role=role,
                    framework_version ="1.0.0",
                    py_version='py3',
                   # git_config= git_config,
                    image_name= docker_repo,
                    train_instance_count=1,
                    train_instance_type=instance_type,
                    hyperparameters =hyperparameters,
                    output_path=s3_output_path,
                    metric_definitions=metric_definitions,
                    code_location=s3_code_path,


                    #train_use_spot_instances = True
                    train_volume_size=30,
                    base_job_name ="hpo-ppi-extractor")

In [None]:
tuner = HyperparameterTuner(estimator,
                            objective_metric_name,
                            hyperparameter_ranges,
                            metric_definitions,
                            max_jobs=50,
                            max_parallel_jobs=4,
                            strategy="Random",
                            base_tuning_job_name="hpo-ppi-extractor")
tuner.fit(inputs)