### Set up

#### 1. Set  up  accounts and role

In [1]:
#!pip install sagemaker==1.39.0

In [2]:
import sagemaker
import boto3

sagemaker_session = sagemaker.Session()
account_id =  boto3.client('sts').get_caller_identity().get('Account')
region = boto3.session.Session().region_name


#role = sagemaker.get_execution_role()
role="arn:aws:iam::{}:role/service-role/AmazonSageMaker-ExecutionRole-20190118T115449".format(account_id)


#### 2. Setup image and instance type

In [3]:
pytorch_custom_image_name="ppi-extractor:gpu-1.0.0-201910130520"
instance_type = "ml.p3.8xlarge" 

In [4]:
docker_repo = "{}.dkr.ecr.{}.amazonaws.com/{}".format(account_id, region, pytorch_custom_image_name)

#### 3. Configure train/ test and validation datasets

In [5]:
bucket = "aegovan-data"

In [6]:
plain_trainfile = "s3://{}/aimed/AIMedFull.json".format(bucket)
trainfile = "s3://{}/aimed/AIMedFull_preprocessed.json".format(bucket)

#trainfile="s3://{}/aimed/AIMedFull_Ylhsieh.json".format(bucket)
    
valfile="s3://{}/aimed/AIMedval_preprocessed.json".format(bucket)
#trainfile = "s3://{}/aimed/AIMedtrain_pubmedoverlap.json".format(bucket)
#valfile="s3://{}/aimed/AIMedval_pubmedoverlap.json".format(bucket)
#embeddingfile="s3://{}/embeddings/PubMed-and-PMC-w2v.bin.txt".format(bucket)
#embeddingfile="s3://{}/embeddings/bio_nlp_vec/PubMed-shuffle-win-30.bin.txt".format(bucket)
embeddingfile="s3://{}/embeddings/bio_nlp_vec/PubMed-shuffle-win-2.bin.txt".format(bucket)
embed_dim=200

#Collobert embedding
coll_embeddingfile="s3://{}/embeddings/collobert/words_vocab_collabert.txt".format(bucket)
coll_embed_dim=50
s3_output_path= "s3://{}/results/".format(bucket)
s3_code_path= "s3://{}/aimed_code".format(bucket)

### Start training

In [7]:
commit_id = "8f5d23c1e352fc9caea3a14a4293bcca3d315078"

In [8]:
pub_inputs = {
    "train" : trainfile,
 #   "val" :valfile,
    "embedding" : embeddingfile
}

In [9]:
plain_inputs = {
    "train" : plain_trainfile,
 #   "val" :valfile,
    "embedding" : embeddingfile
}

In [10]:
coll_inputs = {
    "train" : trainfile,
   # "val" :valfile,
    "embedding" : coll_embeddingfile
}

In [11]:
# hyperparameters = {
#     "dataset":"PpiAimedDatasetFactory",
#     "trainfile":trainfile.split("/")[-1],
#     "valfile":valfile.split("/")[-1],
#     "embeddingfile":embeddingfile.split("/")[-1],
#     "embeddim":embed_dim,
#     "batchsize": "32",
#     "epochs" : "1000",   
#     "log-level" : "INFO",
#     "lstmhiddensize": 100,
#     "fclayersize": 15,
#     "numlayers":7,
#     "poolingkernelsize":10,
#     "learningrate":.001,
#     "cnn_output":100,
#     "earlystoppingpatience":20
# }

In [12]:
choi_CnnNetworkNoPosFactoryhyperparameters = {
    "dataset":"PpiAimedDatasetPreprocessedFactory",
    "network" :"RelationExtractorCnnNetworkNoPosFactory",
    "trainfile":trainfile.split("/")[-1],
    "embeddingfile":coll_embeddingfile.split("/")[-1],
    "embeddim":coll_embed_dim,
    "batchsize": "32",
    "epochs" : "1000",   
    "log-level" : "INFO",
    "learningrate":.001,
    "cnn_output":100,
    "earlystoppingpatience":20,
    "dropout_rate_cnn":.2,
    "fc_drop_out_rate":0.5,
    "train_val_vocab_merge":1
}

In [13]:
choi_CnnPosNetworkFactory = {
    "dataset":"PpiAimedDatasetPreprocessedFactory",
    "network" :"RelationExtractorCnnPosNetworkFactory",
    "trainfile":trainfile.split("/")[-1],
    "embeddingfile":coll_embeddingfile.split("/")[-1],
    "embeddim":coll_embed_dim,
    "batchsize": "32",
    "epochs" : "1000",   
    "log-level" : "INFO",
    "learningrate":.001,
    "cnn_output":100,
    "earlystoppingpatience":20,
    "dropout_rate_cnn":.2,
    "fc_drop_out_rate":0.5,
      "train_val_vocab_merge":1
}

In [14]:
BilstmNetworkFactoryhyperparameters = {
    "dataset":"PpiAimedDatasetPreprocessedFactory",
    "network" :"RelationExtractorBiLstmNetworkFactory",
    "trainfile":trainfile.split("/")[-1],
    "valfile":valfile.split("/")[-1],
    "embeddingfile":embeddingfile.split("/")[-1],
    "embeddim":embed_dim,
    "batchsize": "50",
    "epochs" : "1000",  
    "earlystoppingpatience":20,
    "log-level" : "INFO",
    "learningrate":.001,
    "lstm_dropout":0.5,
    "pooling_kernel_size":3,
    "lstm_num_layers" :3,
    "lstm_hidden_size":64,
    "fc_layer_size":64,
    "fc_drop_out_rate":0.5,
}

In [15]:
base_experiment_bilstm_pos =  {
    "dataset":"PpiAimedDatasetFactory",
    "network" :"RelationExtractorDynamicEntityBiLstmNetworkFactory",
    "trainfile":plain_trainfile.split("/")[-1],
    "embeddingfile":embeddingfile.split("/")[-1],
    "embeddim":embed_dim,
    "batchsize": "50",
    "epochs" : "1000",  
    "earlystoppingpatience":20,
    "log-level" : "INFO",
    "learningrate":.001,
    "lstm_dropout":0.5,
    "pooling_kernel_size":3,
    "lstm_num_layers" :3,
    "lstm_hidden_size":64,
    "fc_layer_size":64,
    "fc_drop_out_rate":0.5,
    "train_val_vocab_merge":0,
    "use_min_dict":0,
    "fine_tune_embeddings":0
    
}

## For BiLSTM AIMed preprocessed

#### network f - score using the following HP

```bash
/usr/bin/python -m main_train_k_fold --batchsize 64 --dataset PpiAimedDatasetPreprocessedFactory --earlystoppingpatience 20 --embeddim 200 --embeddingfile PubMed-shuffle-win-2.bin.txt --epochs 1000 --fc_drop_out_rate 0.5 --learningrate 0.001 --log-level INFO --lstm_dropout 0.5 --lstm_hidden_size 400 --lstm_num_layers 1 --network RelationExtractorBiLstmNetworkFactoryNoPos --train_val_vocab_merge 1 --trainfile AIMedFull_preprocessed.json
```

**precision, recall, fscore, support** for 10 folds

```json
[(0.7528089887640449, 0.7528089887640449, 0.752808988764045, None),
 (0.6915887850467289, 0.7628865979381443, 0.7254901960784315, None),
 (0.7586206896551724, 0.6226415094339622, 0.6839378238341969, None),
 (0.7727272727272727, 0.6296296296296297, 0.6938775510204083, None),
 (0.75, 0.6990291262135923, 0.7236180904522612, None),
 (0.7525773195876289, 0.7525773195876289, 0.752577319587629, None),
 (0.6851851851851852, 0.74, 0.7115384615384615, None),
 (0.7142857142857143, 0.7476635514018691, 0.730593607305936, None),
 (0.7560975609756098, 0.6326530612244898, 0.6888888888888889, None),
 (0.6788990825688074, 0.7789473684210526, 0.7254901960784315, None)]
```

**tn, fp, fn, tp** for 10 folds


```json
[(473, 22, 22, 67), 
 (454, 33, 23, 74), 
 (457, 21, 40, 66), 
 (456, 20, 40, 68), 
 (456, 24, 31, 72), 
 (462, 24, 24, 73),
 (449, 34, 26, 74), 
 (444, 32, 27, 80), 
 (465, 20, 36, 62),
 (453, 35, 21, 74)]
```

See aimed-ppi-extractor-2019-10-20-04-33-12-355-copy-10-20


## BiLstm plain

```python
BilstmNetworkFactoryhyperparametersNoPos_plain = {
    "dataset":"PpiAimedDatasetFactory",
    "network" :"RelationExtractorBiLstmNetworkFactoryNoPos",
    "trainfile":plain_trainfile.split("/")[-1],
   # "valfile":valfile.split("/")[-1],
    "embeddingfile":embeddingfile.split("/")[-1],
    "embeddim":embed_dim,
    "batchsize": "64",
    "epochs" : "1000",  
    "earlystoppingpatience":20,
    "log-level" : "INFO",
    "learningrate":.001,
    "lstm_dropout":0.5,
    "lstm_num_layers" :1,
    "lstm_hidden_size":400,
    "fc_drop_out_rate":0.5,
    "use_min_dict":0,
    "train_val_vocab_merge":1,
    "git_commit_id":commit_id
}
```

```bash


/usr/bin/python -m main_train_k_fold --batchsize 64 --dataset PpiAimedDatasetFactory --earlystoppingpatience 20 --embeddim 200 --embeddingfile PubMed-shuffle-win-2.bin.txt --epochs 1000 --fc_drop_out_rate 0.5 --git_commit_id 8f5d23c1e352fc9caea3a14a4293bcca3d315078 --learningrate 0.001 --log-level INFO --lstm_dropout 0.5 --lstm_hidden_size 400 --lstm_num_layers 1 --network RelationExtractorBiLstmNetworkFactoryNoPos --train_val_vocab_merge 1 --trainfile AIMedFull.json --use_min_dict 0
```

```text

0.659574468	0.613861386	0.635897436
0.598290598	0.714285714	0.651162791
0.583333333	0.7875	0.670212766
0.773333333	0.604166667	0.678362573
0.646017699	0.675925926	0.660633484
0.80952381	0.525773196	0.6375
0.584158416	0.621052632	0.602040816
0.683168317	0.6	0.638888889
0.69	0.69	0.69
0.650943396	0.627272727	0.638888889
		
66.78	64.6	65.04
8	7.6	2.6![image.png](attachment:image.png)
```


## Resnet plain

```json
BilstmNetworkFactoryhyperparametersNoPos_plain = {
    "dataset":"PpiAimedDatasetPreprocessedFactory",
    "network" :"RelationExtractorBiLstmNetworkFactoryNoPos",
    "trainfile":plain_trainfile.split("/")[-1],
    "embeddingfile":embeddingfile.split("/")[-1],
    "embeddim":embed_dim,
    "batchsize": "64",
    "epochs" : "1000",  
    "earlystoppingpatience":20,
    "log-level" : "INFO",
    "learningrate":.001,
    "lstm_dropout":0.5,
    "lstm_num_layers" :1,
    "lstm_hidden_size":400,
    "fc_drop_out_rate":0.5,
    "use_min_dict":0,
    "train_val_vocab_merge":1,
    "git_commit_id":commit_id
}
```

```bash
/usr/bin/python -m main_train_k_fold --batchsize 32 --cnn_kernel_size 3 --cnn_num_layers 2 --cnn_output 64 --dataset PpiAimedDatasetFactory --dropout_rate_cnn 0.5 --earlystoppingpatience 50 --embeddim 200 --embeddingfile PubMed-shuffle-win-2.bin.txt --epochs 1000 --fc_drop_out_rate 0.5 --fc_layer_size 256 --git_commit_id 8f5d23c1e352fc9caea3a14a4293bcca3d315078 --input_drop_out_rate 0.2 --learningrate 0.001 --log-level INFO --network RelationExtractorSimpleResnetCnnPosNetworkFactory --pool_stride 2 --pooling_kernel_size 3 --train_val_vocab_merge 1 --trainfile AIMedFull.json --use_min_dict 1 --weight_decay 1e-05
```

```text
(P, R, F1)
0.722891566	0.594059406	0.652173913
0.726190476	0.62244898	0.67032967
0.623529412	0.6625	0.642424242
0.680851064	0.666666667	0.673684211
0.705882353	0.666666667	0.685714286
0.639175258	0.639175258	0.639175258
0.666666667	0.547368421	0.601156069
0.647619048	0.591304348	0.618181818
0.579831933	0.69	0.630136986
0.717391304	0.6	0.653465347
		
67.1	62.8	64.66(mean)
4.8	4.6	2.8 (STd) 
```

## Resnet preprocessed

```python
SimpleResnetCnnPosNetworkFactoryhyperparameters = {
    "dataset":"PpiAimedDatasetPreprocessedFactory",
    "network" :"RelationExtractorSimpleResnetCnnPosNetworkFactory",
    "earlystoppingpatience" : 50,
    "trainfile":trainfile.split("/")[-1],
    "embeddingfile":embeddingfile.split("/")[-1],
    "embeddim":embed_dim,
    "batchsize": "32",
    "epochs" : "1000",   
    "log-level" : "INFO",
    "dropout_rate_cnn": 0.5,
    "pooling_kernel_size": 3,
    "pool_stride":2,
    "cnn_kernel_size":3,
    "cnn_num_layers":2,
    "cnn_output":64,
    "learningrate":.001,
    "weight_decay":.00001,
    "fc_layer_size": 256,
    "fc_drop_out_rate": 0.5,
    "input_drop_out_rate" : 0.2,
    "use_min_dict":1,
    "train_val_vocab_merge":1,
    "git_commit_id":commit_id
}
```

```bash
/usr/bin/python -m main_train_k_fold --batchsize 32 --cnn_kernel_size 3 --cnn_num_layers 2 --cnn_output 64 --dataset PpiAimedDatasetPreprocessedFactory --dropout_rate_cnn 0.5 --earlystoppingpatience 50 --embeddim 200 --embeddingfile PubMed-shuffle-win-2.bin.txt --epochs 1000 --fc_drop_out_rate 0.5 --fc_layer_size 256 --git_commit_id 8f5d23c1e352fc9caea3a14a4293bcca3d315078 --input_drop_out_rate 0.2 --learningrate 0.001 --log-level INFO --network RelationExtractorSimpleResnetCnnPosNetworkFactory --pool_stride 2 --pooling_kernel_size 3 --train_val_vocab_merge 1 --trainfile AIMedFull_preprocessed.json --use_min_dict 1 --weight_decay 1e-05

```

```test
0.7625	0.685393258	0.721893491
0.752941176	0.659793814	0.703296703
0.75	0.70754717	0.72815534
0.770114943	0.62037037	0.687179487
0.747572816	0.747572816	0.747572816
0.711340206	0.711340206	0.711340206
0.724137931	0.63	0.673796791
0.745098039	0.710280374	0.727272727
0.660377358	0.714285714	0.68627451
0.75	0.726315789	0.737967914
		
73.74	69.13	71.25 (mean)
3.3	4.4	2.6 (std)

```

main_train_k_fold --batchsize 64 --dataset PpiAimedDatasetPreprocessedFactory --earlystoppingpatience 20 --embeddim 200 --embeddingfile PubMed-shuffle-win-2.bin.txt --epochs 1000 --fc_drop_out_rate 0.5 --learningrate 0.001 --log-level INFO --lstm_dropout 0.5 --lstm_hidden_size 400 --lstm_num_layers 1 --network RelationExtractorBiLstmNetworkFactoryNoPos --train_val_vocab_merge 1 --trainfile AIMedFull_preprocessed.json --use_min_dict 1


In [16]:
BilstmNetworkFactoryhyperparametersNoPos = {
    "dataset":"PpiAimedDatasetPreprocessedFactory",
    "network" :"RelationExtractorBiLstmNetworkFactoryNoPos",
    "trainfile":trainfile.split("/")[-1],
   # "valfile":valfile.split("/")[-1],
    "embeddingfile":embeddingfile.split("/")[-1],
    "embeddim":embed_dim,
    "batchsize": "64",
    "epochs" : "1000",  
    "earlystoppingpatience":20,
    "log-level" : "INFO",
    "learningrate":.001,
    "lstm_dropout":0.5,
    "lstm_num_layers" :1,
    "lstm_hidden_size":400,
    "fc_drop_out_rate":0.5,
    "use_min_dict":0,
    "train_val_vocab_merge":1,
    "git_commit_id":commit_id
}

In [17]:
BilstmNetworkFactoryhyperparametersNoPos_plain = {
    "dataset":"PpiAimedDatasetFactory",
    "network" :"RelationExtractorBiLstmNetworkFactoryNoPos",
    "trainfile":plain_trainfile.split("/")[-1],
   # "valfile":valfile.split("/")[-1],
    "embeddingfile":embeddingfile.split("/")[-1],
    "embeddim":embed_dim,
    "batchsize": "64",
    "epochs" : "1000",  
    "earlystoppingpatience":20,
    "log-level" : "INFO",
    "learningrate":.001,
    "lstm_dropout":0.5,
    "lstm_num_layers" :1,
    "lstm_hidden_size":400,
    "fc_drop_out_rate":0.5,
    "use_min_dict":0,
    "train_val_vocab_merge":1,
    "git_commit_id":commit_id
}

In [18]:


PpiAimedDatasetFactoryYlhsiehBiLstmNetwork = {
    "dataset":"PpiAimedDatasetFactoryYlhsieh",
    "network" :"RelationExtractorBiLstmNetworkFactoryNoPos",
    "trainfile":trainfile.split("/")[-1],
   # "valfile":valfile.split("/")[-1],
    "embeddingfile":embeddingfile.split("/")[-1],
    "embeddim":embed_dim,
    "batchsize": "16",
    "epochs" : "1000",  
    "earlystoppingpatience":20,
    "log-level" : "INFO",
    "learningrate":.001,
    "lstm_dropout":0.5,
    "lstm_num_layers" :1,
    "lstm_hidden_size":400,
    "fc_drop_out_rate":0.5,
  "train_val_vocab_merge":1
}

In [19]:
ResnetCnnPosNetworkFactoryhyperparameters = {
    "dataset":"PpiAimedDatasetPreprocessedFactory",
    "network" :"RelationExtractorResnetCnnPosNetworkFactory",
    "earlystoppingpatience" : 20,
    "trainfile":plain_trainfile.split("/")[-1],
   # "valfile":valfile.split("/")[-1],
    "embeddingfile":embeddingfile.split("/")[-1],
    "embeddim":embed_dim,
    "batchsize": "8",
    "epochs" : "1000",   
    "log-level" : "INFO",
    "dropout_rate_cnn": 0.5,
    "pooling_kernel_size": 3,
    "pool_stride":2,
    "cnn_kernel_size":3,
    "cnn_num_layers":3,
    "cnn_output":64,
    "learningrate":.00001,
    "weight_decay":.00001,
    "fc_layer_size": 256,
    "fc_drop_out_rate": 0.5,
    "input_drop_out_rate" : 0.2
   
}

In [20]:
SimpleResnetCnnPosNetworkFactoryhyperparameters = {
    "dataset":"PpiAimedDatasetPreprocessedFactory",
    "network" :"RelationExtractorSimpleResnetCnnPosNetworkFactory",
    "earlystoppingpatience" : 50,
    "trainfile":trainfile.split("/")[-1],
  #  "valfile":valfile.split("/")[-1],
    "embeddingfile":embeddingfile.split("/")[-1],
    "embeddim":embed_dim,
    "batchsize": "32",
    "epochs" : "1000",   
    "log-level" : "INFO",
    "dropout_rate_cnn": 0.5,
    "pooling_kernel_size": 3,
    "pool_stride":2,
    "cnn_kernel_size":3,
    "cnn_num_layers":2,
    "cnn_output":64,
    "learningrate":.001,
    "weight_decay":.00001,
    "fc_layer_size": 256,
    "fc_drop_out_rate": 0.5,
    "input_drop_out_rate" : 0.2,
    "use_min_dict":1,
    "train_val_vocab_merge":1,
    "git_commit_id":commit_id
}

In [21]:
SimpleResnetCnnPosNetworkFactoryhyperparameters_coll = {
    "dataset":"PpiAimedDatasetFactory",
    "network" :"RelationExtractorSimpleResnetCnnPosNetworkFactory",
    "earlystoppingpatience" : 50,
    "trainfile":trainfile.split("/")[-1],
  #  "valfile":valfile.split("/")[-1],
    "embeddingfile":coll_embeddingfile.split("/")[-1],
    "embeddim":50,
    "batchsize": "32",
    "epochs" : "1000",   
    "log-level" : "INFO",
    "dropout_rate_cnn": 0.5,
    "pooling_kernel_size": 3,
    "pool_stride":2,
    "cnn_kernel_size":3,
    "cnn_num_layers":2,
    "cnn_output":64,
    "learningrate":.001,
    "weight_decay":.00001,
    "fc_layer_size": 256,
    "fc_drop_out_rate": 0.5,
    "input_drop_out_rate" : 0.2,
   
    "train_val_vocab_merge":1
}

In [22]:
SimpleResnetCnnPosNetworkFactoryhyperparametersv2 = {
    "dataset":"PpiAimedDatasetFactory",
    "network" :"RelationExtractorSimpleResnetCnnPosNetworkFactory",
    "earlystoppingpatience" : 50,
    "trainfile":trainfile.split("/")[-1],
    "valfile":valfile.split("/")[-1],
    "embeddingfile":embeddingfile.split("/")[-1],
    "embeddim":embed_dim,
    "batchsize": "8",
    "epochs" : "1000",   
    "log-level" : "INFO",
    "dropout_rate_cnn": 0.5,
    "pooling_kernel_size": 3,
    "pool_stride":2,
    "cnn_kernel_size":3,
    "cnn_num_layers":3,
    "cnn_output":32,
    "learningrate":.001,
    "weight_decay":.00001,
    "fc_layer_size": 128,
    "fc_drop_out_rate": 0.5,
    "input_drop_out_rate" : 0.2,
    "train_val_vocab_merge":1
   
}

In [23]:
metric_definitions = [{"Name": "TrainLoss",
                     "Regex": "###score: train_loss### (\d*[.]?\d*)"}
                    ,{"Name": "ValidationLoss",
                     "Regex": "###score: val_loss### (\d*[.]?\d*)"}
                    ,{"Name": "TrainFScore",
                     "Regex": "###score: train_fscore### (\d*[.]?\d*)"}
                   ,{"Name": "ValidationFScore",
                     "Regex": "###score: val_fscore### (\d*[.]?\d*)"}
                    ]

In [24]:
!git log -1 | head -1
!git log -1 | head -5 | tail -1

commit e4a1f23bd88db8ff57a9310abd93aff378dbcc5e
    add untyped ppi extraction


In [25]:
base_job_name="aimed-bilstm-plain"
hyperparameters =BilstmNetworkFactoryhyperparametersNoPos_plain #BilstmNetworkFactoryhyperparametersNoPos_plain # BilstmNetworkFactoryhyperparametersNoPos  #PpiAimedDatasetFactoryYlhsiehBiLstmNetwork #SimpleResnetCnnPosNetworkFactoryhyperparameters
inputs = plain_inputs #pub_inputs #plain_inputs #


In [26]:
git_config = {'repo': 'https://github.com/elangovana/PPI-typed-relation-extractor.git',
              'branch': 'master',
              'commit': hyperparameters["git_commit_id"]
             }

In [27]:
hyperparameters

{'batchsize': '64',
 'dataset': 'PpiAimedDatasetFactory',
 'earlystoppingpatience': 20,
 'embeddim': 200,
 'embeddingfile': 'PubMed-shuffle-win-2.bin.txt',
 'epochs': '1000',
 'fc_drop_out_rate': 0.5,
 'git_commit_id': '8f5d23c1e352fc9caea3a14a4293bcca3d315078',
 'learningrate': 0.001,
 'log-level': 'INFO',
 'lstm_dropout': 0.5,
 'lstm_hidden_size': 400,
 'lstm_num_layers': 1,
 'network': 'RelationExtractorBiLstmNetworkFactoryNoPos',
 'train_val_vocab_merge': 1,
 'trainfile': 'AIMedFull.json',
 'use_min_dict': 0}

In [28]:
inputs

{'embedding': 's3://aegovan-data/embeddings/bio_nlp_vec/PubMed-shuffle-win-2.bin.txt',
 'train': 's3://aegovan-data/aimed/AIMedFull.json'}

In [29]:
from sagemaker.pytorch import PyTorch

estimator = PyTorch(
     entry_point='main_train_k_fold.py',
   # entry_point='main_train.py',
                    source_dir = 'source/algorithms',
                    dependencies =['source/algorithms', 'source/datasets', 'source/preprocessor', 'source/modelnetworks'],
                    role=role,
                    framework_version ="1.0.0",
                    py_version='py3',
                    git_config= git_config,
                    image_name= docker_repo,
                    train_instance_count=1,
                    train_instance_type=instance_type,
                    hyperparameters =hyperparameters,
                    output_path=s3_output_path,
                    metric_definitions=metric_definitions,
                    #train_use_spot_instances = True
                    train_volume_size=30,
                    code_location=s3_code_path,
                    train_max_run = 60 * 60 * 24 * 4,
                    base_job_name = base_job_name)

In [None]:
estimator.fit(inputs)

2019-11-05 10:00:05 Starting - Starting the training job...
2019-11-05 10:00:06 Starting - Launching requested ML instances...
2019-11-05 10:01:03 Starting - Preparing the instances for training......
2019-11-05 10:02:02 Downloading - Downloading input data......
2019-11-05 10:03:25 Training - Downloading the training image.........
2019-11-05 10:05:23 Training - Training image download completed. Training in progress.[31mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[31mbash: no job control in this shell[0m
[31m2019-11-05 10:05:24,861 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training[0m
[31m2019-11-05 10:05:24,905 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[31m2019-11-05 10:05:24,906 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[31m2019-11-05 10:05:25,178 sagemaker-containers INFO     Module main_train_k_fold does not provi

[31m2019-11-05 10:08:29,787 - algorithms.PretrainedEmbedderLoader - INFO - Total words in embedding is 2231686[0m
[31m2019-11-05 10:08:29,791 - algorithms.PretrainedEmbedderLoader - INFO - The number of words intialised without embbeder is 211[0m
[31m2019-11-05 10:08:29,791 - algorithms.PretrainedEmbedderLoader - INFO - Total words 2231897[0m
[31m2019-11-05 10:08:57,828 - algorithms.transform_sentence_tokeniser - INFO - Running sentence tokenisor [0m
[31mIn order to characterize the signaling pathway through which FGFR3 regulates cell growth, L6 cells lacking any endogenous FGFR were stably transfected with the two different human isoforms, PROTEIN1 IIIb and FGFR3 IIIc, that result from alternative splicing of exon III of the FGFR3 gene encoding the ligand binding domain.[0m
[31mPROTEIN1-related protein: a ligand and specific activator of the tyrosine kinase receptor Flt4.[0m
[31mThe PROTEIN1 receptor (EPOR) is a member of a family of cytokine and growth factor receptors t

[31mThe extracellular domain of the human PROTEIN1 expressed in Chinese hamster ovary cells is a highly glycosylated protein, possessing binding ability for brain-derived neurotrophic factor (BDNF).[0m
[31mNerve growth factor binds independently to two transmembrane receptors, the PROTEIN1 and the p140trk (trkA) tyrosine kinase receptor, which are both co-expressed in the majority of neuronal cells that respond to NGF.[0m
[31mThe gamma chain of the PROTEIN1 is shared with the functional IL-4 receptor and is causatively related to X-linked severe combined immunodeficiency (XSCID), which is ascribed to a profound T cell defect.[0m
[31mCoexpression of the alpha and betaL subunits of the human PROTEIN1 (IFNalpha) receptor is required for the induction of an antiviral state by human IFNalpha.[0m
[31mInterleukin-2 (IL-2) signaling requires the dimerization of the PROTEIN1 receptor beta.(IL-2R beta) and common gamma (gamma c) chains.[0m
[31mSpecifically, substitution of proline for

[31mFGF-7 recognizes one FGFR isoform known as the FGFR2 IIIb isoform or PROTEIN1 (KGFR), whereas FGF-2 binds well to FGFR1, FGFR2, and FGFR4 but interacts poorly with KGFR.[0m
[31mStat3 recruitment by two distinct ligand-induced, tyrosine-phosphorylated docking sites in the PROTEIN1 receptor intracellular domain.[0m
[31mIn this study, we used the two-hybrid assay of protein-protein interaction in the yeast Saccharomyces cerevisiae to study the interaction between human IRS-1 and the PROTEIN1 receptor.[0m
[31mThe death-inducing receptor Fas is activated when cross-linked by the type II membrane protein PROTEIN1 (FasL).[0m
[31mA mutant form of human interferon-gamma (PROTEIN1 SC1) that binds one IFN-gamma receptor alpha chain (IFN-gamma R alpha) has been designed and characterized.[0m
[31mActivation of PROTEIN1 inducing factor mediated by interleukin-1beta converting enzyme.[0m
[31mThe results of his3 activation indicated that p85, N + C-SH2 and C-SH2 interact with both PRO

[31mExpression of PROTEIN1 IIIc in stably transfected L6 cells conferred growth responses to several members of the FGF family including FGF-1, -2, -4, and -6, while FGFR3 IIIb-expressing cells responded only to FGF-1.[0m
[31mThe retinoblastoma-susceptibility gene product binds directly to the human PROTEIN1 TAFII250.[0m
[31mRemarkably, Phe93 forms extensive contacts with a peptide ligand in the crystal structure of the EBP bound to an PROTEIN1-mimetic peptide (EMP1), suggesting that Phe93 is also important for EMP1 binding.[0m
[31mTaken together, our results suggest that IRS-1 may serve as a converging target in the signaling pathways stimulated by receptors that belong to different families, such as the gastrin/CCKB G protein-coupled receptor and the PROTEIN1 receptor.[0m
[31mGastrin stimulates tyrosine phosphorylation of PROTEIN1 receptor substrate 1 and its association with Grb2 and the phosphatidylinositol 3-kinase.[0m
[31mThese results, obtained from a variety of exper

[31m2019-11-05 10:10:10,929 - algorithms.transform_sentence_tokeniser - INFO - Completed  sentence tokenisor [0m
[31m2019-11-05 10:10:10,933 - algorithms.transform_text_index - INFO - Transforming TransformTextToIndex[0m
[31m2019-11-05 10:10:10,960 - algorithms.transform_text_index - INFO - Total number of unknown occurances 52[0m
[31m2019-11-05 10:10:10,960 - algorithms.transform_text_index - INFO - Completed TransformTextToIndex[0m
[31m2019-11-05 10:10:10,960 - algorithms.transform_label_encoder - INFO - Running TransformLabelEncoder[0m
[31m2019-11-05 10:10:10,961 - algorithms.transform_label_encoder - INFO - Complete TransformLabelEncoder[0m
[31m2019-11-05 10:10:12,536 - algorithms.TrainInferencePipeline - INFO - Retrieving key learningrate with default .01, found 0.001[0m
[31m2019-11-05 10:10:12,536 - algorithms.TrainInferencePipeline - INFO - Retrieving key weight_decay with default .0001, found .0001[0m
[31m2019-11-05 10:10:12,536 - algorithms.TrainInferencePipel

[31m2019-11-05 10:11:09,898 - algorithms.Train - INFO - Train set result details:[0m
[31m2019-11-05 10:11:09,906 - algorithms.result_writer - INFO - Confusion matrix, full output in /opt/ml/output/data/predictedvsactual_e474adf3-3932-444a-8fb9-7e805ad46a9c_20191105_101109.csv: [0m
[31m[[4271   80]
 [ 504  395]][0m
[31m2019-11-05 10:11:09,912 - algorithms.Train - INFO - Train set result details: 0.7554853114140988[0m
[31m2019-11-05 10:11:09,912 - algorithms.Train - INFO - Validation set result details:[0m
[31m2019-11-05 10:11:10,034 - algorithms.result_writer - INFO - Confusion matrix, full output in /opt/ml/output/data/predictedvsactual_6df65142-6a4e-4fd5-afb3-624ff0f43f10_20191105_101110.csv: [0m
[31m[[458  25]
 [ 81  20]][0m
[31m2019-11-05 10:11:10,035 - algorithms.Train - INFO - Validation set result details: 0.5851272015655578 [0m
[31m2019-11-05 10:11:10,035 - algorithms.Train - INFO - Snapshotting because the current score 0.5851272015655578 is greater than 0.5612

[31m2019-11-05 10:12:33,933 - algorithms.Train - INFO - Train set result details:[0m
[31m2019-11-05 10:12:33,941 - algorithms.result_writer - INFO - Confusion matrix, full output in /opt/ml/output/data/predictedvsactual_57a62222-f0bd-4857-9272-e8e4cb8d1433_20191105_101233.csv: [0m
[31m[[4250  101]
 [ 266  633]][0m
[31m2019-11-05 10:12:33,948 - algorithms.Train - INFO - Train set result details: 0.8669354178725123[0m
[31m2019-11-05 10:12:33,948 - algorithms.Train - INFO - Validation set result details:[0m
[31m2019-11-05 10:12:34,066 - algorithms.result_writer - INFO - Confusion matrix, full output in /opt/ml/output/data/predictedvsactual_25474f56-6ec1-4aa5-ab89-77d37f74f65f_20191105_101234.csv: [0m
[31m[[450  33]
 [ 65  36]][0m
[31m2019-11-05 10:12:34,068 - algorithms.Train - INFO - Validation set result details: 0.6626665094895674 [0m
[31m2019-11-05 10:12:34,068 - algorithms.Train - INFO - Run    140    12      1079     4/83          5% 14.183209 4.971600       0.8669 

In [None]:
assert False

### HPO

In [None]:
objective_metric_name ="ValidationFScore"

In [None]:
from sagemaker.tuner import HyperparameterTuner, IntegerParameter, CategoricalParameter, ContinuousParameter
hyperparameter_ranges = {'lstmhiddensize': IntegerParameter(40,200), #ContinuousParameter(0.01, 0.2),
                          "fclayersize": IntegerParameter(10,50),
                            "numlayers":IntegerParameter(1,10),
                        "poolingkernelsize":IntegerParameter(2,10)}

In [None]:
hyperparameters={ "trainfile":trainfile.split("/")[-1],
    "valfile":valfile.split("/")[-1],
    "embeddingfile":embeddingfile.split("/")[-1],
    "embeddim":"200",
    "epochs": 100,
    "earlystoppingpatience": 20,
    "dataset":"PpiAimedDatasetFactory"}

In [None]:

estimator = PyTorch(
     entry_point='main_train.py',
                    source_dir = 'source/algorithms',
                    dependencies =['source/algorithms', 'source/datasets','source/preprocessor'],
                    role=role,
                    framework_version ="1.0.0",
                    py_version='py3',
                   # git_config= git_config,
                    image_name= docker_repo,
                    train_instance_count=1,
                    train_instance_type=instance_type,
                    hyperparameters =hyperparameters,
                    output_path=s3_output_path,
                    metric_definitions=metric_definitions,
                    code_location=s3_code_path,
                    #train_use_spot_instances = True
                    train_volume_size=30,
                    base_job_name ="aimed-ppi-extractor")

In [None]:
tuner = HyperparameterTuner(estimator,
                            objective_metric_name,
                            hyperparameter_ranges,
                            metric_definitions,
                            max_jobs=50,
                            max_parallel_jobs=7,
                            strategy="Random",
                            base_tuning_job_name="hpo-aimed-ppi-extractor")
tuner.fit(inputs)