### Set up

#### 1. Set  up  accounts and role

In [1]:
#!pip install sagemaker==1.39.0

In [2]:
import sagemaker
import boto3

sagemaker_session = sagemaker.Session()
account_id =  boto3.client('sts').get_caller_identity().get('Account')
region = boto3.session.Session().region_name


#role = sagemaker.get_execution_role()
role="arn:aws:iam::{}:role/service-role/AmazonSageMaker-ExecutionRole-20190118T115449".format(account_id)


#### 2. Setup image and instance type

In [3]:
pytorch_custom_image_name="ppi-extractor:gpu-1.0.0-201910130520"
instance_type = "ml.p3.8xlarge" 

In [4]:
docker_repo = "{}.dkr.ecr.{}.amazonaws.com/{}".format(account_id, region, pytorch_custom_image_name)

#### 3. Configure train/ test and validation datasets

In [5]:
bucket = "aegovan-data"

In [6]:
plain_trainfile = "s3://{}/aimed/AIMedFull.json".format(bucket)
trainfile = "s3://{}/aimed/AIMedFull_preprocessed.json".format(bucket)

#trainfile="s3://{}/aimed/AIMedFull_Ylhsieh.json".format(bucket)
    
valfile="s3://{}/aimed/AIMedval_preprocessed.json".format(bucket)
#trainfile = "s3://{}/aimed/AIMedtrain_pubmedoverlap.json".format(bucket)
#valfile="s3://{}/aimed/AIMedval_pubmedoverlap.json".format(bucket)
#embeddingfile="s3://{}/embeddings/PubMed-and-PMC-w2v.bin.txt".format(bucket)
#embeddingfile="s3://{}/embeddings/bio_nlp_vec/PubMed-shuffle-win-30.bin.txt".format(bucket)
embeddingfile="s3://{}/embeddings/bio_nlp_vec/PubMed-shuffle-win-2.bin.txt".format(bucket)
embed_dim=200


train_split="s3://{}/aimed/AIMedtrain_preprocessed.json".format(bucket)
val_split="s3://{}/aimed/AIMedval_preprocessed.json".format(bucket)


#Collobert embedding
coll_embeddingfile="s3://{}/embeddings/collobert/words_vocab_collabert.txt".format(bucket)
coll_embed_dim=50
s3_output_path= "s3://{}/results/".format(bucket)
s3_code_path= "s3://{}/aimed_code".format(bucket)

### Start training

In [7]:
commit_id = "ba7f135dbbadaffb8aa998757a717ae875fe378e"

In [8]:
docid = "docid"
labelid="isValid"

In [9]:
pub_inputs = {
    "train" : trainfile,
 #   "val" :valfile,
    "embedding" : embeddingfile
}



In [10]:
plain_inputs = {
    "train" : plain_trainfile,
 #   "val" :valfile,
    "embedding" : embeddingfile
}

In [11]:
coll_inputs = {
    "train" : trainfile,
   # "val" :valfile,
    "embedding" : coll_embeddingfile
}

In [12]:
train_split_inputs = {
    "train" : train_split,
    "val" : val_split,
   # "val" :valfile,
    "embedding" : embeddingfile
}

In [13]:
# hyperparameters = {
#     "dataset":"PpiAimedDatasetFactory",
#     "trainfile":trainfile.split("/")[-1],
#     "valfile":valfile.split("/")[-1],
#     "embeddingfile":embeddingfile.split("/")[-1],
#     "embeddim":embed_dim,
#     "batchsize": "32",
#     "epochs" : "1000",   
#     "log-level" : "INFO",
#     "lstmhiddensize": 100,
#     "fclayersize": 15,
#     "numlayers":7,
#     "poolingkernelsize":10,
#     "learningrate":.001,
#     "cnn_output":100,
#     "earlystoppingpatience":20
# }

In [14]:
choi_CnnNetworkNoPosFactoryhyperparameters = {
    "dataset":"PpiAimedDatasetPreprocessedFactory",
    "network" :"RelationExtractorCnnNetworkNoPosFactory",
    "trainfile":trainfile.split("/")[-1],
    "embeddingfile":coll_embeddingfile.split("/")[-1],
    "embeddim":coll_embed_dim,
    "batchsize": "32",
    "epochs" : "1000",   
    "log-level" : "INFO",
    "learningrate":.001,
    "cnn_output":100,
    "earlystoppingpatience":20,
    "dropout_rate_cnn":.2,
    "fc_drop_out_rate":0.5,
    "train_val_vocab_merge":1,
     "docidfieldname":docid,
    "labelfieldname":labelid
}

In [15]:
choi_CnnPosNetworkFactory = {
    "dataset":"PpiAimedDatasetPreprocessedFactory",
    "network" :"RelationExtractorCnnPosNetworkFactory",
    "trainfile":trainfile.split("/")[-1],
    "embeddingfile":coll_embeddingfile.split("/")[-1],
    "embeddim":coll_embed_dim,
    "batchsize": "32",
    "epochs" : "1000",   
    "log-level" : "INFO",
    "learningrate":.001,
    "cnn_output":100,
    "earlystoppingpatience":20,
    "dropout_rate_cnn":.2,
    "fc_drop_out_rate":0.5,
    "train_val_vocab_merge":1,
    "docidfieldname":docid,
    "labelfieldname":labelid
}

In [16]:
BilstmNetworkFactoryhyperparameters = {
    "dataset":"PpiAimedDatasetPreprocessedFactory",
    "network" :"RelationExtractorBiLstmNetworkFactory",
    "trainfile":trainfile.split("/")[-1],
    "valfile":valfile.split("/")[-1],
    "embeddingfile":embeddingfile.split("/")[-1],
    "embeddim":embed_dim,
    "batchsize": "50",
    "epochs" : "1000",  
    "earlystoppingpatience":20,
    "log-level" : "INFO",
    "learningrate":.001,
    "lstm_dropout":0.5,
    "pooling_kernel_size":3,
    "lstm_num_layers" :3,
    "lstm_hidden_size":64,
    "fc_layer_size":64,
    "fc_drop_out_rate":0.5,
    "docidfieldname":docid,
    "labelfieldname":labelid
}

In [17]:
base_experiment_bilstm_pos =  {
    "dataset":"PpiAimedDatasetFactory",
    "network" :"RelationExtractorDynamicEntityBiLstmNetworkFactory",
    "trainfile":plain_trainfile.split("/")[-1],
    "embeddingfile":embeddingfile.split("/")[-1],
    "embeddim":embed_dim,
    "batchsize": "50",
    "epochs" : "1000",  
    "earlystoppingpatience":20,
    "log-level" : "INFO",
    "learningrate":.001,
    "lstm_dropout":0.5,
    "pooling_kernel_size":3,
    "lstm_num_layers" :3,
    "lstm_hidden_size":64,
    "fc_layer_size":64,
    "fc_drop_out_rate":0.5,
    "train_val_vocab_merge":0,
    "use_min_dict":0,
    "fine_tune_embeddings":0,
    "docidfieldname":docid,
    "labelfieldname":labelid
    
}

## For BiLSTM AIMed preprocessed

#### network f - score using the following HP

```bash
/usr/bin/python -m main_train_k_fold --batchsize 64 --dataset PpiAimedDatasetPreprocessedFactory --earlystoppingpatience 20 --embeddim 200 --embeddingfile PubMed-shuffle-win-2.bin.txt --epochs 1000 --fc_drop_out_rate 0.5 --learningrate 0.001 --log-level INFO --lstm_dropout 0.5 --lstm_hidden_size 400 --lstm_num_layers 1 --network RelationExtractorBiLstmNetworkFactoryNoPos --train_val_vocab_merge 1 --trainfile AIMedFull_preprocessed.json
```

**precision, recall, fscore, support** for 10 folds

```json
[(0.7528089887640449, 0.7528089887640449, 0.752808988764045, None),
 (0.6915887850467289, 0.7628865979381443, 0.7254901960784315, None),
 (0.7586206896551724, 0.6226415094339622, 0.6839378238341969, None),
 (0.7727272727272727, 0.6296296296296297, 0.6938775510204083, None),
 (0.75, 0.6990291262135923, 0.7236180904522612, None),
 (0.7525773195876289, 0.7525773195876289, 0.752577319587629, None),
 (0.6851851851851852, 0.74, 0.7115384615384615, None),
 (0.7142857142857143, 0.7476635514018691, 0.730593607305936, None),
 (0.7560975609756098, 0.6326530612244898, 0.6888888888888889, None),
 (0.6788990825688074, 0.7789473684210526, 0.7254901960784315, None)]
```

**tn, fp, fn, tp** for 10 folds


```json
[(473, 22, 22, 67), 
 (454, 33, 23, 74), 
 (457, 21, 40, 66), 
 (456, 20, 40, 68), 
 (456, 24, 31, 72), 
 (462, 24, 24, 73),
 (449, 34, 26, 74), 
 (444, 32, 27, 80), 
 (465, 20, 36, 62),
 (453, 35, 21, 74)]
```

See aimed-ppi-extractor-2019-10-20-04-33-12-355-copy-10-20


## BiLstm plain

```python
BilstmNetworkFactoryhyperparametersNoPos_plain = {
    "dataset":"PpiAimedDatasetFactory",
    "network" :"RelationExtractorBiLstmNetworkFactoryNoPos",
    "trainfile":plain_trainfile.split("/")[-1],
   # "valfile":valfile.split("/")[-1],
    "embeddingfile":embeddingfile.split("/")[-1],
    "embeddim":embed_dim,
    "batchsize": "64",
    "epochs" : "1000",  
    "earlystoppingpatience":20,
    "log-level" : "INFO",
    "learningrate":.001,
    "lstm_dropout":0.5,
    "lstm_num_layers" :1,
    "lstm_hidden_size":400,
    "fc_drop_out_rate":0.5,
    "use_min_dict":0,
    "train_val_vocab_merge":1,
    "git_commit_id":commit_id
}
```

```bash


/usr/bin/python -m main_train_k_fold --batchsize 64 --dataset PpiAimedDatasetFactory --earlystoppingpatience 20 --embeddim 200 --embeddingfile PubMed-shuffle-win-2.bin.txt --epochs 1000 --fc_drop_out_rate 0.5 --git_commit_id 8f5d23c1e352fc9caea3a14a4293bcca3d315078 --learningrate 0.001 --log-level INFO --lstm_dropout 0.5 --lstm_hidden_size 400 --lstm_num_layers 1 --network RelationExtractorBiLstmNetworkFactoryNoPos --train_val_vocab_merge 1 --trainfile AIMedFull.json --use_min_dict 0
```

```text

0.659574468	0.613861386	0.635897436
0.598290598	0.714285714	0.651162791
0.583333333	0.7875	0.670212766
0.773333333	0.604166667	0.678362573
0.646017699	0.675925926	0.660633484
0.80952381	0.525773196	0.6375
0.584158416	0.621052632	0.602040816
0.683168317	0.6	0.638888889
0.69	0.69	0.69
0.650943396	0.627272727	0.638888889
		
66.78	64.6	65.04
8	7.6	2.6![image.png](attachment:image.png)
```


## Resnet plain

```json
BilstmNetworkFactoryhyperparametersNoPos_plain = {
    "dataset":"PpiAimedDatasetPreprocessedFactory",
    "network" :"RelationExtractorBiLstmNetworkFactoryNoPos",
    "trainfile":plain_trainfile.split("/")[-1],
    "embeddingfile":embeddingfile.split("/")[-1],
    "embeddim":embed_dim,
    "batchsize": "64",
    "epochs" : "1000",  
    "earlystoppingpatience":20,
    "log-level" : "INFO",
    "learningrate":.001,
    "lstm_dropout":0.5,
    "lstm_num_layers" :1,
    "lstm_hidden_size":400,
    "fc_drop_out_rate":0.5,
    "use_min_dict":0,
    "train_val_vocab_merge":1,
    "git_commit_id":commit_id
}
```

```bash
/usr/bin/python -m main_train_k_fold --batchsize 32 --cnn_kernel_size 3 --cnn_num_layers 2 --cnn_output 64 --dataset PpiAimedDatasetFactory --dropout_rate_cnn 0.5 --earlystoppingpatience 50 --embeddim 200 --embeddingfile PubMed-shuffle-win-2.bin.txt --epochs 1000 --fc_drop_out_rate 0.5 --fc_layer_size 256 --git_commit_id 8f5d23c1e352fc9caea3a14a4293bcca3d315078 --input_drop_out_rate 0.2 --learningrate 0.001 --log-level INFO --network RelationExtractorSimpleResnetCnnPosNetworkFactory --pool_stride 2 --pooling_kernel_size 3 --train_val_vocab_merge 1 --trainfile AIMedFull.json --use_min_dict 1 --weight_decay 1e-05
```

```text
(P, R, F1)
0.722891566	0.594059406	0.652173913
0.726190476	0.62244898	0.67032967
0.623529412	0.6625	0.642424242
0.680851064	0.666666667	0.673684211
0.705882353	0.666666667	0.685714286
0.639175258	0.639175258	0.639175258
0.666666667	0.547368421	0.601156069
0.647619048	0.591304348	0.618181818
0.579831933	0.69	0.630136986
0.717391304	0.6	0.653465347
		
67.1	62.8	64.66(mean)
4.8	4.6	2.8 (STd) 
```

## Resnet preprocessed

```python
SimpleResnetCnnPosNetworkFactoryhyperparameters = {
    "dataset":"PpiAimedDatasetPreprocessedFactory",
    "network" :"RelationExtractorSimpleResnetCnnPosNetworkFactory",
    "earlystoppingpatience" : 50,
    "trainfile":trainfile.split("/")[-1],
    "embeddingfile":embeddingfile.split("/")[-1],
    "embeddim":embed_dim,
    "batchsize": "32",
    "epochs" : "1000",   
    "log-level" : "INFO",
    "dropout_rate_cnn": 0.5,
    "pooling_kernel_size": 3,
    "pool_stride":2,
    "cnn_kernel_size":3,
    "cnn_num_layers":2,
    "cnn_output":64,
    "learningrate":.001,
    "weight_decay":.00001,
    "fc_layer_size": 256,
    "fc_drop_out_rate": 0.5,
    "input_drop_out_rate" : 0.2,
    "use_min_dict":1,
    "train_val_vocab_merge":1,
    "git_commit_id":commit_id
}
```

```bash
/usr/bin/python -m main_train_k_fold --batchsize 32 --cnn_kernel_size 3 --cnn_num_layers 2 --cnn_output 64 --dataset PpiAimedDatasetPreprocessedFactory --dropout_rate_cnn 0.5 --earlystoppingpatience 50 --embeddim 200 --embeddingfile PubMed-shuffle-win-2.bin.txt --epochs 1000 --fc_drop_out_rate 0.5 --fc_layer_size 256 --git_commit_id 8f5d23c1e352fc9caea3a14a4293bcca3d315078 --input_drop_out_rate 0.2 --learningrate 0.001 --log-level INFO --network RelationExtractorSimpleResnetCnnPosNetworkFactory --pool_stride 2 --pooling_kernel_size 3 --train_val_vocab_merge 1 --trainfile AIMedFull_preprocessed.json --use_min_dict 1 --weight_decay 1e-05

```

```test
0.7625	0.685393258	0.721893491
0.752941176	0.659793814	0.703296703
0.75	0.70754717	0.72815534
0.770114943	0.62037037	0.687179487
0.747572816	0.747572816	0.747572816
0.711340206	0.711340206	0.711340206
0.724137931	0.63	0.673796791
0.745098039	0.710280374	0.727272727
0.660377358	0.714285714	0.68627451
0.75	0.726315789	0.737967914
		
73.74	69.13	71.25 (mean)
3.3	4.4	2.6 (std)

```

main_train_k_fold --batchsize 64 --dataset PpiAimedDatasetPreprocessedFactory --earlystoppingpatience 20 --embeddim 200 --embeddingfile PubMed-shuffle-win-2.bin.txt --epochs 1000 --fc_drop_out_rate 0.5 --learningrate 0.001 --log-level INFO --lstm_dropout 0.5 --lstm_hidden_size 400 --lstm_num_layers 1 --network RelationExtractorBiLstmNetworkFactoryNoPos --train_val_vocab_merge 1 --trainfile AIMedFull_preprocessed.json --use_min_dict 1


### Resnet Nooverlap  preprocessed 

- Run: aimed-resnet-nooverlap-2019-11-23-09-39-25-707
    ```bash
    python -m main_train_k_fold --batchsize 32 --cnn_kernel_size 3 --cnn_num_layers 2 --cnn_output 64 --dataset PpiAimedDatasetPreprocessedFactory --docidfieldname docid --dropout_rate_cnn 0.5 --earlystoppingpatience 50 --embeddim 200 --embeddingfile PubMed-shuffle-win-2.bin.txt --epochs 1000 --fc_drop_out_rate 0.5 --fc_layer_size 256 --git_commit_id 21a539b999abfbad643c2727aa1d475cc47d545c --input_drop_out_rate 0.2 --labelfieldname isValid --learningrate 0.001 --log-level INFO --network RelationExtractorSimpleResnetCnnPosNetworkFactory --pool_stride 2 --pooling_kernel_size 3 --train_val_vocab_merge 1 --trainfile AIMedFull_preprocessed.json --use_min_dict 1 --weight_decay 1e-05
    ```

- Results

    ```text
          P       R       F
   mean 57.06	55.69	56.09
   std   7.75	 5.88	 5.77
    ```

- Confusion matrix

    ```text
    tn	fp	fn	tp
    460	40	43	65
    443	57	57	72
    361	39	43	51
    463	41	57	51
    425	39	33	47
    455	38	51	41
    613	94	49	73
    472	34	35	56
    344	25	43	62
    369	22	28	43
    ```


-  Precision recall support

    ```text
    0.619047619	0.601851852	0.610328638
    0.558139535	0.558139535	0.558139535
    0.566666667	0.542553191	0.554347826
    0.554347826	0.472222222	0.51
    0.546511628	0.5875	0.56626506
    0.518987342	0.445652174	0.479532164
    0.437125749	0.598360656	0.505190311
    0.622222222	0.615384615	0.61878453
    0.712643678	0.59047619	0.645833333
    0.661538462	0.605633803	0.632352941
    ```





### BiLSTM Nooverlap  preprocessed 

- Run: aimed-bilstm-nooverlap-2019-11-23-09-36-48-443/algo-1-1574501940

    ```bash
   python -m main_train_k_fold --batchsize 64 --dataset PpiAimedDatasetPreprocessedFactory --docidfieldname docid --earlystoppingpatience 20 --embeddim 200 --embeddingfile PubMed-shuffle-win-2.bin.txt --epochs 1000 --fc_drop_out_rate 0.5 --git_commit_id 21a539b999abfbad643c2727aa1d475cc47d545c --labelfieldname isValid --learningrate 0.001 --log-level INFO --lstm_dropout 0.5 --lstm_hidden_size 400 --lstm_num_layers 1 --network RelationExtractorBiLstmNetworkFactoryNoPos --train_val_vocab_merge 1 --trainfile AIMedFull_preprocessed.json --use_min_dict 0
    ```

- Results

    ```text
          P       R       F
   mean 56.38	57.15	56.38
   std  6.2 	8.8	    5.9
    ```

- Confusion matrix

    ```text
    tn	fp	fn	tp
    441	59	39	69
    445	55	54	75
    366	34	54	40
    474	30	50	58
    429	35	38	42
    446	47	34	58
    633	74	60	62
    454	52	25	66
    336	33	45	60
    364	27	23	48
    ```


-  Precision recall f1

    ```text
    0.5390625	0.638888889	0.584745763
    0.576923077	0.581395349	0.579150579
    0.540540541	0.425531915	0.476190476
    0.659090909	0.537037037	0.591836735
    0.545454545	0.525	0.535031847
    0.552380952	0.630434783	0.588832487
    0.455882353	0.508196721	0.480620155
    0.559322034	0.725274725	0.631578947
    0.64516129	0.571428571	0.606060606
    0.64	0.676056338	0.657534247
    ```



In [18]:
BilstmNetworkFactoryhyperparametersNoPos = {
    "dataset":"PpiAimedDatasetPreprocessedFactory",
    "network" :"RelationExtractorBiLstmNetworkFactoryNoPos",
    "trainfile":trainfile.split("/")[-1],
   # "valfile":valfile.split("/")[-1],
    "embeddingfile":embeddingfile.split("/")[-1],
    "embeddim":embed_dim,
    "batchsize": "64",
    "epochs" : "1000",  
    "earlystoppingpatience":20,
    "log-level" : "INFO",
    "learningrate":.001,
    "lstm_dropout":0.5,
    "lstm_num_layers" :1,
    "lstm_hidden_size":400,
    "fc_drop_out_rate":0.5,
    "use_min_dict":0,
    "train_val_vocab_merge":1,
    "git_commit_id":commit_id,
     "docidfieldname":docid,
    "labelfieldname":labelid
}

In [19]:
BilstmNetworkFactoryhyperparametersNoPos_overlap = {
    "dataset":"PpiAimedDatasetPreprocessedFactory",
    "network" :"RelationExtractorBiLstmNetworkFactoryNoPos",
    "trainfile":trainfile.split("/")[-1],
   # "valfile":valfile.split("/")[-1],
    "embeddingfile":embeddingfile.split("/")[-1],
    "embeddim":embed_dim,
    "batchsize": "64",
    "epochs" : "1000",  
    "earlystoppingpatience":20,
    "log-level" : "INFO",
    "learningrate":.001,
    "lstm_dropout":0.5,
    "lstm_num_layers" :1,
    "lstm_hidden_size":400,
    "fc_drop_out_rate":0.5,
    "use_min_dict":0,
    "train_val_vocab_merge":1,
    "git_commit_id":commit_id,
    "labelfieldname":labelid
}

In [20]:
BilstmNetworkFactoryhyperparametersNoPos_inclayers = {
    "dataset":"PpiAimedDatasetPreprocessedFactory",
    "network" :"RelationExtractorBiLstmNetworkFactoryNoPos",
    "trainfile":trainfile.split("/")[-1],
   # "valfile":valfile.split("/")[-1],
    "embeddingfile":embeddingfile.split("/")[-1],
    "embeddim":embed_dim,
    "batchsize": "64",
    "epochs" : "1000",  
    "earlystoppingpatience":20,
    "log-level" : "INFO",
    "learningrate":.001,
    "lstm_dropout":0.5,
    "lstm_num_layers" :6,
    "lstm_hidden_size":400,
    "fc_drop_out_rate":0.5,
    "use_min_dict":0,
    "train_val_vocab_merge":1,
    "git_commit_id":commit_id,
     "docidfieldname":docid,
    "labelfieldname":labelid
}

In [36]:
BilstmNetworkFactoryhyperparametersNoPos_split = {
    "dataset":"PpiAimedDatasetPreprocessedFactory",
    "network" :"RelationExtractorBiLstmNetworkFactoryNoPos",
    "trainfile":train_split.split("/")[-1],
    "valfile":val_split.split("/")[-1],
    "embeddingfile":embeddingfile.split("/")[-1],
    "embeddim":embed_dim,
    "batchsize": "64",
    "epochs" : "1000",  
    "earlystoppingpatience":20,
    "log-level" : "INFO",
    "learningrate":.001,
    "lstm_dropout":0.5,
    "lstm_num_layers" :1,
    "lstm_hidden_size":400,
    "fc_drop_out_rate":0.5,
    "use_min_dict":0,
    "train_val_vocab_merge":1,
    "git_commit_id":commit_id ,
     "labelfieldname":labelid
}

In [22]:
BilstmNetworkFactoryhyperparametersNoPos_plain = {
    "dataset":"PpiAimedDatasetFactory",
    "network" :"RelationExtractorBiLstmNetworkFactoryNoPos",
    "trainfile":plain_trainfile.split("/")[-1],
   # "valfile":valfile.split("/")[-1],
    "embeddingfile":embeddingfile.split("/")[-1],
    "embeddim":embed_dim,
    "batchsize": "64",
    "epochs" : "1000",  
    "earlystoppingpatience":20,
    "log-level" : "INFO",
    "learningrate":.001,
    "lstm_dropout":0.5,
    "lstm_num_layers" :1,
    "lstm_hidden_size":400,
    "fc_drop_out_rate":0.5,
    "use_min_dict":0,
    "train_val_vocab_merge":1,
    "git_commit_id":commit_id,
     "docidfieldname":docid,
    "labelfieldname":labelid
}

In [23]:


PpiAimedDatasetFactoryYlhsiehBiLstmNetwork = {
    "dataset":"PpiAimedDatasetFactoryYlhsieh",
    "network" :"RelationExtractorBiLstmNetworkFactoryNoPos",
    "trainfile":trainfile.split("/")[-1],
   # "valfile":valfile.split("/")[-1],
    "embeddingfile":embeddingfile.split("/")[-1],
    "embeddim":embed_dim,
    "batchsize": "16",
    "epochs" : "1000",  
    "earlystoppingpatience":20,
    "log-level" : "INFO",
    "learningrate":.001,
    "lstm_dropout":0.5,
    "lstm_num_layers" :1,
    "lstm_hidden_size":400,
    "fc_drop_out_rate":0.5,
  "train_val_vocab_merge":1,
     "docidfieldname":None,
    "labelfieldname":labelid
}

In [24]:
ResnetCnnPosNetworkFactoryhyperparameters = {
    "dataset":"PpiAimedDatasetPreprocessedFactory",
    "network" :"RelationExtractorResnetCnnPosNetworkFactory",
    "earlystoppingpatience" : 20,
    "trainfile":plain_trainfile.split("/")[-1],
   # "valfile":valfile.split("/")[-1],
    "embeddingfile":embeddingfile.split("/")[-1],
    "embeddim":embed_dim,
    "batchsize": "8",
    "epochs" : "1000",   
    "log-level" : "INFO",
    "dropout_rate_cnn": 0.5,
    "pooling_kernel_size": 3,
    "pool_stride":2,
    "cnn_kernel_size":3,
    "cnn_num_layers":3,
    "cnn_output":64,
    "learningrate":.00001,
    "weight_decay":.00001,
    "fc_layer_size": 256,
    "fc_drop_out_rate": 0.5,
    "input_drop_out_rate" : 0.2,
     "docidfieldname":docid,
    "labelfieldname":labelid
   
}

In [25]:
SimpleResnetCnnPosNetworkFactoryhyperparameters = {
    "dataset":"PpiAimedDatasetPreprocessedFactory",
    "network" :"RelationExtractorSimpleResnetCnnPosNetworkFactory",
    "earlystoppingpatience" : 50,
    "trainfile":trainfile.split("/")[-1],
  #  "valfile":valfile.split("/")[-1],
    "embeddingfile":embeddingfile.split("/")[-1],
    "embeddim":embed_dim,
    "batchsize": "32",
    "epochs" : "1000",   
    "log-level" : "INFO",
    "dropout_rate_cnn": 0.5,
    "pooling_kernel_size": 3,
    "pool_stride":2,
    "cnn_kernel_size":3,
    "cnn_num_layers":2,
    "cnn_output":64,
    "learningrate":.001,
    "weight_decay":.00001,
    "fc_layer_size": 256,
    "fc_drop_out_rate": 0.5,
    "input_drop_out_rate" : 0.2,
    "use_min_dict":1,
    "train_val_vocab_merge":1,
    "git_commit_id":commit_id,
    
     "docidfieldname":docid,
    "labelfieldname":labelid
}

In [26]:
SimpleResnetCnnPosNetworkFactoryhyperparameters_coll = {
    "dataset":"PpiAimedDatasetFactory",
    "network" :"RelationExtractorSimpleResnetCnnPosNetworkFactory",
    "earlystoppingpatience" : 50,
    "trainfile":trainfile.split("/")[-1],
  #  "valfile":valfile.split("/")[-1],
    "embeddingfile":coll_embeddingfile.split("/")[-1],
    "embeddim":50,
    "batchsize": "32",
    "epochs" : "1000",   
    "log-level" : "INFO",
    "dropout_rate_cnn": 0.5,
    "pooling_kernel_size": 3,
    "pool_stride":2,
    "cnn_kernel_size":3,
    "cnn_num_layers":2,
    "cnn_output":64,
    "learningrate":.001,
    "weight_decay":.00001,
    "fc_layer_size": 256,
    "fc_drop_out_rate": 0.5,
    "input_drop_out_rate" : 0.2,
   
    "train_val_vocab_merge":1,
    
     "docidfieldname":docid,
    "labelfieldname":labelid
}

In [27]:
SimpleResnetCnnPosNetworkFactoryhyperparametersv2 = {
    "dataset":"PpiAimedDatasetFactory",
    "network" :"RelationExtractorSimpleResnetCnnPosNetworkFactory",
    "earlystoppingpatience" : 50,
    "trainfile":trainfile.split("/")[-1],
    "valfile":valfile.split("/")[-1],
    "embeddingfile":embeddingfile.split("/")[-1],
    "embeddim":embed_dim,
    "batchsize": "8",
    "epochs" : "1000",   
    "log-level" : "INFO",
    "dropout_rate_cnn": 0.5,
    "pooling_kernel_size": 3,
    "pool_stride":2,
    "cnn_kernel_size":3,
    "cnn_num_layers":3,
    "cnn_output":32,
    "learningrate":.001,
    "weight_decay":.00001,
    "fc_layer_size": 128,
    "fc_drop_out_rate": 0.5,
    "input_drop_out_rate" : 0.2,
    "train_val_vocab_merge":1,
     "docidfieldname":docid,
    "labelfieldname":labelid
   
}

In [28]:
metric_definitions = [{"Name": "TrainLoss",
                     "Regex": "###score: train_loss### (\d*[.]?\d*)"}
                    ,{"Name": "ValidationLoss",
                     "Regex": "###score: val_loss### (\d*[.]?\d*)"}
                    ,{"Name": "TrainFScore",
                     "Regex": "###score: train_fscore### (\d*[.]?\d*)"}
                   ,{"Name": "ValidationFScore",
                     "Regex": "###score: val_fscore### (\d*[.]?\d*)"}
                    ]

In [29]:
!git log -1 | head -1
!git log -1 | head -5 | tail -1

commit ba7f135dbbadaffb8aa998757a717ae875fe378e
    update - notebook Add Bert variance


In [38]:
base_job_name="aimed-bilstm-split"
hyperparameters = BilstmNetworkFactoryhyperparametersNoPos_split #SimpleResnetCnnPosNetworkFactoryhyperparameters # BilstmNetworkFactoryhyperparametersNoPos #BilstmNetworkFactoryhyperparametersNoPos_inclayers # BilstmNetworkFactoryhyperparametersNoPos_overlap   #     #BilstmNetworkFactoryhyperparametersNoPos_split #BilstmNetworkFactoryhyperparametersNoPos_plain # BilstmNetworkFactoryhyperparametersNoPos  #PpiAimedDatasetFactoryYlhsiehBiLstmNetwork #SimpleResnetCnnPosNetworkFactoryhyperparameters
inputs = train_split_inputs # pub_inputs #train_split_inputs  #plain_inputs #


In [39]:
git_config = {'repo': 'https://github.com/elangovana/PPI-typed-relation-extractor.git',
              'branch': 'master',
              'commit': hyperparameters["git_commit_id"]
             }

In [40]:
hyperparameters 

{'dataset': 'PpiAimedDatasetPreprocessedFactory',
 'network': 'RelationExtractorBiLstmNetworkFactoryNoPos',
 'trainfile': 'AIMedtrain_preprocessed.json',
 'valfile': 'AIMedval_preprocessed.json',
 'embeddingfile': 'PubMed-shuffle-win-2.bin.txt',
 'embeddim': 200,
 'batchsize': '64',
 'epochs': '1000',
 'earlystoppingpatience': 20,
 'log-level': 'INFO',
 'learningrate': 0.001,
 'lstm_dropout': 0.5,
 'lstm_num_layers': 1,
 'lstm_hidden_size': 400,
 'fc_drop_out_rate': 0.5,
 'use_min_dict': 0,
 'train_val_vocab_merge': 1,
 'git_commit_id': 'ba7f135dbbadaffb8aa998757a717ae875fe378e',
 'labelfieldname': 'isValid'}

In [41]:
inputs

{'train': 's3://aegovan-data/aimed/AIMedtrain_preprocessed.json',
 'val': 's3://aegovan-data/aimed/AIMedval_preprocessed.json',
 'embedding': 's3://aegovan-data/embeddings/bio_nlp_vec/PubMed-shuffle-win-2.bin.txt'}

In [42]:
from sagemaker.pytorch import PyTorch

estimator = PyTorch(
   #  entry_point='main_train_k_fold.py',
       entry_point='main_train.py',
                    source_dir = 'source/algorithms',
                    dependencies =['source/algorithms', 'source/datasets', 'source/preprocessor', 'source/modelnetworks', 'source/metrics'],
                    role=role,
                    framework_version ="1.0.0",
                    py_version='py3',
                    git_config= git_config,
                    image_name= docker_repo,
                    train_instance_count=1,
                    train_instance_type=instance_type,
                    hyperparameters =hyperparameters,
                    output_path=s3_output_path,
                    metric_definitions=metric_definitions,
                    #train_use_spot_instances = True
                    train_volume_size=30,
                    code_location=s3_code_path,
                    train_max_run = 60 * 60 * 24 * 4,
                    base_job_name = base_job_name)

In [43]:
for i in range(0, 10):
    estimator.fit(inputs, wait=False)

In [None]:
assert False

### HPO

In [None]:
objective_metric_name ="ValidationFScore"

In [None]:
from sagemaker.tuner import HyperparameterTuner, IntegerParameter, CategoricalParameter, ContinuousParameter
hyperparameter_ranges = {'lstmhiddensize': IntegerParameter(40,200), #ContinuousParameter(0.01, 0.2),
                          "fclayersize": IntegerParameter(10,50),
                            "numlayers":IntegerParameter(1,10),
                        "poolingkernelsize":IntegerParameter(2,10)}

In [None]:
hyperparameters={ "trainfile":trainfile.split("/")[-1],
    "valfile":valfile.split("/")[-1],
    "embeddingfile":embeddingfile.split("/")[-1],
    "embeddim":"200",
    "epochs": 100,
    "earlystoppingpatience": 20,
    "dataset":"PpiAimedDatasetFactory"}

In [None]:

estimator = PyTorch(
     entry_point='main_train.py',
                    source_dir = 'source/algorithms',
                    dependencies =['source/algorithms', 'source/datasets','source/preprocessor'],
                    role=role,
                    framework_version ="1.0.0",
                    py_version='py3',
                   # git_config= git_config,
                    image_name= docker_repo,
                    train_instance_count=1,
                    train_instance_type=instance_type,
                    hyperparameters =hyperparameters,
                    output_path=s3_output_path,
                    metric_definitions=metric_definitions,
                    code_location=s3_code_path,
                    #train_use_spot_instances = True
                    train_volume_size=30,
                    base_job_name ="aimed-ppi-extractor")

In [None]:
tuner = HyperparameterTuner(estimator,
                            objective_metric_name,
                            hyperparameter_ranges,
                            metric_definitions,
                            max_jobs=50,
                            max_parallel_jobs=7,
                            strategy="Random",
                            base_tuning_job_name="hpo-aimed-ppi-extractor")
tuner.fit(inputs)