### Set up

#### 1. Set  up  accounts and role

In [16]:
!pip install sagemaker==1.39.0

[31mawsebcli 3.12.4 has requirement requests<=2.9.1,>=2.6.1, but you'll have requests 2.20.1 which is incompatible.[0m
[31mspacy 2.1.8 has requirement numpy>=1.15.0, but you'll have numpy 1.14.1 which is incompatible.[0m
[31mblis 0.2.4 has requirement numpy>=1.15.0, but you'll have numpy 1.14.1 which is incompatible.[0m
[33mYou are using pip version 10.0.1, however version 19.2.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [1]:
import sagemaker
import boto3

sagemaker_session = sagemaker.Session()
account_id =  boto3.client('sts').get_caller_identity().get('Account')
region = boto3.session.Session().region_name


#role = sagemaker.get_execution_role()
role="arn:aws:iam::{}:role/service-role/AmazonSageMaker-ExecutionRole-20190118T115449".format(account_id)


#### 2. Setup image and instance type

In [2]:
pytorch_custom_image_name="ppi-extractor:gpu-1.0.0-201909140804"
instance_type = "ml.p3.2xlarge" 

In [3]:
docker_repo = "{}.dkr.ecr.{}.amazonaws.com/{}".format(account_id, region, pytorch_custom_image_name)

#### 3. Configure train/ test and validation datasets

In [4]:
bucket = "aegovan-data"

In [5]:
trainfile = "s3://{}/processed_dataset/train_unique_pub_v6_less_negative.json".format(bucket)
testfile= "s3://{}/processed_dataset/test_unique_pub_v6_less_negative.json".format(bucket)
valfile="s3://{}/processed_dataset/val_unique_pub_v6_less_negative.json".format(bucket)
embeddingfile="s3://{}/embeddings/wikipedia-pubmed-and-PMC-w2v.bin.txt".format(bucket)
s3_output_path= "s3://{}/results/".format(bucket)

### Start training

In [6]:
inputs = {
    "train" : trainfile,
    "val" :valfile,
    "embedding" : embeddingfile
}

In [9]:
hyperparameters = {
    "dataset":"PPIDataset",
    "trainfile":trainfile.split("/")[-1],
    "valfile":valfile.split("/")[-1],
    "embeddingfile":embeddingfile.split("/")[-1],
    "embeddim":200,
    "batchsize": "32",
    "epochs" : "1000",   
    "log-level" : "INFO"
   
}

In [10]:
metric_definitions = [{"Name": "TrainLoss",
                     "Regex": "###score: train_loss### (\d*[.]?\d*)"}
                    ,{"Name": "ValidationLoss",
                     "Regex": "###score: val_loss### (\d*[.]?\d*)"}
                    ,{"Name": "TrainFScore",
                     "Regex": "###score: train_fscore### (\d*[.]?\d*)"}
                   ,{"Name": "ValidationFScore",
                     "Regex": "###score: val_fscore### (\d*[.]?\d*)"}
                    ]

In [18]:
git_config = {'repo': 'https://github.com/elangovana/PPI-typed-relation-extractor.git',
              'branch': 'master',
              'commit': 'c050a2818efd6dc6d2adb3dac07c102b94c92eab'}

In [41]:
from sagemaker.pytorch import PyTorch

estimator = PyTorch(
     entry_point='main_train.py',
                    source_dir = 'source/algorithms',
                    dependencies =['source/algorithms'],
                    role=role,
                    framework_version ="1.0.0",
                    py_version='py3',
                   # git_config= git_config,
                    image_name= docker_repo,
                    train_instance_count=1,
                    train_instance_type=instance_type,
                    hyperparameters =hyperparameters,
                    output_path=s3_output_path,
                    metric_definitions=metric_definitions,
                    #train_use_spot_instances = True
                    train_volume_size=30,
                    base_job_name ="ppi-extractor")

In [42]:
estimator.fit(inputs)

2019-09-14 09:50:29 Starting - Starting the training job...
2019-09-14 09:50:32 Starting - Launching requested ML instances...
2019-09-14 09:51:25 Starting - Preparing the instances for training...
2019-09-14 09:52:00 Downloading - Downloading input data..................
2019-09-14 09:55:41 Training - Downloading the training image..........[31mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[31mbash: no job control in this shell[0m
[31m2019-09-14 09:57:55,191 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training[0m
[31m2019-09-14 09:57:55,215 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[31m2019-09-14 09:57:58,227 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[31m2019-09-14 09:57:58,563 sagemaker-containers INFO     Module main_train does not provide a setup.py. [0m
[31mGenerating setup.py[0m
[31m2019-09-14 09:57:58,563 sagemak

[31m2019-09-14 09:58:02,434 sagemaker-containers ERROR    ExecuteUserScriptError:[0m
[31mCommand "/usr/bin/python -m main_train --batchsize 32 --dataset PPIDataset --embeddim 200 --embeddingfile wikipedia-pubmed-and-PMC-w2v.bin.txt --epochs 1000 --log-level INFO --trainfile train_unique_pub_v6_less_negative.json --valfile val_unique_pub_v6_less_negative.json"[0m
[31musage: main_train.py [-h] --dataset
                     {PPIDataset,PpiAimedDataset,PpiNoInteractionDataset}
                     --trainfile TRAINFILE [--traindir TRAINDIR] --valfile
                     VALFILE [--valdir VALDIR] --embeddingfile EMBEDDINGFILE
                     [--embeddingdir EMBEDDINGDIR] [--outdir OUTDIR]
                     --embeddim EMBEDDIM [--epochs EPOCHS]
                     [--interaction-type INTERACTION_TYPE]
                     [--log-level {DEBUG,ERROR,WARN,INFO}][0m
[31mmain_train.py: error: unrecognized arguments: --batchsize 32[0m

2019-09-14 09:58:06 Uploading - Uploading g

UnexpectedStatusException: Error for Training job ppi-extractor-2019-09-14-09-50-18-575: Failed. Reason: AlgorithmError: ExecuteUserScriptError:
Command "/usr/bin/python -m main_train --batchsize 32 --dataset PPIDataset --embeddim 200 --embeddingfile wikipedia-pubmed-and-PMC-w2v.bin.txt --epochs 1000 --log-level INFO --trainfile train_unique_pub_v6_less_negative.json --valfile val_unique_pub_v6_less_negative.json"
usage: main_train.py [-h] --dataset
                     {PPIDataset,PpiAimedDataset,PpiNoInteractionDataset}
                     --trainfile TRAINFILE [--traindir TRAINDIR] --valfile
                     VALFILE [--valdir VALDIR] --embeddingfile EMBEDDINGFILE
                     [--embeddingdir EMBEDDINGDIR] [--outdir OUTDIR]
                     --embeddim EMBEDDIM [--epochs EPOCHS]
                     [--interaction-type INTERACTION_TYPE]
                     [--log-level {DEBUG,ERROR,WARN,INFO}]
main_train.py: error: unrecognized arguments: --batchsize 32