# Bert NER on SageMaker using PyTorch

This uses the Biocreative II gene mention dataset https://biocreative.bioinformatics.udel.edu/tasks/biocreative-ii/task-1a-gene-mention-tagging/






In [1]:
import sys, os
import logging

sys.path.append("src")

logging.basicConfig(level="INFO", handlers=[logging.StreamHandler(sys.stdout)],
                        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

### Bucket and role set up

In [2]:
import sagemaker, boto3

from sagemaker import get_execution_role
sm_session = sagemaker.session.Session()
account_id =  boto3.client('sts').get_caller_identity().get('Account')

# role=get_execution_role()
role ="arn:aws:iam::{}:role/service-role/AmazonSageMaker-ExecutionRole-20181222T162635".format(account_id)


2020-10-04 19:51:53,090 - botocore.credentials - INFO - Found credentials in shared credentials file: ~/.aws/credentials
2020-10-04 19:51:53,256 - botocore.credentials - INFO - Found credentials in shared credentials file: ~/.aws/credentials


In [3]:
data_bucket = "aegovan-data"

data_bucket_prefix = "Biocreative-gene-mention"

s3_uri_data = "s3://{}/{}/train data".format(data_bucket, data_bucket_prefix)
s3_uri_train = "{}/{}".format(s3_uri_data, "train.in")
s3_uri_classes = "{}/{}".format(s3_uri_data, "GENE.eval")

s3_uri_test = "s3://{}/{}/test data".format(data_bucket, data_bucket_prefix , "test.in")

s3_output_path = "s3://{}/{}/output".format(data_bucket, data_bucket_prefix)
s3_code_path = "s3://{}/{}/code".format(data_bucket, data_bucket_prefix)
s3_checkpoint = "s3://{}/{}/checkpoint".format(data_bucket, data_bucket_prefix)

## Train

This shows you how to train BERT on SageMaker using SPOT instances

In [4]:
inputs_full =  {
    "train" : s3_uri_train,
    "class" : s3_uri_classes
}

inputs = inputs_full

In [5]:
sm_localcheckpoint_dir="/opt/ml/checkpoints/"

In [6]:
instance_type = "ml.p3.2xlarge"
instance_type_gpu_map = {"ml.p3.8xlarge":4, "ml.p3.2xlarge": 1, "ml.p3.16xlarge":8}

In [17]:
hp = {
"epochs" : 30,
"earlystoppingpatience" : 5,
# Increasing batch size might end up with CUDA OOM error, increase grad accumulation instead
"batch" : 8 * instance_type_gpu_map[instance_type],
"trainfile" :s3_uri_train.split("/")[-1],
"classfile":s3_uri_classes.split("/")[-1],
# The number of steps to accumulate gradients for
"gradaccumulation" : 4,
"log-level":"INFO",
# This param depends on your model max pos embedding size or when large you might end up with CUDA OOM error    
"maxseqlen" : 512,
# Make sure the lr is quite small, as this is a pretrained model..
"lr":0.00001,
# Use finetuning (set to 1), if you only want to change the weights in the final classification layer.. 
"finetune": 0,
"checkpointdir" : sm_localcheckpoint_dir,
# Checkpoints once every n epochs
"checkpointfreq": 2,
"log-level" : "INFO"
}



In [18]:
hp

{'epochs': 30,
 'earlystoppingpatience': 5,
 'batch': 8,
 'trainfile': 'train.in',
 'classfile': 'GENE.eval',
 'gradaccumulation': 4,
 'log-level': 'INFO',
 'maxseqlen': 512,
 'lr': 1e-05,
 'finetune': 0,
 'checkpointdir': '/opt/ml/checkpoints/',
 'checkpointfreq': 2}

In [19]:
inputs

{'train': 's3://aegovan-data/Biocreative-gene-mention/train data/train.in',
 'class': 's3://aegovan-data/Biocreative-gene-mention/train data/GENE.eval'}

In [20]:
metric_definitions = [{"Name": "TrainLoss",
                     "Regex": "###score: train_loss### (\d*[.]?\d*)"}
                    ,{"Name": "ValidationLoss",
                     "Regex": "###score: val_loss### (\d*[.]?\d*)"}
                    ,{"Name": "TrainScore",
                     "Regex": "###score: train_score### (\d*[.]?\d*)"}
                   ,{"Name": "ValidationScore",
                     "Regex": "###score: val_score### (\d*[.]?\d*)"}
                    ]

In [21]:
# set True if you need spot instance
use_spot = True
train_max_run_secs =   2*24 * 60 * 60
spot_wait_sec =  5 * 60
max_wait_time_secs = train_max_run_secs +  spot_wait_sec

if not use_spot:
    max_wait_time_secs = None
    
# During local mode, no spot.., use smaller dataset
if instance_type == 'local':
    use_spot = False
    max_wait_time_secs = 0
    wait = True
    # Use smaller dataset to run locally
    inputs = inputs_sample


In [22]:
job_type = "bc2-ner-bert"
base_name = "{}".format(job_type)

In [24]:
from sagemaker.pytorch import PyTorch

estimator = PyTorch(entry_point='main.py',
                    source_dir = 'src',
                    role=role,
                    framework_version ="1.4.0",
                    py_version='py3',
                    instance_count=1,
                    instance_type=instance_type,
                    hyperparameters = hp,
                    output_path=s3_output_path,
                    metric_definitions=metric_definitions,
                    volume_size=30,
                    code_location=s3_code_path,
                    debugger_hook_config=False,
                    base_job_name =base_name,  
                    use_spot_instances = use_spot,
                    max_run =  train_max_run_secs,
                    max_wait = max_wait_time_secs,   
                    checkpoint_s3_uri=s3_checkpoint,
                    checkpoint_local_path=sm_localcheckpoint_dir
                    )

estimator.fit(inputs, wait=True)

2020-10-04 20:16:02,884 - sagemaker - INFO - Creating training-job with name: bc2-ner-bert-2020-10-04-09-15-57-317
2020-10-04 09:16:06 Starting - Starting the training job...
2020-10-04 09:16:08 Starting - Launching requested ML instances...
2020-10-04 09:17:06 Starting - Preparing the instances for training......
2020-10-04 09:18:09 Downloading - Downloading input data...
2020-10-04 09:18:43 Training - Downloading the training image.....[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2020-10-04 09:19:59,616 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2020-10-04 09:19:59,641 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2020-10-04 09:20:02,677 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2020-10-04 09:20:02,989 sagemaker-containers INFO     Module default_user

[34m2020-10-04 09:20:36,949 - transformers.configuration_utils - INFO - loading configuration file /opt/ml/checkpoints/config.json[0m
[34m2020-10-04 09:20:36,949 - transformers.configuration_utils - INFO - Model config BertConfig {
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 28996[0m
[34m}
[0m
[34m2020-10-04 09:20:36,950 - transformers.modeling_utils - INFO - loading weights file /opt/ml/checkpoints/pytorch_mod

[34m2020-10-04 09:28:11,756 - trainer - INFO - Validation set result details: 0.43967669377363616 [0m
[34m2020-10-04 09:28:11,812 - trainer - INFO - Snapshotting because the current score 0.43967669377363616 is greater than 0.355314511632622 [0m
[34m2020-10-04 09:28:11,812 - trainer - INFO - Snapshot model to /opt/ml/model/best_snaphsotmodel.pt[0m
[34m2020-10-04 09:28:11,813 - transformers.configuration_utils - INFO - Configuration saved in /opt/ml/model/config.json[0m
[34m2020-10-04 09:28:12,271 - transformers.modeling_utils - INFO - Model weights saved in /opt/ml/model/pytorch_model.bin[0m
[34m2020-10-04 09:28:12,272 - trainer - INFO - Run    451     1       750     9/375         2% 0.043748 0.023825       0.3525       0.4397[0m
[34m###score: train_loss### 0.043748344724377[0m
[34m###score: val_loss### 0.02382524897996336[0m
[34m###score: train_score### 0.3524729960204662[0m
[34m###score: val_score### 0.43967669377363616[0m
[34mThe current process just got forked

[34m2020-10-04 09:37:21,464 - trainer - INFO - Train set result details:[0m
[34m2020-10-04 09:37:22,201 - trainer - INFO - Train set result details: 0.5287243203346045[0m
[34m2020-10-04 09:37:22,201 - trainer - INFO - Validation set result details:[0m
[34mThe current process just got forked. Disabling parallelism to avoid deadlocks...[0m
[34mThe current process just got forked. Disabling parallelism to avoid deadlocks...[0m
[34mThe current process just got forked. Disabling parallelism to avoid deadlocks...[0m
[34mThe current process just got forked. Disabling parallelism to avoid deadlocks...[0m
[34mThe current process just got forked. Disabling parallelism to avoid deadlocks...[0m
[34mThe current process just got forked. Disabling parallelism to avoid deadlocks...[0m
[34mThe current process just got forked. Disabling parallelism to avoid deadlocks...[0m
[34m2020-10-04 09:39:28,706 - trainer - INFO - Validation set result details: 0.5591105689253631 [0m
[34m2020

[34m2020-10-04 09:46:59,465 - trainer - INFO - Validation set result details: 0.5875483911912165 [0m
[34m2020-10-04 09:46:59,521 - trainer - INFO - Snapshotting because the current score 0.5875483911912165 is greater than 0.5866368769058906 [0m
[34m2020-10-04 09:46:59,522 - trainer - INFO - Snapshot model to /opt/ml/model/best_snaphsotmodel.pt[0m
[34m2020-10-04 09:46:59,522 - transformers.configuration_utils - INFO - Configuration saved in /opt/ml/model/config.json[0m
[34m2020-10-04 09:46:59,983 - transformers.modeling_utils - INFO - Model weights saved in /opt/ml/model/pytorch_model.bin[0m
[34m2020-10-04 09:46:59,984 - trainer - INFO - Checkpoint model to /opt/ml/checkpoints/[0m
[34m2020-10-04 09:46:59,985 - transformers.configuration_utils - INFO - Configuration saved in /opt/ml/checkpoints/config.json[0m
[34m2020-10-04 09:47:00,475 - transformers.modeling_utils - INFO - Model weights saved in /opt/ml/checkpoints/pytorch_model.bin[0m
[34m2020-10-04 09:47:00,476 - tra

[34m2020-10-04 09:56:08,980 - trainer - INFO - Train set result details:[0m
[34m2020-10-04 09:56:09,702 - trainer - INFO - Train set result details: 0.6797318205260444[0m
[34m2020-10-04 09:56:09,702 - trainer - INFO - Validation set result details:[0m
[34mThe current process just got forked. Disabling parallelism to avoid deadlocks...[0m
[34mThe current process just got forked. Disabling parallelism to avoid deadlocks...[0m
[34mThe current process just got forked. Disabling parallelism to avoid deadlocks...[0m
[34mThe current process just got forked. Disabling parallelism to avoid deadlocks...[0m
[34mThe current process just got forked. Disabling parallelism to avoid deadlocks...[0m
[34mThe current process just got forked. Disabling parallelism to avoid deadlocks...[0m
[34mThe current process just got forked. Disabling parallelism to avoid deadlocks...[0m
[34m2020-10-04 09:58:16,326 - trainer - INFO - Validation set result details: 0.6160117495429067 [0m
[34m2020


2020-10-04 10:04:18 Stopping - Stopping the training job[34m2020-10-04 10:05:46,989 - trainer - INFO - Validation set result details: 0.6027381500230096 [0m
[34m2020-10-04 10:05:46,990 - trainer - INFO - Run   2706    11      4500     9/375         2% 0.016705 0.016400       0.6923       0.6027[0m
[34m###score: train_loss### 0.016705091308491925[0m
[34m###score: val_loss### 0.01639985454439496[0m
[34m###score: train_score### 0.6922777417261519[0m
[34m###score: val_score### 0.6027381500230096[0m
[34mThe current process just got forked. Disabling parallelism to avoid deadlocks...[0m
[34mThe current process just got forked. Disabling parallelism to avoid deadlocks...[0m
[34mThe current process just got forked. Disabling parallelism to avoid deadlocks...[0m
[34mThe current process just got forked. Disabling parallelism to avoid deadlocks...[0m
[34mThe current process just got forked. Disabling parallelism to avoid deadlocks...[0m
[34mThe current process just got for

## Deploy BERT model

#### Inference container
Ideally the server containing should already have all the required dependencies installed to reduce start up time and ensure that the runtime enviornment is consistent. This can be implemented using a custom docker image.

But for this demo, to simplify, we will let the Pytorch container script model install the dependencies during start up. As a result, you will see some of the initial ping requests fail, until all dependencies are installed.


In [25]:
import sagemaker
training_job = "bc2-ner-bert-2020-10-04-09-15-57-317"
estimator = sagemaker.estimator.Estimator.attach(training_job)


2020-10-04 10:07:37 Starting - Preparing the instances for training
2020-10-04 10:07:37 Downloading - Downloading input data
2020-10-04 10:07:37 Training - Training image download completed. Training in progress.
2020-10-04 10:07:37 Stopping - Stopping the training job
2020-10-04 10:07:37 Uploading - Uploading generated training model
2020-10-04 10:07:37 Stopped - Training job stopped


In [None]:
from sagemaker.pytorch import PyTorchModel
from sagemaker import get_execution_role
role = role

model_uri = estimator.model_data

model = PyTorchModel(model_data=model_uri,
                     role=role,
                     framework_version='1.4.0',
                     py_version = "py3",
                     entry_point='serve.py',
                     source_dir='src'
                    
                    )

predictor = model.deploy(initial_instance_count=1, instance_type='ml.p3.2xlarge')

2020-10-05 00:35:41,222 - sagemaker - INFO - Creating model with name: pytorch-inference-2020-10-04-13-35-41-222
2020-10-05 00:35:45,969 - sagemaker - INFO - Creating endpoint with name pytorch-inference-2020-10-04-13-35-43-250
--------

### Invoke API

In [27]:
class Predictor:
    
    def serialize(self, x):
        return x
    
    def deserialize(self,x, content_type):
        payload_bytes = json.loads( x.read().decode("utf-8") )
        return payload_bytes

In [28]:
predictor.serializer = Predictor()
predictor.deserializer = Predictor()

In [29]:
data = ["Hailey-Hailey disease is caused by mutations in  ATP2C1  encoding a novel Ca(2+) pump."]

In [35]:
import json

from datasets.biocreative_dataset import BiocreativeDataset
d = BiocreativeDataset("tmp/train.in", None)
for i in range( 0,len(d),5):

    data = [d[j][0][0] for j in range(i, i+5)]
    #print(i,len(d), data)
    data_bytes=("\n".join(data)).encode("utf-8")
    response_bytes  = predictor.predict(data_bytes,  
                                    initial_args={ "Accept":"text/json", "ContentType" : "text/csv" }
                                   )
   
    for r in response_bytes:
        for i in r:
            if i["entity"] != "O":
                print(i["entity"], i["raw_token"])






I-GENE [PAD]
B-GENE [PAD]
I-GENE ##ity
I-GENE [PAD]
I-GENE [PAD]
B-GENE [PAD]
B-GENE %
I-GENE ##virus
B-GENE [PAD]
I-GENE [PAD]
B-GENE [PAD]
B-GENE [PAD]
B-GENE [PAD]
B-GENE [PAD]
B-GENE [PAD]
I-GENE [PAD]
I-GENE [PAD]
I-GENE [PAD]
I-GENE [PAD]
I-GENE [PAD]
I-GENE [PAD]
I-GENE [PAD]
I-GENE [PAD]
B-GENE [PAD]
B-GENE [PAD]
I-GENE [PAD]
I-GENE [PAD]
I-GENE [PAD]
I-GENE [PAD]
I-GENE [PAD]
I-GENE [PAD]
I-GENE [PAD]
I-GENE [PAD]
I-GENE [PAD]
B-GENE [PAD]
B-GENE Thus
B-GENE as
B-GENE -
I-GENE 2
I-GENE ,
I-GENE Sur
B-GENE )
I-GENE ,
I-GENE Y
I-GENE ,
I-GENE Sur
I-GENE ##f
B-GENE ,
I-GENE and
I-GENE Y
I-GENE ##15
I-GENE ##17
I-GENE ##2
B-GENE Sur
I-GENE ##f
I-GENE -
I-GENE 5
I-GENE [PAD]
I-GENE [PAD]
I-GENE [PAD]
I-GENE [PAD]
B-GENE [PAD]
I-GENE [PAD]
I-GENE [PAD]
B-GENE [PAD]
B-GENE [PAD]
I-GENE [PAD]
I-GENE [PAD]
I-GENE [PAD]
I-GENE [PAD]
I-GENE )
I-GENE and
I-GENE 2
I-GENE (
I-GENE /
I-GENE th
I-GENE p
I-GENE [PAD]
B-GENE [PAD]
I-GENE [PAD]
B-GENE Zhu
B-GENE [PAD]
I-GENE ##F
I-GENE ##H
I-GEN

## Delete endpoint

In [36]:
predictor.delete_endpoint()

2020-10-05 00:29:40,004 - sagemaker - INFO - Deleting endpoint configuration with name: pytorch-inference-2020-10-04-10-16-55-684
2020-10-05 00:29:42,109 - sagemaker - INFO - Deleting endpoint with name: pytorch-inference-2020-10-04-10-16-55-684
