# Bert text classification on SST2 using PyTorch





In [1]:
!pip install -r requirements_notebook.txt



In [2]:
import sys, os, shutil
import logging

sys.path.append("src")

logging.basicConfig(level="INFO", handlers=[logging.StreamHandler(sys.stdout)],
                        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

In [None]:
resume_from_checkpoint=False

In [None]:
import boto3

def delete_s3_objects(s3_uri):
    client = boto3.client('s3')
    s3_uri = s3_uri.replace("s3://","")
    bucket, prefix = s3_uri.split("/")[0], "/".join( s3_uri.split("/")[1:])
    
    response = client.list_objects(
    Bucket=bucket,
    Delimiter='|',
    Prefix=prefix,
    MaxKeys = 20
)
    s3 = boto3.resource('s3')
    for item in response.get("Contents", []):
        print("Deleting {}".format(item["Key"]))
        obj = s3.Object(bucket, item["Key"] )
        obj.delete()

In [None]:
if not resume_from_checkpoint:
    delete_s3_objects(s3_checkpoint)

### Bucket and role set up

In [3]:
import sagemaker, boto3
from sagemaker import get_execution_role
sm_session = sagemaker.session.Session()


sagemaker_session = sagemaker.Session()
account_id =  boto3.client('sts').get_caller_identity().get('Account')
region = boto3.session.Session().region_name


#role = sagemaker.get_execution_role()
role="arn:aws:iam::{}:role/service-role/AmazonSageMaker-ExecutionRole-20190118T115449".format(account_id)


2021-01-20 20:26:47,819 - botocore.credentials - INFO - Found credentials in shared credentials file: ~/.aws/credentials
2021-01-20 20:26:48,084 - botocore.credentials - INFO - Found credentials in shared credentials file: ~/.aws/credentials
2021-01-20 20:26:48,248 - botocore.credentials - INFO - Found credentials in shared credentials file: ~/.aws/credentials


In [4]:
data_bucket = sm_session.default_bucket()

data_bucket_prefix = "bert-bc3ast-classify"

s3_uri_data = "s3://{}/{}/data".format(data_bucket, data_bucket_prefix)
s3_uri_train = "{}/{}".format(s3_uri_data, "train.csv")
s3_uri_val = "{}/{}".format(s3_uri_data, "dev.csv")

s3_uri_test = "{}/{}".format(s3_uri_data, "test.csv")

s3_output_path = "s3://{}/{}/output".format(data_bucket, data_bucket_prefix)
s3_code_path = "s3://{}/{}/code".format(data_bucket, data_bucket_prefix)
s3_checkpoint = "s3://{}/{}/checkpoint".format(data_bucket, data_bucket_prefix)

In [5]:
prepare_dataset = True

## Prepare dataset

In [6]:
raw_data_dir = "tmp"
processed_out_dir = os.path.join(raw_data_dir, "processd")

if os.path.exists(processed_out_dir):
    shutil.rmtree(processed_out_dir)

os.makedirs(processed_out_dir, exist_ok=True)


In [7]:
from utils.bc3ast_preprocess import BC3ASTPreprocess
from s3_util import S3Util

if prepare_dataset:
    train_file = os.path.join(raw_data_dir , "bc3_act_all_records.tsv")
    train_label_file = os.path.join(raw_data_dir , "bc3_act_gold_standard.tsv")

    test_file = os.path.join(raw_data_dir , "bc3_act_all_records_test.tsv")
    test_label_file = os.path.join(raw_data_dir , "bc3_act_gold_standard_test.tsv")


    processed_train_val_file = os.path.join(processed_out_dir , "train_full.tsv")
    processed_train_file = os.path.join(processed_out_dir , "train.tsv")
    processed_val_file = os.path.join(processed_out_dir , "val.tsv")
    processed_test_file = os.path.join(processed_out_dir , "test.tsv")
        
    BC3ASTPreprocess().process( train_file, train_label_file,  processed_train_val_file)
    BC3ASTPreprocess().split(processed_train_val_file, processed_train_file, processed_val_file,  split=0.8)

    BC3ASTPreprocess().process(test_file, test_label_file, processed_test_file)

    S3Util().upload_file(processed_train_file, s3_uri_train )
    S3Util().upload_file(processed_val_file, s3_uri_val )
    S3Util().upload_file(processed_test_file, s3_uri_test )
    
    

2021-01-20 20:26:53,613 - utils.bc3ast_preprocess - INFO - Writing to <_io.TextIOWrapper name='tmp/processd/train_full.tsv' mode='w' encoding='UTF-8'>
2021-01-20 20:26:54,228 - utils.bc3ast_preprocess - INFO - Writing to <_io.TextIOWrapper name='tmp/processd/test.tsv' mode='w' encoding='UTF-8'>
Uploading file tmp/processd/train.tsv to s3://sagemaker-us-east-2-324346001917/bert-bc3ast-classify/data/train.csv in 9.874878 seconds
Uploading file tmp/processd/val.tsv to s3://sagemaker-us-east-2-324346001917/bert-bc3ast-classify/data/dev.csv in 7.431086 seconds
Uploading file tmp/processd/test.tsv to s3://sagemaker-us-east-2-324346001917/bert-bc3ast-classify/data/test.csv in 14.291215 seconds


## Train

This shows you how to train BERT on SageMaker using SPOT instances

In [8]:
inputs_full =  {
    "train" : s3_uri_train,
    "val" : s3_uri_val,

}



inputs = inputs_full

In [9]:
sm_localcheckpoint_dir="/opt/ml/checkpoints/"

In [10]:
instance_type = "ml.p3.2xlarge"
instance_type_gpu_map = {"ml.p3.8xlarge":4, "ml.p3.2xlarge": 1, "ml.p3.16xlarge":8}

In [11]:
hp = {
"epochs" : 30,
"earlystoppingpatience" : 3,
# Increasing batch size might end up with CUDA OOM error, increase grad accumulation instead
"batch" : 8 * instance_type_gpu_map[instance_type],
"trainfile" :s3_uri_train.split("/")[-1],
"valfile" : s3_uri_val.split("/")[-1],
"datasetfactory":"datasets.bc3ast_dataset_factory.BC3ASTDatasetFactory",
# The number of steps to accumulate gradients for
"gradaccumulation" : 4,
"log-level":"INFO",
# This param depends on your model max pos embedding size or when large you might end up with CUDA OOM error    
"maxseqlen" : 512,
# Make sure the lr is quite small, as this is a pretrained model..
"lr":0.00001,
# Use finetuning (set to 1), if you only want to change the weights in the final classification layer.. 
"finetune": 0,
"checkpointdir" : sm_localcheckpoint_dir,
# Checkpoints once every n epochs
"checkpointfreq": 2
}



In [12]:
hp

{'epochs': 30,
 'earlystoppingpatience': 3,
 'batch': 8,
 'trainfile': 'train.csv',
 'valfile': 'dev.csv',
 'datasetfactory': 'datasets.bc3ast_dataset_factory.BC3ASTDatasetFactory',
 'gradaccumulation': 4,
 'log-level': 'INFO',
 'maxseqlen': 512,
 'lr': 1e-05,
 'finetune': 0,
 'checkpointdir': '/opt/ml/checkpoints/',
 'checkpointfreq': 2}

In [13]:
inputs

{'train': 's3://sagemaker-us-east-2-324346001917/bert-bc3ast-classify/data/train.csv',
 'val': 's3://sagemaker-us-east-2-324346001917/bert-bc3ast-classify/data/dev.csv'}

In [14]:
metric_definitions = [{"Name": "TrainLoss",
                     "Regex": "###score: train_loss### (\d*[.]?\d*)"}
                    ,{"Name": "ValidationLoss",
                     "Regex": "###score: val_loss### (\d*[.]?\d*)"}
                    ,{"Name": "TrainScore",
                     "Regex": "###score: train_score### (\d*[.]?\d*)"}
                   ,{"Name": "ValidationScore",
                     "Regex": "###score: val_score### (\d*[.]?\d*)"}
                    ]

In [15]:
# set True if you need spot instance
use_spot = True
train_max_run_secs =   2*24 * 60 * 60
spot_wait_sec =  5 * 60
max_wait_time_secs = train_max_run_secs +  spot_wait_sec

if not use_spot:
    max_wait_time_secs = None
    s3_checkpoint=None
    sm_localcheckpoint_dir=None
    hp.pop("checkpointdir")
    
# During local mode, no spot.., use smaller dataset
if instance_type == 'local':
    use_spot = False
    max_wait_time_secs = 0
    wait = True
    # Use smaller dataset to run locally
    inputs = inputs_sample
    
    


In [16]:
job_type = "bert-sst2-classification"
base_name = "{}".format(job_type)

In [17]:
from sagemaker.pytorch import PyTorch

estimator = PyTorch(
                    entry_point='main.py',
                    source_dir = 'src',
                    role=role,
                    framework_version ="1.4.0",
                    py_version='py3',
                    instance_count=1,
                    instance_type=instance_type,
                    hyperparameters = hp,
                    output_path=s3_output_path,
                    metric_definitions=metric_definitions,
                    volume_size=30,
                    code_location=s3_code_path,
                    debugger_hook_config=False,
                    base_job_name =base_name,  
                    use_spot_instances = use_spot,
                    max_run =  train_max_run_secs,
                    max_wait = max_wait_time_secs,   
                    checkpoint_s3_uri=s3_checkpoint,
                    checkpoint_local_path=sm_localcheckpoint_dir)

estimator.fit(inputs, wait=True)

2021-01-20 20:27:30,161 - sagemaker - INFO - Creating training-job with name: bert-sst2-classification-2021-01-20-09-27-26-442
2021-01-20 09:27:32 Starting - Starting the training job...
2021-01-20 09:27:33 Starting - Launching requested ML instances...
2021-01-20 09:28:37 Starting - Preparing the instances for training......
2021-01-20 09:29:45 Downloading - Downloading input data...
2021-01-20 09:30:20 Training - Downloading the training image......
2021-01-20 09:31:35 Training - Training image download completed. Training in progress..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2021-01-20 09:31:37,557 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2021-01-20 09:31:37,582 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2021-01-20 09:31:37,586 sagemaker_pytorch_container.training INFO     Invoking us

[34m{'trainfile': 'train.csv', 'traindir': '/opt/ml/input/data/train', 'valfile': 'dev.csv', 'valdir': '/opt/ml/input/data/val', 'datasetfactory': 'datasets.bc3ast_dataset_factory.BC3ASTDatasetFactory', 'outdir': '/opt/ml/output/data', 'modeldir': '/opt/ml/model', 'checkpointdir': '/opt/ml/checkpoints/', 'checkpointfreq': '2', 'earlystoppingpatience': 3, 'epochs': 30, 'gradaccumulation': 4, 'batch': 8, 'lr': 1e-05, 'finetune': 0, 'maxseqlen': 512, 'log_level': 'INFO'}[0m
[34m2021-01-20 09:31:50,435 - builder - INFO - Retrieving Tokeniser[0m
[34m2021-01-20 09:31:50,518 - transformers.file_utils - INFO - https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt not found in cache or force_download set to True, downloading to /tmp/tmpgor2xbls[0m
[34m2021-01-20 09:31:50,636 - transformers.file_utils - INFO - copying /tmp/tmpgor2xbls to cache at /root/.cache/torch/transformers/5e8a2b4893d13790ed4150ca1906be5f7a03d6c4ddf62296c383f6db42814db2.e13dbb970cb325137104fb2

[34m2021-01-20 09:35:27,935 - bert_train - INFO - Train set result details:[0m
[34m2021-01-20 09:35:27,937 - bert_train - INFO - Train set result details: 0.831140350877193[0m
[34m2021-01-20 09:35:27,937 - bert_train - INFO - Validation set result details:[0m
[34m2021-01-20 09:35:33,316 - bert_train - INFO - Validation set result details: 0.8157894736842105 [0m
[34m2021-01-20 09:35:33,316 - bert_train - INFO - Snapshotting because the current score 0.8157894736842105 is greater than 0.7850877192982456 [0m
[34m2021-01-20 09:35:33,316 - bert_train - INFO - Snapshot model to /opt/ml/model/best_snaphsotmodel.pt[0m
[34m2021-01-20 09:35:33,890 - bert_train - INFO - Checkpoint model to /opt/ml/checkpoints/checkpoint.pt[0m
[34m2021-01-20 09:35:34,440 - bert_train - INFO - Run    211     2       684     9/228         4% 0.051172 0.052492       0.8311       0.8158[0m
[34m###score: train_loss### 0.05117241614253113[0m
[34m###score: val_loss### 0.05249242824420594[0m
[34m###sc

[34m2021-01-20 09:45:50,566 - bert_train - INFO - Train set result details:[0m
[34m2021-01-20 09:45:50,568 - bert_train - INFO - Train set result details: 0.9906798245614035[0m
[34m2021-01-20 09:45:50,568 - bert_train - INFO - Validation set result details:[0m
[34m2021-01-20 09:45:55,991 - bert_train - INFO - Validation set result details: 0.8706140350877193 [0m
[34m2021-01-20 09:45:55,991 - bert_train - INFO - Checkpoint model to /opt/ml/checkpoints/checkpoint.pt[0m
[34m2021-01-20 09:45:56,534 - bert_train - INFO - Run    833    11      2736     9/228         4% 0.005537 0.057184       0.9907       0.8706[0m
[34m###score: train_loss### 0.00553677411619247[0m
[34m###score: val_loss### 0.05718375396865763[0m
[34m###score: train_score### 0.9906798245614035[0m
[34m###score: val_score### 0.8706140350877193[0m
[34m2021-01-20 09:46:59,589 - bert_train - INFO - Train set result details:[0m
[34m2021-01-20 09:46:59,592 - bert_train - INFO - Train set result details: 0.990

## Deploy BERT model

#### Inference container
Ideally the server containing should already have all the required dependencies installed to reduce start up time and ensure that the runtime enviornment is consistent. This can be implemented using a custom docker image.

But for this demo, to simplify, we will let the Pytorch container script model install the dependencies during start up. As a result, you will see some of the initial ping requests fail, until all dependencies are installed.


In [19]:
from sagemaker.pytorch import PyTorchModel
from sagemaker import get_execution_role

model_uri = estimator.model_data

model = PyTorchModel(model_data=model_uri,
                     role=role,
                     py_version = "py3",
                     framework_version='1.4.0',
                     entry_point='serve.py',
                     source_dir='src')

predictor = model.deploy(initial_instance_count=1, instance_type='ml.p3.2xlarge')

2021-01-20 21:05:28,168 - sagemaker - INFO - Creating model with name: pytorch-inference-2021-01-20-10-05-28-167
2021-01-20 21:05:32,255 - sagemaker - INFO - Creating endpoint with name pytorch-inference-2021-01-20-10-05-29-534
----------------!

### Invoke API

In [20]:
import json


class TextSerDes:
    
     def serialize(self, x):
        data_bytes="\n".join(x).encode("utf-8")
        return data_bytes
    
     def deserialize(self, x, content_type):
        payload =   x.read().decode("utf-8")
        return json.loads(payload) 

In [21]:
def chunk_predict(predictor, data, chunk_size=50):
    predictor.serializer = TextSerDes()
    predictor.deserializer = TextSerDes()
    
    result = []
    for i in range(0, len(data), chunk_size):
        
        re = predictor.predict(data[i:i+chunk_size],  initial_args={ "Accept":"text/json", "ContentType" : "text/csv" })
        result.extend(re)
    return result

In [22]:
from s3_util import S3Util
import csv, io

def load_test_csv(s3_uri):
    os.path.join(processed_out_dir, "test.csv")
    data = S3Util().download_object(s3_uri).decode("utf-8")
    
    csv_reader = csv.reader(io.StringIO(data), delimiter='\t',
                            quotechar='"', quoting=csv.QUOTE_MINIMAL)
    
    actuals =[]
    inputs = []
    ids = []
    for r in csv_reader:
        text = r[0]
        label = r[1]
        id = r[2]
        
        inputs.append(text)
        actuals.append(label)
        ids.append(id)
    return inputs, actuals, ids
        

def write_predictions_csv(test_data, predictions, output_file):
    
    with open(output_file, "w") as f:
        csv_writer = csv.writer(f, delimiter='\t',
                            quotechar='"', quoting=csv.QUOTE_MINIMAL)
            
        for d,p in zip(test_data, predictions):
            csv_writer.writerow([d,p])

In [23]:


test_data, test_labels, test_ids = load_test_csv(s3_uri_test)
response  = chunk_predict(predictor, test_data )
predictions_label = [ list(l.keys())[0] for l in response ]

write_predictions_csv(test_ids, predictions_label, "bc3act-output.csv" )

In [24]:
from sklearn.metrics import accuracy_score

accuracy_score(test_labels, predictions_label)

0.8206666666666667

In [25]:
from sklearn.metrics import f1_score

f1_score(test_labels, predictions_label, pos_label="1")

0.5871066768994628

## Delete endpoint

In [26]:
predictor.delete_endpoint()

2021-01-20 21:17:38,808 - sagemaker - INFO - Deleting endpoint configuration with name: pytorch-inference-2021-01-20-10-05-29-534
2021-01-20 21:17:41,506 - sagemaker - INFO - Deleting endpoint with name: pytorch-inference-2021-01-20-10-05-29-534
