# Train SNLI on SageMaker using PyTorch



In [1]:
import sys, os
import logging

sys.path.append("src")

logging.basicConfig(level="INFO", handlers=[logging.StreamHandler(sys.stdout)],
                        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

### Bucket and role set up

In [2]:
import sagemaker, boto3
from sagemaker import get_execution_role
sm_session = sagemaker.session.Session()
account_id = boto3.client("sts").get_caller_identity()["Account"]
role = f"arn:aws:iam::{account_id}:role/service-role/AmazonSageMaker-ExecutionRole-20190508T110816"

2024-03-16 21:29:44,898 - botocore.credentials - INFO - Found credentials in shared credentials file: ~/.aws/credentials
sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/aparnaelangovan/Library/Application Support/sagemaker/config.yaml
2024-03-16 21:29:45,311 - botocore.credentials - INFO - Found credentials in shared credentials file: ~/.aws/credentials
2024-03-16 21:29:45,492 - botocore.credentials - INFO - Found credentials in shared credentials file: ~/.aws/credentials


In [3]:
data_bucket = "aegovan-data"

data_bucket_prefix = "nli"

s3_uri_data = "s3://{}/{}/snli".format(data_bucket, data_bucket_prefix)
s3_uri_train = "{}/{}".format(s3_uri_data, "snli_1.0_train.jsonl")
s3_uri_val = "{}/{}".format(s3_uri_data, "snli_1.0_dev.jsonl")



s3_uri_test = "{}/{}".format(s3_uri_data, "snli_1.0_test.jsonl")

s3_output_path = "s3://{}/{}/output".format(data_bucket, data_bucket_prefix)
s3_code_path = "s3://{}/{}/code".format(data_bucket, data_bucket_prefix)
s3_checkpoint = "s3://{}/{}/checkpoint".format(data_bucket, data_bucket_prefix)

## Train

This shows you how to train BERT on SageMaker using SPOT instances

In [4]:
inputs_full =  {
    "train" : s3_uri_train,
    "val" : s3_uri_val
}

# Using the full dataset can take a while 4-5 hours. So if you just quickly test the sample, use inputs_sample
inputs = inputs_full

In [5]:
sm_localcheckpoint_dir="/opt/ml/checkpoints/"

In [6]:
instance_type = "ml.p3.2xlarge"
instance_type_gpu_map = {"ml.p3.8xlarge":4, "ml.p3.2xlarge": 1, "ml.p3.16xlarge":8}

In [7]:
hp = {
"epochs" : 10,
"earlystoppingpatience" : 3,
# Increasing batch size might end up with CUDA OOM error, increase grad accumulation instead
"batch" : 8 * instance_type_gpu_map[instance_type],
"trainfile" :s3_uri_train.split("/")[-1],
"valfile" : s3_uri_val.split("/")[-1],
# The number of steps to accumulate gradients for
"gradaccumulation" : 4,
"log-level":"INFO",
# This param depends on your model max pos embedding size or when large you might end up with CUDA OOM error    
"maxseqlen" : 512,
# Make sure the lr is quite small, as this is a pretrained model..
"lr":0.00001,
# Use finetuning (set to 1), if you only want to change the weights in the final classification layer.. 
"finetune": 0,
"checkpointdir" : sm_localcheckpoint_dir,
# Checkpoints once every n epochs
"checkpointfreq": 2
}



In [8]:
hp

{'epochs': 10,
 'earlystoppingpatience': 3,
 'batch': 8,
 'trainfile': 'snli_1.0_train.jsonl',
 'valfile': 'snli_1.0_dev.jsonl',
 'gradaccumulation': 4,
 'log-level': 'INFO',
 'maxseqlen': 512,
 'lr': 1e-05,
 'finetune': 0,
 'checkpointdir': '/opt/ml/checkpoints/',
 'checkpointfreq': 2}

In [9]:
inputs

{'train': 's3://aegovan-data/nli/snli/snli_1.0_train.jsonl',
 'val': 's3://aegovan-data/nli/snli/snli_1.0_dev.jsonl'}

In [10]:
metric_definitions = [{"Name": "TrainLoss",
                     "Regex": "###score: train_loss### (\d*[.]?\d*)"}
                    ,{"Name": "ValidationLoss",
                     "Regex": "###score: val_loss### (\d*[.]?\d*)"}
                    ,{"Name": "TrainScore",
                     "Regex": "###score: train_score### (\d*[.]?\d*)"}
                   ,{"Name": "ValidationScore",
                     "Regex": "###score: val_score### (\d*[.]?\d*)"}
                    ]

In [11]:
# set True if you need spot instance
use_spot = False
train_max_run_secs =   2*24 * 60 * 60
spot_wait_sec =  5 * 60
max_wait_time_secs = train_max_run_secs +  spot_wait_sec

if not use_spot:
    max_wait_time_secs = None
    
# During local mode, no spot.., use smaller dataset
if instance_type == 'local':
    use_spot = False
    max_wait_time_secs = 0
    wait = True
    # Use smaller dataset to run locally
    inputs = inputs_sample


In [12]:
job_type = "snli-classification"
base_name = "{}".format(job_type)

In [None]:
from sagemaker.pytorch import PyTorch

estimator = PyTorch(
     #entry_point='main_train_k_fold.py',
    entry_point='main.py',
                    source_dir = '../src',
                    role=role,
                    framework_version ="1.12.0",
                    py_version='py38',
                    instance_count=1,
                    instance_type=instance_type,
                    hyperparameters = hp,
                    output_path=s3_output_path,
                    metric_definitions=metric_definitions,
                    volume_size=30,
                    code_location=s3_code_path,
                    debugger_hook_config=False,
                    base_job_name =base_name,  
                    use_spot_instances = use_spot,
                    max_run =  train_max_run_secs,
                    # max_wait = max_wait_time_secs
                    # checkpoint_s3_uri=s3_checkpoint,
                    # checkpoint_local_path=sm_localcheckpoint_dir
)

estimator.fit(inputs, wait=True)

2024-03-16 21:29:47,373 - sagemaker.image_uris - INFO - image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
2024-03-16 21:29:47,403 - sagemaker - INFO - Creating training-job with name: snli-classification-2024-03-17-04-29-46-732
2024-03-17 04:29:47 Starting - Starting the training job......
2024-03-17 04:30:24 Starting - Preparing the instances for training...
2024-03-17 04:31:09 Downloading - Downloading input data...
2024-03-17 04:31:44 Downloading - Downloading the training image..................
2024-03-17 04:34:55 Training - Training image download completed. Training in progress...

## Deploy BERT model

#### Inference container
Ideally the server containing should already have all the required dependencies installed to reduce start up time and ensure that the runtime enviornment is consistent. This can be implemented using a custom docker image.

But for this demo, to simplify, we will let the Pytorch container script model install the dependencies during start up. As a result, you will see some of the initial ping requests fail, until all dependencies are installed.


In [None]:
from sagemaker.pytorch import PyTorchModel
from sagemaker import get_execution_role
role = get_execution_role()

model_uri = estimator.model_data

model = PyTorchModel(model_data=model_uri,
                     role=role,
                     py_version = "py38",
                     framework_version='1.12.0',
                     entry_point='serve.py',
                     source_dir='src')

predictor = model.deploy(initial_instance_count=1, instance_type='ml.p3.2xlarge')

### Invoke API

In [None]:
data = ["Q-workshop is a Polish company located in Poznań that specializes in designand production of polyhedral dice",
        "ET is a sci-fi directed by steven spielberg"]

In [None]:
import json


class TextSerDes:
    
     def serialize(self, x):
        data_bytes="\n".join(x).encode("utf-8")
        return data_bytes
    
     def deserialize(self, x, content_type):
        return json.loads(x.read().decode("utf-8")) 

In [None]:

predictor.serializer = TextSerDes()
predictor.deserializer = TextSerDes()


response  = predictor.predict(data,  initial_args={ "Accept":"text/json", "ContentType" : "text/csv" }
                                   )

response 

## Delete endpoint

In [None]:
predictor.delete_endpoint()