In [25]:
!pip install smdebug

Collecting smdebug
  Downloading smdebug-1.0.12-py2.py3-none-any.whl (270 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.1/270.1 kB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
Collecting pyinstrument==3.4.2 (from smdebug)
  Downloading pyinstrument-3.4.2-py2.py3-none-any.whl (83 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.3/83.3 kB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyinstrument-cext>=0.2.2 (from pyinstrument==3.4.2->smdebug)
  Downloading pyinstrument_cext-0.2.4.tar.gz (4.8 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: pyinstrument-cext
  Building wheel for pyinstrument-cext (setup.py) ... [?25ldone
[?25h  Created wheel for pyinstrument-cext: filename=pyinstrument_cext-0.2.4-cp310-cp310-linux_x86_64.whl size=6299 sha256=e9cacf537737d5bbb75ed0e7e274d0c5dc9b39cd0f0d1bc7169759c3c052b6b2
  Stored in directory: /home/ec2-user/.cache/pip/wheels/0f/8b/7a/5f7fd1dd6d3cbb3d

In [1]:
import sagemaker
import boto3
import os

We can now declare some global and environment variables and upload the data to a bucket we'd made previously in S3. 

In [2]:
# Global
session = sagemaker.Session()

bucket = "aws-bin-image-project"
os.environ["DEFAULT_S3_BUCKET"] = bucket

print(f"Default Bucket: {bucket}")

region = session.boto_region_name
print(f"AWS Region: {region}")

role = sagemaker.get_execution_role()
print(f"RoleArn: {role}")

os.environ['SM_MODEL_DIR'] = f's3://{bucket}/model/'
os.environ['SM_OUTPUT_DATA_DIR'] = f's3://{bucket}/output/'

# Data channels
data_channels = {
    "train": f"s3://{bucket}/data/train",
    "test": f"s3://{bucket}/data/test",
    "valid": f"s3://{bucket}/data/valid"
}

Default Bucket: aws-bin-image-project
AWS Region: us-east-1
RoleArn: arn:aws:iam::273181410894:role/service-role/AmazonSageMaker-ExecutionRole-20230907T124178


In [5]:
!aws s3 sync ./binImages/train s3://${DEFAULT_S3_BUCKET}/data/train/
!aws s3 sync ./binImages/test s3://${DEFAULT_S3_BUCKET}/data/test/
!aws s3 sync ./binImages/valid s3://${DEFAULT_S3_BUCKET}/data/valid/

upload: binImages/train/1/00009.jpg to s3://aws-bin-image-project/data/train/1/00009.jpg
upload: binImages/train/1/00213.jpg to s3://aws-bin-image-project/data/train/1/00213.jpg
upload: binImages/train/1/00232.jpg to s3://aws-bin-image-project/data/train/1/00232.jpg
upload: binImages/train/1/00014.jpg to s3://aws-bin-image-project/data/train/1/00014.jpg
upload: binImages/train/1/00024.jpg to s3://aws-bin-image-project/data/train/1/00024.jpg
upload: binImages/train/1/00214.jpg to s3://aws-bin-image-project/data/train/1/00214.jpg
upload: binImages/train/1/00397.jpg to s3://aws-bin-image-project/data/train/1/00397.jpg
upload: binImages/train/1/00403.jpg to s3://aws-bin-image-project/data/train/1/00403.jpg
upload: binImages/train/1/00148.jpg to s3://aws-bin-image-project/data/train/1/00148.jpg
upload: binImages/train/1/00100.jpg to s3://aws-bin-image-project/data/train/1/00100.jpg
upload: binImages/train/1/00218.jpg to s3://aws-bin-image-project/data/train/1/00218.jpg
upload: binImages/tra

### Hyperparameter Tuning
To perform hyperparameter optimisation (HPO), we specify a tuner, our hyperparameter search space, and the objective metric we want to optimise. We then specify a script (in this case, `hpo.py`) to pass to our tuner that contains instructions on how to carry out the HPO. 

The cell below specifies the hyperparameter ranges that we wish to search through. We also pass our objective metric, its definition (which is a regex for the tuner to search for in `hpo.py`), the maximum number of jobs, and the number of parallel jobs to carry out. 

In [6]:
from sagemaker.tuner import (
    IntegerParameter,
    CategoricalParameter,
    ContinuousParameter,
    HyperparameterTuner,
)

from sagemaker.pytorch import PyTorch


hyperparameter_ranges = {
    "lr": ContinuousParameter(0.0001, 0.1),
    "batch-size": CategoricalParameter([64, 128, 256, 512]),
    "beta1": ContinuousParameter(0.6, 0.999)
}

objective_metric_name = "Average test loss"
objective_type = "Minimize"
metric_definitions = [{"Name": "Average test loss", 
                       "Regex": "Testing Loss: ([0-9\\.]+)"},
                      {"Name": "Accuracy", 
                       "Regex": "Testing Accuracy: ([0-9\\.]+)"}]


In [7]:
estimator = PyTorch(
    entry_point="./scripts/hpo.py",
    base_job_name='bin-image-hpo',
    role=role,
    py_version='py36',
    framework_version="1.8",
    instance_count=1,
    instance_type="ml.m5.xlarge"
)

tuner = HyperparameterTuner(
    estimator,
    objective_metric_name,
    hyperparameter_ranges,
    metric_definitions,
    max_jobs=10,
    max_parallel_jobs=2,
    objective_type=objective_type,
)

In [9]:
# Use this line to check estimator works okay
# estimator.fit(data_channels, wait=True)

# Now use this line to check tuner works okay
tuner.fit(data_channels, wait=True)
tuner_name = tuner.describe()['HyperParameterTuningJobName']
print(f'Tuning job submitted: {tuner_name}.')

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.


Using provided s3_resource


INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating hyperparameter tuning job with name: pytorch-training-230907-1257


...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................!
Tuning job submitted: pytorch-training-230907-1257.


We ran 10 jobs and can access the results as follows.

In [12]:
from sagemaker.analytics import HyperparameterTuningJobAnalytics

sagemaker.HyperparameterTuningJobAnalytics(tuner_name).dataframe()\
.sort_values('FinalObjectiveValue')

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


Unnamed: 0,batch-size,beta1,lr,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
0,"""128""",0.995252,0.001058,pytorch-training-230907-1257-010-9676b72d,Completed,1.55353,2023-09-07 13:34:31+00:00,2023-09-07 13:42:05+00:00,454.0
3,"""128""",0.995194,0.002506,pytorch-training-230907-1257-007-749d52f4,Completed,1.567634,2023-09-07 13:26:03+00:00,2023-09-07 13:33:52+00:00,469.0
5,"""128""",0.82896,0.009644,pytorch-training-230907-1257-005-ab7976c1,Completed,1.574448,2023-09-07 13:17:12+00:00,2023-09-07 13:24:45+00:00,453.0
1,"""64""",0.997786,0.015981,pytorch-training-230907-1257-009-fa69eea9,Completed,1.577441,2023-09-07 13:34:29+00:00,2023-09-07 13:41:47+00:00,438.0
9,"""128""",0.838729,0.027425,pytorch-training-230907-1257-001-64473c5b,Completed,1.580934,2023-09-07 12:59:10+00:00,2023-09-07 13:07:33+00:00,503.0
2,"""512""",0.951731,0.000537,pytorch-training-230907-1257-008-fe8412ae,Completed,1.583123,2023-09-07 13:26:04+00:00,2023-09-07 13:34:03+00:00,479.0
7,"""64""",0.99822,0.030278,pytorch-training-230907-1257-003-27fdafe4,Completed,1.588823,2023-09-07 13:08:23+00:00,2023-09-07 13:15:46+00:00,443.0
6,"""512""",0.946745,0.001199,pytorch-training-230907-1257-004-611e41ad,Completed,1.589437,2023-09-07 13:08:25+00:00,2023-09-07 13:16:23+00:00,478.0
4,"""512""",0.98522,0.001299,pytorch-training-230907-1257-006-57a5148e,Completed,1.601627,2023-09-07 13:17:14+00:00,2023-09-07 13:25:08+00:00,474.0
8,"""512""",0.988918,0.005219,pytorch-training-230907-1257-002-815cdd81,Completed,1.752347,2023-09-07 12:59:15+00:00,2023-09-07 13:08:06+00:00,531.0


We can now obtain the best model and the corresponding best set of hyperparameters from the search. We'll train this model on our entire training set to obtain a classifier that we can perform inference with.

In [11]:
best_estimator = tuner.best_estimator()

#Get the hyperparameters of the best trained model
best_estimator.hyperparameters()


2023-09-07 13:42:07 Starting - Found matching resource for reuse
2023-09-07 13:42:07 Downloading - Downloading input data
2023-09-07 13:42:07 Training - Training image download completed. Training in progress.
2023-09-07 13:42:07 Uploading - Uploading generated training model
2023-09-07 13:42:07 Completed - Resource retained for reuse


{'_tuning_objective_metric': '"Average test loss"',
 'batch-size': '"128"',
 'beta1': '0.9952515870960148',
 'lr': '0.0010578396135173058',
 'sagemaker_container_log_level': '20',
 'sagemaker_estimator_class_name': '"PyTorch"',
 'sagemaker_estimator_module': '"sagemaker.pytorch.estimator"',
 'sagemaker_job_name': '"bin-image-hpo-2023-09-07-12-57-51-446"',
 'sagemaker_program': '"hpo.py"',
 'sagemaker_region': '"us-east-1"',
 'sagemaker_submit_directory': '"s3://sagemaker-us-east-1-273181410894/bin-image-hpo-2023-09-07-12-57-51-446/source/sourcedir.tar.gz"'}

## Model Training, Profiling and Debugging
Now that we have a good set of hyperparameters, we train the model on the full training set for a large number of epochs. We set up Sagemaker debugging and profiling by configuring the following set of rules and adding appropriate hooks to the entry point script (in this case `train.py`). We pass the rules and save intervals to the debugger, which specify how often to extract values for the debug tensors.

In [13]:
from sagemaker.debugger import (
    Rule, 
    ProfilerRule, 
    DebuggerHookConfig, 
    ProfilerConfig, 
    FrameworkProfile, 
    TensorBoardOutputConfig,
    CollectionConfig,
    rule_configs)

rules = [
    Rule.sagemaker(rule_configs.vanishing_gradient()),
    ProfilerRule.sagemaker(rule_configs.LowGPUUtilization()),
    ProfilerRule.sagemaker(rule_configs.ProfilerReport()),
    Rule.sagemaker(rule_configs.overfit()),
    Rule.sagemaker(rule_configs.overtraining()),
    Rule.sagemaker(rule_configs.poor_weight_initialization()),
]

profiler_config = ProfilerConfig(
    # Monitors training job for 500ms with a 10-step profile
    system_monitor_interval_millis=500, 
    framework_profile_params=FrameworkProfile(num_steps=10)
)

debugger_hook_config = DebuggerHookConfig(
    s3_output_path=f"s3://{bucket}/output/",
    collection_configs=[
        CollectionConfig(
            name="CrossEntropyLoss_output", 
            parameters={
                "include_regex": "CrossEntropyLoss_output.*",
                "train.save_interval": "1", 
                "eval.save_interval": "1", 
            }
        )
    ]
)

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [14]:
hyperparameters = {
    'batch-size': best_estimator.hyperparameters()['batch-size'].replace('"', ''),
    'beta1': best_estimator.hyperparameters()['beta1'],
    'lr': best_estimator.hyperparameters()['lr'],
    'epochs': '20',
    'train-proportion': '1.0'
 }

hyperparameters

{'batch-size': '128',
 'beta1': '0.9952515870960148',
 'lr': '0.0010578396135173058',
 'epochs': '20',
 'train-proportion': '1.0'}

In [18]:
estimator = PyTorch(
    entry_point="./scripts/train.py",
    base_job_name='bin-image-training',
    role=role,
    py_version='py36',
    framework_version="1.8",
    instance_count=1,
    instance_type="ml.g4dn.xlarge",
    hyperparameters=hyperparameters,
    profiler_config=profiler_config,
    ## Debugger parameters
    debugger_hook_config=debugger_hook_config,
    rules=rules
)

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


In [19]:
estimator.fit(data_channels, wait=True)

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: bin-image-training-2023-09-07-14-11-49-935


Using provided s3_resource
2023-09-07 14:11:50 Starting - Starting the training job...
2023-09-07 14:12:15 Starting - Preparing the instances for trainingVanishingGradient: InProgress
Overfit: InProgress
Overtraining: InProgress
PoorWeightInitialization: InProgress
LowGPUUtilization: InProgress
ProfilerReport: InProgress
......
2023-09-07 14:13:15 Downloading - Downloading input data......
2023-09-07 14:14:16 Training - Downloading the training image..................
2023-09-07 14:17:20 Training - Training image download completed. Training in progress..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2023-09-07 14:17:28,064 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2023-09-07 14:17:28,092 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2023-09-07 14:17:28,096 sagemaker_pytorch_container.traini

We can examine the status of the debug errors as follows.

In [23]:
import pprint

training_job_name = estimator.latest_training_job.name
print(f"Training jobname: {training_job_name}")

client = estimator.sagemaker_session.sagemaker_client
description = client.describe_training_job(TrainingJobName=training_job_name)

for status in description['DebugRuleEvaluationStatuses']:
    status.pop('LastModifiedTime')
    status.pop('RuleEvaluationJobArn')
    pprint.pprint(status)

Training jobname: bin-image-training-2023-09-07-14-11-49-935
{'RuleConfigurationName': 'VanishingGradient',
 'RuleEvaluationStatus': 'NoIssuesFound'}
{'RuleConfigurationName': 'Overfit', 'RuleEvaluationStatus': 'NoIssuesFound'}
{'RuleConfigurationName': 'Overtraining',
 'RuleEvaluationStatus': 'IssuesFound',
 'StatusDetails': 'RuleEvaluationConditionMet: Evaluation of the rule '
                  'Overtraining at step 145 resulted in the condition being '
                  'met\n'}
{'RuleConfigurationName': 'PoorWeightInitialization',
 'RuleEvaluationStatus': 'IssuesFound',
 'StatusDetails': 'RuleEvaluationConditionMet: Evaluation of the rule '
                  'PoorWeightInitialization at step 0 resulted in the '
                  'condition being met\n'}


From this we can see that two of the rules have been triggered: Overtraining and PoorWeightInitialization. 

We could try a different weight initialisation, modify our dataset to include augmented data, and modify the structure of the network (e.g. by adding drop out layers) to fix these issues.

The ProfilerReport rule runs by default and provides a comprehensive training report regarding system bottlenecks and framework profiling.

In [5]:
rule_output_path = estimator.output_path + estimator.latest_training_job.job_name + "/rule-output"

In [6]:
# Copy profiler reports to local folder
! aws s3 cp {rule_output_path} . --recursive

download: s3://sagemaker-us-east-1-273181410894/bin-image-training-2023-09-07-14-11-49-935/rule-output/ProfilerReport/profiler-output/profiler-report.ipynb to ProfilerReport/profiler-output/profiler-report.ipynb
download: s3://sagemaker-us-east-1-273181410894/bin-image-training-2023-09-07-14-11-49-935/rule-output/ProfilerReport/profiler-output/profiler-report.html to ProfilerReport/profiler-output/profiler-report.html
download: s3://sagemaker-us-east-1-273181410894/bin-image-training-2023-09-07-14-11-49-935/rule-output/ProfilerReport/profiler-output/profiler-reports/Dataloader.json to ProfilerReport/profiler-output/profiler-reports/Dataloader.json
download: s3://sagemaker-us-east-1-273181410894/bin-image-training-2023-09-07-14-11-49-935/rule-output/ProfilerReport/profiler-output/profiler-reports/IOBottleneck.json to ProfilerReport/profiler-output/profiler-reports/IOBottleneck.json
download: s3://sagemaker-us-east-1-273181410894/bin-image-training-2023-09-07-14-11-49-935/rule-output/Pro

In [7]:
import IPython

IPython.display.HTML(filename="./ProfilerReport/profiler-output/profiler-report.html")

Unnamed: 0,Description,Recommendation,Number of times rule triggered,Number of datapoints,Rule parameters
LowGPUUtilization,"Checks if the GPU utilization is low or fluctuating. This can happen due to bottlenecks, blocking calls for synchronizations, or a small batch size.","Check if there are bottlenecks, minimize blocking calls, change distributed training strategy, or increase the batch size.",3,1284,threshold_p95:70  threshold_p5:10  window:500  patience:1000
BatchSize,"Checks if GPUs are underutilized because the batch size is too small. To detect this problem, the rule analyzes the average GPU memory footprint, the CPU and the GPU utilization.","The batch size is too small, and GPUs are underutilized. Consider running on a smaller instance type or increasing the batch size.",2,1283,cpu_threshold_p95:70  gpu_threshold_p95:70  gpu_memory_threshold_p95:70  patience:1000  window:500
GPUMemoryIncrease,Measures the average GPU memory footprint and triggers if there is a large increase.,Choose a larger instance type with more memory if footprint is close to maximum available memory.,2,1284,increase:5  patience:1000  window:10
Dataloader,"Checks how many data loaders are running in parallel and whether the total number is equal the number of available CPU cores. The rule triggers if number is much smaller or larger than the number of available cores. If too small, it might lead to low GPU utilization. If too large, it might impact other compute intensive operations on CPU.",Change the number of data loader processes.,1,11,min_threshold:70  max_threshold:200
LoadBalancing,"Detects workload balancing issues across GPUs. Workload imbalance can occur in training jobs with data parallelism. The gradients are accumulated on a primary GPU, and this GPU might be overused with regard to other GPUs, resulting in reducing the efficiency of data parallelization.",Choose a different distributed training strategy or a different distributed training framework.,0,1284,threshold:0.2  patience:1000
MaxInitializationTime,Checks if the time spent on initialization exceeds a threshold percent of the total training time. The rule waits until the first step of training loop starts. The initialization can take longer if downloading the entire dataset from Amazon S3 in File mode. The default threshold is 20 minutes.,"Initialization takes too long. If using File mode, consider switching to Pipe mode in case you are using TensorFlow framework.",0,394,threshold:20
CPUBottleneck,"Checks if the CPU utilization is high and the GPU utilization is low. It might indicate CPU bottlenecks, where the GPUs are waiting for data to arrive from the CPUs. The rule evaluates the CPU and GPU utilization rates, and triggers the issue if the time spent on the CPU bottlenecks exceeds a threshold percent of the total training time. The default threshold is 50 percent.",Consider increasing the number of data loaders or applying data pre-fetching.,0,1287,threshold:50  cpu_threshold:90  gpu_threshold:10  patience:1000
IOBottleneck,Checks if the data I/O wait time is high and the GPU utilization is low. It might indicate IO bottlenecks where GPU is waiting for data to arrive from storage. The rule evaluates the I/O and GPU utilization rates and triggers the issue if the time spent on the IO bottlenecks exceeds a threshold percent of the total training time. The default threshold is 50 percent.,"Pre-fetch data or choose different file formats, such as binary formats that improve I/O performance.",0,1287,threshold:50  io_threshold:50  gpu_threshold:10  patience:1000
StepOutlier,"Detects outliers in step duration. The step duration for forward and backward pass should be roughly the same throughout the training. If there are significant outliers, it may indicate a system stall or bottleneck issues.","Check if there are any bottlenecks (CPU, I/O) correlated to the step outliers.",0,394,threshold:3  mode:None  n_outliers:10  stddev:3

Unnamed: 0,mean,max,p99,p95,p50,min
Step Durations in [s],0.16,14.74,2.89,0.01,0.01,0.01


We can see from this that GPU utilisation was triggered a few times, and we could re-run the training with a larger batch-size to fix this. 

Now that we have a working model, we can deploy it to an endpoint and provide a test image for inference.

### Model Deploying and Querying
The following deploys our trained model to an endpoint. 

In [3]:
estimator = sagemaker.estimator.Estimator.attach("bin-image-training-2023-09-07-14-11-49-935")


2023-09-07 14:25:26 Starting - Preparing the instances for training
2023-09-07 14:25:26 Downloading - Downloading input data
2023-09-07 14:25:26 Training - Training image download completed. Training in progress.
2023-09-07 14:25:26 Uploading - Uploading generated training model
2023-09-07 14:25:26 Completed - Training job completed


In [13]:
from sagemaker.pytorch.model import PyTorchModel
from sagemaker.predictor import Predictor

jpeg_serializer = sagemaker.serializers.IdentitySerializer("image/jpeg")
json_deserializer = sagemaker.deserializers.JSONDeserializer()

class ImagePredictor(Predictor):
    def __init__(self, endpoint_name, sagemaker_session):
        super(ImagePredictor, self).__init__(
            endpoint_name,
            sagemaker_session=session,
            serializer=jpeg_serializer,
            deserializer=json_deserializer,
        )

pytorch_model = PyTorchModel(
    model_data=estimator.model_data,
    role=role,
    py_version='py36',
    framework_version="1.8",
    entry_point='./scripts/inference.py',
    predictor_cls=ImagePredictor
)

predictor = pytorch_model.deploy(
    instance_type='ml.m5.large',
    initial_instance_count=1
)

-----!

We can then test this by passing the endpoint an image from our local directory. The endpoint then returns an array of 5 numbers to indicate class inclusion.

In [14]:
from PIL import Image
import io

with open("./binImages/test/1/00194.jpg", 'rb') as f:
    payload = f.read()

prediction = predictor.predict(payload)

In [18]:
print(f"Prediction: {prediction};\nNumber of classes: {len(prediction[0])}")

Prediction: [[0.4696963429450989, 0.4951171278953552, 0.1369757056236267, -0.43281447887420654, -0.974503219127655]];
Number of classes: 5


## Batch processing

Now that we've confirmed that we can make predictions, we can set up a batch transform to perform inference on a multiple images in one call. We do this by creating a PyTorch model from our trained model, passing it `inference.py`, as above, and creating a transformer object that we can then call the `.transform` method on.

In [21]:
# Create PyTorchModel from saved model artifact
pytorch_model = PyTorchModel(
    model_data=estimator.model_data,
    role=role,
    py_version='py36',
    framework_version="1.8",
    entry_point='./scripts/inference.py',
    predictor_cls=ImagePredictor
)

batch_transform_output_path = "s3://aws-bin-image-project/batch_transform/output"

transformer = pytorch_model.transformer(
    instance_count = 1,
    instance_type = 'ml.m5.large',
    output_path = batch_transform_output_path
)

We'll upload six test images to S3, perform a batch transform, then read the results out. 

In [20]:
!aws s3 sync ./testBatchTransform s3://aws-bin-image-project/batch_transform/images

upload: testBatchTransform/00046.jpg to s3://aws-bin-image-project/batch_transform/00046.jpg
upload: testBatchTransform/00288.jpg to s3://aws-bin-image-project/batch_transform/00288.jpg
upload: testBatchTransform/00279.jpg to s3://aws-bin-image-project/batch_transform/00279.jpg
upload: testBatchTransform/00112.jpg to s3://aws-bin-image-project/batch_transform/00112.jpg
upload: testBatchTransform/00194.jpg to s3://aws-bin-image-project/batch_transform/00194.jpg
upload: testBatchTransform/00130.jpg to s3://aws-bin-image-project/batch_transform/00130.jpg


In [22]:
# Perform batch transform
s3_input_path = "s3://aws-bin-image-project/batch_transform/images"

transformer.transform(
    data=s3_input_path,
    data_type="S3Prefix",
    content_type="image/jpeg",
    wait=True,
)

INFO:sagemaker:Creating transform job with name: pytorch-inference-2023-09-08-13-25-56-174


.............................[34m2023-09-08 13:30:47,195 [INFO ] main org.pytorch.serve.servingsdk.impl.PluginsManager - Initializing plugins manager...[0m
[34m2023-09-08 13:30:47,446 [INFO ] main org.pytorch.serve.ModelServer - [0m
[34mTorchserve version: 0.4.0[0m
[34mTS Home: /opt/conda/lib/python3.6/site-packages[0m
[34mCurrent directory: /[0m
[34mTemp directory: /home/model-server/tmp[0m
[34mNumber of GPUs: 0[0m
[34mNumber of CPUs: 2[0m
[34mMax heap size: 974 M[0m
[34mPython executable: /opt/conda/bin/python3.6[0m
[34mConfig file: /etc/sagemaker-ts.properties[0m
[34mInference address: http://0.0.0.0:8080[0m
[34mManagement address: http://0.0.0.0:8080[0m
[34mMetrics address: http://127.0.0.1:8082[0m
[34mModel Store: /.sagemaker/ts/models[0m
[34mInitial Models: model.mar[0m
[34mLog dir: /logs[0m
[34mMetrics dir: /logs[0m
[34mNetty threads: 0[0m
[34mNetty client threads: 0[0m
[34mDefault workers per model: 2[0m
[34mBlacklist Regex: N/A[0m
[

We can examine the details of the batch transform as follows.

In [25]:
client = session.sagemaker_client

job_info = client.describe_transform_job(TransformJobName=transformer.latest_transform_job.name)

pprint.pprint(job_info)

{'CreationTime': datetime.datetime(2023, 9, 8, 13, 25, 56, 623000, tzinfo=tzlocal()),
 'DataProcessing': {'InputFilter': '$',
                    'JoinSource': 'None',
                    'OutputFilter': '$'},
 'ModelName': 'pytorch-inference-2023-09-08-13-22-40-778',
 'ResponseMetadata': {'HTTPHeaders': {'content-length': '845',
                                      'content-type': 'application/x-amz-json-1.1',
                                      'date': 'Fri, 08 Sep 2023 13:37:33 GMT',
                                      'x-amzn-requestid': '6862ad53-481f-4111-babf-217c0acbd168'},
                      'HTTPStatusCode': 200,
                      'RequestId': '6862ad53-481f-4111-babf-217c0acbd168',
                      'RetryAttempts': 0},
 'TransformEndTime': datetime.datetime(2023, 9, 8, 13, 30, 59, 378000, tzinfo=tzlocal()),
 'TransformInput': {'CompressionType': 'None',
                    'ContentType': 'image/jpeg',
                    'DataSource': {'S3DataSource': {'S3Da

In [35]:
# Download results to local folder
!aws s3 sync s3://aws-bin-image-project/batch_transform/output/ ./BatchTransformPreds 

download: s3://aws-bin-image-project/batch_transform/output/00046.jpg.out to BatchTransformPreds/00046.jpg.out
download: s3://aws-bin-image-project/batch_transform/output/00112.jpg.out to BatchTransformPreds/00112.jpg.out
download: s3://aws-bin-image-project/batch_transform/output/00130.jpg.out to BatchTransformPreds/00130.jpg.out
download: s3://aws-bin-image-project/batch_transform/output/00279.jpg.out to BatchTransformPreds/00279.jpg.out
download: s3://aws-bin-image-project/batch_transform/output/00288.jpg.out to BatchTransformPreds/00288.jpg.out
download: s3://aws-bin-image-project/batch_transform/output/00194.jpg.out to BatchTransformPreds/00194.jpg.out


In [40]:
# Examine results
import json

local_path = "./BatchTransformPreds"

for f in os.listdir(local_path):
    path = os.path.join(local_path, f)
    img_name = os.path.basename(path).split('.')[0] + '.jpg'
    with open(path, "r") as f:
        pred = json.load(f)
        print(f"{img_name} predictions: {pred}")

00194.jpg predictions: [[0.4696963429450989, 0.4951171278953552, 0.1369757056236267, -0.43281447887420654, -0.974503219127655]]
00112.jpg predictions: [[0.08330731838941574, 0.42631062865257263, 0.14380434155464172, -0.16868776082992554, -0.5581206679344177]]
00130.jpg predictions: [[-0.40971165895462036, 0.33617570996284485, 0.27284786105155945, 0.19134843349456787, -0.4453326165676117]]
00046.jpg predictions: [[-0.13286170363426208, 0.23365354537963867, 0.1055876761674881, -0.09916656464338303, -0.3965054154396057]]
00288.jpg predictions: [[-0.8448575735092163, 0.38474535942077637, 0.36406901478767395, 0.3260776698589325, -0.5248528718948364]]
00279.jpg predictions: [[0.5841971635818481, 0.7196459174156189, 0.3047943115234375, -0.40039733052253723, -1.2733790874481201]]
