# Train and deploy a model for the Amazon Bin Image counting problem

This notebook contains code to perform hyperparameter tuning, training, debugging of a multi-class image classifier for for counting objects in images from bins in Amazon distribution centers. During the EDA phase the problem has been reduced to a 


The model is implemented using PyTorch. The training data comes directly from the public dataset at https://github.com/awslabs/open-data-docs/tree/main/docs/aft-vbi-pds. The original dataset contains >500,000 images, but in this project a randomly selected subsample of 50,000 images is used. See EDA for details. 

## Install debugging tools

Run only once.

In [None]:
!pip install smdebug 

## Create S3 bucket for the models

Run only once.

In [1]:
import boto3 

In [2]:
client = boto3.client('s3')
response = client.create_bucket(
    ACL='private',
    Bucket='amazon-bin-images-models',
)

## Import useful modules

In [1]:
import os
import sys
import io
import json
import IPython
import numpy as np
import pandas as pd
from PIL import Image
import boto3
import sagemaker
from sagemaker.pytorch import PyTorch
from sagemaker.tuner import (
    IntegerParameter,
    CategoricalParameter,
    ContinuousParameter,
    HyperparameterTuner,
)
from sagemaker.debugger import Rule, ProfilerRule, rule_configs
from sagemaker.debugger import DebuggerHookConfig, CollectionConfig, ProfilerConfig, FrameworkProfile
from smdebug.trials import create_trial
from smdebug.core.modes import ModeKeys
from smdebug.profiler.analysis.notebook_utils.training_job import TrainingJob
from smdebug.profiler.analysis.notebook_utils.timeline_charts import TimelineCharts

[2022-01-17 22:47:54.673 datascience-1-0-ml-t3-medium-1abf3407f667f989be9d86559395:377 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None


## Hyperparameter Tuning

The code below sets up a series of hyperparameter sweeps to find the set that achieves the best model performance. The latter is quantified by a series of metrics computed from each dataset: 
- Average training loss.
- Training accuracy.
- Training RMSE.
- Average validation loss.
- Validation accuracy.
- Validation RMSE.
- Average testing loss.
- Testing accuracy.
- Testing RMSE.

**The average validation loss is the metric to minimize to perform hyperparameter tuning.**

In [2]:
# Hyperparameter ranges
hyperparameter_ranges = {
    "learning_rate": ContinuousParameter(0.001, 0.1),
    "batch_size": CategoricalParameter([32, 64, 128]),
    "epochs": IntegerParameter(2, 50)
}

# Objective metric 
objective_metric_name = "Validation Loss"
objective_type = "Minimize"
metric_definitions = [
    {"Name": "Training Loss", "Regex": "Training Loss: ([0-9\\.]+)"},
    {"Name": "Training Accuracy", "Regex": "Training Accuracy: ([0-9\\.]+)"},
    {"Name": "Training RMSE", "Regex": "Training RMSE: ([0-9\\.]+)"},
    {"Name": "Validation Loss", "Regex": "Validation Loss: ([0-9\\.]+)"},
    {"Name": "Validation Accuracy", "Regex": "Validation Accuracy: ([0-9\\.]+)"},
    {"Name": "Validation RMSE", "Regex": "Validation RMSE: ([0-9\\.]+)"},
    {"Name": "Testing Loss", "Regex": "Testing Loss: ([0-9\\.]+)"},
    {"Name": "Testing Accuracy", "Regex": "Testing Accuracy: ([0-9\\.]+)"},
    {"Name": "Testing RMSE", "Regex": "Testing RMSE: ([0-9\\.]+)"},
]

In [3]:
estimator = PyTorch(
    role=sagemaker.get_execution_role(),
    base_job_name='amazon-bin-images-pytorch',
    instance_count=2,
    instance_type="ml.m5.large", 
    source_dir="./",
    entry_point="hyperparameter_opt.py",
    framework_version="1.8",
    py_version="py36",
)
tuner = HyperparameterTuner(
    estimator,
    objective_metric_name,
    hyperparameter_ranges,
    metric_definitions,
    max_jobs=8,
    max_parallel_jobs=2,
    objective_type=objective_type
)

In [4]:
os.environ['SM_CHANNEL_TRAINING']='s3://amazon-bin-images-sub'
os.environ['SM_MODEL_DIR']='s3://amazon-bin-images-models'
os.environ['SM_OUTPUT_DATA_DIR']='s3://amazon-bin-images-models'
tuner.fit({"training": "s3://amazon-bin-images-sub"})

..................................................................................................................................................................__s

Job ended with status 'Stopped' rather than 'Completed'. This could mean the job timed out or stopped early for some other reason: Consider checking whether it completed as you expect.





## Describe tuning results

In [None]:
from sagemaker.analytics import HyperparameterTuningJobAnalytics

exp = HyperparameterTuningJobAnalytics(
  hyperparameter_tuning_job_name='pytorch-training-211221-0409')

jobs = exp.dataframe()

jobs.sort_values('FinalObjectiveValue', ascending=0)

## Select best model hyperparameters

In [None]:
# If kernel dies - retrieve from completed training job
#best_training_job_name=''
#best_estimator = sagemaker.estimator.Estimator.attach(best_training_job_name)

In [None]:
best_estimator=tuner.best_estimator()

In [None]:
best_estimator.hyperparameters()

In [None]:
hyperparameters = {
    "batch_size": int(best_estimator.hyperparameters()['batch_size'].replace('"', '')), 
    "learning_rate": best_estimator.hyperparameters()['learning_rate'], 
    "epochs": best_estimator.hyperparameters()['epochs']
}

## Profiler and Debugger settings

In [None]:
rules = [
    Rule.sagemaker(rule_configs.vanishing_gradient()),
    Rule.sagemaker(rule_configs.overfit()),
    Rule.sagemaker(rule_configs.overtraining()),
    Rule.sagemaker(rule_configs.poor_weight_initialization()),
    ProfilerRule.sagemaker(rule_configs.ProfilerReport()),
]

In [None]:
hook_config = DebuggerHookConfig(
    hook_parameters={
        "train.save_interval": "1",
        "eval.save_interval": "1"
    }
)

profiler_config = ProfilerConfig(
    system_monitor_interval_millis=500, framework_profile_params=FrameworkProfile(num_steps=1)
)

## Train model with best hyperparameters, debugger and profiler

In [None]:
mi_estimator = PyTorch(
    role=sagemaker.get_execution_role(),
    base_job_name='amazon-bin-images-pytorch',
    instance_count=2,
    instance_type="ml.m5.large", 
    source_dir="./",
    entry_point="train.py",
    framework_version="1.8",
    py_version="py36",
    hyperparameters=hyperparameters,
    ## Debugger and Profiler parameters
    rules = rules,
    debugger_hook_config=hook_config,
    profiler_config=profiler_config
)

In [None]:
os.environ['SM_CHANNEL_TRAINING']='s3://amazon-bin-images-sub'
os.environ['SM_MODEL_DIR']='s3://amazon-bin-images-models'
os.environ['SM_OUTPUT_DATA_DIR']='s3://amazon-bin-images-models'
mi_estimator.fit({"training": "s3://amazon-bin-images-sub"})

## Locate model artifacts and deploy

In [None]:
model_location=mi_estimator.model_data

In [None]:
jpeg_serializer = sagemaker.serializers.IdentitySerializer("image/jpeg")
json_deserializer = sagemaker.deserializers.JSONDeserializer()

class ImagePredictor(Predictor):
    def __init__(self, endpoint_name, sagemaker_session):
        super(ImagePredictor, self).__init__(
            endpoint_name,
            sagemaker_session=sagemaker_session,
            serializer=jpeg_serializer,
            deserializer=json_deserializer,
        )

In [None]:
mi_pytorch_model = PyTorchModel(
    model_data=model_location, 
    role=role, 
    entry_point='inference.py',
    py_version='py36',
    framework_version='1.8',
    predictor_cls=ImagePredictor
)

In [None]:
predictor = mi_pytorch_model.deploy(initial_instance_count=1, instance_type='ml.m5.large')

## Test deployed model 

In [None]:
img_bytes = 'ENTER SOMETHING HERE'
type(img_bytes)

In [None]:
Image.open(io.BytesIO(img_bytes))

In [None]:
response=predictor.predict(img_bytes, initial_args={"ContentType": "image/jpeg"})

In [None]:
np.argmax(response, 1)