## Hyperparameter Tuning in SageMaker

In [2]:
import sagemaker
from sagemaker.tuner import (
    IntegerParameter,
    CategoricalParameter,
    ContinuousParameter,
    HyperparameterTuner,
)

sagemaker_session = sagemaker.Session()

# bucket = sagemaker_session.default_bucket()
bucket = 'edgarin-mlend'
prefix = "hpo-ex/pytorch-cifar-data"

role = sagemaker.get_execution_role()

In [23]:
from torchvision.datasets import CIFAR10
from torchvision import transforms

# Download data locally
local_dir = 'data'
CIFAR10.mirrors = ["https://sagemaker-sample-files.s3.amazonaws.com/datasets/image/CIFAR10/"]
CIFAR10(
    local_dir,
    download=True,
    transform=transforms.Compose(
        [transforms.ToTensor()]
    )
)

Files already downloaded and verified


Dataset CIFAR10
    Number of datapoints: 50000
    Root location: data
    Split: Train
    StandardTransform
Transform: Compose(
               ToTensor()
           )

In [16]:
# TODONE: Upload the data to an S3 bucket. You can use the sagemaker_session object, boto3 or the AWS CLI
s3_inputs_path = sagemaker_session.upload_data(path='./data', bucket=bucket, key_prefix=prefix)
print("input spec (in this case, just an S3 path): {}".format(s3_inputs_path))

input spec (in this case, just an S3 path): s3://edgarin-mlend/hpo-ex/pytorch-cifar-data


In [6]:
from sagemaker.pytorch import PyTorch

estimator = PyTorch(
    entry_point="cifar.py",
    role=role,
    py_version='py36',
    framework_version="1.8",
    instance_count=1,
    instance_type="ml.m5.large"
)

In [27]:
#TODONE: Initialise your hyperparameters
hyperparameter_ranges = {
    'batch-size': CategoricalParameter([32, 128, 512, 2048]),
    'lr': ContinuousParameter(0.01, 0.1),
    'epochs': IntegerParameter(4, 8),
    'momentum': ContinuousParameter(0.8, 1)
}


"\nhyperparameter_ranges = {\n    'batch-size': CategoricalParameter([128, 512, 2048]),\n}\n"

In [28]:
objective_metric_name = "average test loss"
objective_type = "Minimize"
metric_definitions = [{"Name": "average test loss", "Regex": "Test set: Average loss: ([0-9\\.]+)"}]

In [None]:
#TODO: Create your HyperparameterTuner Object
tuner = HyperparameterTuner(
    estimator,
    objective_metric_name,
    hyperparameter_ranges,
    metric_definitions,
    max_jobs=16,
    max_parallel_jobs=4,
    objective_type=objective_type
)

In [None]:
#TODONE: Train your model
tuner.fit({'training': s3_inputs_path})

...........................

In [4]:
# ME: Create tuner 'offline'. Only works if it's the first tuner's deployment
tuner = HyperparameterTuner.attach('pytorch-training-220113-1641')

In [1]:
predictor = tuner.deploy(initial_instance_count=1, instance_type="ml.t2.medium")

'\n2022-01-13 17:14:07 Starting - Preparing the instances for training\n2022-01-13 17:14:07 Downloading - Downloading input data\n2022-01-13 17:14:07 Training - Training image download completed. Training in progress.\n2022-01-13 17:14:07 Uploading - Uploading generated training model\n2022-01-13 17:14:07 Completed - Training job completed\n'

In [13]:
# Me get estimator offline, if training job already exists
from sagemaker.pytorch import PyTorch
estimator = PyTorch.attach('pytorch-training-220113-1641-016-bdd2c065', sagemaker_session=sagemaker_session)


2022-01-13 17:14:07 Starting - Preparing the instances for training
2022-01-13 17:14:07 Downloading - Downloading input data
2022-01-13 17:14:07 Training - Training image download completed. Training in progress.
2022-01-13 17:14:07 Uploading - Uploading generated training model
2022-01-13 17:14:07 Completed - Training job completed


In [14]:
# Me: Deploy estimator
predictor = estimator.deploy(initial_instance_count=1, instance_type="ml.t2.medium")
predictor

INFO:sagemaker:Creating model with name: pytorch-training-2022-01-15-20-25-33-241
INFO:sagemaker:Creating endpoint with name pytorch-training-2022-01-15-20-25-33-241


-------------!

<sagemaker.pytorch.model.PyTorchPredictor at 0x7fa55f23f208>

In [18]:
# ME: offline predictor if endpoint already exists
from sagemaker.serializers import IdentitySerializer, NumpySerializer
from sagemaker.deserializers import NumpyDeserializer
from sagemaker.predictor import Predictor

predictor = Predictor(
    endpoint_name='pytorch-training-2022-01-15-20-13-38-974',
    sagemaker_session=sagemaker_session,
    serializer=NumpySerializer(), # Also works for tensors
    deserializer=NumpyDeserializer()
)

## Query the Endpoint

In [19]:
# This is to simulate getting the images from the trainloader
import torch
from torchvision.datasets import CIFAR10
from torchvision import transforms
def _get_test_data_loader(batch_size, training_dir):    
    dataset = CIFAR10(
        training_dir,
        train=False,
        transform=transforms.Compose(
            [transforms.ToTensor()]
        ),
    )

    return torch.utils.data.DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=True
    )

def get_sample_batch_from_data_loader(batch_size=3):
    data_loader = _get_test_data_loader(batch_size, './data')
    images, labels = next(iter(data_loader))    
    print('image batch shape: ', images.shape)
    return images
    

In [20]:
images = get_sample_batch_from_data_loader()

image batch shape:  torch.Size([3, 3, 32, 32])


In [21]:
outputs = predictor.predict(images)
type(outputs), outputs

(numpy.ndarray,
 array([[-1.88285112, -2.5368638 , -1.82982254, -1.24045777, -4.34741306,
         -2.26212573, -3.22548103, -3.42421889, -4.64631653, -2.11977243],
        [-4.43299055, -7.13682318, -2.11528444, -1.27023113, -3.08074617,
         -0.94650877, -3.73306227, -2.07049942, -7.17226076, -6.78401566],
        [-3.53452444, -8.12713909, -2.49943805, -1.48746967, -4.09650993,
         -0.53728586, -5.00073576, -2.99834251, -6.0757885 , -5.95453644]]))

In [22]:
preds = outputs.argmax(axis=1)
preds

array([3, 5, 5])

## Now the same query but without a dataloader (directly from filesystem)

In [23]:
import gzip 
import numpy as np
import random
import os

def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

def get_sample_np_image_from_pickle():
    file = 'data/cifar-10-batches-py/data_batch_1'
    raw_data=unpickle(file)
    image = np.reshape(raw_data[b'data'][0], (3, 32, 32))
    print('image shape: ', image.shape)    
    return image

In [24]:
image = get_sample_np_image_from_pickle()
np_images = np.array([image]).astype(np.float32) # From CIFAR pickle
type(np_images), np_images.shape

image shape:  (3, 32, 32)


(numpy.ndarray, (1, 3, 32, 32))

In [25]:
# TODO: Query the endpoint
outputs = predictor.predict(np_images)
print(outputs)

[[-309.27151489 -474.49102783 -169.92550659    0.         -776.70776367
   -19.04278564 -167.51652527  -81.43013    -490.99118042 -597.09423828]]


In [26]:
preds = outputs.argmax(axis=1)
preds

array([3])

In [27]:
# Now same but sending a Pytorch Tensor (more practical as it doesn't need non standard types)
pt_input_data = torch.tensor([image]).float()

In [28]:
# TODO: Query the endpoint
outputs = predictor.predict(np_images)
print(outputs)

[[-309.27151489 -474.49102783 -169.92550659    0.         -776.70776367
   -19.04278564 -167.51652527  -81.43013    -490.99118042 -597.09423828]]


In [29]:
preds = outputs.argmax(axis=1)
preds

array([3])

### Cleanup

After you have finished with this exercise, remember to delete the prediction endpoint to release the instance associated with it