# Pre-Trained ResNet34 Training of CIFAR100 on SageMaker Managed Spot Training

In [1]:
import sagemaker
import uuid

sagemaker_session = sagemaker.Session()
print('SageMaker version: ' + sagemaker.__version__)

bucket = sagemaker_session.default_bucket()
prefix = 'sagemaker/pytorch-resnet34-cifar100'

role = sagemaker.get_execution_role()
checkpoint_suffix = str(uuid.uuid4())[:8]
checkpoint_s3_path = 's3://{}/checkpoint-{}'.format(bucket, checkpoint_suffix)

print('Checkpointing Path: {}'.format(checkpoint_s3_path))

D:\Projects\GitHub\TSAI_EMLO1.0\Session09_AWSSagemakerAndLargeScaleModelTraining


In [None]:
import os
import subprocess

instance_type = 'local'

if subprocess.call('nvidia-smi') == 0:
    ## Set type to GPU if one is present
    instance_type = 'local_gpu'
    
print("Instance type = " + instance_type)

### Download the Cifar10 dataset

In [None]:
from utils_cifar import get_train_data_loader, get_test_data_loader, imshow, classes

trainloader = get_train_data_loader()
testloader = get_test_data_loader()

### Data Preview

In [None]:
import numpy as np
import torchvision, torch

# get some random training images
dataiter = iter(trainloader)
images, labels = dataiter.next()

# show images
imshow(torchvision.utils.make_grid(images))

# print labels
print(' '.join('%9s' % classes[labels[j]] for j in range(4)))

### Upload data

In [None]:
inputs = sagemaker_session.upload_data(path='data', bucket=bucket, key_prefix='data/cifar100')

## Training

In [2]:
# import os

# os.environ['SM_OUTPUT_DATA_DIR'] = os.path.join(dr,'output')
# os.environ['SM_MODEL_DIR'] = os.path.join(dr,'model')
# os.environ['SM_CHANNEL'] = os.path.join(dr,'data')

use_spot_instances = True
max_run=600
max_wait = 1200 if use_spot_instances else None

In [3]:
from sagemaker.pytorch import PyTorch

hyperparameters = {'epochs': 4}


spot_estimator = PyTorch(entry_point='train.py',
                            role=role,
                            framework_version='1.7.1',
                            py_version='py3',
                            instance_count=2,
                            instance_type='ml.p3.8xlarge',
                            base_job_name='cifar100-pytorch-resnet34-spot-1',
                            hyperparameters=hyperparameters,
                            checkpoint_s3_uri=checkpoint_s3_path,
                            debugger_hook_config=False,
                            use_spot_instances=use_spot_instances,
                            max_run=max_run,
                            max_wait=max_wait)

spot_estimator.fit(inputs)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


Namespace(batch_size=128, dir='D:/Projects/GitHub/TSAI_EMLO1.0/Session09_AWSSagemakerAndLargeScaleModelTraining\\data', epochs=1, gpus=1, model_dir='D:/Projects/GitHub/TSAI_EMLO1.0/Session09_AWSSagemakerAndLargeScaleModelTraining\\model', output_data_dir='D:/Projects/GitHub/TSAI_EMLO1.0/Session09_AWSSagemakerAndLargeScaleModelTraining\\output')
Files already downloaded and verified
Files already downloaded and verified


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type   | Params
-----------------------------------
0 | network | ResNet | 21.3 M
-----------------------------------
21.3 M    Trainable params
0         Non-trainable params
21.3 M    Total params
85.344    Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

# Deploy the trained model to prepare for predictions

In [None]:
from sagemaker.pytorch import PyTorchModel

predictor = spot_estimator.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')

## Invoking the end-point

In [None]:
# get some test images
dataiter = iter(testloader)
images, labels = dataiter.next()

# print images
imshow(torchvision.utils.make_grid(images))
print('GroundTruth: ', ' '.join('%4s' % classes[labels[j]] for j in range(4)))

outputs = predictor.predict(images.numpy())

_, predicted = torch.max(torch.from_numpy(np.array(outputs)), 1)

print('Predicted: ', ' '.join('%4s' % classes[predicted[j]]
                              for j in range(4)))

# Clean-up

To avoid incurring extra charges to your AWS account, let's delete the endpoint we created:

In [None]:
predictor.delete_endpoint()