In [1]:
import os
import sys
import tarfile
from six.moves import urllib
from ipywidgets import FloatProgress
from IPython.display import display

DATA_URL = 'https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz'


def cifar10_download(data_dir='/tmp/cifar10_data', print_progress=True):
    """Download and extract the tarball from Alex's website."""
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    if os.path.exists(os.path.join(data_dir, 'cifar-10-batches-py')):
        print('cifar dataset already downloaded')
        return

    filename = DATA_URL.split('/')[-1]
    filepath = os.path.join(data_dir, filename)

    if not os.path.exists(filepath):
        f = FloatProgress(min=0, max=100)
        display(f)
        sys.stdout.write('\r>> Downloading %s ' % (filename))        

        def _progress(count, block_size, total_size):
            if print_progress:
                f.value = 100.0 * count * block_size / total_size

        filepath, _ = urllib.request.urlretrieve(DATA_URL, filepath, _progress)
        print()
        statinfo = os.stat(filepath)
        print('Successfully downloaded', filename, statinfo.st_size, 'bytes.')

    tarfile.open(filepath, 'r:gz').extractall(data_dir)

In [2]:
import os
import sagemaker
from sagemaker import get_execution_role

sagemaker_session = sagemaker.Session()

role = get_execution_role()
bucket = sagemaker_session.default_bucket()

In [3]:
# Download cifar10 datset
cifar10_download()

FloatProgress(value=0.0)

>> Downloading cifar-10-python.tar.gz ()
('Successfully downloaded', 'cifar-10-python.tar.gz', 170498071, 'bytes.')


In [4]:
sagemaker_session.upload_data(path='/tmp/cifar10_data/cifar-10-batches-py', key_prefix='cifar10_data')

's3://sagemaker-us-west-2-500842391574/cifar10_data'

In [5]:
# Configure the hyperparameters from the instructor
training_image = '500842391574.dkr.ecr.us-west-2.amazonaws.com/horovod:latest'
#hosting_image = '<<TO BE PROVIDED>>'

# Training data channel
channels = {'training': 's3://'+bucket+'/cifar10_data'}

# Optmized training parameters
hyperparameters = {'learning_rate': .0001, 'epochs': 20, 'batch_size': 32}

# Output of trained model
output_location = "s3://{}".format(bucket)

In [6]:
channels

{'training': 's3://sagemaker-us-west-2-500842391574/cifar10_data'}

In [7]:
from sagemaker.estimator import Estimator
# SageMaker estimator
horovod_estimator = Estimator(
    training_image,
    role=role,
    output_path=output_location,
    train_instance_count=2,
    train_instance_type='ml.p3.2xlarge',
    hyperparameters=hyperparameters,
    sagemaker_session=sagemaker_session
)

In [8]:
# Start training
horovod_estimator.fit(channels)

INFO:sagemaker:Creating training-job with name: horovod-2018-09-05-15-27-17-532


.......................
[32mCreating SageMaker trainer environment:[0m
[32mTrainerEnvironment(input_dir='/opt/ml/input', input_config_dir='/opt/ml/input/config', model_dir='/opt/ml/model', output_dir='/opt/ml/output', hyperparameters={'epochs': '20', 'learning_rate': '0.0001', 'batch_size': '32'}, resource_config={'current_host': 'algo-2', 'network_interface_name': 'ethwe', 'hosts': ['algo-1', 'algo-2']}, input_data_config={'training': {'TrainingInputMode': 'File', 'RecordWrapperType': 'None', 'S3DistributionType': 'FullyReplicated'}}, output_data_dir='/opt/ml/output/data', hosts=['algo-1', 'algo-2'], channel_dirs={'training': '/opt/ml/input/data/training'}, current_host='algo-2', available_gpus=1, available_cpus=8)[0m
[32mWorker node algo-2 is waiting for MPI to start training process[0m
[31mCreating SageMaker trainer environment:[0m
[31mTrainerEnvironment(input_dir='/opt/ml/input', input_config_dir='/opt/ml/input/config', model_dir='/opt/ml/model', output_dir='/opt/ml/output


Billable seconds: 2117
