In [1]:
#!pip install sagemaker-containers

In [2]:
# Sample hyperparameters hopefully provided by SageMaker Estimator() call
hyperparameters = {"epochs": 20, "learning_rate": 0.01}

# Global variables
_MPI_SCRIPT = "./mpi_script.sh"
_MPI_IS_RUNNING = "./mpi_is_running"
_MPI_IS_FINISHED = "./mpi_is_finished"

In [3]:
import os
import sys
import textwrap
import stat
import sagemaker_containers.beta.framework as framework

# `env` from __main__
env = framework.training_env(hyperparameters=hyperparameters)
print(env)

{'network_interface_name': 'ethwe', 'log_level': 20, 'model_dir': '/home/ec2-user/sagemaker_local/jobs/1535993635.62/opt/ml/model', 'num_gpus': 0, 'channel_input_dirs': {}, 'input_config_dir': '/home/ec2-user/sagemaker_local/jobs/1535993635.62/opt/ml/input/config', 'num_cpus': 4, 'input_data_config': {}, 'output_data_dir': '/home/ec2-user/sagemaker_local/jobs/1535993635.62/opt/ml/output/data', 'hosts': [u'ip-172-16-48-46'], 'output_dir': '/home/ec2-user/sagemaker_local/jobs/1535993635.62/opt/ml/output', 'module_dir': '/home/ec2-user/sagemaker_local/jobs/1535993635.62/opt/ml/code', 'hyperparameters': {'epochs': 20, 'learning_rate': 0.01}, 'module_name': 'None', 'current_host': u'ip-172-16-48-46', 'input_dir': '/home/ec2-user/sagemaker_local/jobs/1535993635.62/opt/ml/input', 'job_name': None, 'resource_config': {u'current_host': u'ip-172-16-48-46', u'hosts': [u'ip-172-16-48-46']}, 'framework_module': None}


In [4]:
hyperparameters = framework.mapping.to_cmd_args(env.hyperparameters)
hyperparameters

[u'--epochs', u'20', u'--learning_rate', u'0.01']

In [5]:
# Sample to create mpirun script for each worker
def _create_mpi_script(env):
    """
    Creates a MPI script with user provided information.
    For distributed training: the 'master node' runs mpirun with this script,
    '/mpi_script.sh'. 
    
    This script creates a file '/mpi_is_running' that worker nodes use to
    determine whether training # (started by MPI from the master node) is still running.
    
    Processes on worker nodes use # /mpi_is_finished file to determine when to exit.
    
    Args:
        env (TrainingEnv): an instance of the training environment.
    """
    # return list of cmd args
    hyperparameters = framework.mapping.to_cmd_args(env.hyperparameters)
    channels = framework.mapping.to_cmd_args(env.channel_input_dirs)
    #framework.modules.download_and_install(env.module_dir) # <-- check if needed

    python_cmd = [sys.executable, 'train.py']
    python_cmd.extend(hyperparameters)
    python_cmd.extend(channels)

    content = textwrap.dedent("""#!/usr/bin/env bash
touch /mpi_is_running
%s
EXIT_CODE=$?
touch /mpi_is_finished
exit ${EXIT_CODE}
""" % ' '.join(python_cmd))

    # build MPI script
    with open(_MPI_SCRIPT, 'w') as w:
        w.write(content)
    
    # change permissions on script
    st = os.stat(_MPI_SCRIPT)
    os.chmod(_MPI_SCRIPT, st.st_mode | stat.S_IEXEC)

__Sample__

In [6]:
python_cmd = [sys.executable, 'train.py']
python_cmd.extend(hyperparameters)

In [7]:
python_cmd

['/home/ec2-user/anaconda3/envs/tensorflow_p27/bin/python',
 'train.py',
 u'--epochs',
 u'20',
 u'--learning_rate',
 u'0.01']

__Test using the function__

In [8]:
_create_mpi_script(env)

In [9]:
!cat ./mpi_script.sh

#!/usr/bin/env bash
touch /mpi_is_running
/home/ec2-user/anaconda3/envs/tensorflow_p27/bin/python train.py --epochs 20 --learning_rate 0.01
EXIT_CODE=$?
touch /mpi_is_finished
exit ${EXIT_CODE}


---

---

In [10]:
import os
import sys
import tarfile
from six.moves import urllib
from ipywidgets import FloatProgress
from IPython.display import display

DATA_URL = 'https://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz'


def cifar10_download(data_dir='/tmp/cifar10_data', print_progress=True):
    """Download and extract the tarball from Alex's website."""
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    if os.path.exists(os.path.join(data_dir, 'cifar-10-batches-bin')):
        print('cifar dataset already downloaded')
        return

    filename = DATA_URL.split('/')[-1]
    filepath = os.path.join(data_dir, filename)

    if not os.path.exists(filepath):
        f = FloatProgress(min=0, max=100)
        display(f)
        sys.stdout.write('\r>> Downloading %s ' % (filename))        

        def _progress(count, block_size, total_size):
            if print_progress:
                f.value = 100.0 * count * block_size / total_size

        filepath, _ = urllib.request.urlretrieve(DATA_URL, filepath, _progress)
        print()
        statinfo = os.stat(filepath)
        print('Successfully downloaded', filename, statinfo.st_size, 'bytes.')

    tarfile.open(filepath, 'r:gz').extractall(data_dir)

In [30]:
import os
import sagemaker
from sagemaker import get_execution_role

sagemaker_session = sagemaker.Session()

role = get_execution_role()
bucket = sagemaker_session.default_bucket()

INFO:sagemaker:Created S3 bucket: sagemaker-us-west-2-500842391574


In [31]:
# DOwnload cifar10 datset
cifar10_download()

cifar dataset already downloaded


In [32]:
sagemaker_session.upload_data(path='/tmp/cifar10_data', key_prefix='cifar10_data')

In [38]:
# Configure the hyperparameters from the instructor
training_image = '500842391574.dkr.ecr.us-west-2.amazonaws.com/horovod:latest'
#hosting_image = '<<PROVIDED BY INSTRUCTOR>>'

# Training data channel
channels = {'training': 's3://'+bucket+'/cifar10_data'}

# Optmized training parameters
hyperparameters = {'learning-rate': .0001, 'epochs': 12}

# Output of trained model
output_location = "s3://{}".format(bucket)

In [39]:
channels

{'training': 's3://sagemaker-us-west-2-500842391574/cifar10_data'}

In [40]:
from sagemaker.estimator import Estimator
# SageMaker estimator
horovod_estimator = Estimator(
    training_image,
    role=role,
    output_path=output_location,
    train_instance_count=2,
    train_instance_type='ml.p3.2xlarge',
    hyperparameters=hyperparameters,
    sagemaker_session=sagemaker_session
)

In [41]:
# Start training
horovod_estimator.fit(channels)

INFO:sagemaker:Creating training-job with name: horovod-2018-09-03-17-27-34-717


.......................
[31mNo handlers could be found for logger "sagemaker-containers"[0m



ValueError: Error training horovod-2018-09-03-17-27-34-717: Failed Reason: AlgorithmError: framework error: 
Traceback (most recent call last):
  File "/usr/local/lib/python2.7/dist-packages/sagemaker_containers/_trainer.py", line 47, in train
    framework = importlib.import_module(framework_name)
  File "/usr/lib/python2.7/importlib/__init__.py", line 37, in import_module
    __import__(name)
ImportError: No module named training

No module named training