In [1]:
#!pip install sagemaker-containers

In [2]:
# Sample hyperparameters hopefully provided by SageMaker Estimator() call
hyperparameters = {"epochs": 20, "learning_rate": 0.01}

# Global variables
_MPI_SCRIPT = "./mpi_script.sh"
_MPI_IS_RUNNING = "./mpi_is_running"
_MPI_IS_FINISHED = "./mpi_is_finished"

In [3]:
import os
import sys
import textwrap
import stat
import sagemaker_containers.beta.framework as framework

# `env` from __main__
env = framework.training_env(hyperparameters=hyperparameters)
print(env)

{'network_interface_name': 'ethwe', 'log_level': 20, 'model_dir': '/opt/ml/model', 'num_gpus': 0, 'channel_input_dirs': {}, 'input_config_dir': '/opt/ml/input/config', 'num_cpus': 4, 'input_data_config': {}, 'output_data_dir': '/opt/ml/output/data', 'hosts': [u'algo-1', u'algo-2'], 'output_dir': '/opt/ml/output', 'module_dir': '/opt/ml/code', 'hyperparameters': {'epochs': 20, 'learning_rate': 0.01}, 'module_name': 'None', 'current_host': u'algo-1', 'input_dir': '/opt/ml/input', 'job_name': None, 'resource_config': {u'hosts': [u'algo-1', u'algo-2'], u'current_host': u'algo-1'}, 'framework_module': None}


In [4]:
hyperparameters = framework.mapping.to_cmd_args(env.hyperparameters)
hyperparameters

[u'--epochs', u'20', u'--learning_rate', u'0.01']

In [5]:
# Sample to create mpirun script for each worker
def _create_mpi_script(env):
    """
    Creates a MPI script with user provided information.
    For distributed training: the 'master node' runs mpirun with this script,
    '/mpi_script.sh'. 
    
    This script creates a file '/mpi_is_running' that worker nodes use to
    determine whether training # (started by MPI from the master node) is still running.
    
    Processes on worker nodes use # /mpi_is_finished file to determine when to exit.
    
    Args:
        env (TrainingEnv): an instance of the training environment.
    """
    # return list of cmd args
    hyperparameters = framework.mapping.to_cmd_args(env.hyperparameters)
    channels = framework.mapping.to_cmd_args(env.channel_input_dirs)
    #framework.modules.download_and_install(env.module_dir) # <-- check if needed

    python_cmd = [sys.executable, 'train.py']
    python_cmd.extend(hyperparameters)
    python_cmd.extend(channels)

    content = textwrap.dedent("""#!/usr/bin/env bash
touch /mpi_is_running
%s
EXIT_CODE=$?
touch /mpi_is_finished
exit ${EXIT_CODE}
""" % ' '.join(python_cmd))

    # build MPI script
    with open(_MPI_SCRIPT, 'w') as w:
        w.write(content)
    
    # change permissions on script
    st = os.stat(_MPI_SCRIPT)
    os.chmod(_MPI_SCRIPT, st.st_mode | stat.S_IEXEC)

__Sample__

In [6]:
python_cmd = [sys.executable, 'train.py']
python_cmd.extend(hyperparameters)

In [7]:
python_cmd

['/home/ec2-user/anaconda3/envs/python2/bin/python',
 'train.py',
 u'--epochs',
 u'20',
 u'--learning_rate',
 u'0.01']

__Test using the function__

In [8]:
_create_mpi_script(env)

In [9]:
!cat ./mpi_script.sh

#!/usr/bin/env bash
touch /mpi_is_running
/home/ec2-user/anaconda3/envs/python2/bin/python train.py --epochs 20 --learning_rate 0.01
EXIT_CODE=$?
touch /mpi_is_finished
exit ${EXIT_CODE}
