# Distributed Training with SageMaker and Gluon

This lab demonstrates how to perform distributed training on multiple hosts using Gluon and SageMaker. There are two main steps:

1. Choose the version of kvstore to use when creating the Gluon Trainer. For distributed training it is either 'dist_sync', 'dist_device_sync', 'dist_async'. See there refence (https://mxnet.incubator.apache.org/api/python/kvstore/kvstore.html#mxnet.kvstore.create) for details.
2. Specify more than 1 instance when creating a SageMaker MXNet model.

In [1]:
import mxnet as mx
from mxnet import nd
import os
import boto3
from sagemaker.mxnet import MXNet
from sagemaker import get_execution_role

role = get_execution_role()

s3 = boto3.client('s3')

bucket_name = 'eduthie-sagemaker-1'
prefix = 'distributed_training_gluon_lab/'

local_dir = '/tmp'

In [2]:
num_inputs = 2
num_outputs = 1
num_examples = 10000000

def real_fn(X):
    return 2 * X[:, 0] - 3.4 * X[:, 1] + 4.2

X = nd.random_normal(shape=(num_examples, num_inputs))
noise = 0.01 * nd.random_normal(shape=(num_examples,))
y = real_fn(X) + noise

In [3]:
def save_and_upload(X,y,target_folder,i):
    file_name = '{}'.format(i)
    local_path = os.path.join(local_dir,file_name)
    mx.nd.save(local_path,{'X':X, 'y':y})
    print('Created local file {}'.format(local_path))
    upload_filename = '{}/{}/{}'.format(prefix,target_folder,file_name)
    print('Uploading to {}'.format(upload_filename))
    s3.upload_file(local_path, bucket_name, upload_filename)

def split_and_upload(X,y,k,target_folder):
    n = len(X)
    assert (n//k)*k == n
    idx = list(range(0, n+1, n//k))
    X_shards = [X[idx[i]:idx[i+1]] for i in range(k)]
    y_shards = [y[idx[i]:idx[i+1]] for i in range(k)]
    
    for X,y,i in zip(X_shards,y_shards,range(k)):
        save_and_upload(X,y,target_folder,i)

In [4]:
train_frac = 0.9
split_index = int(num_examples*train_frac)
X_train = X[0:split_index]
X_test = X[split_index:]
y_train = y[0:split_index]
y_test = y[split_index:]
print(len(X_train))
print(len(X_test))

split_and_upload(X_train,y_train,5,'train')
save_and_upload(X_test,y_test,'test',0)

9000000
1000000
Created local file /tmp/0
Uploading to distributed_training_gluon_lab//train/0
Created local file /tmp/1
Uploading to distributed_training_gluon_lab//train/1
Created local file /tmp/2
Uploading to distributed_training_gluon_lab//train/2
Created local file /tmp/3
Uploading to distributed_training_gluon_lab//train/3
Created local file /tmp/4
Uploading to distributed_training_gluon_lab//train/4
Created local file /tmp/0
Uploading to distributed_training_gluon_lab//test/0


In [5]:
from multiple_regression import train

In [6]:
channel_input_dirs = {'train':'./data'}
hyperparameters = {'batch_size':64, 'epochs':10, 'learning_rate':0.01}
train(hyperparameters=hyperparameters,channel_input_dirs=channel_input_dirs,num_gpus=1,hosts=['alg-1','alg-2'],
      current_host='alg-1')

Train file path ./data/0
Number of examples 100
kvstore device
Epoch 0, loss: 0.330997190475
Epoch 1, loss: 0.35462594986
Epoch 2, loss: 0.340337524414
Epoch 3, loss: 0.344433574677
Epoch 4, loss: 0.337782287598
Epoch 5, loss: 0.340681056976
Epoch 6, loss: 0.34321428299
Epoch 7, loss: 0.328452281952
Epoch 8, loss: 0.341133003235
Epoch 9, loss: 0.334118156433


In [None]:
mnist_estimator_10 = MXNet(entry_point='multiple_regression.py',
    role=role,
    train_instance_count=10, 
    train_instance_type='ml.p3.2xlarge',
    hyperparameters={'batch_size':64, 'epochs':10, 'learning_rate':0.00000001})

mnist_estimator_1 = MXNet(entry_point='multiple_regression.py',
    role=role,
    train_instance_count=1, 
    train_instance_type='ml.p3.2xlarge',
    hyperparameters={'batch_size':64, 'epochs':10, 'learning_rate':0.00000001})

train_data_location = 's3://{}/{}train'.format(bucket_name,prefix)
test_data_location = 's3://{}/{}test'.format(bucket_name,prefix)

mnist_estimator_1.fit({'train': train_data_location, 'test': test_data_location},wait=False)
mnist_estimator_10.fit({'train': train_data_location, 'test': test_data_location},wait=False)

INFO:sagemaker:Creating training-job with name: sagemaker-mxnet-2018-07-12-08-25-42-331


.........................
[32m2018-07-12 08:29:41,331 INFO - root - running container entrypoint[0m
[32m2018-07-12 08:29:41,331 INFO - root - starting train task[0m
[32m2018-07-12 08:29:41,351 INFO - container_support.training - Training starting[0m
[31m2018-07-12 08:29:37,926 INFO - root - running container entrypoint[0m
[31m2018-07-12 08:29:37,926 INFO - root - starting train task[0m
[31m2018-07-12 08:29:37,948 INFO - container_support.training - Training starting[0m
[32m2018-07-12 08:29:45,269 INFO - mxnet_container.train - MXNetTrainingEnvironment: {'enable_cloudwatch_metrics': False, 'available_gpus': 1, 'channels': {u'test': {u'TrainingInputMode': u'File', u'RecordWrapperType': u'None', u'S3DistributionType': u'FullyReplicated'}, u'train': {u'TrainingInputMode': u'File', u'RecordWrapperType': u'None', u'S3DistributionType': u'FullyReplicated'}}, '_ps_verbose': 0, 'resource_config': {u'current_host': u'algo-2', u'network_interface_name': u'ethwe', u'hosts': [u'algo-1'

  for idx, event in sagemaker.logs.multi_stream_iter(client, log_group, stream_names, positions):


[32mEpoch 0, loss: 0.0244133305157[0m
[31mEpoch 0, loss: 0.0243529557301[0m
[32mEpoch 1, loss: 1.09735581624e-06[0m
[31mEpoch 1, loss: 1.09551817263e-06[0m
[32mEpoch 2, loss: 7.82107433146e-07[0m
[31mEpoch 2, loss: 7.81440616914e-07[0m
[32mEpoch 3, loss: 7.82098567742e-07[0m
[31mEpoch 3, loss: 7.81441063522e-07[0m
[31mEpoch 4, loss: 7.81441730187e-07[0m
[32mEpoch 4, loss: 7.82099877873e-07[0m
