In [1]:
import logging
from os import path as op
import os

import mxnet as mx
import numpy as np

data_path = os.getcwd()+ "/"
batch_size = 100
num_cpus = 0
num_gpus = 1

def prep_data(data_path):
    """
    Convert numpy array to mx Nd-array.
    Parameters
    ----------
    path: the directory that save data.npz.
    """
    data = np.load(find_file(data_path, 'data.npz'))
    x_train = data['x_train']
    y_train = data['y_train']
    x_test = data['x_test']
    y_test = data['y_test']
    x_train = x_train.astype('float32')
    x_test = x_test.astype('float32')

    x_train -= np.mean(x_train)
    x_train /= np.std(x_train)
    x_test -= np.mean(x_train)
    x_test /= np.std(x_train)

    img_rows = 256
    img_cols = 256

    x_train = x_train.reshape(x_train.shape[0], 3, img_rows, img_cols)
    x_test = x_test.reshape(x_test.shape[0], 3, img_rows, img_cols)
    # y_train = y_train.reshape(y_train.shape[0], )
    # y_test = y_test.reshape(y_test.shape[0], )
    print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

    train_iter = mx.io.NDArrayIter(x_train, y_train, batch_size, shuffle=True)
    val_iter = mx.io.NDArrayIter(x_test, y_test, batch_size)

    return train_iter, val_iter

def find_file(root_path, file_name):
    """
    Searching for data.npz at its root director, and return a full path for the file.
    Parameters
    ----------
    root_path: the root directory for data.npz.
    file_name: refers to data.npz
    """
    for root, dirs, files in os.walk(root_path):
        if file_name in files:
            return os.path.join(root, file_name)

def mx_lenet():
    """Building a two layer LeNet Convolutional Neural Net using MXNet."""
    data = mx.sym.var('data')
    # first conv layer
    conv1 = mx.sym.Convolution(data=data, kernel=(5, 5), num_filter=20)
    tanh1 = mx.sym.Activation(data=conv1, act_type="tanh")
    pool1 = mx.sym.Pooling(data=tanh1, pool_type="max", kernel=(2, 2), stride=(2, 2))
    # second conv layer
    conv2 = mx.sym.Convolution(data=pool1, kernel=(5, 5), num_filter=50)
    tanh2 = mx.sym.Activation(data=conv2, act_type="tanh")
    pool2 = mx.sym.Pooling(data=tanh2, pool_type="max", kernel=(2, 2), stride=(2, 2))
    # first fullc layer
    flatten = mx.sym.flatten(data=pool2)
    fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=500)
    tanh3 = mx.sym.Activation(data=fc1, act_type="tanh")
    # second fullc
    fc2 = mx.sym.FullyConnected(data=tanh3, num_hidden=2)
    # softmax loss
    return mx.sym.SoftmaxOutput(data=fc2, name='softmax')


def train(num_cpus, num_gpus, **kwargs):
    """
    Train the image classification neural net.
    Parameters
    ----------
    num_cpus: If train the model on an aws GPS machine, num_cpus = 0 and num_gpus = 1, vice versa.
    num_gpus: apply to the same rule above
    """
    train_iter, val_iter = prep_data(data_path)
    lenet = mx_lenet()
    lenet_model = mx.mod.Module(
        symbol=lenet,
        context=get_train_context(num_cpus, num_gpus))
    logging.getLogger().setLevel(logging.DEBUG)
    lenet_model.fit(train_iter,
                    eval_data=val_iter,
                    optimizer='sgd',
                    optimizer_params={'learning_rate': 0.1},
                    eval_metric='acc',
                    batch_end_callback=mx.callback.Speedometer(batch_size, 16),
                    num_epoch=50)
    return lenet_model


def get_train_context(num_cpus, num_gpus):
    """
    Define the model training instance.
    Parameters
    ----------
    num_cpus: If train the model on an aws GPS machine, num_cpus = 0 and num_gpus = 1, vice versa.
    num_gpus: apply to the same rule above
    """
    if num_gpus > 0:
        return mx.gpu()
    return mx.cpu()

def get_train_context(num_cpus, num_gpus):
    if num_gpus > 0:
        print("It's {} instance".format(num_gpus))
        return mx.gpu()
    print("It's {} instance".format(num_cpus))
    return mx.cpu()

  import OpenSSL.SSL


In [2]:
from sagemaker.mxnet import MXNet
from sagemaker import get_execution_role

# print("Your Role is {}".format())
mxnet_estimator = MXNet("mx_lenet_sagemaker.py", 
                        role=get_execution_role(), 
                        train_instance_type="ml.p2.xlarge", 
                        train_instance_count=1)
mxnet_estimator.fit("s3://ds-skynet/MX-Net/data")

INFO:sagemaker:Created S3 bucket: sagemaker-us-east-1-552819999234
INFO:sagemaker:Creating training-job with name: sagemaker-mxnet-py2-gpu-2018-01-11-14-27-17-798


...................................................................................
[31mexecuting startup script (first run)[0m
[31m2018-01-11 14:34:04,904 INFO - root - running container entrypoint[0m
[31m2018-01-11 14:34:04,904 INFO - root - starting train task[0m
[31m2018-01-11 14:34:06,669 INFO - mxnet_container.train - MXNetTrainingEnvironment: {'enable_cloudwatch_metrics': False, 'available_gpus': 1, 'channels': {u'training': {u'TrainingInputMode': u'File', u'RecordWrapperType': u'None', u'S3DistributionType': u'FullyReplicated'}}, '_ps_verbose': 0, 'resource_config': {u'current_host': u'algo-1', u'hosts': [u'algo-1']}, 'user_script_name': u'mx_lenet_sagemaker.py', 'input_config_dir': '/opt/ml/input/config', 'channel_dirs': {u'training': u'/opt/ml/input/data/training'}, 'code_dir': '/opt/ml/code', 'output_data_dir': '/opt/ml/output/data/', 'output_dir': '/opt/ml/output', 'model_dir': '/opt/ml/model', 'hyperparameters': {u'sagemaker_program': u'mx_lenet_sagemaker.py', u'sag

  for idx, event in sagemaker.logs.multi_stream_iter(client, log_group, stream_names, positions):


[31m2018-01-11 14:34:20,540 INFO - root - Epoch[0] Train-accuracy=0.500000[0m
[31m2018-01-11 14:34:20,541 INFO - root - Epoch[0] Time cost=3.190[0m
[31m2018-01-11 14:34:22,306 INFO - root - Epoch[0] Validation-accuracy=0.500000[0m
[31m2018-01-11 14:34:23,952 INFO - root - Epoch[1] Train-accuracy=0.500000[0m
[31m2018-01-11 14:34:23,952 INFO - root - Epoch[1] Time cost=1.646[0m
[31m2018-01-11 14:34:25,696 INFO - root - Epoch[1] Validation-accuracy=0.500000[0m
[31m2018-01-11 14:34:27,343 INFO - root - Epoch[2] Train-accuracy=0.500000[0m
[31m2018-01-11 14:34:27,343 INFO - root - Epoch[2] Time cost=1.647[0m
[31m2018-01-11 14:34:29,102 INFO - root - Epoch[2] Validation-accuracy=0.500000[0m
[31m2018-01-11 14:34:30,751 INFO - root - Epoch[3] Train-accuracy=0.500000[0m
[31m2018-01-11 14:34:30,752 INFO - root - Epoch[3] Time cost=1.650[0m
[31m2018-01-11 14:34:32,507 INFO - root - Epoch[3] Validation-accuracy=0.500000[0m
[31m2018-01-11 14:34:34,156 INFO - root - Epoch[4] 

KeyboardInterrupt: 