In [1]:
%%file mx_lenet_sagemaker.py
import logging
from os import path as op
import os

import mxnet as mx
import numpy as np

data_path = os.getcwd()+ "/"
batch_size = 50
num_cpus = 0
num_gpus = 1

def prep_data(data_path):
    """
    Convert numpy array to mx Nd-array.
    Parameters
    ----------
    path: the directory that save data.npz.
    """
    data = np.load(find_file(data_path, 'data.npz'))
    x_train = data['x_train']
    y_train = data['y_train'][:,:1] ## only take the second column of y_train
    x_test = data['x_test']
    y_test = data['y_test'][:,:1]
    x_train = x_train.astype('float32')
    x_test = x_test.astype('float32')

    x_train -= np.mean(x_train)
    x_train /= np.std(x_train)
    x_test -= np.mean(x_train)
    x_test /= np.std(x_train)

    img_rows = 256
    img_cols = 256

    x_train = x_train.reshape(x_train.shape[0], 3, img_rows, img_cols) ## reshape it to (448, ) instead of (448,1)
    x_test = x_test.reshape(x_test.shape[0], 3, img_rows, img_cols)
    y_train = y_train.reshape(y_train.shape[0], )
    y_test = y_test.reshape(y_test.shape[0], )
    print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

    train_iter = mx.io.NDArrayIter(x_train, y_train, batch_size, shuffle=True)
    val_iter = mx.io.NDArrayIter(x_test, y_test, batch_size)

    return train_iter, val_iter

def find_file(root_path, file_name):
    """
    Searching for data.npz at its root director, and return a full path for the file.
    Parameters
    ----------
    root_path: the root directory for data.npz.
    file_name: refers to data.npz
    """
    for root, dirs, files in os.walk(root_path):
        if file_name in files:
            return os.path.join(root, file_name)

def mx_lenet():
    """Building a two layer LeNet Convolutional Neural Net using MXNet."""
    data = mx.sym.var('data')
    # first conv layer
    conv1 = mx.sym.Convolution(data=data, kernel=(5, 5), num_filter=20)
    tanh1 = mx.sym.Activation(data=conv1, act_type="tanh")
    pool1 = mx.sym.Pooling(data=tanh1, pool_type="max", kernel=(2, 2), stride=(2, 2))
    # second conv layer
    conv2 = mx.sym.Convolution(data=pool1, kernel=(5, 5), num_filter=50)
    tanh2 = mx.sym.Activation(data=conv2, act_type="tanh")
    pool2 = mx.sym.Pooling(data=tanh2, pool_type="max", kernel=(2, 2), stride=(2, 2))
    # first fullc layer
    flatten = mx.sym.flatten(data=pool2)
    fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=500)
    tanh3 = mx.sym.Activation(data=fc1, act_type="tanh")
    # second fullc
    fc2 = mx.sym.FullyConnected(data=tanh3, num_hidden=2)
    # softmax loss
    return mx.sym.SoftmaxOutput(data=fc2, name='softmax')


def train(num_cpus, num_gpus, **kwargs):
    """
    Train the image classification neural net.
    Parameters
    ----------
    num_cpus: If train the model on an aws GPS machine, num_cpus = 0 and num_gpus = 1, vice versa.
    num_gpus: apply to the same rule above
    """
    train_iter, val_iter = prep_data(data_path)
    lenet = mx_lenet()
    lenet_model = mx.mod.Module(
        symbol=lenet,
        context=get_train_context(num_cpus, num_gpus))
    logging.getLogger().setLevel(logging.DEBUG)
    lenet_model.fit(train_iter,
                    eval_data=val_iter,
                    optimizer='sgd',
                    optimizer_params={'learning_rate': 0.1},
                    eval_metric='acc',
                    batch_end_callback=mx.callback.Speedometer(batch_size, 16),
                    num_epoch=80)
    return lenet_model


def get_train_context(num_cpus, num_gpus):
    """
    Define the model training instance.
    Parameters
    ----------
    num_cpus: If train the model on an aws GPS machine, num_cpus = 0 and num_gpus = 1, vice versa.
    num_gpus: apply to the same rule above
    """
    if num_gpus > 0:
        return mx.gpu()
    return mx.cpu()

def get_train_context(num_cpus, num_gpus):
    if num_gpus > 0:
        print("It's {} instance".format(num_gpus))
        return mx.gpu()
    print("It's {} instance".format(num_cpus))
    return mx.cpu()

Overwriting mx_lenet_sagemaker.py


In [2]:
from sagemaker.mxnet import MXNet
from sagemaker import get_execution_role

mxnet_estimator = MXNet("mx_lenet_sagemaker.py", 
                        role=get_execution_role(), 
                        train_instance_type="ml.p2.xlarge", 
                        train_instance_count=1)
mxnet_estimator.fit("s3://ds-skynet/MX-Net/data") ## give your s3 bucket address here.

INFO:sagemaker:Created S3 bucket: sagemaker-us-east-1-552819999234
INFO:sagemaker:Creating training-job with name: sagemaker-mxnet-py2-gpu-2018-01-11-18-52-53-404


.................................................................................
[31mexecuting startup script (first run)[0m
[31m2018-01-11 18:59:37,357 INFO - root - running container entrypoint[0m
[31m2018-01-11 18:59:37,357 INFO - root - starting train task[0m
[31m2018-01-11 18:59:39,209 INFO - mxnet_container.train - MXNetTrainingEnvironment: {'enable_cloudwatch_metrics': False, 'available_gpus': 1, 'channels': {u'training': {u'TrainingInputMode': u'File', u'RecordWrapperType': u'None', u'S3DistributionType': u'FullyReplicated'}}, '_ps_verbose': 0, 'resource_config': {u'current_host': u'algo-1', u'hosts': [u'algo-1']}, 'user_script_name': u'mx_lenet_sagemaker.py', 'input_config_dir': '/opt/ml/input/config', 'channel_dirs': {u'training': u'/opt/ml/input/data/training'}, 'code_dir': '/opt/ml/code', 'output_data_dir': '/opt/ml/output/data/', 'output_dir': '/opt/ml/output', 'model_dir': '/opt/ml/model', 'hyperparameters': {u'sagemaker_program': u'mx_lenet_sagemaker.py', u'sagem

[31m2018-01-11 19:01:04,348 INFO - root - Epoch[25] Train-accuracy=0.586667[0m
[31m2018-01-11 19:01:04,348 INFO - root - Epoch[25] Time cost=1.512[0m
[31m2018-01-11 19:01:05,764 INFO - root - Epoch[25] Validation-accuracy=0.413333[0m
[31m2018-01-11 19:01:07,274 INFO - root - Epoch[26] Train-accuracy=0.526667[0m
[31m2018-01-11 19:01:07,274 INFO - root - Epoch[26] Time cost=1.510[0m
[31m2018-01-11 19:01:08,692 INFO - root - Epoch[26] Validation-accuracy=0.413333[0m
[31m2018-01-11 19:01:10,203 INFO - root - Epoch[27] Train-accuracy=0.635556[0m
[31m2018-01-11 19:01:10,204 INFO - root - Epoch[27] Time cost=1.511[0m
[31m2018-01-11 19:01:11,636 INFO - root - Epoch[27] Validation-accuracy=0.586667[0m
[31m2018-01-11 19:01:13,144 INFO - root - Epoch[28] Train-accuracy=0.553333[0m
[31m2018-01-11 19:01:13,144 INFO - root - Epoch[28] Time cost=1.508[0m
[31m2018-01-11 19:01:14,570 INFO - root - Epoch[28] Validation-accuracy=0.413333[0m
[31m2018-01-11 19:01:16,076 INFO - root

[31m2018-01-11 19:02:48,419 INFO - root - Epoch[60] Validation-accuracy=0.413333[0m
[31m2018-01-11 19:02:49,932 INFO - root - Epoch[61] Train-accuracy=0.628889[0m
[31m2018-01-11 19:02:49,932 INFO - root - Epoch[61] Time cost=1.513[0m
[31m2018-01-11 19:02:51,354 INFO - root - Epoch[61] Validation-accuracy=0.413333[0m
[31m2018-01-11 19:02:52,875 INFO - root - Epoch[62] Train-accuracy=0.633333[0m
[31m2018-01-11 19:02:52,875 INFO - root - Epoch[62] Time cost=1.521[0m
[31m2018-01-11 19:02:54,304 INFO - root - Epoch[62] Validation-accuracy=0.413333[0m
[31m2018-01-11 19:02:55,909 INFO - root - Epoch[63] Train-accuracy=0.640000[0m
[31m2018-01-11 19:02:55,909 INFO - root - Epoch[63] Time cost=1.605[0m
[31m2018-01-11 19:02:57,333 INFO - root - Epoch[63] Validation-accuracy=0.413333[0m
[31m2018-01-11 19:02:58,859 INFO - root - Epoch[64] Train-accuracy=0.608889[0m
[31m2018-01-11 19:02:58,860 INFO - root - Epoch[64] Time cost=1.527[0m
[31m2018-01-11 19:03:00,298 INFO - root