In [1]:
%%file mx_lenet_sagemaker.py
import logging
from os import path as op
import os

import mxnet as mx
import numpy as np

data_path = os.getcwd()+ "/"
batch_size = 50
num_cpus = 0
num_gpus = 1

def prep_data(data_path):
    """
    Convert numpy array to mx Nd-array.
    Parameters
    ----------
    path: the directory that save data.npz.
    """
    data = np.load(find_file(data_path, 'data.npz'))
    x_train = data['x_train']
    y_train = data['y_train'][:,:1] ## only take the second column of y_train
    x_test = data['x_test']
    y_test = data['y_test'][:,:1]
    x_train = x_train.astype('float32')
    x_test = x_test.astype('float32')

    x_train -= np.mean(x_train)
    x_train /= np.std(x_train)
    x_test -= np.mean(x_train)
    x_test /= np.std(x_train)

    img_rows = 256
    img_cols = 256

    x_train = x_train.reshape(x_train.shape[0], 3, img_rows, img_cols) ## reshape it to (448, ) instead of (448,1)
    x_test = x_test.reshape(x_test.shape[0], 3, img_rows, img_cols)
    y_train = y_train.reshape(y_train.shape[0], )
    y_test = y_test.reshape(y_test.shape[0], )
    print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

    train_iter = mx.io.NDArrayIter(x_train, y_train, batch_size, shuffle=True)
    val_iter = mx.io.NDArrayIter(x_test, y_test, batch_size)

    return train_iter, val_iter

def find_file(root_path, file_name):
    """
    Searching for data.npz at its root director, and return a full path for the file.
    Parameters
    ----------
    root_path: the root directory for data.npz.
    file_name: refers to data.npz
    """
    for root, dirs, files in os.walk(root_path):
        if file_name in files:
            return os.path.join(root, file_name)

def mx_lenet():
    """Building a three layer LeNet sytle Convolutional Neural Net using MXNet."""
    data = mx.sym.var('data')
    data_dp = mx.symbol.Dropout(data, p = 0.2) ## 20% of the input that gets dropped out during training time
    # first conv layer
    conv1 = mx.sym.Convolution(data=data_dp, kernel=(5, 5), num_filter=20)
    tanh1 = mx.sym.Activation(data=conv1, act_type="tanh")
    pool1 = mx.sym.Pooling(data=tanh1, pool_type="max", kernel=(2, 2), stride=(2, 2))
    # second conv layer
    conv2 = mx.sym.Convolution(data=pool1, kernel=(5, 5), num_filter=50)
    tanh2 = mx.sym.Activation(data=conv2, act_type="tanh")
    pool2 = mx.sym.Pooling(data=tanh2, pool_type="max", kernel=(2, 2), stride=(2, 2))
    
    # third conv layer
    conv3 = mx.sym.Convolution(data=pool1, kernel=(5, 5), num_filter=50)
    tanh3 = mx.sym.Activation(data=conv2, act_type="tanh")
    pool3 = mx.sym.Pooling(data=tanh2, pool_type="max", kernel=(2, 2), stride=(2, 2))
    
    # first fullc layer
    flatten = mx.sym.flatten(data=pool3)
    fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=500)
    tanh4 = mx.sym.Activation(data=fc1, act_type="tanh")
    # second fullc
    fc2 = mx.sym.FullyConnected(data=tanh4, num_hidden=2)
    # softmax loss
    return mx.sym.SoftmaxOutput(data=fc2, name='softmax')


def train(num_cpus, num_gpus, **kwargs):
    """
    Train the image classification neural net.
    Parameters
    ----------
    num_cpus: If train the model on an aws GPS machine, num_cpus = 0 and num_gpus = 1, vice versa.
    num_gpus: apply to the same rule above
    """
    train_iter, val_iter = prep_data(data_path)
    lenet = mx_lenet()
    lenet_model = mx.mod.Module(
        symbol=lenet,
        context=get_train_context(num_cpus, num_gpus))
    logging.getLogger().setLevel(logging.DEBUG)
    lenet_model.fit(train_iter,
                    eval_data=val_iter,
                    optimizer='sgd',
                    optimizer_params={'learning_rate': 0.1},
                    eval_metric='acc',
                    batch_end_callback=mx.callback.Speedometer(batch_size, 16),
                    num_epoch=50)
    return lenet_model


def get_train_context(num_cpus, num_gpus):
    """
    Define the model training instance.
    Parameters
    ----------
    num_cpus: If train the model on an aws GPS machine, num_cpus = 0 and num_gpus = 1, vice versa.
    num_gpus: apply to the same rule above
    """
    if num_gpus > 0:
        return mx.gpu()
    return mx.cpu()

def get_train_context(num_cpus, num_gpus):
    if num_gpus > 0:
        print("It's {} instance".format(num_gpus))
        return mx.gpu()
    print("It's {} instance".format(num_cpus))
    return mx.cpu()

Overwriting mx_lenet_sagemaker.py


In [2]:
from sagemaker.mxnet import MXNet
from sagemaker import get_execution_role

mxnet_estimator = MXNet("mx_lenet_sagemaker.py", 
                        role=get_execution_role(), 
                        train_instance_type="ml.p2.xlarge", 
                        train_instance_count=1)
mxnet_estimator.fit("s3://ds-skynet/MX-Net/data") ## give your s3 bucket address here.

INFO:sagemaker:Created S3 bucket: sagemaker-us-east-1-552819999234
INFO:sagemaker:Creating training-job with name: sagemaker-mxnet-py2-gpu-2018-01-12-18-34-02-618


....................................................................................
[31mexecuting startup script (first run)[0m
[31m2018-01-12 18:40:57,573 INFO - root - running container entrypoint[0m
[31m2018-01-12 18:40:57,573 INFO - root - starting train task[0m
[31m2018-01-12 18:40:59,284 INFO - mxnet_container.train - MXNetTrainingEnvironment: {'enable_cloudwatch_metrics': False, 'available_gpus': 1, 'channels': {u'training': {u'TrainingInputMode': u'File', u'RecordWrapperType': u'None', u'S3DistributionType': u'FullyReplicated'}}, '_ps_verbose': 0, 'resource_config': {u'current_host': u'algo-1', u'hosts': [u'algo-1']}, 'user_script_name': u'mx_lenet_sagemaker.py', 'input_config_dir': '/opt/ml/input/config', 'channel_dirs': {u'training': u'/opt/ml/input/data/training'}, 'code_dir': '/opt/ml/code', 'output_data_dir': '/opt/ml/output/data/', 'output_dir': '/opt/ml/output', 'model_dir': '/opt/ml/model', 'hyperparameters': {u'sagemaker_program': u'mx_lenet_sagemaker.py', u'sa

[31m2018-01-12 18:43:16,002 INFO - root - Epoch[17] Batch [16]#011Speed: 273.47 samples/sec#011accuracy=0.774118[0m
[31m2018-01-12 18:43:18,378 INFO - root - Epoch[17] Train-accuracy=0.760000[0m
[31m2018-01-12 18:43:18,378 INFO - root - Epoch[17] Time cost=5.372[0m
[31m2018-01-12 18:43:20,078 INFO - root - Epoch[17] Validation-accuracy=0.537500[0m
[31m2018-01-12 18:43:23,071 INFO - root - Epoch[18] Batch [16]#011Speed: 273.65 samples/sec#011accuracy=0.787059[0m
[31m2018-01-12 18:43:25,450 INFO - root - Epoch[18] Train-accuracy=0.756923[0m
[31m2018-01-12 18:43:25,451 INFO - root - Epoch[18] Time cost=5.372[0m
[31m2018-01-12 18:43:27,143 INFO - root - Epoch[18] Validation-accuracy=0.742500[0m
[31m2018-01-12 18:43:30,138 INFO - root - Epoch[19] Batch [16]#011Speed: 273.51 samples/sec#011accuracy=0.756471[0m
[31m2018-01-12 18:43:32,519 INFO - root - Epoch[19] Train-accuracy=0.770769[0m
[31m2018-01-12 18:43:32,519 INFO - root - Epoch[19] Time cost=5.376[0m
[31m2018-01

[31m2018-01-12 18:45:59,426 INFO - root - Epoch[40] Batch [16]#011Speed: 271.68 samples/sec#011accuracy=0.803529[0m
[31m2018-01-12 18:46:01,821 INFO - root - Epoch[40] Train-accuracy=0.758462[0m
[31m2018-01-12 18:46:01,821 INFO - root - Epoch[40] Time cost=5.411[0m
[31m2018-01-12 18:46:03,546 INFO - root - Epoch[40] Validation-accuracy=0.542500[0m
[31m2018-01-12 18:46:06,558 INFO - root - Epoch[41] Batch [16]#011Speed: 272.02 samples/sec#011accuracy=0.770588[0m
[31m2018-01-12 18:46:08,952 INFO - root - Epoch[41] Train-accuracy=0.790769[0m
[31m2018-01-12 18:46:08,952 INFO - root - Epoch[41] Time cost=5.406[0m
[31m2018-01-12 18:46:10,665 INFO - root - Epoch[41] Validation-accuracy=0.582500[0m
[31m2018-01-12 18:46:13,682 INFO - root - Epoch[42] Batch [16]#011Speed: 271.59 samples/sec#011accuracy=0.794118[0m
[31m2018-01-12 18:46:16,077 INFO - root - Epoch[42] Train-accuracy=0.769231[0m
[31m2018-01-12 18:46:16,077 INFO - root - Epoch[42] Time cost=5.411[0m
[31m2018-01