In [1]:
%load_ext watermark
%watermark -a 'Christian Schuhegger' -u -d -v -p numpy,scipy,pandas,matplotlib,seaborn,mxnet

Christian Schuhegger 
last updated: 2019-02-27 

CPython 3.6.8
IPython 7.3.0

numpy 1.14.6
scipy 1.2.0
pandas 0.24.1
matplotlib 3.0.2
seaborn 0.9.0
mxnet 1.3.1


In [2]:
%matplotlib inline
import numpy as np, scipy, scipy.stats as stats, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
import sklearn, sklearn.pipeline, sklearn.model_selection, sklearn.preprocessing
import logging, time, datetime, tqdm
import mxnet as mx
from mxnet import gluon, nd, autograd, metric


pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
# pd.set_option('display.float_format', lambda x: '%.2f' % x)
np.set_printoptions(edgeitems=10)
np.set_printoptions(linewidth=1000)
np.set_printoptions(suppress=True)
np.core.arrayprint._line_width = 180

sns.set()

In [3]:
from IPython.display import display, HTML

from IPython.display import display_html
def display_side_by_side(*args):
    html_str=''
    for df in args:
        if type(df) == np.ndarray:
            df = pd.DataFrame(df)
        html_str+=df.to_html()
    html_str = html_str.replace('table','table style="display:inline"')
    # print(html_str)
    display_html(html_str,raw=True)

CSS = """
.output {
    flex-direction: row;
}
"""

def display_graphs_side_by_side(*args):
    html_str='<table><tr>'
    for g in args:
        html_str += '<td>'
        html_str += g._repr_svg_()
        html_str += '</td>'
    html_str += '</tr></table>'
    display_html(html_str,raw=True)
    

display(HTML("<style>.container { width:70% !important; }</style>"))

In [4]:
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s:%(name)s:%(levelname)s: %(message)s')
log = logging.getLogger('std')

In [5]:
N_in       = 1000
N_subjects = 260 * 10 # 100000
N_subjects = 100000
W = stats.norm(loc=0, scale=1).rvs(size=(2,N_in), random_state=np.random.RandomState(42))
X = stats.norm(loc=0, scale=1).rvs(size=(N_subjects,N_in), random_state=np.random.RandomState(43))
y = np.sum(W[1:,:] * X + W[0,:], axis=1)

In [6]:
y.shape, X.shape

((100000,), (100000, 1000))

In [7]:
pd.Series(y).describe()

count    100000.000000
mean         19.234805
std          31.516783
min        -147.380827
25%          -2.102370
50%          19.273695
75%          40.475011
max         147.253509
dtype: float64

In [8]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size = 0.1, random_state = 42)

In [9]:
def to_gluon_iter(x_in, y_in, batch_size=256):
    x_nd = nd.array(x_in)
    y_nd = nd.array(y_in)
    dataset = mx.gluon.data.ArrayDataset(x_nd, y_nd)

    itr = mx.gluon.data.DataLoader(dataset, batch_size = batch_size, shuffle = None)# , last_batch = 'rollover'
    return itr

def to_sym_iter(x_in, y_in, batch_size=256):
    itr = mx.io.NDArrayIter(x_in, y_in, batch_size, shuffle=None , label_name='lin_reg_label')
    return itr

In [10]:
class DataIterLoader():
    def __init__(self, data_iter):
        self.data_iter = data_iter

    def __iter__(self):
        self.data_iter.reset()
        return self

    def __next__(self):
        batch = self.data_iter.__next__()
        assert len(batch.data) == len(batch.label) == 1
        # print('len(batch_data): {}; batch.data[0].shape: {}'.format(len(batch.data), batch.data[0].shape))
        data = batch.data[0]
        label = batch.label[0]
        return data, label

    def next(self):
        return self.__next__() # for Python 2

# See:
#  Appendix: Upgrading from Module DataIter to Gluon DataLoader
#  https://mxnet.incubator.apache.org/versions/master/tutorials/gluon/datasets.html
batch_size=256
# gluon_train_iter = to_gluon_iter(X_train, y_train, batch_size=batch_size)
# gluon_valid_iter = to_gluon_iter(X_test , y_test, batch_size=batch_size)

sym_train_iter = to_sym_iter(X_train, y_train, batch_size=batch_size)
sym_valid_iter  = to_sym_iter(X_test, y_test, batch_size=batch_size)

gluon_train_iter = DataIterLoader(sym_train_iter)
gluon_valid_iter = DataIterLoader(sym_valid_iter)

In [11]:
def create_aux():
    epochs=20
    model_ctx=mx.cpu()
    loss_function = mx.gluon.loss.L2Loss()
    init_function = mx.init.Xavier()
    optimizer     = mx.optimizer.Adam()
    return epochs, model_ctx, loss_function, init_function, optimizer

In [12]:
def create_gluon_model():
    ACTIVATION = 'relu'
    net = mx.gluon.nn.HybridSequential(prefix='MLP_')
    with net.name_scope():
        net.add(
            mx.gluon.nn.Dense(300, activation=ACTIVATION, prefix='fc-1_'),
            mx.gluon.nn.Dense(100, activation=ACTIVATION, prefix='fc-2_'),
            mx.gluon.nn.Dense(1 , activation=None       , prefix='predictions')
        )
    return net

In [13]:
def create_sym_model():
    ACTIVATION = 'relu'

    data = mx.sym.Variable('data')
    Y    = mx.sym.Variable('lin_reg_label')
    fc1  = mx.sym.FullyConnected(data, name='fc1', num_hidden=300)
    act1 = mx.sym.Activation(fc1, name='relu1', act_type=ACTIVATION)
    fc2  = mx.sym.FullyConnected(act1, name='fc2', num_hidden=100)
    act2 = mx.sym.Activation(fc2, name='relu2', act_type=ACTIVATION)
    fc3  = mx.sym.FullyConnected(act2, name='fc3', num_hidden=1)
    lro  = mx.sym.LinearRegressionOutput(data=fc3, label=Y, name="lro")
    
    return lro    

In [14]:
epochs, model_ctx, loss_function, init_function, optimizer = create_aux()

In [15]:
sym_train_iter = to_sym_iter(X_train, y_train, batch_size=batch_size)
sym_valid_iter  = to_sym_iter(X_test, y_test, batch_size=batch_size)
gluon_model = create_gluon_model()
# gluon_model.hybridize()
gluon_model.hybridize(static_shape=True, static_alloc=True)
gluon_model.collect_params().initialize(init_function, ctx=model_ctx)

trainer = gluon.Trainer(gluon_model.collect_params(), optimizer)

nr_batches = len(X_train) // batch_size
total = epochs * (nr_batches + 1)

time1 = time.time()
for e in range(epochs):
    for i, (x_, y_) in enumerate(gluon_train_iter):
        x_ = x_.as_in_context(model_ctx)
        y_ = y_.as_in_context(model_ctx)
        with autograd.record():
            output = gluon_model(x_)
            loss = loss_function(output, y_)

        loss.backward()
        last_batch_loss = nd.mean(loss).asscalar()
        trainer.step(x_.shape[0])
    
    t = time.time()
    print([t-time1, e, last_batch_loss])

time2 = time.time()

[1.002150297164917, 0, 2.3317626]
[2.068141460418701, 1, 0.8200017]
[3.078101634979248, 2, 0.41898423]
[4.136557340621948, 3, 0.25386566]
[5.115137577056885, 4, 0.17628984]
[6.105294942855835, 5, 0.13901676]
[7.126128673553467, 6, 0.12758017]
[8.11776351928711, 7, 0.19650489]
[9.117296934127808, 8, 1.0679522]
[10.12504529953003, 9, 3.6427248]
[11.126052141189575, 10, 1.3626083]
[12.109097480773926, 11, 0.6898772]
[13.0963454246521, 12, 1.2826223]
[14.081398725509644, 13, 1.0957555]
[15.096967458724976, 14, 1.1497743]
[16.069961309432983, 15, 1.9431003]
[17.076117515563965, 16, 1.5676603]
[18.11342215538025, 17, 2.5785687]
[19.103195428848267, 18, 2.671261]
[20.170690536499023, 19, 2.2330596]


In [16]:
print('time: {}'.format(time2-time1))

time: 20.17090630531311


In [17]:
gluon_predict_iter = mx.gluon.data.DataLoader(mx.gluon.data.ArrayDataset(nd.array(X_test)), batch_size=batch_size)
y_gluon_pred  = nd.zeros(X_test.shape[0])
for i, (data) in enumerate(gluon_predict_iter):
    data   = data.as_in_context(model_ctx)
    output = gluon_model(data)
    y_gluon_pred[i * batch_size : i * batch_size + output.shape[0]] = output[:,0]

s = sklearn.metrics.mean_squared_error(y_test, y_gluon_pred.asnumpy())
s

4.396654576967551

In [18]:
sklearn.metrics.explained_variance_score(y_test, y_gluon_pred.asnumpy())

0.9955717688296912

In [19]:
epochs, model_ctx, loss_function, init_function, optimizer = create_aux()

In [20]:
sym_model = create_sym_model()

sym_model_module = mx.mod.Module(symbol = sym_model, data_names = ['data'], label_names = ['lin_reg_label'], context = model_ctx)

freq = int((len(X_train) * epochs / batch_size) // 10)
batch_end_callback = mx.callback.Speedometer(batch_size, frequent=freq, auto_reset=False)

time1 = time.time()

sym_model_module.fit(sym_train_iter, 
                     sym_valid_iter,
                     optimizer=optimizer,
                     initializer=init_function,
                     num_epoch=epochs,
                     eval_metric='mse',
                     batch_end_callback=batch_end_callback
                    )
time2 = time.time()

  optimizer_params=optimizer_params)
2019-02-27 11:29:38,909:root:INFO: Epoch[0] Train-mse=148.861828
2019-02-27 11:29:38,910:root:INFO: Epoch[0] Time cost=1.035
2019-02-27 11:29:38,975:root:INFO: Epoch[0] Validation-mse=4.877652
2019-02-27 11:29:40,034:root:INFO: Epoch[1] Train-mse=2.899106
2019-02-27 11:29:40,035:root:INFO: Epoch[1] Time cost=1.058
2019-02-27 11:29:40,115:root:INFO: Epoch[1] Validation-mse=3.584615
2019-02-27 11:29:41,233:root:INFO: Epoch[2] Train-mse=1.507336
2019-02-27 11:29:41,234:root:INFO: Epoch[2] Time cost=1.119
2019-02-27 11:29:41,310:root:INFO: Epoch[2] Validation-mse=3.203393
2019-02-27 11:29:42,440:root:INFO: Epoch[3] Train-mse=0.907041
2019-02-27 11:29:42,441:root:INFO: Epoch[3] Time cost=1.130
2019-02-27 11:29:42,505:root:INFO: Epoch[3] Validation-mse=3.049417
2019-02-27 11:29:43,548:root:INFO: Epoch[4] Train-mse=0.602118
2019-02-27 11:29:43,549:root:INFO: Epoch[4] Time cost=1.043
2019-02-27 11:29:43,607:root:INFO: Epoch[4] Validation-mse=3.011585
2019-0

In [21]:
print(time2-time1)

22.70542311668396


In [22]:
y_sym_pred = sym_model_module.predict(sym_valid_iter)
s = sklearn.metrics.mean_squared_error(y_test, y_sym_pred.asnumpy())
s

2.5199532238884363

In [23]:
sklearn.metrics.explained_variance_score(y_test, y_sym_pred.asnumpy())

0.9974339473895943