In [126]:
from mxnet import np, npx, autograd
from mxnet.gluon import nn, Trainer
from mxnet.gluon.loss import L2Loss
from mxnet import initializer
from d2l import mxnet as d2l
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import pandas as pd
from pdb import set_trace
npx.set_np()

## Download Data

In [2]:
base_train_df = pd.read_csv(d2l.download('kaggle_house_train'))
score_df = pd.read_csv(d2l.download('kaggle_house_test'))

## Setup Pipeline

In [3]:
train_df, test_df = train_test_split(base_train_df, train_size=0.85)

In [4]:
cat_cols = train_df.columns[train_df.dtypes == 'object'].tolist()
numeric_cols = [col for col in train_df.columns[train_df.dtypes != 'object'].tolist() if col != 'Id']
cat_transformer = Pipeline(steps=[('impute', SimpleImputer(strategy='constant', fill_value='missing')),
                                  ('onehot', OneHotEncoder(handle_unknown='ignore'))])
numeric_transformer = Pipeline(steps=[('impute', SimpleImputer(strategy='median')),
                                      ('scale', StandardScaler())])
preprocessor = ColumnTransformer([('cat', cat_transformer, cat_cols),
                                  ('numeric', numeric_transformer, numeric_cols)])
pipe = Pipeline(steps=[('preprocess', preprocessor)])
pipe.fit(train_df)
train_X = pipe.transform(train_df).toarray()
train_y = train_df.SalePrice.to_numpy()
test_X = pipe.transform(test_df).toarray()
test_y = test_df.SalePrice.to_numpy()

In [5]:
def convert_to_mxnet_array(*arr):
    output_list = []
    for array in arr:
        output_list.append(np.array(array).astype('float32'))
    return output_list

In [6]:
train_X, train_y, test_X, test_y = convert_to_mxnet_array(train_X, train_y, test_X, test_y)

## Define Model

In [222]:
net = nn.Sequential()
with net.name_scope():
    net.add(nn.Dense(256, activation='relu'))
    net.add(nn.Dropout(0.5))
    net.add(nn.Dense(256, activation='relu'))
    net.add(nn.Dropout(0.1))
    net.add(nn.Dense(1))
net.initialize(initializer.Normal(sigma=0.01))

## Define Loss

In [8]:
loss_fn = L2Loss()

In [239]:
def rmsle(yhat, y):
    log_yhat = np.log(yhat)
    log_y = np.log(y)
    return np.sqrt(L2Loss()(log_yhat, log_y).mean())

## Define Training Loop

In [223]:
lr = 1
wd = 10
num_epochs = 50
optimizer = Trainer(net.collect_params(), optimizer='adam', optimizer_params={'learning_rate': lr, 'wd': wd})
net.collect_params('.*bias').setattr('wd_mult', 0)
for epoch in range(num_epochs):
    with autograd.record():
        yhat = net(train_X).squeeze()
        loss = loss_fn(yhat, train_y)
    loss.backward()
    optimizer.step(batch_size=train_X.shape[0])

In [240]:
rmsle(net(test_X), test_y)

array(0.13793813)

In [224]:
loss_fn(net(test_X), test_y)

array([1.7696462e+07, 8.0826672e+07, 9.5196608e+07, 2.6123574e+07,
       6.7614342e+08, 4.4429156e+07, 3.6750576e+07, 8.9169300e+06,
       2.6867070e+06, 9.6389888e+07, 3.1910062e+05, 3.0337182e+07,
       1.1799840e+06, 2.4409278e+08, 4.9052995e+06, 1.4967662e+08,
       2.3177304e+07, 2.6373508e+06, 1.6102591e+07, 6.0623728e+07,
       5.8405610e+06, 7.5160490e+06, 2.0727544e+07, 1.6197209e+07,
       4.0214192e+07, 2.1611254e+07, 1.0528298e+08, 4.8029264e+07,
       6.2785446e+08, 6.7300844e+05, 2.1020188e+06, 4.5757316e+07,
       4.8714004e+07, 1.9913670e+07, 7.0581458e+09, 3.1139612e+05,
       9.0582579e+08, 3.4602232e+07, 9.7083184e+07, 1.2210571e+08,
       3.7017904e+07, 6.7296685e+08, 7.2630056e+07, 9.6010392e+07,
       1.3169884e+06, 2.7419210e+07, 8.0027062e+05, 9.4668800e+05,
       9.0105100e+06, 2.4919594e+07, 2.4912260e+06, 2.0907090e+08,
       4.0909843e+08, 1.6217203e+08, 2.0867112e+07, 2.9546335e+06,
       2.4071500e+07, 7.6771538e+05, 1.8930688e+08, 1.8314587e

In [236]:
np.concatenate([net(test_X), test_y[:, np.newaxis]], axis=1)

array([[ 129800.805,  135750.   ],
       [ 140785.7  ,  153500.   ],
       [  98298.305,   84500.   ],
       [ 185228.22 ,  178000.   ],
       [ 126773.45 ,   90000.   ],
       [ 146926.47 ,  137500.   ],
       [ 192673.28 ,  184100.   ],
       [ 179223.02 ,  175000.   ],
       [ 192318.06 ,  190000.   ],
       [ 195384.52 ,  181500.   ],
       [ 247529.12 ,  248328.   ],
       [ 277210.62 ,  285000.   ],
       [ 267536.22 ,  266000.   ],
       [ 147805.08 ,  169900.   ],
       [ 373032.2  ,  369900.   ],
       [  43698.17 ,   61000.   ],
       [ 185808.42 ,  179000.   ],
       [ 228296.67 ,  226000.   ],
       [ 105674.96 ,  100000.   ],
       [ 126988.76 ,  138000.   ],
       [ 150417.77 ,  147000.   ],
       [ 243877.12 ,  240000.   ],
       [ 445218.56 ,  438780.   ],
       [ 145691.61 ,  140000.   ],
       [ 108968.19 ,  100000.   ],
       [ 125925.62 ,  132500.   ],
       [ 254510.89 ,  240000.   ],
       [ 115800.945,  106000.   ],
       [ 225435.98 ,

In [131]:
loss_fn(net(train_X), train_y)

array([1.2960500e+10, 7.2601252e+09, 9.6605000e+09, ..., 4.2049999e+10,
       1.9503124e+10, 9.1125002e+09])

In [103]:
(161000 - 21.87)**2

25913958338.2969

In [102]:
net(train_X)[0]

array([21.866129])

In [113]:
net(train_X).squeeze()

array([21.866129, 22.09983 , 21.852669, ..., 22.80056 , 22.50327 ,
       22.429173])

In [134]:
(train_y[0:2] - net(train_X).squeeze()[0:2])**2/2

array([1.296050e+10, 7.260125e+09])

In [110]:
train_y[0:2]

array([161000., 120500.])

In [133]:
L2Loss?

[0;31mInit signature:[0m [0mL2Loss[0m[0;34m([0m[0mweight[0m[0;34m=[0m[0;36m1.0[0m[0;34m,[0m [0mbatch_axis[0m[0;34m=[0m[0;36m0[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Calculates the mean squared error between `label` and `pred`.

.. math:: L = \frac{1}{2} \sum_i \vert {label}_i - {pred}_i \vert^2.

`label` and `pred` can have arbitrary shape as long as they have the same
number of elements.

Parameters
----------
weight : float or None
    Global scalar weight for loss.
batch_axis : int, default 0
    The axis that represents mini-batch.


Inputs:
    - **pred**: prediction tensor with arbitrary shape
    - **label**: target tensor with the same size as pred.
    - **sample_weight**: element-wise weighting tensor. Must be broadcastable
      to the same shape as pred. For example, if pred has shape (64, 10)
      and you want to weigh each sample in the batch separately,
      sample_weight should ha