In [1]:
from mxnet import np, npx, autograd
npx.set_np()
from d2l import mxnet as d2l
from pdb import set_trace

## Create Data

In [2]:
p = 200
n = 200
X = np.random.normal(size=(n, p))
w = np.random.normal(size=(p, 1))
b = np.random.normal()
y = np.dot(X, w) + b

In [3]:
dataset_size = len(X)
train_size_fraction = 0.1
shuffled_indices = np.random.shuffle(np.array(range(dataset_size)))
train_indices = shuffled_indices[:int(train_size_fraction*dataset_size)]
test_indices = shuffled_indices[int(train_size_fraction*dataset_size):]

In [4]:
train_ds, test_ds = (X[train_indices, :], y[train_indices]), (X[test_indices, :], y[test_indices])

In [5]:
def get_iterator(ds, batch_size=100):
    ds_size = len(ds[0])
    for start_idx in range(0, ds_size, batch_size):
        indices = range(start_idx, min(start_idx + batch_size, ds_size))
        yield ds[0][indices], ds[1][indices]

In [6]:
train_dl, test_dl = get_iterator(train_ds), get_iterator(test_ds)

## Define Net

In [7]:
def net(X, w, b):
    yhat = np.dot(X, w) + b
    return yhat

## Define Loss

In [8]:
def loss_fn(yhat, y, w, lmbd=0.1):
    squared_loss = (yhat - y)**2
    l2_loss = lmbd*(w**2)
    loss = squared_loss + l2_loss.sum()
    return loss

## Define SGD

In [9]:
def sgd(w, b, lr, batch_size):
    w[:] = w - lr*w.grad/batch_size
    b[:] = b - lr*b.grad/batch_size

## Define Training Loop

In [10]:
def get_ds_loss(ds, w, b):
    mse = 0
    num_samples = 0
    for X, y in get_iterator(ds):
        yhat = net(X, w, b)
        mse += ((yhat - y)**2).sum()
        num_samples += y.shape[0]
    return mse/num_samples

In [21]:
what = np.random.normal(size=(p, 1))
bhat = np.zeros(1)
what.attach_grad()
bhat.attach_grad()

In [22]:
num_epochs = 100
lr = 0.003
for epoch in range(num_epochs):
    for features, labels in get_iterator(train_ds):
        batch_size = features.shape[0]
        with autograd.record():
            yhat = net(features, what, bhat)
            loss = loss_fn(yhat, labels, what, lmbd=3)
        loss.backward()
        sgd(what, bhat, lr, batch_size)
    print(f'Epoch: {epoch}, train_loss: {get_ds_loss(train_ds, what, bhat)}, test_loss: {get_ds_loss(test_ds, what, bhat)}')

Epoch: 0, train_loss: 366.4363, test_loss: 435.57132
Epoch: 1, train_loss: 312.25183, test_loss: 424.059
Epoch: 2, train_loss: 267.39398, test_loss: 413.49216
Epoch: 3, train_loss: 230.12769, test_loss: 403.74088
Epoch: 4, train_loss: 199.05779, test_loss: 394.6988
Epoch: 5, train_loss: 173.05994, test_loss: 386.27805
Epoch: 6, train_loss: 151.2259, test_loss: 378.40594
Epoch: 7, train_loss: 132.82059, test_loss: 371.022
Epoch: 8, train_loss: 117.24751, test_loss: 364.07544
Epoch: 9, train_loss: 104.02136, test_loss: 357.52335
Epoch: 10, train_loss: 92.74638, test_loss: 351.32953
Epoch: 11, train_loss: 83.09896, test_loss: 345.46286
Epoch: 12, train_loss: 74.81373, test_loss: 339.8966
Epoch: 13, train_loss: 67.67244, test_loss: 334.60764
Epoch: 14, train_loss: 61.495197, test_loss: 329.57568
Epoch: 15, train_loss: 56.13313, test_loss: 324.78293
Epoch: 16, train_loss: 51.462715, test_loss: 320.21362
Epoch: 17, train_loss: 47.3812, test_loss: 315.85373
Epoch: 18, train_loss: 43.80281, te