# Dropout


In [1]:
import d2l
from mxnet import autograd, gluon, init, nd
from mxnet.gluon import loss as gloss, nn

## Dropout from Scratch

In [2]:
def dropout(X, drop_prob):
    assert 0 <= drop_prob <= 1
    # In this case, all elements are dropped out.
    if drop_prob == 1:
        return X.zeros_like()
    mask = nd.random.uniform(0, 1, X.shape) > drop_prob
    return mask * X / (1.0-drop_prob)

## Sanity Test 

In [3]:
X = nd.arange(16).reshape((2, 8))
print(dropout(X, 0))
print(dropout(X, 0.5))
print(dropout(X, 1))


[[ 0.  1.  2.  3.  4.  5.  6.  7.]
 [ 8.  9. 10. 11. 12. 13. 14. 15.]]
<NDArray 2x8 @cpu(0)>

[[ 0.  0.  0.  0.  8. 10. 12.  0.]
 [16.  0. 20. 22.  0.  0.  0. 30.]]
<NDArray 2x8 @cpu(0)>

[[0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]]
<NDArray 2x8 @cpu(0)>


### Defining Model Parameters


In [4]:
num_inputs, num_outputs, num_hiddens1, num_hiddens2 = 784, 10, 1024, 2048

W1 = nd.random.normal(scale=0.01, shape=(num_inputs, num_hiddens1))
b1 = nd.zeros(num_hiddens1)
W2 = nd.random.normal(scale=0.01, shape=(num_hiddens1, num_hiddens2))
b2 = nd.zeros(num_hiddens2)
W3 = nd.random.normal(scale=0.01, shape=(num_hiddens2, num_outputs))
b3 = nd.zeros(num_outputs)

params = [W1, b1, W2, b2, W3, b3]
for param in params:
    param.attach_grad()

### Define the Model

In [5]:
drop_prob1, drop_prob2 = 0.0, 0.0

def net(X):
    X = X.reshape((-1, num_inputs))
    H1 = (nd.dot(X, W1) + b1).relu()
    if autograd.is_training():        # Use dropout only when training the model.
        H1 = dropout(H1, drop_prob1)  # Add a dropout layer after the first fully connected layer.
    H2 = (nd.dot(H1, W2) + b2).relu()
    if autograd.is_training():
        H2 = dropout(H2, drop_prob2)  # Add a dropout layer after the second fully connected layer.
    return nd.dot(H2, W3) + b3

### Training and Testing

In [6]:
num_epochs, lr, batch_size = 10, 0.5, 256
loss = gloss.SoftmaxCrossEntropyLoss()
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, params, lr)

epoch 1, loss 0.9413, train acc 0.645, test acc 0.808
epoch 2, loss 0.5028, train acc 0.810, test acc 0.845
epoch 3, loss 0.4372, train acc 0.837, test acc 0.863
epoch 4, loss 0.3904, train acc 0.854, test acc 0.869
epoch 5, loss 0.3703, train acc 0.863, test acc 0.865
epoch 6, loss 0.3456, train acc 0.872, test acc 0.871
epoch 7, loss 0.3291, train acc 0.878, test acc 0.881
epoch 8, loss 0.3103, train acc 0.883, test acc 0.886
epoch 9, loss 0.2992, train acc 0.888, test acc 0.888
epoch 10, loss 0.2893, train acc 0.892, test acc 0.885


## Dropout in Gluon

In [7]:
net = nn.Sequential()
net.add(nn.Dense(num_hiddens1, activation="relu"),
        nn.Dropout(drop_prob1),  # Add a dropout layer after the first fully connected layer.
        nn.Dense(num_hiddens2, activation="relu"),
        nn.Dropout(drop_prob2),  # Add a dropout layer after the second fully connected layer.
        nn.Dense(num_outputs))
net.initialize(init.Normal(sigma=0.01))

### Training

In [8]:
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size,
             None, None, trainer)

epoch 1, loss 0.8574, train acc 0.677, test acc 0.813
epoch 2, loss 0.4920, train acc 0.817, test acc 0.854
epoch 3, loss 0.4295, train acc 0.840, test acc 0.861
epoch 4, loss 0.3912, train acc 0.854, test acc 0.867
epoch 5, loss 6.1379, train acc 0.705, test acc 0.254
epoch 6, loss 7.2570, train acc 0.322, test acc 0.522
epoch 7, loss 1.4614, train acc 0.476, test acc 0.176
epoch 8, loss 1.5300, train acc 0.453, test acc 0.588
epoch 9, loss 0.6826, train acc 0.738, test acc 0.770
epoch 10, loss 0.5558, train acc 0.794, test acc 0.829
