## Dropout 丢弃法

丢弃法不改变其输入的期望值，即$E(h'_i)=\frac{E(\xi_i)}{1-p} h_i = hi$，其中$p(\xi_i=0)=p$，$p(\xi_i=1)=1-p$。

In [14]:
import torch
import torch.nn as nn
import numpy as np

In [15]:
import io, os, sys, types
from IPython import get_ipython
from nbformat import read
from IPython.core.interactiveshell import InteractiveShell
def find_notebook(fullname, path=None):
    """find a notebook, given its fully qualified name and an optional path
    
    This turns "foo.bar" into "foo/bar.ipynb"
    and tries turning "Foo_Bar" into "Foo Bar" if Foo_Bar
    does not exist.
    """
    name = fullname.rsplit('.', 1)[-1]
    if not path:
        path = ['']
    for d in path:
        nb_path = os.path.join(d, name + ".ipynb")
        if os.path.isfile(nb_path):
            return nb_path
        # let import Notebook_Name find "Notebook Name.ipynb"
        nb_path = nb_path.replace("_", " ")
        if os.path.isfile(nb_path):
            return nb_path
        
class NotebookLoader(object):
    """Module Loader for Jupyter Notebooks"""
    def __init__(self, path=None):
        self.shell = InteractiveShell.instance()
        self.path = path
    
    def load_module(self, fullname):
        """import a notebook as a module"""
        path = find_notebook(fullname, self.path)
        
        print ("importing Jupyter notebook from %s" % path)
                                       
        # load the notebook object
        with io.open(path, 'r', encoding='utf-8') as f:
            nb = read(f, 4)
        
        
        # create the module and add it to sys.modules
        # if name in sys.modules:
        #    return sys.modules[name]
        mod = types.ModuleType(fullname)
        mod.__file__ = path
        mod.__loader__ = self
        mod.__dict__['get_ipython'] = get_ipython
        sys.modules[fullname] = mod
        
        # extra work to ensure that magics that would affect the user_ns
        # actually affect the notebook module's ns
        save_user_ns = self.shell.user_ns
        self.shell.user_ns = mod.__dict__
        
        try:
          for cell in nb.cells:
            if cell.cell_type == 'code':
                # transform the input to executable Python
                code = self.shell.input_transformer_manager.transform_cell(cell.source)
                # run the code in themodule
                exec(code, mod.__dict__)
        finally:
            self.shell.user_ns = save_user_ns
        return mod
class NotebookFinder(object):
    """Module finder that locates Jupyter Notebooks"""
    def __init__(self):
        self.loaders = {}
    
    def find_module(self, fullname, path=None):
        nb_path = find_notebook(fullname, path)
        if not nb_path:
            return
        
        key = path
        if path:
            # lists aren't hashable
            key = os.path.sep.join(path)
        
        if key not in self.loaders:
            self.loaders[key] = NotebookLoader(path)
        return self.loaders[key]

sys.meta_path.append(NotebookFinder())

In [16]:
import d2l

In [17]:
def drop_out(X,drop_prob):
    X=X.float()
    assert 0 <= drop_prob <= 1
    keep_prob = 1-drop_prob
    #此种情况全部舍弃此隐藏层全部单元
    if keep_prob == 0:
        return torch.zeros_like(X)
    #此种情况，随机舍去概率小于keep_prob的单元
    mask = (torch.rand(X.shape) < keep_prob).float()
    
    return mask*X / keep_prob

In [18]:
X=torch.arange(16).view(2,8)

In [19]:
drop_out(X,0)

tensor([[ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.],
        [ 8.,  9., 10., 11., 12., 13., 14., 15.]])

In [20]:
drop_out(X,0.5)

tensor([[ 0.,  0.,  0.,  0.,  8., 10.,  0.,  0.],
        [16., 18., 20.,  0., 24., 26., 28., 30.]])

In [21]:
num_inputs, num_outputs, num_hiddens1, num_hiddens2 = 784, 10, 256, 256
#两个隐藏层，输出个数都为256
W1 = torch.tensor(np.random.normal(0, 0.01, size=(num_inputs, num_hiddens1)), dtype=torch.float, requires_grad=True)
b1 = torch.zeros(num_hiddens1, requires_grad=True)
W2 = torch.tensor(np.random.normal(0, 0.01, size=(num_hiddens1, num_hiddens2)), dtype=torch.float, requires_grad=True)
b2 = torch.zeros(num_hiddens2, requires_grad=True)
W3 = torch.tensor(np.random.normal(0, 0.01, size=(num_hiddens2, num_outputs)), dtype=torch.float, requires_grad=True)
b3 = torch.zeros(num_outputs, requires_grad=True)

params = [W1, b1, W2, b2, W3, b3]


In [22]:
drop_prob1,drop_prob2 = 0.2,0.5
#由输入层起始的隐藏层丢弃概率应该小一点，后面的丢弃概率可以大一点
def net(X,is_training=True):
    X = X.view(-1,num_inputs)
    H1 = (torch.matmul(X,W1)+b1).relu()
    if is_training:
        H1 = drop_out(H1,drop_prob1)
    
    H2=(torch.matmul(H1,W2)+b2).relu()
    
    if is_training:
        H2 = drop_out(H2,drop_prob2)
    return torch.matmul(H2,W3)+b3

In [23]:
def evaluate_accuracy(data_iter,net):
    #准确的样本数目
    acc_sum = 0.0
    #总样本数
    n=0
    #统计准确率的情况下不应该进行dropout
    for X,y in data_iter:
        #net 符合pytorch的Module
        if isinstance(net,torch.nn.Module):
            net.eval()#模型评估模式，关闭dropout
            acc_sum+=(net(X).argmax(dim=1) == y).float().sum().item()
            net.train()
        else:
            if('is_training' in net.__code__.co_varnames):
                acc_sum+=(net(X,False).argmax(dim=1) == y).float().sum().item()
            else:
                acc_sum+=(net(X).argmax(dim=1)==y).float().sum().item() 
        n+=y.shape[0]#y.shape[0]为batch_size个label
    return acc_sum /n

In [24]:
def train_ch3(net,train_iter,test_iter,loss,epochs,batch_size,params=None,lr=None,optimizer=None):
    #训练
    for epoch in range(epochs):
        train_l_sum , train_acc_sum,n= 0.0,0.0,0
        for X,y in train_iter:#X为图像
            y_hat=net(X)#对该次输入的预测值
            l = loss(y_hat,y).sum()
            
            #梯度清零
            if optimizer is not None:
                optimizer.zero_grad()
            elif params is not None and params[0].grad is not None:
                for param in params:
                    param.grad.data.zero_()
            
            l.backward()
            
            if optimizer is None:
                d2l.sgd(params,lr,batch_size)
            else:
                optimizer.step()
                
            
            train_l_sum += l.item()#统计总的损失
            train_acc_sum+=(y_hat.argmax(dim=1) == y).sum().item()#统计准确率
            #y.shape[0]为batch_size个label
            n+=y.shape[0]
        #一个epoch走完
        test_acc = evaluate_accuracy(test_iter,net)
        print("Epoch %d, loss %.4f, train accuracy %.3f, test accuracy %.3f" %(epoch+1,train_l_sum/n,train_acc_sum/n,test_acc))
            

In [25]:
num_epochs, lr, batch_size = 5, 100.0, 256#学习率设置的大一点（因为是自己实现的SGD）
loss = torch.nn.CrossEntropyLoss()
train_iter, test_iter = d2l.load_data_from_fmnist(batch_size)
train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, params, lr)

Epoch 1, loss 0.0045, train accuracy 0.557, test accuracy 0.752
Epoch 2, loss 0.0023, train accuracy 0.786, test accuracy 0.825
Epoch 3, loss 0.0019, train accuracy 0.821, test accuracy 0.820
Epoch 4, loss 0.0017, train accuracy 0.838, test accuracy 0.790
Epoch 5, loss 0.0016, train accuracy 0.849, test accuracy 0.847


### 简洁实现

In [28]:
net = nn.Sequential(
        d2l.FlattenLayer(),#自定义形变层
        nn.Linear(num_inputs,num_hiddens1),
        nn.ReLU(),
        nn.Dropout(drop_prob1),
        nn.Linear(num_hiddens1,num_hiddens2),
        nn.ReLU(),
        nn.Dropout(drop_prob2),
        nn.Linear(num_hiddens2,10)
        )

In [29]:
for param in net.parameters():
    nn.init.normal_(param,mean=0,std=0.01)

In [31]:
optimizer = torch.optim.SGD(net.parameters(),lr=0.5)
train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size,None,None,optimizer)

Epoch 1, loss 0.0046, train accuracy 0.546, test accuracy 0.741
Epoch 2, loss 0.0023, train accuracy 0.787, test accuracy 0.744
Epoch 3, loss 0.0019, train accuracy 0.823, test accuracy 0.809
Epoch 4, loss 0.0017, train accuracy 0.837, test accuracy 0.837
Epoch 5, loss 0.0016, train accuracy 0.847, test accuracy 0.845
