In [1]:
import torch
import matplotlib.pyplot as plt
from torch.nn import init
import torch.utils.data as Data
import torch.nn as nn
import numpy as np

N_SAMPLES=200
BATCH_SIZE=64
EPOCH=12
LR=0.03
N_HIDDEN=8
ACTIVATION=torch.tanh
B_INIT=-0.2       # use a bad bias constant initializer

#training data 
x=np.linspace(-7,10,N_SAMPLES)[:,np.newaxis]
noise=np.random.normal(0,2,x.shape) # 02之间
y=np.square(x)-5+noise

#test data
test_x=np.linspace(-7,10,200)[:,np.newaxis]
noise=np.random.normal(0,2,test_x.shape)
test_y=np.square(test_x)-5+noise

train_x,train_y=torch.from_numpy(x).float(),torch.from_numpy(y).float()
test_x=torch.from_numpy(test_x).float()
test_y=torch.from_numpy(test_y).float()

train_dataset=Data.TensorDataset(train_x,train_y)
train_loader=Data.DataLoader(dataset=train_dataset,batch_size=BATCH_SIZE,shuffle=True,num_workers=2)

plt.scatter(train_x.numpy(), train_y.numpy(), c='#FF9359', s=50, alpha=0.2, label='train')
plt.legend(loc='upper left')

class Net(nn.Module):
    def __init__(self,batch_normalization=False):
        super(Net,self).__init__()
        self.do_bn=batch_normalization
        self.fcs=[]# full connection
        self.bns=[]# batch normalization
        self.bn_input=nn.BatchNorm1d(1,momentum=0.5)
        
        for i in range(N_HIDDEN): #定义每一层，从第一层开始
            input_size=1 if i==0 else 10
            fc=nn.Linear(input_size,10)
            setattr(self,'fc%i'%i,fc)   #add fc to Net ,类似self.fc=fc
            self._set_init(fc)
            self.fcs.append(fc)
            if self.do_bn:
                bn=nn.BatchNorm1d(10,momentum=0.5)
                setattr(self,'bn%i'%i,bn)
                self.bns.append(bn)
            
        self.predict=nn.Linear(10,1)
        self._set_init(self.predict)
    
    def _set_init(self,layer):  # parameters initialization
        init.normal_(layer.weight,mean=0,std=1)
        init.constant_(layer.bias,B_INIT)         #对某一层的参数进行初始化
        
    def forward(self,x):
        pre_activation=[x]  #激励之前的x
        if self.do_bn:self.bn_input(x)  #输入bn之后的数据
        layer_input=[x]   #经过bn，后面层的输入
        for i in range(N_HIDDEN):
            x=self.fcs[i](x)
            pre_activation.append(x)
            if self.do_bn:x=self.bns[i](x)
            x=ACTIVATION(x)
            layer_input.append(x)
        out=self.predict(x)
        
        return out ,layer_input,pre_activation
            
nets=[Net(batch_normalization=False),Net(batch_normalization=True)]
#print(nets)
    
opts=[torch.optim.Adam(net.parameters(),lr=LR)for net in nets]
loss_func=torch.nn.MSELoss()

def plot_histogram(l_in, l_in_bn, pre_ac, pre_ac_bn):
    for i, (ax_pa, ax_pa_bn, ax, ax_bn) in enumerate(zip(axs[0, :], axs[1, :], axs[2, :], axs[3, :])):
        [a.clear() for a in [ax_pa, ax_pa_bn, ax, ax_bn]]
        if i == 0:
            p_range = (-7, 10);the_range = (-7, 10)
        else:
            p_range = (-4, 4);the_range = (-1, 1)
        ax_pa.set_title('L' + str(i))
        ax_pa.hist(pre_ac[i].data.numpy().ravel(), bins=10, range=p_range, color='#FF9359', alpha=0.5);ax_pa_bn.hist(pre_ac_bn[i].data.numpy().ravel(), bins=10, range=p_range, color='#74BCFF', alpha=0.5)
        ax.hist(l_in[i].data.numpy().ravel(), bins=10, range=the_range, color='#FF9359');ax_bn.hist(l_in_bn[i].data.numpy().ravel(), bins=10, range=the_range, color='#74BCFF')
        for a in [ax_pa, ax, ax_pa_bn, ax_bn]: a.set_yticks(());a.set_xticks(())
        ax_pa_bn.set_xticks(p_range);ax_bn.set_xticks(the_range)
        axs[0, 0].set_ylabel('PreAct');axs[1, 0].set_ylabel('BN PreAct');axs[2, 0].set_ylabel('Act');axs[3, 0].set_ylabel('BN Act')
    plt.pause(0.01)


f, axs = plt.subplots(4, N_HIDDEN + 1, figsize=(10, 5))
plt.ion()  # something about plotting
plt.show()

losses=[[],[]]

for epoch in range(EPOCH):
    print('Epoch',epoch)
    layer_inputs,pre_acts=[],[]
    for net, l in zip(nets, losses):
        net.eval()  
        pred, layer_input, pre_act = net(test_x)
        l.append(loss_func(pred, test_y).data.item())
        layer_inputs.append(layer_input)
        pre_acts.append(pre_act)
        net.train()  
    plot_histogram(*layer_inputs, *pre_acts)     # plot histogram    
    
    for step, (b_x, b_y) in enumerate(train_loader):
            for net, opt in zip(nets, opts):     # train for each network
                pred, _, _ = net(b_x)
                loss = loss_func(pred, b_y)
                opt.zero_grad()
                loss.backward()
                opt.step() 
    
plt.ioff()

    # plot training loss
plt.figure(2)
plt.plot(losses[0], c='#FF9359', lw=3, label='Original')
plt.plot(losses[1], c='#74BCFF', lw=3, label='Batch Normalization')
plt.xlabel('step');plt.ylabel('test loss');plt.ylim((0, 2000));plt.legend(loc='best')

    # evaluation
    # set net to eval mode to freeze the parameters in batch normalization layers
[net.eval() for net in nets]    # set eval mode to fix moving_mean and moving_var
preds = [net(test_x)[0] for net in nets]
plt.figure(3)
plt.plot(test_x.data.numpy(), preds[0].data.numpy(), c='#FF9359', lw=4, label='Original')
plt.plot(test_x.data.numpy(), preds[1].data.numpy(), c='#74BCFF', lw=4, label='Batch Normalization')
plt.scatter(test_x.data.numpy(), test_y.data.numpy(), c='r', s=50, alpha=0.2, label='train')
plt.legend(loc='best')
plt.show()

<Figure size 640x480 with 1 Axes>

<Figure size 1000x500 with 36 Axes>

Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11


<Figure size 640x480 with 1 Axes>

<Figure size 640x480 with 1 Axes>

In [None]:
#Dropout改善过拟合
import torch
from torch.autograd import Variable
import matplotlib.pyplot as plt

N_SAMPLES=20
N_HIDDEN=300

#training data
x=torch.unsqueeze(torch.linspace(-1,1,N_SAMPLES),1)
y=x+0.3*torch.normal(torch.zeros(N_SAMPLES,1),torch.ones(N_SAMPLES,1))#mean std
x,y=Variable(x),Variable(y)

#test data
test_x=torch.unsqueeze(torch.linspace(-1,1,N_SAMPLES),1)
test_y=test_x+0.3*torch.normal(torch.zeros(N_SAMPLES,1),torch.ones(N_SAMPLES,1))
test_x,test_y=Variable(test_x),Variable(test_y)

#show data
plt.scatter(x.data.numpy(), y.data.numpy(), c='magenta', s=50, alpha=0.5, label='train')
plt.scatter(test_x.data.numpy(), test_y.data.numpy(), c='cyan', s=50, alpha=0.5, label='test')
plt.legend(loc='upper left')
plt.ylim((-2.5, 2.5))
plt.show()

net_overfitting=torch.nn.Sequential(
    torch.nn.Linear(1,N_HIDDEN),
    torch.nn.ReLU(),
    torch.nn.Linear(N_HIDDEN,N_HIDDEN),
    torch.nn.ReLU(),
    torch.nn.Linear(N_HIDDEN,1)
)

net_dropped=torch.nn.Sequential(
    torch.nn.Linear(1,N_HIDDEN),
    torch.nn.Dropout(0.5),   #drop 50% neuron
    torch.nn.ReLU(),
    torch.nn.Linear(N_HIDDEN,N_HIDDEN),
    torch.nn.Dropout(0.5),
    torch.nn.ReLU(),
    torch.nn.Linear(N_HIDDEN,1)
)

print(net_overfitting)
print(net_dropout)

optimizer_ofit=torch.optim.Adam(net_overfitting.parameters(),lr=0.01)
optimizer_drop=torch.optim.Adam(net_dropped.parameters(),lr=0.01)
loss_func=torch.nn.MSELoss()

plt.ion()

for t in range(500):
    pred_ofit=net_overfitting(x)
    pred_drop=net_dropped(x)
    loss_ofit=loss_func(pred_ofit,y)
    loss_drop=loss_func(pred_drop,y)
    
    optimizer_ofit.zero_grad()
    optimizer_drop.zero_grad()
    loss_ofit.backward()
    loss_drop.backward()
    optimizer_ofit.step()
    optimizer_drop.step()
    
    if t%10==0:
        net_overfitting.eval()
        net_dropped.eval()     #在测试的时候屏蔽掉drop的部分。避免参数不同
        
        plt.cla()
        test_pred_ofit=net_overfitting(test_x)
        test_pred_drop=net_dropped(test_x)
        plt.scatter(x.data.numpy(), y.data.numpy(), c='magenta', s=50, alpha=0.3, label='train')
        plt.scatter(test_x.data.numpy(), test_y.data.numpy(), c='cyan', s=50, alpha=0.3, label='test')
        plt.plot(test_x.data.numpy(), test_pred_ofit.data.numpy(), 'r-', lw=3, label='overfitting')
        plt.plot(test_x.data.numpy(), test_pred_drop.data.numpy(), 'b--', lw=3, label='dropout(50%)')
        plt.text(0, -1.2, 'overfitting loss=%.4f' % loss_func(test_pred_ofit, test_y).data.numpy(), fontdict={'size': 20, 'color':  'red'})
        plt.text(0, -1.5, 'dropout loss=%.4f' % loss_func(test_pred_drop, test_y).data.numpy(), fontdict={'size': 20, 'color': 'blue'})
        plt.legend(loc='upper left'); plt.ylim((-2.5, 2.5));plt.pause(0.1)

        # 训练的时候再将屏蔽掉的dropout函数还原，使两个网络参数不同。用于训练
        net_overfitting.train()
        net_dropped.train()

plt.ioff()
plt.show()