In [1]:
import torch
from torch import nn

In [26]:
def batch_norm(X,gamma,beta,moving_mean,moving_var,eps,momentum):
    if not torch.is_grad_enabled():
        X_hat=(X-moving_mean) / torch.sqrt(moving_var+eps)
    else:
        assert len(X.shape) in (2,4)
        if len(X.shape)==2:
            mean=X.mean(dim=0)
            var=((X-mean)**2).mean(dim=0)
        else:
            mean=X.mean(dim=(0,2,3),keepdim=True)
            var=((X-mean)**2).mean(dim=(0,2,3),keepdim=True)
        X_hat=(X-mean)/torch.sqrt(var+eps)
        moving_mean=momentum*moving_mean+(1.0-momentum)*mean
        moving_var=momentum*moving_var+(1.0-momentum)*var
    Y=gamma*X_hat+beta
    return Y,moving_mean.data,moving_var.data
        

In [27]:
class BatchNorm(nn.Module):
    def __init__(self,num_features,num_dims):
        super().__init__()
        if num_dims==2:
            shape=(1,num_features)
        else:
            shape=(1,num_features,1,1)
        self.gamma=nn.Parameter(torch.ones(shape))
        self.beta=nn.Parameter(torch.zeros(shape))
        
        self.moving_mean=torch.zeros(shape)
        self.moving_var=torch.ones(shape)
        
    def forward(self,X):
        if self.moving_mean.device != X.device:
            self.moving_mean=self.moving_mean.to(X.device)
            self.moving_var=self.moving_var.to(X.device)
        Y,self.moving_mean,self.moving_var=batch_norm(X,self.gamma,self.beta,self.moving_mean,self.moving_var,eps=1e-5,momentum=0.9)
        return Y

In [28]:
net=nn.Sequential(
    nn.Conv2d(1,6,kernel_size=5),BatchNorm(6,num_dims=4),nn.Sigmoid(),
    nn.AvgPool2d(kernel_size=2,stride=2),
    nn.Conv2d(6,16,kernel_size=5),BatchNorm(16,num_dims=4),nn.Sigmoid(),
    nn.AvgPool2d(kernel_size=2,stride=2),nn.Flatten(),
    nn.Linear(16*4*4,120),BatchNorm(120,num_dims=2),nn.Sigmoid(),
    nn.Linear(120,84),BatchNorm(84,num_dims=2),nn.Sigmoid(),
    nn.Linear(84,10)
)

In [29]:
import torchvision
from torchvision import transforms

In [30]:
batch_size=256
mnist_train=torchvision.datasets.FashionMNIST(
    root='F:\study\ml\DataSet\FashionMNIST',train=True,
    download=True,transform=transforms.ToTensor())
mnist_test=torchvision.datasets.FashionMNIST(
    root='F:\study\ml\DataSet\FashionMNIST',train=False,
    download=True,transform=transforms.ToTensor())

In [31]:
train_iter=torch.utils.data.DataLoader(mnist_train,batch_size,shuffle=True)
test_iter=torch.utils.data.DataLoader(mnist_test,batch_size,shuffle=False)

In [32]:
def evaluate_accuracy_gpu(net,data_iter,device=None):
    if isinstance(net,nn.Module):
        net.eval()
        if not device:
            device=next(iter(net.parameters())).device
    l_sum=[]
    for X,y in data_iter:
        y_hat=net(X)
        l_sum.append( (sum(torch.argmax(y_hat,dim=1).reshape(y.shape) == y)/y.shape[0]).item())
    return sum(l_sum)/len(l_sum)

In [33]:
def train_ch6(net,train_iter,test_iter,num_epochs,lr,device):
    def init_weight(m):
        if type(m)==nn.Linear or type(m)==nn.Conv2d:
            nn.init.xavier_uniform_(m.weight)
    net.apply(init_weight)
    print('training on',device)
    net.to(device)
    optimizer = torch.optim.SGD(net.parameters(), lr=lr)
    loss=nn.CrossEntropyLoss()
    for epoch in range(num_epochs):
        tr_l=[]
        tr_acc=[]
        net.train()
        for X,y in train_iter:
            optimizer.zero_grad()
            X,y=X.to(device),y.to(device)
            y_hat=net(X)
#             print(y_hat)
            l=loss(y_hat,y)
            l.backward()
            optimizer.step()
            with torch.no_grad():
                tr_l.append(l.item())
#                 print(y.shape)
#                 print(y_hat.shape)
#                 print(y_hat)
#                 print(y)
#                 print(torch.argmax(y_hat,dim=1))
                tr_acc.append( (sum(torch.argmax(y_hat,dim=1).reshape(y.shape) == y)/y.shape[0]).item())
        test_acc=evaluate_accuracy_gpu(net,test_iter)
        print('epoch : ',epoch ,' train loss : ',sum(tr_l)/len(tr_l),' train acc : ', sum(tr_acc)/len(tr_acc),'test acc : ',test_acc)

In [34]:
lr, num_epochs, batch_size = 1.0, 10, 256
train_ch6(net, train_iter, test_iter, num_epochs, lr, 'cpu')

training on cpu
epoch :  0  train loss :  0.742442106693349  train acc :  0.7330562944107867 test acc :  0.7291015625
epoch :  1  train loss :  0.49389492007012065  train acc :  0.8191988031914894 test acc :  0.83701171875
epoch :  2  train loss :  0.4033650502245477  train acc :  0.853407579787234 test acc :  0.85224609375
epoch :  3  train loss :  0.3601266277597306  train acc :  0.8678025265957446 test acc :  0.8529296875
epoch :  4  train loss :  0.3331690793341779  train acc :  0.8779421542553192 test acc :  0.85576171875
epoch :  5  train loss :  0.31196352199037025  train acc :  0.8865359042553191 test acc :  0.84599609375
epoch :  6  train loss :  0.2955932251316436  train acc :  0.8908300087807026 test acc :  0.88388671875
epoch :  7  train loss :  0.28437234112556947  train acc :  0.8952349289934686 test acc :  0.8599609375
epoch :  8  train loss :  0.27344608154702693  train acc :  0.8999168882978723 test acc :  0.8734375
epoch :  9  train loss :  0.2651952926782852  train a