In [1]:
import torch
from torch import nn
from d2l import torch as d2l

## **首先NiN网的思路非常有趣。在像alex和vgg等网络中，我们将最后卷积层的输出与全连接层进行连接，这会产生非常多的参数，参数太多也是导致过拟合的关键。NiN网络可以解决这样的问题。在一个nin块中，我们往往使用两个1*1的卷积层对每个通道上的像素进行一个提取，这样卷积完之后输出的每个特征点就是输入的每一个通道上的对应位置的像素的信息。之后，经过多个NiN块之后，最关键的一点来了，我们将最后输出的特征维度进行自适应平均到 1 * 1 ,之后reshape到二维，将输出的通道数视为不同的特征。**
* **NiN块由一个普通的卷积层和两个1*1核大小的卷积层构成。普通的卷积层进行不同特征的提取，1 * 1的卷积层进行不同通道的信息集合，之后输入激活函数进行非线性输出**

In [2]:
batch_size = 128
train_iter , test_iter = d2l.load_data_fashion_mnist( batch_size ,resize= 224 )

In [3]:
def nin_block(in_channels, out_channels, kernel_size, strides, padding):
    return nn.Sequential(
        nn.Conv2d(in_channels, out_channels, kernel_size, strides, padding),
        nn.ReLU(),
        nn.Conv2d(out_channels, out_channels, kernel_size=1), nn.ReLU(),
        nn.Conv2d(out_channels, out_channels, kernel_size=1), nn.ReLU())


In [4]:
net = nn.Sequential(
    nin_block(1, 96, kernel_size=11, strides=4, padding=0),
    nn.MaxPool2d(kernel_size= 3, stride=2),
    nin_block(96, 256, kernel_size=5, strides=1, padding=2),
    nn.MaxPool2d( kernel_size= 3, stride=2),
    nin_block(256, 384, kernel_size=3, strides=1, padding=1),
    nn.MaxPool2d( kernel_size= 3, stride=2),
    nn.Dropout(0.5),
    # 标签类别数是10
    nin_block(384, 10, kernel_size=3, strides=1, padding=1),
    nn.AdaptiveAvgPool2d((1, 1)),
    # 将四维的输出转成二维的输出，其形状为(批量大小, 10)
    nn.Flatten())

In [6]:
x = torch.rand( size = (1 , 1, 224 , 224 ))
for layer in net:
    x = layer(x)
    print( layer.__class__.__name__ , 'output_size:' , x.shape )

Sequential output_size: torch.Size([1, 96, 54, 54])
MaxPool2d output_size: torch.Size([1, 96, 26, 26])
Sequential output_size: torch.Size([1, 256, 26, 26])
MaxPool2d output_size: torch.Size([1, 256, 12, 12])
Sequential output_size: torch.Size([1, 384, 12, 12])
MaxPool2d output_size: torch.Size([1, 384, 5, 5])
Dropout output_size: torch.Size([1, 384, 5, 5])
Sequential output_size: torch.Size([1, 10, 5, 5])
AdaptiveAvgPool2d output_size: torch.Size([1, 10, 1, 1])
Flatten output_size: torch.Size([1, 10])


In [49]:
len(train_iter) , len(train_iter)//5

(469, 93)

In [50]:
def train_simple_NiN( net , train_iter ,  test_iter , num_epochs , lr , device ):
    print('training on ', device )
    net.to( device )
    loss = nn.CrossEntropyLoss()
    optimzer = torch.optim.Adam( net.parameters() , lr )
    num_batch = len( train_iter )
    for epoch in range( num_epochs ):
        #存储多个批次的loss,准确个数，和总样本数
        metrics = d2l.Accumulator(3)
        for i , ( X,y) in enumerate( train_iter ):
            X = X.to( device )
            y = y.to( device )
            optimzer.zero_grad()
            y_hat = net( X )
            l = loss( y_hat , y )
            l.backward()
            optimzer.step()
            with torch.no_grad():
                metrics.add( l*X.shape[0] , d2l.accuracy( y_hat , y ) ,  y.numel() )
            train_loss = metrics[0]/metrics[2]
            train_acc  = metrics[1]/metrics[2]
            if (i+1) % (num_batch // 3 ) == 0 or i == num_batch-1:

                print( f'num_batch:{i+1} , avg_loss:{train_loss:.2f} , avg_accuracy:{train_acc:.2f}' )
        test_acc  = d2l.evaluate_accuracy_gpu( net , test_iter , device )
        print( f'epochs:{epoch} ,test_acc:{test_acc:.2f} ')
        



In [51]:
self_lr , self_num_epochs = 0.001 , 10
train_simple_NiN( net , train_iter , test_iter , num_epochs=self_num_epochs ,lr=self_lr , device=d2l.try_gpu() )

training on  cuda:0


KeyboardInterrupt: 

In [None]:
lr , num_epochs = 0.001 , 10
d2l.train_ch6( net , train_iter ,test_iter , num_epochs , lr , d2l.try_gpu() )