In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.models as models

### 搭建模型

In [2]:
class LinearBNAC(nn.Module):
    def __init__(self, in_channels, out_channels, bias=True, dropout=0.3, is_output=False):
        super(LinearBNAC, self).__init__()
        if is_output and out_channels==1:
            self.linear = nn.Sequential(
                nn.Linear(in_channels, out_channels, bias=bias),
                nn.Sigmoid()
            )
        elif is_output:
            self.linear = nn.Sequential(
                nn.Linear(in_channels, out_channels, bias=bias),
                nn.Softmax(dim=1)
            )   
        else:
            self.linear = nn.Sequential(
                nn.Linear(in_channels, out_channels, bias=bias),
                nn.Dropout(dropout),
                nn.BatchNorm1d(out_channels),
                nn.LeakyReLU(inplace=True)
            )
            
    def forward(self, x):
        out = self.linear(x)

        return out

In [3]:
class Model(nn.Module):
    def __init__(self, input_dimention, output_classes=1):
        super(Model, self).__init__()
        self.layer1 = LinearBNAC(input_dimention, 128)
        self.layer2 = nn.Linear(128, 64)
        self.layer3 = nn.Linear(64, 32)
        self.output = LinearBNAC(32, output_classes, is_output=True)

    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.output(x)

        return x        

### 準備輸入資料、優化器、標籤資料、模型輸出

In [4]:
model = Model(input_dimention=256, output_classes=10)
optimizer = optim.Adam(model.parameters())

In [5]:
batch_size = 4
input_features = 256
dummy_input = torch.randn(batch_size, input_features)

target = torch.tensor([9., 5., 4., 4.], dtype=torch.long)

In [6]:
output = model(dummy_input)
print(output)

tensor([[0.1135, 0.1016, 0.0895, 0.1025, 0.1016, 0.0793, 0.0998, 0.0846, 0.1183,
         0.1094],
        [0.1097, 0.0740, 0.1077, 0.1046, 0.0987, 0.1123, 0.0842, 0.1017, 0.1153,
         0.0918],
        [0.1154, 0.0894, 0.0954, 0.1160, 0.0879, 0.1067, 0.0852, 0.0845, 0.1113,
         0.1082],
        [0.1048, 0.1028, 0.1129, 0.1033, 0.0959, 0.0871, 0.0917, 0.0993, 0.1092,
         0.0929]], grad_fn=<SoftmaxBackward>)


### 計算 CrossEntropy Loss
* 請注意哪一個 Loss最適合：我們已經使用 softmax
* 因為我們有使用dropout，並隨機產生dummy_input，所以各為學員得到的值會與解答不同，然而步驟原理需要相同

In [7]:
from torch.nn import NLLLoss, LogSoftmax, CrossEntropyLoss

In [8]:
criterion = NLLLoss()

In [9]:
loss = criterion(torch.log(output), target)

### 完成back propagation並更新梯度

In [10]:
loss.backward()

In [11]:
print('weight : {}'.format(model.layer1.linear[0].weight))
print('\n')
print('grad : {}'.format(model.layer1.linear[0].weight.grad))

weight : Parameter containing:
tensor([[-0.0328,  0.0592,  0.0289,  ...,  0.0292,  0.0281,  0.0100],
        [-0.0042,  0.0019,  0.0056,  ..., -0.0375, -0.0101, -0.0070],
        [ 0.0466,  0.0006,  0.0539,  ...,  0.0495,  0.0426,  0.0214],
        ...,
        [ 0.0258, -0.0082,  0.0410,  ..., -0.0603,  0.0051, -0.0051],
        [ 0.0265, -0.0183,  0.0157,  ..., -0.0511, -0.0568, -0.0195],
        [ 0.0611, -0.0573, -0.0114,  ...,  0.0309, -0.0080, -0.0067]],
       requires_grad=True)


grad : tensor([[-3.4733e-02,  3.0867e-02,  3.2351e-02,  ...,  4.5061e-02,
          4.5378e-02,  5.0161e-02],
        [-6.7546e-08, -1.5124e-07, -3.6441e-07,  ..., -8.6993e-07,
         -6.3536e-07, -1.1110e-06],
        [-4.2012e-03,  8.0197e-03,  8.9579e-03,  ...,  1.5647e-02,
          3.2069e-03,  1.1460e-02],
        ...,
        [-2.8559e-06, -6.0911e-06, -1.3630e-05,  ..., -3.2068e-05,
         -1.9547e-05, -3.7257e-05],
        [ 1.4774e-02, -9.3768e-03, -5.2261e-03,  ..., -1.0312e-03,
       

In [12]:
optimizer.step()

In [13]:
print('weight : {}'.format(model.layer1.linear[0].weight))
print('\n')
print('grad : {}'.format(model.layer1.linear[0].weight.grad))

weight : Parameter containing:
tensor([[-0.0318,  0.0582,  0.0279,  ...,  0.0282,  0.0271,  0.0090],
        [-0.0033,  0.0029,  0.0066,  ..., -0.0365, -0.0091, -0.0060],
        [ 0.0476, -0.0004,  0.0529,  ...,  0.0485,  0.0416,  0.0204],
        ...,
        [ 0.0268, -0.0072,  0.0420,  ..., -0.0593,  0.0061, -0.0041],
        [ 0.0255, -0.0173,  0.0167,  ..., -0.0501, -0.0558, -0.0185],
        [ 0.0601, -0.0563, -0.0104,  ...,  0.0319, -0.0070, -0.0077]],
       requires_grad=True)


grad : tensor([[-3.4733e-02,  3.0867e-02,  3.2351e-02,  ...,  4.5061e-02,
          4.5378e-02,  5.0161e-02],
        [-6.7546e-08, -1.5124e-07, -3.6441e-07,  ..., -8.6993e-07,
         -6.3536e-07, -1.1110e-06],
        [-4.2012e-03,  8.0197e-03,  8.9579e-03,  ...,  1.5647e-02,
          3.2069e-03,  1.1460e-02],
        ...,
        [-2.8559e-06, -6.0911e-06, -1.3630e-05,  ..., -3.2068e-05,
         -1.9547e-05, -3.7257e-05],
        [ 1.4774e-02, -9.3768e-03, -5.2261e-03,  ..., -1.0312e-03,
       

### 清空 gradient

In [14]:
optimizer.zero_grad()

In [15]:
print('weight : {}'.format(model.layer1.linear[0].weight))
print('\n')
print('grad : {}'.format(model.layer1.linear[0].weight.grad))

weight : Parameter containing:
tensor([[-0.0318,  0.0582,  0.0279,  ...,  0.0282,  0.0271,  0.0090],
        [-0.0033,  0.0029,  0.0066,  ..., -0.0365, -0.0091, -0.0060],
        [ 0.0476, -0.0004,  0.0529,  ...,  0.0485,  0.0416,  0.0204],
        ...,
        [ 0.0268, -0.0072,  0.0420,  ..., -0.0593,  0.0061, -0.0041],
        [ 0.0255, -0.0173,  0.0167,  ..., -0.0501, -0.0558, -0.0185],
        [ 0.0601, -0.0563, -0.0104,  ...,  0.0319, -0.0070, -0.0077]],
       requires_grad=True)


grad : tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
