In [1]:
import torch
import torch.nn as nn
import torchvision.models as models
import torch.nn.functional as F
import numpy as np
import torch.optim as optim


### 搭建模型

In [2]:
class LinearBNAC(nn.Module):
    def __init__(self, in_channels, out_channels, bias=True, dropout=0.3, is_output=False):
        super(LinearBNAC, self).__init__()
        if is_output and out_channels==1:
            self.linear = nn.Sequential(
                nn.Linear(in_channels, out_channels, bias=bias),
                nn.Sigmoid()
            )
        elif is_output:
            self.linear = nn.Sequential(
                nn.Linear(in_channels, out_channels, bias=bias),
                nn.Softmax(dim=1)
            )   
        else:
            self.linear = nn.Sequential(
                nn.Linear(in_channels, out_channels, bias=bias),
                nn.Dropout(dropout),
                nn.BatchNorm1d(out_channels),
                nn.LeakyReLU(inplace=True)
            )
            
    def forward(self, x):
        out=self.linear(x)
        return out

In [3]:
class Model(nn.Module):
    def __init__(self, input_dimention, output_classes=1):
        super(Model, self).__init__()
        self.layer1 = LinearBNAC(input_dimention, 128)
        self.layer2 = LinearBNAC(128, 64)
        self.layer3 = LinearBNAC(64, 32)
        self.output = LinearBNAC(32, output_classes, is_output=True)
    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.output(x)
        return x 
        

### 準備輸入資料、優化器、標籤資料、模型輸出

In [17]:
model = Model(input_dimention=256,output_classes=10)
optimizer = optim.Adam(params=model.parameters(), lr=1e-3, weight_decay=1e-3)

In [18]:
batch_size = 4
input_features = 256
dummy_input = torch.randn(batch_size, input_features,)

#target = torch.empty(4, dtype=torch.float).random_(10)
target = torch.tensor([9., 5., 4., 4.], dtype=torch.long)

  target = torch.tensor([9., 5., 4., 4.], dtype=torch.long)


In [19]:
output = model(dummy_input)
print(output)

tensor([[0.0873, 0.1630, 0.1030, 0.0581, 0.0688, 0.0928, 0.1172, 0.0794, 0.0998,
         0.1304],
        [0.0745, 0.1713, 0.1188, 0.0960, 0.0664, 0.1576, 0.0818, 0.0686, 0.0605,
         0.1045],
        [0.0921, 0.1022, 0.0569, 0.0800, 0.0331, 0.1429, 0.1450, 0.0895, 0.2009,
         0.0573],
        [0.1026, 0.1387, 0.0787, 0.0899, 0.0652, 0.0949, 0.1724, 0.0877, 0.0683,
         0.1017]], grad_fn=<SoftmaxBackward0>)


### 計算 CrossEntropy Loss
* 請注意哪一個 Loss最適合：我們已經使用 softmax
* 因為我們有使用dropout，並隨機產生dummy_input，所以各為學員得到的值會與解答不同，然而步驟原理需要相同

In [20]:
from torch.nn import NLLLoss, LogSoftmax, CrossEntropyLoss

In [21]:
criterion = NLLLoss()

In [22]:
loss = criterion(torch.log(output), target)

### 完成back propagation並更新梯度

In [23]:
loss.backward()

In [24]:
print('weight : {}'.format(model.layer1.linear[0].weight))
print('\n')
print('grad : {}'.format(model.layer1.linear[0].weight.grad))

weight : Parameter containing:
tensor([[-0.0117, -0.0499,  0.0005,  ..., -0.0091, -0.0426,  0.0455],
        [ 0.0399,  0.0129, -0.0463,  ...,  0.0577, -0.0294, -0.0001],
        [-0.0487,  0.0520, -0.0123,  ..., -0.0055, -0.0191, -0.0345],
        ...,
        [-0.0213, -0.0605, -0.0242,  ...,  0.0586, -0.0527, -0.0565],
        [ 0.0481, -0.0156,  0.0054,  ..., -0.0406,  0.0202,  0.0330],
        [ 0.0326, -0.0622,  0.0541,  ..., -0.0419, -0.0377,  0.0122]],
       requires_grad=True)


grad : tensor([[ 1.9297e-03, -1.6583e-03, -1.1734e-03,  ...,  5.9098e-03,
         -1.8262e-03,  4.4991e-03],
        [-3.6286e-04, -2.6425e-04,  8.4127e-05,  ..., -1.8569e-04,
         -3.8107e-04, -3.9645e-05],
        [ 4.2613e-02,  1.7764e-01,  3.1588e-03,  ..., -1.0676e-01,
          1.3270e-01, -9.7836e-02],
        ...,
        [ 5.7389e-02,  4.1178e-02,  3.0519e-03,  ..., -3.5413e-02,
          1.1378e-01, -5.6627e-02],
        [-4.2246e-02, -3.7635e-02,  2.7219e-02,  ..., -2.3230e-02,
       

In [25]:
optimizer.step()

In [26]:
print('weight : {}'.format(model.layer1.linear[0].weight))
print('\n')
print('grad : {}'.format(model.layer1.linear[0].weight.grad))

weight : Parameter containing:
tensor([[-0.0127, -0.0489,  0.0015,  ..., -0.0101, -0.0416,  0.0445],
        [ 0.0409,  0.0139, -0.0473,  ...,  0.0587, -0.0284,  0.0009],
        [-0.0497,  0.0510, -0.0133,  ..., -0.0045, -0.0201, -0.0335],
        ...,
        [-0.0223, -0.0615, -0.0252,  ...,  0.0596, -0.0537, -0.0555],
        [ 0.0491, -0.0146,  0.0044,  ..., -0.0396,  0.0212,  0.0340],
        [ 0.0316, -0.0632,  0.0531,  ..., -0.0409, -0.0387,  0.0132]],
       requires_grad=True)


grad : tensor([[ 1.9297e-03, -1.6583e-03, -1.1734e-03,  ...,  5.9098e-03,
         -1.8262e-03,  4.4991e-03],
        [-3.6286e-04, -2.6425e-04,  8.4127e-05,  ..., -1.8569e-04,
         -3.8107e-04, -3.9645e-05],
        [ 4.2613e-02,  1.7764e-01,  3.1588e-03,  ..., -1.0676e-01,
          1.3270e-01, -9.7836e-02],
        ...,
        [ 5.7389e-02,  4.1178e-02,  3.0519e-03,  ..., -3.5413e-02,
          1.1378e-01, -5.6627e-02],
        [-4.2246e-02, -3.7635e-02,  2.7219e-02,  ..., -2.3230e-02,
       

### 清空 gradient

In [27]:
optimizer.zero_grad()

In [28]:
print('weight : {}'.format(model.layer1.linear[0].weight))
print('\n')
print('grad : {}'.format(model.layer1.linear[0].weight.grad))

weight : Parameter containing:
tensor([[-0.0127, -0.0489,  0.0015,  ..., -0.0101, -0.0416,  0.0445],
        [ 0.0409,  0.0139, -0.0473,  ...,  0.0587, -0.0284,  0.0009],
        [-0.0497,  0.0510, -0.0133,  ..., -0.0045, -0.0201, -0.0335],
        ...,
        [-0.0223, -0.0615, -0.0252,  ...,  0.0596, -0.0537, -0.0555],
        [ 0.0491, -0.0146,  0.0044,  ..., -0.0396,  0.0212,  0.0340],
        [ 0.0316, -0.0632,  0.0531,  ..., -0.0409, -0.0387,  0.0132]],
       requires_grad=True)


grad : tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
