In [4]:
import torch
import torch.nn as nn
import torchvision.models as models
import torch.nn.functional as F
import numpy as np
import torch.optim as optim


### 搭建模型

In [5]:
class LinearBNAC(nn.Module):
    def __init__(self, in_channels, out_channels, bias=True, dropout=0.3, is_output=False):
        super(LinearBNAC, self).__init__()
        if is_output and out_channels==1:
            self.linear = nn.Sequential(
                nn.Linear(in_channels, out_channels, bias=bias),
                nn.Sigmoid()
            )
        elif is_output:
            self.linear = nn.Sequential(
                nn.Linear(in_channels, out_channels, bias=bias),
                nn.Softmax(dim=1)
            )   
        else:
            self.linear = nn.Sequential(
                nn.Linear(in_channels, out_channels, bias=bias),
                nn.Dropout(dropout),
                nn.BatchNorm1d(out_channels),
                nn.LeakyReLU(inplace=True)
            )
            
    def forward(self, x):
        out=self.linear(x)
        return out

In [6]:
class Model(nn.Module):
    def __init__(self, input_dimention, output_classes=1):
        super(Model, self).__init__()
        self.layer1 = LinearBNAC(input_dimention, 128)
        self.layer2 = LinearBNAC(128, 64)
        self.layer3 = LinearBNAC(64, 32)
        self.output = LinearBNAC(32, output_classes, is_output=True)
    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.output(x)
        return x 
        

### 準備輸入資料、優化器、標籤資料、模型輸出

In [7]:
model = Model(input_dimention=256,output_classes=10)
optimizer = optim.Adam(params=model.parameters(), lr=1e-3, weight_decay=1e-3)

In [8]:
batch_size = 4
input_features = 256
dummy_input = torch.randn(batch_size, input_features,)

#target = torch.empty(4, dtype=torch.float).random_(10)
target = torch.tensor([9., 5., 4., 4.], dtype=torch.long)

In [9]:
output = model(dummy_input)
print(output)

tensor([[0.0767, 0.1559, 0.0704, 0.0837, 0.0898, 0.1215, 0.1144, 0.0999, 0.0863,
         0.1013],
        [0.0882, 0.0661, 0.0965, 0.1918, 0.0649, 0.1566, 0.0680, 0.0966, 0.0781,
         0.0933],
        [0.0951, 0.1426, 0.0670, 0.1848, 0.0609, 0.1000, 0.1120, 0.0876, 0.0503,
         0.0996],
        [0.1500, 0.0733, 0.1128, 0.1429, 0.0537, 0.0859, 0.1112, 0.0902, 0.1083,
         0.0716]], grad_fn=<SoftmaxBackward0>)


### 計算 CrossEntropy Loss
* 請注意哪一個 Loss最適合：我們已經使用 softmax
* 因為我們有使用dropout，並隨機產生dummy_input，所以各為學員得到的值會與解答不同，然而步驟原理需要相同
* 如果欲使用LogSoftmax, CrossEntropyLoss，可以將 nn.Softmax從模型中移除

In [10]:
from torch.nn import NLLLoss, LogSoftmax, CrossEntropyLoss

In [11]:
criterion = NLLLoss() 

In [12]:
loss = criterion(torch.log(output), target)

### 完成back propagation並更新梯度

In [13]:
loss.backward()

In [14]:
print('weight : {}'.format(model.layer1.linear[0].weight))
print('\n')
print('grad : {}'.format(model.layer1.linear[0].weight.grad))

weight : Parameter containing:
tensor([[-0.0446,  0.0082, -0.0173,  ...,  0.0399,  0.0332, -0.0362],
        [-0.0303,  0.0310,  0.0191,  ..., -0.0477, -0.0176, -0.0123],
        [-0.0099,  0.0209,  0.0314,  ..., -0.0114,  0.0314,  0.0276],
        ...,
        [ 0.0527, -0.0160, -0.0030,  ..., -0.0336, -0.0623, -0.0210],
        [-0.0526,  0.0045,  0.0158,  ...,  0.0139,  0.0176,  0.0476],
        [-0.0596, -0.0317, -0.0177,  ...,  0.0103, -0.0516, -0.0594]],
       requires_grad=True)


grad : tensor([[-0.0105, -0.0091,  0.0691,  ...,  0.0725,  0.0131,  0.0019],
        [ 0.0080,  0.0071,  0.0099,  ..., -0.0480, -0.0463, -0.0009],
        [ 0.0055,  0.0057, -0.0261,  ..., -0.0230,  0.0064, -0.0041],
        ...,
        [ 0.0356,  0.0198, -0.0116,  ..., -0.0248,  0.0284, -0.0326],
        [-0.0024, -0.0012,  0.0017,  ...,  0.0013, -0.0027,  0.0022],
        [-0.0021, -0.0122,  0.0096,  ...,  0.0365,  0.0034,  0.0033]])


In [15]:
optimizer.step()

In [16]:
print('weight : {}'.format(model.layer1.linear[0].weight))
print('\n')
print('grad : {}'.format(model.layer1.linear[0].weight.grad))

weight : Parameter containing:
tensor([[-0.0436,  0.0092, -0.0183,  ...,  0.0389,  0.0322, -0.0372],
        [-0.0313,  0.0300,  0.0181,  ..., -0.0467, -0.0166, -0.0113],
        [-0.0109,  0.0199,  0.0324,  ..., -0.0104,  0.0304,  0.0286],
        ...,
        [ 0.0517, -0.0170, -0.0020,  ..., -0.0326, -0.0633, -0.0200],
        [-0.0516,  0.0055,  0.0148,  ...,  0.0129,  0.0186,  0.0466],
        [-0.0586, -0.0307, -0.0187,  ...,  0.0093, -0.0526, -0.0604]],
       requires_grad=True)


grad : tensor([[-0.0105, -0.0091,  0.0691,  ...,  0.0725,  0.0131,  0.0019],
        [ 0.0080,  0.0071,  0.0099,  ..., -0.0480, -0.0463, -0.0009],
        [ 0.0055,  0.0057, -0.0261,  ..., -0.0230,  0.0064, -0.0041],
        ...,
        [ 0.0356,  0.0198, -0.0116,  ..., -0.0248,  0.0284, -0.0326],
        [-0.0024, -0.0012,  0.0017,  ...,  0.0013, -0.0027,  0.0022],
        [-0.0021, -0.0122,  0.0096,  ...,  0.0365,  0.0034,  0.0033]])


### 清空 gradient

In [17]:
optimizer.zero_grad()

In [18]:
print('weight : {}'.format(model.layer1.linear[0].weight))
print('\n')
print('grad : {}'.format(model.layer1.linear[0].weight.grad))

weight : Parameter containing:
tensor([[-0.0436,  0.0092, -0.0183,  ...,  0.0389,  0.0322, -0.0372],
        [-0.0313,  0.0300,  0.0181,  ..., -0.0467, -0.0166, -0.0113],
        [-0.0109,  0.0199,  0.0324,  ..., -0.0104,  0.0304,  0.0286],
        ...,
        [ 0.0517, -0.0170, -0.0020,  ..., -0.0326, -0.0633, -0.0200],
        [-0.0516,  0.0055,  0.0148,  ...,  0.0129,  0.0186,  0.0466],
        [-0.0586, -0.0307, -0.0187,  ...,  0.0093, -0.0526, -0.0604]],
       requires_grad=True)


grad : None
