In [1]:
import torch
import torch.nn as nn
import torchvision.models as models
import torch.nn.functional as F
import numpy as np

## nn.Module
* Base class for all neural network modules
* 只要在nn.Module的子類中定義了forward函數，backward函數就會被自動實現（利用Autograd）
* nn.Conv2d 本身也是nn.Module的類別(此時我們可以先不用理解nn.Conv2D做了什麼，只需了解其包含一些參數與操作)

In [68]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.conv1 = nn.Conv2d(1, 20, 5)
        self.conv2 = nn.Conv2d(20, 20, 5)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        return F.relu(self.conv2(x))

In [69]:
model = Model()

### 實踐 forward propagation 
* 為什麼不應該直接call model.forward : https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690

In [70]:
input_ = torch.randn(1,1,124,124)
output = model(input_)

### 查看 model 底下的 modules

#### .modules

* model.modules 遞迴的列出所有的 modules

In [71]:
for module in model.modules():
    print(module)

Model(
  (conv1): Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(20, 20, kernel_size=(5, 5), stride=(1, 1))
)
Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
Conv2d(20, 20, kernel_size=(5, 5), stride=(1, 1))


#### .children

* model.children 只列出第一層的子 modules

In [72]:
for module in model.children():
    print(module)

Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
Conv2d(20, 20, kernel_size=(5, 5), stride=(1, 1))


### 查看 model 內的 parameters (torch.nn.parameter.Parameter)

#### .named_parameters
* named_parameters會列出每個nn.Module底下parameters 的名字,數值
* 同時可以查看 requires_grad是否開啟(for backpropagation)

In [73]:
for name, param in model.named_parameters():
    print(name,param.requires_grad)
    #param.requires_grad=True

conv1.weight True
conv1.bias True
conv2.weight True
conv2.bias True


#### .parameters
* 不會印出名字

In [79]:
for param in model.parameters():
    print(type(param),param.shape, param.requires_grad)

<class 'torch.nn.parameter.Parameter'> torch.Size([20, 1, 5, 5]) True
<class 'torch.nn.parameter.Parameter'> torch.Size([20]) True
<class 'torch.nn.parameter.Parameter'> torch.Size([20, 20, 5, 5]) True
<class 'torch.nn.parameter.Parameter'> torch.Size([20]) True


#### 計算模型可訓練參數總量

In [80]:
model_parameters = filter(lambda p: p.requires_grad, model.parameters())
params = sum([np.prod(p.size()) for p in model_parameters])
print('總共參數量：' ,params)

總共參數量： 10540


### Backpropagation

In [81]:
input_ = torch.randn(1,1,124,124)
output = model(input_)

#### 確認 requires_grad為 True (default 就是 True)

In [82]:
for name, param in model.named_parameters():
    print(name,param.requires_grad)

conv1.weight True
conv1.bias True
conv2.weight True
conv2.bias True


#### 此時還沒做backpropagation，parameters沒有gradient value

In [18]:
print(model.conv1.weight.grad)

None


#### 執行backward，完成後就能看到每個parameters底下的gradient value

In [19]:
output.sum().backward()

In [20]:
print(model.conv1.weight.grad)

tensor([[[[ 1.6210e+02,  5.1225e+02, -2.1730e+02, -4.5927e+01,  7.6519e+02],
          [-8.3639e+02,  1.4350e+03, -4.9346e+02,  1.0653e+03, -9.7021e+02],
          [-1.4194e+03,  7.7866e+02,  1.3832e+03, -1.1798e+03,  2.2985e+02],
          [-1.0582e+03, -7.7341e+02,  2.5269e+02,  5.5109e+02,  1.4927e+02],
          [ 3.2955e+02,  1.3545e+02,  1.7758e+01,  2.4372e+02, -1.0246e+03]]],


        [[[-2.7771e+02,  2.8576e+02,  2.7087e+02, -2.3264e+02,  2.6199e+02],
          [ 3.0646e+02,  2.8792e+02,  2.1235e+02, -5.2904e+02,  1.9815e+02],
          [-4.4810e+02,  2.1456e+02,  1.7090e+02, -1.6141e+02, -2.8268e+02],
          [ 1.2011e+02, -4.2804e+02,  8.5065e+01,  3.2301e+02, -2.2653e+02],
          [ 4.1278e+02,  1.6778e+01, -2.2901e+02, -8.4452e+01, -1.4430e+02]]],


        [[[ 1.6949e+01, -5.0699e+02, -3.0705e+02, -2.8467e+02, -4.0673e+02],
          [-1.4856e+01,  3.6041e+02, -1.6105e+02,  2.8540e+02, -2.9854e+02],
          [-1.8193e+02, -7.3799e+02, -3.7101e+02, -2.9654e+02, -7.70

#### 當我們把 parameters 的 requires_grad關閉時，就無法成功的完成backward
* 什麼時候會關閉requires_grad關閉時？ prediction (inference)的階段
* 設定 requires_grad = True 是為了之後要做 backpropagation，在計算每個paramters的 gradient時，我們在forward propagation時需要保留額外的訊息(根據chain rule)，這會導致記憶體使用量上升與計算速度下降，然而只有在 training 階段時我們材需要做backpropagation，在 prediction (inference)的階段，我們則可以設定 requires_grad = False 來提升速度與降低記憶體使用量 

In [21]:
for param in model.parameters():
    param.requires_grad = False

In [22]:
input_ = torch.randn(1,1,124,124)
output = model(input_)

In [23]:
output.sum().backward()

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

#### with torch.no_grad()
* 此行底下的requires_grad都會關閉

In [38]:
for param in model.parameters():
    param.requires_grad = True
with torch.no_grad():
    input_ = torch.randn(1,1,124,124)
    output = model(input_)
    output.sum().backward()

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

### 讓我們自行搭建一個 nn.Module 並試算gradient

In [39]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.x = torch.nn.Parameter(torch.tensor(2.4,dtype=torch.float32))
        self.y = torch.nn.Parameter(torch.tensor(4.3,dtype=torch.float32))

    def forward(self, x):
        output = x*self.x**2 + x*self.y + x # 可以看成 output = w*x*x + w*y+2
        return output

In [40]:
model = Model()
input_ = torch.tensor(1.3, dtype = torch.float32)
output = model(input_)
output.backward()
# output 對 self.x 的偏微分為 2 * w * x = 2 * 1.3 * 2.4 = 6.24 
print('self.x 的 gradient : {}'.format(model.x.grad))
# output 對 self.y 的偏微分為 w = 1.3
print('self.y 的 gradient : {}'.format(model.y.grad))

self.x 的 gradient : 6.240000247955322
self.y 的 gradient : 1.2999999523162842


## Sequential
* nn.Module 的容器

In [88]:
layer = nn.Sequential(
                        nn.Conv2d(3,
                                  20,
                                  kernel_size=3,
                                  stride=1,
                                  padding=1,
                                  bias=False), 
                        nn.BatchNorm2d(20),
                        nn.LeakyReLU(inplace=True))

In [42]:
for name, param in layer.named_parameters():
    print(name,param.requires_grad)
    #param.requires_grad=True

0.weight True
1.weight True
1.bias True


In [43]:
input_ = torch.randn(1, 3, 124, 124)
output = layer(input_)

#### OrderedDict+Sequential, 讓我們替每一個module命名

In [44]:
from collections import OrderedDict

In [45]:
layer = nn.Sequential(OrderedDict([
          ('conv1', nn.Conv2d(1,20,5)),
          ('relu1', nn.ReLU()),
          ('conv2', nn.Conv2d(20,64,5)),
          ('relu2', nn.ReLU())
        ]))


In [46]:
for module in layer.modules():
    print(module)
    #param.requires_grad=True

Sequential(
  (conv1): Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
  (relu1): ReLU()
  (conv2): Conv2d(20, 64, kernel_size=(5, 5), stride=(1, 1))
  (relu2): ReLU()
)
Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
ReLU()
Conv2d(20, 64, kernel_size=(5, 5), stride=(1, 1))
ReLU()


In [47]:
for name, param in layer.named_parameters():
    print(name,param.requires_grad)
    #param.requires_grad=True

conv1.weight True
conv1.bias True
conv2.weight True
conv2.bias True


In [48]:
input_ = torch.randn(1, 1, 124, 124)
output = layer(input_)
print(output.shape)

torch.Size([1, 64, 116, 116])


#### append 新的 module到 sequential上

In [49]:
import torch.nn as nn

modules = []
modules.append(nn.Conv2d(1,20,5))
modules.append(nn.ReLU())
modules.append(nn.Conv2d(20,64,5))
modules.append(nn.ReLU())

layer = nn.Sequential(*modules)

In [50]:
layer

Sequential(
  (0): Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
  (1): ReLU()
  (2): Conv2d(20, 64, kernel_size=(5, 5), stride=(1, 1))
  (3): ReLU()
)

In [51]:
input_ = torch.randn(1, 1, 124, 124)
output = layer(input_)
print(output.shape)

torch.Size([1, 64, 116, 116])


* 另一種方式

In [52]:
layer = torch.nn.Sequential()
layer.add_module("conv1", nn.Conv2d(1,20,5))
layer.add_module("relu1", nn.ReLU())
layer.add_module("conv2", nn.Conv2d(20,64,5))
layer.add_module("relu2", nn.ReLU())

In [53]:
layer

Sequential(
  (conv1): Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
  (relu1): ReLU()
  (conv2): Conv2d(20, 64, kernel_size=(5, 5), stride=(1, 1))
  (relu2): ReLU()
)

In [54]:
input_ = torch.randn(1, 1, 124, 124)
output = layer(input_)
print(output.shape)

torch.Size([1, 64, 116, 116])


## ModuleList
* 操作就像是python list, 但其內的module, parameters是可以被追蹤的

In [55]:
layer = nn.ModuleList()
layer.append(nn.Conv2d(1,20,5))
layer.append(nn.ReLU())
layer.append(nn.Conv2d(20,64,5))
layer.append(nn.ReLU())

ModuleList(
  (0): Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
  (1): ReLU()
  (2): Conv2d(20, 64, kernel_size=(5, 5), stride=(1, 1))
  (3): ReLU()
)

In [56]:
input_ = torch.randn(1, 1, 124, 124)
for _, module in enumerate(layer):
    if _ == 0:
        output = module(input_)
    else:
        output = module(output)
print(output.shape)

torch.Size([1, 64, 116, 116])


* 可以追蹤是什麼意思？ nn.Module有辦法去獲取ModuleList裡面的資訊

In [57]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.layer = nn.ModuleList()
        self.layer.append(nn.Conv2d(1,20,5))
        self.layer.append(nn.ReLU())
        self.layer.append(nn.Conv2d(20,64,5))
        self.layer.append(nn.ReLU())

    def forward(self, x):
        for module in self.layer:
            x = module(x)
        return x

In [58]:
model = Model()

In [59]:
model

Model(
  (layer): ModuleList(
    (0): Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
    (1): ReLU()
    (2): Conv2d(20, 64, kernel_size=(5, 5), stride=(1, 1))
    (3): ReLU()
  )
)

In [60]:
input_ = torch.randn(1, 1, 124, 124)
output = model(input_)
print(output.shape)

torch.Size([1, 64, 116, 116])


* 如果是一般的 python list

In [62]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.layer = []
        self.layer.append(nn.Conv2d(1,20,5))
        self.layer.append(nn.ReLU())
        self.layer.append(nn.Conv2d(20,64,5))
        self.layer.append(nn.ReLU())

    def forward(self, x):
        for module in self.layer:
            x = module(x)
        return

In [63]:
model = Model()

In [64]:
model

Model()