## pytorch, モデルのパラメータを初期化することについてのnotebook

In [418]:
import torch
import torch.nn as nn

import numpy as np
import matplotlib.pyplot as plt

from torchinfo import summary

### サンプルモデルの定義

In [419]:
class SampleModel(nn.Module):
    def __init__(self):
        super(SampleModel, self).__init__()

        self.l1 = nn.Linear(4, 2)
        self.f1 = nn.ReLU()
        self.l2 = nn.Linear(2, 1)
    
    def forward(self, x):
        out = self.l1(x)
        out = self.f1(out)
        out = self.l2(out)
        return out

In [420]:
model = SampleModel()

* 全結合層(nn.Linear)のパラメータ数は
    * 係数, 入力数×出力数
    * バイアス, 出力数


In [421]:
summary(model)

Layer (type:depth-idx)                   Param #
SampleModel                              --
├─Linear: 1-1                            10
├─ReLU: 1-2                              --
├─Linear: 1-3                            3
Total params: 13
Trainable params: 13
Non-trainable params: 0

In [422]:
for params in model.parameters():
    print(params, type(params))

Parameter containing:
tensor([[-0.1290,  0.2064, -0.2678, -0.1653],
        [-0.2526, -0.3821,  0.4273, -0.3591]], requires_grad=True) <class 'torch.nn.parameter.Parameter'>
Parameter containing:
tensor([ 0.0835, -0.2762], requires_grad=True) <class 'torch.nn.parameter.Parameter'>
Parameter containing:
tensor([[-0.5207,  0.6373]], requires_grad=True) <class 'torch.nn.parameter.Parameter'>
Parameter containing:
tensor([-0.2507], requires_grad=True) <class 'torch.nn.parameter.Parameter'>


## パラメータを学習させたくない場合


In [423]:
class LinearRegression(nn.Module):
    def __init__(self):
        super(LinearRegression, self).__init__()
        self.layer = nn.Linear(1, 1)

    def forward(self, x):
        y = self.layer(x)
        return y

def loss_fn(outputs, targets):
    loss = nn.MSELoss()
    return loss(outputs, targets)

In [424]:
device = "cuda:0"
model = LinearRegression()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

In [425]:
# seedしてないから、初期値は毎回変わる --
a_before = model.state_dict()["layer.weight"].item()
b_before = model.state_dict()["layer.bias"].item()
model.state_dict()

OrderedDict([('layer.weight', tensor([[-0.0902]])),
             ('layer.bias', tensor([0.0966]))])

### これでfreezeできる

In [426]:
list(model.parameters())[0].requires_grad = False

### 自分で指定した初期値を使いたい

In [427]:
# # これは無理 --
# list(model.parameters())[0] = 2
# model.state_dict()["layer.weight"] = 3.33333

In [428]:
# torch.nn.initを使わないとアカン
# freezeしたあとでも差し替え可能 --
nn.init.normal_(list(model.parameters())[0], mean=0.0, std=0.02)

Parameter containing:
tensor([[0.0093]])

In [429]:
model.state_dict()

OrderedDict([('layer.weight', tensor([[0.0093]])),
             ('layer.bias', tensor([0.0966]))])

In [430]:
model.to(device)

n = 1000
x = torch.rand(n)*2 -1
a, b = 2.0, -10.0
y = a*x + b

x = x + torch.randn(n)*0.02
y = y + a*torch.randn(n)*0.02

x = x.to(device)
y = y.to(device)

bs = 10
niter = 1000
losses = []

for iiter in range(niter):

    r = np.random.choice(n, bs, replace=False)
    bx = x[r].reshape(-1, 1)
    by = y[r].reshape(-1, 1)

    y_ = model(bx)
    loss = loss_fn(by, y_)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if iiter%100 == 0:
       print(f"iiter : {iiter}, loss = {loss:.6f}")
    losses.append(loss)

iiter : 0, loss = 108.496643
iiter : 100, loss = 4.338649
iiter : 200, loss = 2.257839
iiter : 300, loss = 1.533721
iiter : 400, loss = 1.879045
iiter : 500, loss = 1.950571
iiter : 600, loss = 1.232354
iiter : 700, loss = 1.624249
iiter : 800, loss = 1.659364
iiter : 900, loss = 1.304621


In [431]:
a_after = model.state_dict()["layer.weight"].item()
b_after = model.state_dict()["layer.bias"].item()
model.state_dict()

OrderedDict([('layer.weight', tensor([[0.0093]], device='cuda:0')),
             ('layer.bias', tensor([-9.9659], device='cuda:0'))])

In [432]:
print(f"a_ ... {a_before:.3f} -->> {a_after:.3f}")
print(f"b_ ... {b_before:.3f} -->> {b_after:.3f}")

a_ ... -0.090 -->> 0.009
b_ ... 0.097 -->> -9.966


### 多層になった場合、狙ったところをどうやって引っ張るか --

In [433]:
class TwoLayerModel(nn.Module):
    def __init__(self):
        super(TwoLayerModel, self).__init__()
        
        self.l1 = nn.Linear(4, 2)
        self.l2 = nn.Linear(2, 1)
    
    def forward(self, x):
        out = self.l1(x)
        out = self.l2(out)
        return out

In [434]:
model = TwoLayerModel()

In [435]:
summary(model)

Layer (type:depth-idx)                   Param #
TwoLayerModel                            --
├─Linear: 1-1                            10
├─Linear: 1-2                            3
Total params: 13
Trainable params: 13
Non-trainable params: 0

In [436]:
# keysは一次元配置になる --
model.state_dict().keys()

odict_keys(['l1.weight', 'l1.bias', 'l2.weight', 'l2.bias'])

In [437]:
model.state_dict()

OrderedDict([('l1.weight',
              tensor([[-0.3663, -0.3768, -0.0243, -0.2024],
                      [ 0.3312,  0.4486,  0.2241, -0.1547]])),
             ('l1.bias', tensor([ 0.1602, -0.3296])),
             ('l2.weight', tensor([[ 0.4001, -0.3678]])),
             ('l2.bias', tensor([-0.3451]))])

In [439]:
for param in model.parameters():
    print(param)

Parameter containing:
tensor([[-0.3663, -0.3768, -0.0243, -0.2024],
        [ 0.3312,  0.4486,  0.2241, -0.1547]], requires_grad=True)
Parameter containing:
tensor([ 0.1602, -0.3296], requires_grad=True)
Parameter containing:
tensor([[ 0.4001, -0.3678]], requires_grad=True)
Parameter containing:
tensor([-0.3451], requires_grad=True)


In [440]:
from transformers import AutoModel
model_name = "cl-tohoku/bert-base-japanese"

In [441]:
m1 = AutoModel.from_pretrained(model_name)

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [450]:
input_size = (1, 32)
dtypes = [torch.int, torch.long]

summary(m1, input_size=input_size, dtypes=dtypes, depth=7)

Layer (type:depth-idx)                             Output Shape              Param #
BertModel                                          [1, 768]                  --
├─BertEmbeddings: 1-1                              [1, 32, 768]              --
│    └─Embedding: 2-1                              [1, 32, 768]              24,576,000
│    └─Embedding: 2-2                              [1, 32, 768]              1,536
│    └─Embedding: 2-3                              [1, 32, 768]              393,216
│    └─LayerNorm: 2-4                              [1, 32, 768]              1,536
│    └─Dropout: 2-5                                [1, 32, 768]              --
├─BertEncoder: 1-2                                 [1, 32, 768]              --
│    └─ModuleList: 2-6                             --                        --
│    │    └─BertLayer: 3-1                         [1, 32, 768]              2,361,600
│    │    │    └─BertAttention: 4-1                [1, 32, 768]              --
│    │   

In [446]:
len(m1.state_dict().keys())

200

In [447]:
m1.state_dict().keys()

odict_keys(['embeddings.position_ids', 'embeddings.word_embeddings.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.LayerNorm.weight', 'embeddings.LayerNorm.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.1.attention.self.query.weight', 'encoder.laye

### そもそもの疑問として、bert-poolerは本当に通ってないのか？という観点がある