## pytorch, モデルのパラメータを初期化することについてのnotebook

In [722]:
import torch
import torch.nn as nn

import numpy as np
import matplotlib.pyplot as plt

from torchinfo import summary

### サンプルモデルの定義

In [723]:
class SampleModel(nn.Module):
    def __init__(self):
        super(SampleModel, self).__init__()

        self.l1 = nn.Linear(4, 2)
        self.f1 = nn.ReLU()
        self.l2 = nn.Linear(2, 1)
    
    def forward(self, x):
        out = self.l1(x)
        out = self.f1(out)
        out = self.l2(out)
        return out

In [724]:
model = SampleModel()

* 全結合層(nn.Linear)のパラメータ数は
    * 係数, 入力数×出力数
    * バイアス, 出力数


In [725]:
summary(model)

Layer (type:depth-idx)                   Param #
SampleModel                              --
├─Linear: 1-1                            10
├─ReLU: 1-2                              --
├─Linear: 1-3                            3
Total params: 13
Trainable params: 13
Non-trainable params: 0

In [726]:
for params in model.parameters():
    print(params, type(params))

Parameter containing:
tensor([[ 0.1994,  0.1458, -0.1405,  0.3567],
        [ 0.0696,  0.4327,  0.3161,  0.3661]], requires_grad=True) <class 'torch.nn.parameter.Parameter'>
Parameter containing:
tensor([0.1077, 0.0555], requires_grad=True) <class 'torch.nn.parameter.Parameter'>
Parameter containing:
tensor([[-0.4441, -0.2937]], requires_grad=True) <class 'torch.nn.parameter.Parameter'>
Parameter containing:
tensor([0.1949], requires_grad=True) <class 'torch.nn.parameter.Parameter'>


## パラメータを学習させたくない場合


In [727]:
class LinearRegression(nn.Module):
    def __init__(self):
        super(LinearRegression, self).__init__()
        self.layer = nn.Linear(1, 1)

    def forward(self, x):
        y = self.layer(x)
        return y

def loss_fn(outputs, targets):
    loss = nn.MSELoss()
    return loss(outputs, targets)

In [728]:
device = "cuda:0"
model = LinearRegression()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

In [729]:
# seedしてないから、初期値は毎回変わる --
a_before = model.state_dict()["layer.weight"].item()
b_before = model.state_dict()["layer.bias"].item()
model.state_dict()

OrderedDict([('layer.weight', tensor([[0.6596]])),
             ('layer.bias', tensor([0.4982]))])

### これでfreezeできる

In [730]:
#list(model.parameters())[0].requires_grad = False

### これはfreezeできない, なんでだ？
* requires_gradはnn.parameter.Parameterに渡さないと更新されない

In [731]:
model.state_dict()["layer.weight"].requires_grad = False

In [732]:
type(list(model.parameters())[0])

torch.nn.parameter.Parameter

In [733]:
type(model.state_dict()["layer.weight"])

torch.Tensor

### 自分で指定した初期値を使いたい

In [734]:
# # これは無理 --
# list(model.parameters())[0] = 2
# model.state_dict()["layer.weight"] = 3.33333

In [735]:
# torch.nn.initを使わないとアカン
# freezeしたあとでも差し替え可能 --
nn.init.normal_(list(model.parameters())[0], mean=0.0, std=0.02)

Parameter containing:
tensor([[0.0150]], requires_grad=True)

In [736]:
# state_dictからアクセスしても行ける。
# 要はメモリ渡しだからparamsのtensorに渡せればok
nn.init.zeros_(model.state_dict()["layer.weight"])

tensor([[0.]])

In [737]:
model.state_dict()

OrderedDict([('layer.weight', tensor([[0.]])),
             ('layer.bias', tensor([0.4982]))])

In [738]:
model.to(device)

n = 1000
x = torch.rand(n)*2 -1
a, b = 2.0, -10.0
y = a*x + b

x = x + torch.randn(n)*0.02
y = y + a*torch.randn(n)*0.02

x = x.to(device)
y = y.to(device)

bs = 10
niter = 1000
losses = []

ta_ = model.state_dict()["layer.weight"].item()
tb_ = model.state_dict()["layer.bias"].item()   
print(f"start from  :  a_ = {ta_:.3f}, b_ = {tb_:.3f}")

for iiter in range(niter):

   r = np.random.choice(n, bs, replace=False)
   bx = x[r].reshape(-1, 1)
   by = y[r].reshape(-1, 1)

   y_ = model(bx)
   loss = loss_fn(by, y_)

   optimizer.zero_grad()
   loss.backward()
   optimizer.step()

   ta_ = model.state_dict()["layer.weight"].item()
   tb_ = model.state_dict()["layer.bias"].item()   


   if iiter%100 == 0:
      print(f"iiter : {iiter}, loss = {loss:.6f} / a_ = {ta_:.3f}, b_ = {tb_:.3f}")
   losses.append(loss)

start from  :  a_ = 0.000, b_ = 0.498
iiter : 0, loss = 102.121338 / a_ = -0.032, b_ = 0.297
iiter : 100, loss = 2.736832 / a_ = 0.870, b_ = -8.645
iiter : 200, loss = 0.106869 / a_ = 1.388, b_ = -9.816
iiter : 300, loss = 0.033522 / a_ = 1.682, b_ = -9.974
iiter : 400, loss = 0.014352 / a_ = 1.837, b_ = -9.997
iiter : 500, loss = 0.006179 / a_ = 1.916, b_ = -9.999
iiter : 600, loss = 0.005357 / a_ = 1.956, b_ = -10.001
iiter : 700, loss = 0.002963 / a_ = 1.975, b_ = -10.004
iiter : 800, loss = 0.004255 / a_ = 1.985, b_ = -10.001
iiter : 900, loss = 0.004037 / a_ = 1.991, b_ = -10.001


In [739]:
a_after = model.state_dict()["layer.weight"].item()
b_after = model.state_dict()["layer.bias"].item()
model.state_dict()

OrderedDict([('layer.weight', tensor([[1.9940]], device='cuda:0')),
             ('layer.bias', tensor([-10.0002], device='cuda:0'))])

In [740]:
print(f"a_ ... {a_before:.3f} -->> {a_after:.3f}")
print(f"b_ ... {b_before:.3f} -->> {b_after:.3f}")

a_ ... 0.660 -->> 1.994
b_ ... 0.498 -->> -10.000


### 多層になった場合、狙ったところをどうやって引っ張るか --

In [741]:
class TwoLayerModel(nn.Module):
    def __init__(self):
        super(TwoLayerModel, self).__init__()
        
        self.l1 = nn.Linear(4, 2)
        self.l2 = nn.Linear(2, 1)
    
    def forward(self, x):
        out = self.l1(x)
        out = self.l2(out)
        return out

In [742]:
model = TwoLayerModel()

In [743]:
summary(model)

Layer (type:depth-idx)                   Param #
TwoLayerModel                            --
├─Linear: 1-1                            10
├─Linear: 1-2                            3
Total params: 13
Trainable params: 13
Non-trainable params: 0

In [744]:
# keysは一次元配置になる --
model.state_dict().keys()

odict_keys(['l1.weight', 'l1.bias', 'l2.weight', 'l2.bias'])

In [745]:
model.state_dict()

OrderedDict([('l1.weight',
              tensor([[ 0.4195, -0.3565,  0.2914,  0.4754],
                      [ 0.2004, -0.1163,  0.4769,  0.2522]])),
             ('l1.bias', tensor([-0.4128, -0.2074])),
             ('l2.weight', tensor([[0.5616, 0.0457]])),
             ('l2.bias', tensor([0.6327]))])

In [746]:
for param in model.parameters():
    print(param)

Parameter containing:
tensor([[ 0.4195, -0.3565,  0.2914,  0.4754],
        [ 0.2004, -0.1163,  0.4769,  0.2522]], requires_grad=True)
Parameter containing:
tensor([-0.4128, -0.2074], requires_grad=True)
Parameter containing:
tensor([[0.5616, 0.0457]], requires_grad=True)
Parameter containing:
tensor([0.6327], requires_grad=True)


In [747]:
from transformers import AutoModel
model_name = "cl-tohoku/bert-base-japanese"

In [748]:
m1 = AutoModel.from_pretrained(model_name)

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [749]:
input_size = (1, 32)
dtypes = [torch.int, torch.long]

summary(m1, input_size=input_size, dtypes=dtypes, depth=3)

Layer (type:depth-idx)                             Output Shape              Param #
BertModel                                          [1, 768]                  --
├─BertEmbeddings: 1-1                              [1, 32, 768]              --
│    └─Embedding: 2-1                              [1, 32, 768]              24,576,000
│    └─Embedding: 2-2                              [1, 32, 768]              1,536
│    └─Embedding: 2-3                              [1, 32, 768]              393,216
│    └─LayerNorm: 2-4                              [1, 32, 768]              1,536
│    └─Dropout: 2-5                                [1, 32, 768]              --
├─BertEncoder: 1-2                                 [1, 32, 768]              --
│    └─ModuleList: 2-6                             --                        --
│    │    └─BertLayer: 3-1                         [1, 32, 768]              7,087,872
│    │    └─BertLayer: 3-32                        --                        (recursive)


In [750]:
len(m1.state_dict().keys())

200

In [751]:
m1.state_dict().keys()

odict_keys(['embeddings.position_ids', 'embeddings.word_embeddings.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.LayerNorm.weight', 'embeddings.LayerNorm.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.1.attention.self.query.weight', 'encoder.laye

### そもそもの疑問として、output_hidden_statesはbert-poolerを本当に通ってないのか？という観点がある
* 思ってたとおりの挙動でした
    * pooler.dense.weightとpooler.dense.biasが影響するのはbase_output["pooler_output"]のみ
    * last_hidden_state -> (pooler) -> pooler_output
    * 

In [752]:
from transformers import T5Tokenizer

model_name = "rinna/japanese-roberta-base"
input_size = (32, 128)
dtypes = [torch.int, torch.long]

tokenizer = T5Tokenizer.from_pretrained(model_name)
token = tokenizer.encode_plus("例えば君がいるだけで心が強くなれるよ")
input_ids = torch.Tensor(token["input_ids"]).to(torch.long).unsqueeze(0)
attention_mask = torch.Tensor(token["attention_mask"]).to(torch.long).unsqueeze(0)

In [753]:
m2 = AutoModel.from_pretrained(model_name)

Some weights of the model checkpoint at rinna/japanese-roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at rinna/japanese-roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to b

In [754]:
m2_base_output = m2(input_ids=input_ids, attention_mask=attention_mask)

In [755]:
m3 = AutoModel.from_pretrained(model_name)

Some weights of the model checkpoint at rinna/japanese-roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at rinna/japanese-roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to b

In [756]:
# pooler_outputっぽいやつのパラメータを初期化 --
nn.init.zeros_(m3.state_dict()["pooler.dense.weight"])
nn.init.zeros_(m3.state_dict()["pooler.dense.bias"])

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 

In [757]:
m3_base_output = m3(input_ids=input_ids, attention_mask=attention_mask)

* まず、m3のpooler_outputは全部ゼロであってほしい

In [758]:
# ok --
m3_base_output["pooler_output"].sum()

tensor(0., grad_fn=<SumBackward0>)

* m2のpooler_outputは当然値を持つ

In [759]:
# ok --
m2_base_output["pooler_output"].sum()

tensor(5.0400, grad_fn=<SumBackward0>)

* m3もlast_hidden_stateは値を持つ --

In [760]:
m3_base_output["last_hidden_state"]

tensor([[[-0.1488,  0.0563, -0.2001,  ..., -0.2476, -0.1276, -0.0199],
         [-0.1376,  0.0470, -0.1485,  ..., -0.2500, -0.1300,  0.0111],
         [-0.1621,  0.0432, -0.2342,  ..., -0.2326, -0.1819, -0.0077],
         ...,
         [-0.1684,  0.0294, -0.3133,  ..., -0.1929, -0.0901, -0.0889],
         [-0.1868,  0.0360, -0.2739,  ..., -0.2799, -0.0704, -0.0660],
         [-0.0763,  0.0404, -0.2715,  ..., -0.2516, -0.0645, -0.0086]]],
       grad_fn=<NativeLayerNormBackward0>)

* で、かつm2_base_outputと同じ値を持っている

In [761]:
m2_base_output["last_hidden_state"]

tensor([[[-0.1488,  0.0563, -0.2001,  ..., -0.2476, -0.1276, -0.0199],
         [-0.1376,  0.0470, -0.1485,  ..., -0.2500, -0.1300,  0.0111],
         [-0.1621,  0.0432, -0.2342,  ..., -0.2326, -0.1819, -0.0077],
         ...,
         [-0.1684,  0.0294, -0.3133,  ..., -0.1929, -0.0901, -0.0889],
         [-0.1868,  0.0360, -0.2739,  ..., -0.2799, -0.0704, -0.0660],
         [-0.0763,  0.0404, -0.2715,  ..., -0.2516, -0.0645, -0.0086]]],
       grad_fn=<NativeLayerNormBackward0>)

### では、BERTの狙ったレイヤーだけ初期化する方法はどうする？ --

In [762]:
m4 = AutoModel.from_pretrained("cl-tohoku/bert-large-japanese")

Some weights of the model checkpoint at cl-tohoku/bert-large-japanese were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [763]:
summary(m4, depth=3)

Layer (type:depth-idx)                             Param #
BertModel                                          --
├─BertEmbeddings: 1-1                              --
│    └─Embedding: 2-1                              33,554,432
│    └─Embedding: 2-2                              524,288
│    └─Embedding: 2-3                              2,048
│    └─LayerNorm: 2-4                              2,048
│    └─Dropout: 2-5                                --
├─BertEncoder: 1-2                                 --
│    └─ModuleList: 2-6                             --
│    │    └─BertLayer: 3-1                         12,596,224
│    │    └─BertLayer: 3-2                         12,596,224
│    │    └─BertLayer: 3-3                         12,596,224
│    │    └─BertLayer: 3-4                         12,596,224
│    │    └─BertLayer: 3-5                         12,596,224
│    │    └─BertLayer: 3-6                         12,596,224
│    │    └─BertLayer: 3-7                         12,596,224
│ 

In [764]:
[param for param in list(m4.state_dict().keys()) if "23" in param]

['encoder.layer.23.attention.self.query.weight',
 'encoder.layer.23.attention.self.query.bias',
 'encoder.layer.23.attention.self.key.weight',
 'encoder.layer.23.attention.self.key.bias',
 'encoder.layer.23.attention.self.value.weight',
 'encoder.layer.23.attention.self.value.bias',
 'encoder.layer.23.attention.output.dense.weight',
 'encoder.layer.23.attention.output.dense.bias',
 'encoder.layer.23.attention.output.LayerNorm.weight',
 'encoder.layer.23.attention.output.LayerNorm.bias',
 'encoder.layer.23.intermediate.dense.weight',
 'encoder.layer.23.intermediate.dense.bias',
 'encoder.layer.23.output.dense.weight',
 'encoder.layer.23.output.dense.bias',
 'encoder.layer.23.output.LayerNorm.weight',
 'encoder.layer.23.output.LayerNorm.bias']

### BERTでやるとわかりにくいので、Linearでやる --

In [812]:
class QuadLinearModel(nn.Module):
    def __init__(self):
        super(QuadLinearModel, self).__init__()
        self.l0 = nn.Linear(1, 1)
        self.l1 = nn.Linear(1, 1)
        self.l2 = nn.Linear(1, 1)
        self.l3 = nn.Linear(1, 1)

    def forward(self, x):
        out = self.l0(x)
        out = self.l1(out)
        out = self.l2(out)
        out = self.l3(out)

        return out

In [813]:
model = QuadLinearModel()
summary(model)

Layer (type:depth-idx)                   Param #
QuadLinearModel                          --
├─Linear: 1-1                            2
├─Linear: 1-2                            2
├─Linear: 1-3                            2
├─Linear: 1-4                            2
Total params: 8
Trainable params: 8
Non-trainable params: 0

In [814]:
model.state_dict()

OrderedDict([('l0.weight', tensor([[0.4555]])),
             ('l0.bias', tensor([0.5718])),
             ('l1.weight', tensor([[0.7746]])),
             ('l1.bias', tensor([0.4392])),
             ('l2.weight', tensor([[0.3896]])),
             ('l2.bias', tensor([-0.1883])),
             ('l3.weight', tensor([[-0.8489]])),
             ('l3.bias', tensor([-0.7753]))])

In [833]:
def torch_init_params_by_name(model, name):
    """nameを含むnamed_parameterを初期化する関数"""
    init_params = [(param_name, params) for (param_name, params) in model.named_parameters() if name in param_name]
    for param in init_params:
        print(f"... {param[0]} initialized ... ")
        nn.init.normal_(param[1], mean=0, std=0.02)

In [837]:
def torch_freeze_params_by_name(model, name):
    """nameを含むnamed_parameterをfreeze(required_grad=False)する関数"""
    freeze_params = [(param_name, params) for (param_name, params) in model.named_parameters() if name in param_name]
    for param in freeze_params:
        print(f"... {param[0]} freezed ... ")
        param[1].requires_grad = False

In [823]:
torch_freeze_params_by_name(model, name="l1")

<class 'torch.nn.parameter.Parameter'>
<class 'torch.nn.parameter.Parameter'>


In [824]:
model.state_dict()

OrderedDict([('l0.weight', tensor([[0.4555]])),
             ('l0.bias', tensor([0.5718])),
             ('l1.weight', tensor([[0.7746]])),
             ('l1.bias', tensor([0.4392])),
             ('l2.weight', tensor([[0.3896]])),
             ('l2.bias', tensor([-0.1883])),
             ('l3.weight', tensor([[-0.8489]])),
             ('l3.bias', tensor([-0.7753]))])

In [825]:
summary(model)

Layer (type:depth-idx)                   Param #
QuadLinearModel                          --
├─Linear: 1-1                            2
├─Linear: 1-2                            (2)
├─Linear: 1-3                            2
├─Linear: 1-4                            2
Total params: 8
Trainable params: 6
Non-trainable params: 2

### BERT, roBERTaに対して関数を適用してみる --

In [827]:
model = AutoModel.from_pretrained("nlp-waseda/roberta-large-japanese-seq512")

Some weights of the model checkpoint at nlp-waseda/roberta-large-japanese-seq512 were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at nlp-waseda/roberta-large-japanese-seq512 and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for pre

In [835]:
torch_init_params_by_name(model, name="23")

... encoder.layer.23.attention.self.query.weight initialized ... 
... encoder.layer.23.attention.self.query.bias initialized ... 
... encoder.layer.23.attention.self.key.weight initialized ... 
... encoder.layer.23.attention.self.key.bias initialized ... 
... encoder.layer.23.attention.self.value.weight initialized ... 
... encoder.layer.23.attention.self.value.bias initialized ... 
... encoder.layer.23.attention.output.dense.weight initialized ... 
... encoder.layer.23.attention.output.dense.bias initialized ... 
... encoder.layer.23.attention.output.LayerNorm.weight initialized ... 
... encoder.layer.23.attention.output.LayerNorm.bias initialized ... 
... encoder.layer.23.intermediate.dense.weight initialized ... 
... encoder.layer.23.intermediate.dense.bias initialized ... 
... encoder.layer.23.output.dense.weight initialized ... 
... encoder.layer.23.output.dense.bias initialized ... 
... encoder.layer.23.output.LayerNorm.weight initialized ... 
... encoder.layer.23.output.LayerNor

In [838]:
torch_freeze_params_by_name(model, name="23")

... encoder.layer.23.attention.self.query.weight freezed ... 
... encoder.layer.23.attention.self.query.bias freezed ... 
... encoder.layer.23.attention.self.key.weight freezed ... 
... encoder.layer.23.attention.self.key.bias freezed ... 
... encoder.layer.23.attention.self.value.weight freezed ... 
... encoder.layer.23.attention.self.value.bias freezed ... 
... encoder.layer.23.attention.output.dense.weight freezed ... 
... encoder.layer.23.attention.output.dense.bias freezed ... 
... encoder.layer.23.attention.output.LayerNorm.weight freezed ... 
... encoder.layer.23.attention.output.LayerNorm.bias freezed ... 
... encoder.layer.23.intermediate.dense.weight freezed ... 
... encoder.layer.23.intermediate.dense.bias freezed ... 
... encoder.layer.23.output.dense.weight freezed ... 
... encoder.layer.23.output.dense.bias freezed ... 
... encoder.layer.23.output.LayerNorm.weight freezed ... 
... encoder.layer.23.output.LayerNorm.bias freezed ... 


In [839]:
summary(model)

Layer (type:depth-idx)                                  Param #
RobertaModel                                            --
├─RobertaEmbeddings: 1-1                                --
│    └─Embedding: 2-1                                   32,768,000
│    └─Embedding: 2-2                                   526,336
│    └─Embedding: 2-3                                   2,048
│    └─LayerNorm: 2-4                                   2,048
│    └─Dropout: 2-5                                     --
├─RobertaEncoder: 1-2                                   --
│    └─ModuleList: 2-6                                  --
│    │    └─RobertaLayer: 3-1                           12,596,224
│    │    └─RobertaLayer: 3-2                           12,596,224
│    │    └─RobertaLayer: 3-3                           12,596,224
│    │    └─RobertaLayer: 3-4                           12,596,224
│    │    └─RobertaLayer: 3-5                           12,596,224
│    │    └─RobertaLayer: 3-6                      

In [840]:
model.state_dict()["encoder.layer.23.output.dense.weight"]

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])