In [22]:
import torch.nn as nn
import torch
from transformers import AutoModel

In [23]:
class Bert_base(torch.nn.Module):
    """Bert + Linear基础模型(transformer实现训练过程)"""

    def __init__(self, pretrained_model, num_class, dropout_ratio=0.2):
        super().__init__()
        self.pretrained = pretrained_model
        self.hidden_size = pretrained_model.config.hidden_size
        self.fc = torch.nn.Linear(self.hidden_size, num_class)
        self.dropout = nn.Dropout(dropout_ratio)

    def forward(self, input_ids, attention_mask, token_type_ids):
        model_output = self.pretrained(input_ids=input_ids,
                                       attention_mask=attention_mask,
                                       token_type_ids=token_type_ids)

        # model_output.pooler_output.shape=[batch_size, self.hidden_size]
        out = self.fc(self.dropout(model_output.pooler_output))  # 最后一个序列的信息
        out = out.softmax(dim=1)
        return out

In [24]:
pretrained = AutoModel.from_pretrained('bert-base-uncased')
model = Bert_base(pretrained, 2)
for n, p in model.named_parameters():
    print(n)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


pretrained.embeddings.word_embeddings.weight
pretrained.embeddings.position_embeddings.weight
pretrained.embeddings.token_type_embeddings.weight
pretrained.embeddings.LayerNorm.weight
pretrained.embeddings.LayerNorm.bias
pretrained.encoder.layer.0.attention.self.query.weight
pretrained.encoder.layer.0.attention.self.query.bias
pretrained.encoder.layer.0.attention.self.key.weight
pretrained.encoder.layer.0.attention.self.key.bias
pretrained.encoder.layer.0.attention.self.value.weight
pretrained.encoder.layer.0.attention.self.value.bias
pretrained.encoder.layer.0.attention.output.dense.weight
pretrained.encoder.layer.0.attention.output.dense.bias
pretrained.encoder.layer.0.attention.output.LayerNorm.weight
pretrained.encoder.layer.0.attention.output.LayerNorm.bias
pretrained.encoder.layer.0.intermediate.dense.weight
pretrained.encoder.layer.0.intermediate.dense.bias
pretrained.encoder.layer.0.output.dense.weight
pretrained.encoder.layer.0.output.dense.bias
pretrained.encoder.layer.0.outp

In [25]:
def get_parameter_names(model, forbidden_layer_types):
    """
    Returns the names of the model parameters that are not inside a forbidden layer.
    """
    result = []
    for name, child in model.named_children():
        result += [
            f"{name}.{n}"
            for n in get_parameter_names(child, forbidden_layer_types)
            if not isinstance(child, tuple(forbidden_layer_types))
        ]
    # Add model specific parameters (defined with nn.Parameter) since they are not in any child.
    result += list(model._parameters.keys())
    return result


def get_parameters(opt_model, weight_decay=0.0):
    decay_parameters = get_parameter_names(opt_model, [nn.LayerNorm])
    decay_parameters = [name for name in decay_parameters if "bias" not in name]
    optimizer_grouped_parameters = [
                {
                    "params": [p for n, p in opt_model.named_parameters() if n in decay_parameters],
                    "weight_decay": weight_decay,  # 权重衰减系数
                },
                {
                    "params": [p for n, p in opt_model.named_parameters() if n not in decay_parameters],
                    "weight_decay": 0.0,
                },
            ]
    return optimizer_grouped_parameters, decay_parameters

In [26]:
_, dp = get_parameters(model)
dp

['pretrained.embeddings.word_embeddings.weight',
 'pretrained.embeddings.position_embeddings.weight',
 'pretrained.embeddings.token_type_embeddings.weight',
 'pretrained.encoder.layer.0.attention.self.query.weight',
 'pretrained.encoder.layer.0.attention.self.key.weight',
 'pretrained.encoder.layer.0.attention.self.value.weight',
 'pretrained.encoder.layer.0.attention.output.dense.weight',
 'pretrained.encoder.layer.0.intermediate.dense.weight',
 'pretrained.encoder.layer.0.output.dense.weight',
 'pretrained.encoder.layer.1.attention.self.query.weight',
 'pretrained.encoder.layer.1.attention.self.key.weight',
 'pretrained.encoder.layer.1.attention.self.value.weight',
 'pretrained.encoder.layer.1.attention.output.dense.weight',
 'pretrained.encoder.layer.1.intermediate.dense.weight',
 'pretrained.encoder.layer.1.output.dense.weight',
 'pretrained.encoder.layer.2.attention.self.query.weight',
 'pretrained.encoder.layer.2.attention.self.key.weight',
 'pretrained.encoder.layer.2.attention.