In [1]:
from transformers import AutoModel, AutoTokenizer

In [2]:
'''
主要预训练模型:
'microsoft/deberta-v3-base'
'microsoft/deberta-v3-large'
'''
tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-base')  # 需安装sentencepiece(pip install sentencepiece)
print(tokenizer.model_input_names)
print(tokenizer.all_special_tokens)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


['input_ids', 'token_type_ids', 'attention_mask']
['[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]']


In [3]:
# 保存分词器(包括新添加的tokens)
tokenizer.save_pretrained("deberta-v3-base-tokenizer")

('deberta-v3-base-tokenizer\\tokenizer_config.json',
 'deberta-v3-base-tokenizer\\special_tokens_map.json',
 'deberta-v3-base-tokenizer\\spm.model',
 'deberta-v3-base-tokenizer\\added_tokens.json')

In [4]:
# 从本地加载
AutoTokenizer.from_pretrained('deberta-v3-base-tokenizer/')

PreTrainedTokenizer(name_or_path='deberta-v3-base-tokenizer/', vocab_size=128000, model_max_len=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [5]:
model = AutoModel.from_pretrained("microsoft/deberta-v3-base")
model

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.classifier.bias', 'mask_predictions.dense.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


DebertaV2Model(
  (embeddings): DebertaV2Embeddings(
    (word_embeddings): Embedding(128100, 768, padding_idx=0)
    (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
    (dropout): StableDropout()
  )
  (encoder): DebertaV2Encoder(
    (layer): ModuleList(
      (0): DebertaV2Layer(
        (attention): DebertaV2Attention(
          (self): DisentangledSelfAttention(
            (query_proj): Linear(in_features=768, out_features=768, bias=True)
            (key_proj): Linear(in_features=768, out_features=768, bias=True)
            (value_proj): Linear(in_features=768, out_features=768, bias=True)
            (pos_dropout): StableDropout()
            (dropout): StableDropout()
          )
          (output): DebertaV2SelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
            (dropout): StableDropout()
          )
        )
        (intermediate): Deb

In [6]:
model.config

DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.17.0",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

In [7]:
for name, para in model.named_parameters():
    print(name)

embeddings.word_embeddings.weight
embeddings.LayerNorm.weight
embeddings.LayerNorm.bias
encoder.layer.0.attention.self.query_proj.weight
encoder.layer.0.attention.self.query_proj.bias
encoder.layer.0.attention.self.key_proj.weight
encoder.layer.0.attention.self.key_proj.bias
encoder.layer.0.attention.self.value_proj.weight
encoder.layer.0.attention.self.value_proj.bias
encoder.layer.0.attention.output.dense.weight
encoder.layer.0.attention.output.dense.bias
encoder.layer.0.attention.output.LayerNorm.weight
encoder.layer.0.attention.output.LayerNorm.bias
encoder.layer.0.intermediate.dense.weight
encoder.layer.0.intermediate.dense.bias
encoder.layer.0.output.dense.weight
encoder.layer.0.output.dense.bias
encoder.layer.0.output.LayerNorm.weight
encoder.layer.0.output.LayerNorm.bias
encoder.layer.1.attention.self.query_proj.weight
encoder.layer.1.attention.self.query_proj.bias
encoder.layer.1.attention.self.key_proj.weight
encoder.layer.1.attention.self.key_proj.bias
encoder.layer.1.attent

In [8]:
# 长文本
text = "Replace me by any text you'd like."
text = text * 500

encoded_input = tokenizer(text, return_tensors='pt')
print(encoded_input)
print(encoded_input.keys())
print(encoded_input['input_ids'].shape)  # [1, 5002]

{'input_ids': tensor([[    1, 19574,   351,  ...,   334,   260,     2]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]])}
dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
torch.Size([1, 5002])


In [9]:
result = model(**encoded_input)
print(result.last_hidden_state)

tensor([[[ 0.0239,  0.1704, -0.0316,  ..., -0.1247,  0.0810,  0.0192],
         [ 0.4834,  0.1078,  0.0750,  ...,  0.8623,  0.4400, -0.2494],
         [ 0.7394,  0.0288,  0.0800,  ...,  0.7303,  0.3650, -0.0103],
         ...,
         [ 0.6291,  0.7503,  0.4901,  ...,  0.0150, -0.2836,  0.4169],
         [-0.0024,  0.7096,  0.5085,  ...,  0.9686, -0.4418, -0.1343],
         [ 0.0770,  0.1935, -0.0307,  ..., -0.1302,  0.0821,  0.0261]]],
       grad_fn=<NativeLayerNormBackward0>)


In [10]:
result.last_hidden_state.shape  # [1, 5002, 768]

torch.Size([1, 5002, 768])