In [56]:
from transformers import BertTokenizer, AutoTokenizer, AutoModel

In [57]:
# BertTokenizer(可通过`__init__`实例化): Construct a BERT tokenizer. Based on WordPiece.
# from_pretrained(类方法): Instantiate a [`~tokenization_utils_base.PreTrainedTokenizerBase`] (or a derived class) from a predefined tokenizer.
tokenizer = BertTokenizer.from_pretrained(
    pretrained_model_name_or_path='bert-base-chinese',
)
tokenizer  # 分词器

BertTokenizer(name_or_path='bert-base-chinese', vocab_size=21128, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [58]:
# This is a generic tokenizer class that will be instantiated as one of the tokenizer classes of the library when created with the [`AutoTokenizer.from_pretrained`] class method.
# This class cannot be instantiated directly using `__init__()` (throws an error).
# 与上等价
tokenizer_auto = AutoTokenizer.from_pretrained(pretrained_model_name_or_path='bert-base-chinese')
tokenizer_auto

BertTokenizerFast(name_or_path='bert-base-chinese', vocab_size=21128, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [59]:
text = '月光的[UNK][PAD][CLS]新希望[EOS]<eop>'

# 未添加新tokens前的编码效果
print(tokenizer.convert_tokens_to_ids(['月光', '希望', '[EOS]', '<eop>']))
print(tokenizer.tokenize(text))
print(len(tokenizer.get_vocab()))

[100, 100, 100, 100]
['月', '光', '的', '[UNK]', '[PAD]', '[CLS]', '新', '希', '望', '[', '[UNK]', ']', '<', 'e', '##op', '>']
21128


In [60]:
# Add a list of new tokens to the tokenizer class.
tokenizer.add_tokens(new_tokens=['月光', '希望'])

#  Add a dictionary of special tokens (eos, pad, cls, etc.) to the encoder and link them to class attributes.
#  If special tokens are NOT in the vocabulary, they are added to it (indexed starting from the last index of the current vocabulary).

# Keys should be in the list of predefined special attributes:
#   [`bos_token`, `eos_token`, `unk_token`, `sep_token`, `pad_token`, `cls_token`, `mask_token`, `additional_special_tokens`].
print(tokenizer.eos_token, tokenizer.eos_token_id)
tokenizer.add_special_tokens(special_tokens_dict={'eos_token': '[EOS]',
                                                  # Additional special tokens used by the tokenizer.
                                                  'additional_special_tokens': ["<eop>", "<eod>"]})
print(tokenizer.eos_token, tokenizer.eos_token_id)
print(len(tokenizer.get_vocab()))
print(tokenizer.special_tokens_map)
print(tokenizer.additional_special_tokens, tokenizer.additional_special_tokens_ids)

None None
[EOS] 21130
21133
{'eos_token': '[EOS]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]', 'additional_special_tokens': ['<eop>', '<eod>']}
['<eop>', '<eod>'] [21131, 21132]


In [61]:
# 添加新tokens后的编码效果
print(tokenizer.convert_tokens_to_ids(['月光', '希望', '[EOS]', '<eop>']))
print(tokenizer.tokenize(text))
print(len(tokenizer.get_vocab()))

[21128, 21129, 21130, 21131]
['月光', '的', '[UNK]', '[PAD]', '[CLS]', '新', '希望', '[EOS]', '<eop>']
21133


In [62]:
# 保存分词器(包括新添加的tokens)
tokenizer.save_pretrained("../extra_dataset/save_tokenizer/")

('../extra_dataset/save_tokenizer/tokenizer_config.json',
 '../extra_dataset/save_tokenizer/special_tokens_map.json',
 '../extra_dataset/save_tokenizer/vocab.txt',
 '../extra_dataset/save_tokenizer/added_tokens.json')

In [63]:
# 从本地重新加载
tokenizer1 = BertTokenizer.from_pretrained("../extra_dataset/save_tokenizer/")
tokenizer1

BertTokenizer(name_or_path='../extra_dataset/save_tokenizer/', vocab_size=21128, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'eos_token': '[EOS]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]', 'additional_special_tokens': ['<eop>', '<eod>']}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	21128: AddedToken("月光", rstrip=False, lstrip=

In [64]:
print(tokenizer1.convert_tokens_to_ids(['月光', '希望', '[EOS]', '<eop>']))
print(tokenizer1.tokenize(text))
print(len(tokenizer1.get_vocab()))

[21128, 21129, 21130, 21131]
['月光', '的', '[UNK]', '[PAD]', '[CLS]', '新', '希望', '[EOS]', '<eop>']
21133


In [65]:
model = AutoModel.from_pretrained('bert-base-chinese')



In [66]:
model(**tokenizer(text, return_tensors='pt'))  # 报错

IndexError: index out of range in self

In [67]:
model.embeddings

BertEmbeddings(
  (word_embeddings): Embedding(21128, 768, padding_idx=0)
  (position_embeddings): Embedding(512, 768)
  (token_type_embeddings): Embedding(2, 768)
  (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
)

In [68]:
# word_embedding维度为:21128 * 768(but此时tokenizer大小为:21133)
model.get_input_embeddings()

Embedding(21128, 768, padding_idx=0)

In [69]:
# Resizes input token embeddings matrix of the model if `new_num_tokens != config.vocab_size`.
model.resize_token_embeddings(len(tokenizer))

Embedding(21133, 768, padding_idx=0)

In [70]:
model(**tokenizer(text, return_tensors='pt'))

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.5964,  0.7013,  0.0856,  ...,  0.6440, -0.2722, -0.0668],
         [-0.2686,  0.4007, -0.0717,  ..., -0.0795, -0.4526, -0.3661],
         [-0.4009,  0.5033,  0.4209,  ..., -0.1184, -0.0193, -0.2150],
         ...,
         [-0.6121,  0.4952, -0.2831,  ...,  0.5085, -0.2028, -0.3519],
         [-0.3102,  0.6188,  0.1467,  ..., -0.0947, -0.4982, -0.2391],
         [-0.5063,  0.7890,  0.2055,  ...,  0.5374, -0.7213,  0.1184]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[ 0.9988,  0.9998,  0.9968,  0.9804,  0.7700,  0.3574, -0.8768, -0.9041,
          0.9960, -0.9994,  1.0000,  0.9996, -0.8934, -0.9461,  0.9999, -0.9994,
         -0.7651,  0.9940,  0.9975,  0.0776,  0.9980, -1.0000, -0.9552, -0.7782,
         -0.7619,  0.9963,  0.9376, -0.8758, -0.9999,  0.9988,  0.9848,  0.9996,
          0.9788, -0.9999, -0.9993,  0.6910,  0.2432,  0.9935, -0.6462, -0.8489,
         -0.9785, -0.6373, -0.41