In [240]:
from transformers import BertTokenizer, AutoTokenizer, AutoModel

In [241]:
# BertTokenizer(可通过`__init__`实例化): Construct a BERT tokenizer. Based on WordPiece.
# from_pretrained(类方法): Instantiate a [`~tokenization_utils_base.PreTrainedTokenizerBase`] (or a derived class) from a predefined tokenizer.
tokenizer = BertTokenizer.from_pretrained(
    pretrained_model_name_or_path='bert-base-chinese',
)
tokenizer  # 分词器

PreTrainedTokenizer(name_or_path='bert-base-chinese', vocab_size=21128, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [242]:
print(len(tokenizer))
print(len(tokenizer.get_vocab()))  # 与上等价

21128
21128


In [243]:
# This is a generic tokenizer class that will be instantiated as one of the tokenizer classes of the library
#   when created with the [`AutoTokenizer.from_pretrained`] class method.
#   This class cannot be instantiated directly using `__init__()` (throws an error).
# 与上等价
tokenizer_auto = AutoTokenizer.from_pretrained(pretrained_model_name_or_path='bert-base-chinese')
tokenizer_auto

PreTrainedTokenizerFast(name_or_path='bert-base-chinese', vocab_size=21128, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [244]:
list_of_token = [101, 6848, 2885, 4403, 3736, 5709, 1736, 4638, 1333, 1728, 2218, 3221, 3175, 912, 511, 102]

# Convert a list of lists of token ids into a list of strings by calling decode.
print(tokenizer.decode(list_of_token))

# skip_special_tokens:Whether or not to remove special tokens in the decoding.
# If these tokens are already part of the vocabulary, it just let the Tokenizer know about them. If they don’t exist, the Tokenizer creates them, giving them a new id.
# These special tokens will never be processed by the model (ie won’t be split into multiple tokens), and they can be removed from the output when decoding.
print(tokenizer.decode(list_of_token, skip_special_tokens=True))

[CLS] 选 择 珠 江 花 园 的 原 因 就 是 方 便 。 [SEP]
选 择 珠 江 花 园 的 原 因 就 是 方 便 。


In [245]:
# Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the vocabulary.
print(tokenizer.convert_tokens_to_ids(['月光', '希望', '[EOS', '<eop>']))

# Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and added tokens.
print(tokenizer.convert_ids_to_tokens([21128, 21128, 21130, 21131]))

[100, 100, 100, 100]
['[UNK]', '[UNK]', '[UNK]', '[UNK]']


In [246]:
text = '月光的[UNK][PAD][CLS]新希望[EOS]<eop>'
result = tokenizer(text=text, add_special_tokens=False)

# 未添加新tokens前的编码效果
print(result)

print(tokenizer.tokenize(text))
print(tokenizer.decode(result['input_ids']))
print(tokenizer.convert_ids_to_tokens(result['input_ids']))  # 只是ids到tokens的转换

{'input_ids': [3299, 1045, 4638, 100, 0, 101, 3173, 2361, 3307, 138, 100, 140, 133, 147, 9133, 135], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['月', '光', '的', '[UNK]', '[PAD]', '[CLS]', '新', '希', '望', '[', '[UNK]', ']', '<', 'e', '##op', '>']
月 光 的 [UNK] [PAD] [CLS] 新 希 望 [ [UNK] ] < eop >
['月', '光', '的', '[UNK]', '[PAD]', '[CLS]', '新', '希', '望', '[', '[UNK]', ']', '<', 'e', '##op', '>']


In [247]:
# Add a list of new tokens to the tokenizer class.
tokenizer.add_tokens(new_tokens=['月光', '希望'])

#  Add a dictionary of special tokens (eos, pad, cls, etc.) to the encoder and link them to class attributes.
#  If special tokens are NOT in the vocabulary, they are added to it (indexed starting from the last index of the current vocabulary).

# Keys should be in the list of predefined special attributes:
#   [`bos_token`, `eos_token`, `unk_token`, `sep_token`, `pad_token`, `cls_token`, `mask_token`, `additional_special_tokens`].
tokenizer.add_special_tokens(special_tokens_dict={'eos_token': '[EOS]',
                                                  # Additional special tokens used by the tokenizer.
                                                  'additional_special_tokens': ["<eop>", "<eod>"]})
print(len(tokenizer.get_vocab()))
print(tokenizer.additional_special_tokens)
print(tokenizer.additional_special_tokens_ids)

21133
['<eop>', '<eod>']
[21131, 21132]


In [248]:
result1 = tokenizer(text=text, add_special_tokens=False)

# 添加新tokens后的编码效果
print(result1)

print(tokenizer.tokenize(text))
print(tokenizer.decode(result1['input_ids']))  # 分词效果达到预期
print(tokenizer.convert_ids_to_tokens(result['input_ids']))

{'input_ids': [21128, 4638, 100, 0, 101, 3173, 21129, 21130, 21131], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}
['月光', '的', '[UNK]', '[PAD]', '[CLS]', '新', '希望', '[EOS]', '<eop>']
月光 的 [UNK] [PAD] [CLS] 新 希望 [EOS] <eop>
['月', '光', '的', '[UNK]', '[PAD]', '[CLS]', '新', '希', '望', '[', '[UNK]', ']', '<', 'e', '##op', '>']


In [249]:
print(len(tokenizer.get_vocab()))

21133


In [250]:
# 保存分词器(包括新添加的tokens)
tokenizer.save_pretrained("../extra_dataset/save_tokenizer/")

('save_tokenizer/tokenizer_config.json',
 'save_tokenizer/special_tokens_map.json',
 'save_tokenizer/vocab.txt',
 'save_tokenizer/added_tokens.json')

In [251]:
# 从本地重新加载
tokenizer1 = BertTokenizer.from_pretrained("../extra_dataset/save_tokenizer/")
tokenizer1

PreTrainedTokenizer(name_or_path='save_tokenizer/', vocab_size=21128, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'eos_token': '[EOS]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]', 'additional_special_tokens': ['<eop>', '<eod>']})

In [252]:
result2 = tokenizer1(text=text, add_special_tokens=False)

print(result2)
print(tokenizer1.tokenize(text))
print(tokenizer1.decode(result2['input_ids']))  # 分词效果与上等价

{'input_ids': [21128, 4638, 100, 0, 101, 3173, 21129, 21130, 21131], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}
['月光', '的', '[UNK]', '[PAD]', '[CLS]', '新', '希望', '[EOS]', '<eop>']
月光 的 [UNK] [PAD] [CLS] 新 希望 [EOS] <eop>


In [253]:
model = AutoModel.from_pretrained('bert-base-chinese')

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [254]:
model(**tokenizer(text, return_tensors='pt'))  # 报错

IndexError: index out of range in self

In [255]:
model.embeddings

BertEmbeddings(
  (word_embeddings): Embedding(21128, 768, padding_idx=0)
  (position_embeddings): Embedding(512, 768)
  (token_type_embeddings): Embedding(2, 768)
  (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
)

In [256]:
# word_embedding维度为:21128 * 768(but此时tokenizer大小为:21133)
model.get_input_embeddings()

Embedding(21128, 768, padding_idx=0)

In [257]:
# Resizes input token embeddings matrix of the model if `new_num_tokens != config.vocab_size`.
model.resize_token_embeddings(len(tokenizer))

Embedding(21133, 768)

In [258]:
model(**tokenizer(text, return_tensors='pt'))


BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.5206,  0.5423,  0.2691,  ...,  0.8091, -0.4797, -0.0561],
         [-0.3456,  0.3025, -0.1301,  ..., -0.0090, -0.1680, -0.5165],
         [-0.3690,  0.3482,  0.5202,  ..., -0.0231,  0.2632, -0.2211],
         ...,
         [-0.4824,  0.6202,  0.4320,  ...,  0.4577,  0.1774, -0.5592],
         [-0.3892,  0.6652,  0.1083,  ...,  0.2178, -0.0246, -0.4248],
         [-0.3988,  0.4876,  0.2308,  ...,  0.7583, -0.6936,  0.0380]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[ 0.9999,  0.9977,  1.0000,  0.9492,  0.6740, -0.2831, -0.7346, -0.8150,
          0.9780, -1.0000,  1.0000,  1.0000, -0.9065, -0.8526,  0.9999, -0.9999,
         -0.2017,  0.8878,  0.9975, -0.2366,  0.9831, -1.0000, -0.1171, -0.9902,
         -0.9017,  0.9994,  0.9341, -0.8814, -0.9968,  1.0000,  0.9962,  1.0000,
          0.9858, -0.9944, -1.0000,  0.0295,  0.6967,  0.9986,  0.4683, -0.4691,
         -0.9862, -0.5577, -0.85