In [31]:
from transformers import AutoTokenizer

In [32]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')

In [33]:
# 词汇量
print(tokenizer.vocab_size)

# model’s maximum context size
print(tokenizer.model_max_length)

# 模型前向传递(forward)中需要输入的字段名称
print(tokenizer.model_input_names)

21128
512
['input_ids', 'token_type_ids', 'attention_mask']


In [34]:
# 特殊符号
print(tokenizer.bos_token, tokenizer.bos_token_id)
print(tokenizer.eos_token, tokenizer.eos_token_id)
print(tokenizer.unk_token, tokenizer.unk_token_id)
print(tokenizer.sep_token, tokenizer.sep_token_id)
print(tokenizer.pad_token, tokenizer.pad_token_id)
print(tokenizer.cls_token, tokenizer.cls_token_id)
print(tokenizer.mask_token, tokenizer.mask_token_id)
print(tokenizer.all_special_tokens, tokenizer.all_special_ids)
print(tokenizer.additional_special_tokens, tokenizer.additional_special_tokens_ids)
print(tokenizer.special_tokens_map)

None None
None None
[UNK] 100
[SEP] 102
[PAD] 0
[CLS] 101
[MASK] 103
['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]'] [100, 102, 0, 101, 103]
[] []
{'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}


In [35]:
# 词典
tokenizer_dict = tokenizer.get_vocab()

print(tokenizer_dict)
print(type(tokenizer_dict))  # <class 'dict'>
print(len(tokenizer_dict))
print('love' in tokenizer_dict)
print('china' in tokenizer_dict)

{'蘆': 5978, 'nov': 9698, '##鹭': 20972, '1927': 9620, '融': 6084, '##卉': 14344, 'xd': 8444, '##tment': 12986, '荒': 5774, '##頼': 20594, '例': 891, '##㗎': 13727, '生': 4495, 'る': 580, '郜': 6949, '015': 13220, '鱈': 7818, '##绞': 18376, 'ta': 8346, '拳': 2891, '曾': 3295, '滥': 4010, '##勐': 14295, '##拮': 15945, 'i3': 12224, '##470': 12531, '挖': 2905, '##団': 14788, 'x5': 10871, '♥': 491, '艱': 5681, '##韋': 20557, '2001': 8285, 'd2': 11089, '淙': 3906, 'ァ': 588, '鯛': 7808, '##hur': 13190, '屐': 2243, '▌♥': 9601, '1867': 13042, '##iner': 12045, '##奬': 15012, '研': 4777, 't1': 10585, '鏢': 7129, '##垒': 14856, '辜': 6790, '囊': 1718, '##虑': 19048, '淋': 3900, '##这': 19878, 'society': 11573, '父': 4266, '##穆': 18003, '##壶': 14958, '##love': 12564, '飘': 7603, '##our': 9832, '##ᆯ': 11596, '蒐': 5883, '鸵': 7893, '##凉': 14174, 'ecu': 9526, '鈀': 7041, '##笔': 18068, 'f16': 12799, '##瀬': 17172, '嫖': 2069, '監': 4675, '疣': 4551, '##颅': 20622, '##榖': 16583, '##軽': 19788, '龍': 7983, '##鑒': 20199, '苓': 5725, 'star': 9012, '#

In [36]:
text = 'Concluding Statement - a concluding statement that restates the claims'
text1 = 'checkpoint of a model that you expect to be exactly'
s = tokenizer([text, text1], return_offsets_mapping=True)
s

{'input_ids': [[101, 100, 100, 118, 143, 11485, 10753, 9849, 8221, 10223, 8631, 9231, 8847, 10949, 9420, 8174, 12847, 8982, 8884, 102], [101, 9233, 11112, 8205, 143, 9264, 9231, 8357, 9577, 8619, 8722, 8228, 8815, 9577, 12183, 8436, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'offset_mapping': [[(0, 0), (0, 10), (11, 20), (21, 22), (23, 24), (25, 28), (28, 30), (30, 32), (32, 35), (36, 41), (41, 45), (46, 50), (51, 53), (53, 56), (56, 59), (60, 63), (64, 66), (66, 68), (68, 70), (0, 0)], [(0, 0), (0, 5), (5, 10), (11, 13), (14, 15), (16, 21), (22, 26), (27, 30), (31, 33), (33, 35), (35, 37), (38, 40), (41, 43), (44, 46), (46, 49), (49, 51), (0, 0)]]}

In [37]:
# Return the list of tokens (sub-parts of the input strings after word/subword splitting and before conversion to integer indices) at a given batch index (only works for the output of a fast tokenizer).
s.tokens()  # 默认batch_index=0

['[CLS]',
 '[UNK]',
 '[UNK]',
 '-',
 'a',
 'con',
 '##cl',
 '##ud',
 '##ing',
 'state',
 '##ment',
 'that',
 're',
 '##sta',
 '##tes',
 'the',
 'cl',
 '##ai',
 '##ms',
 '[SEP]']

In [38]:
tokenizer.tokenize(text)

['[UNK]',
 '[UNK]',
 '-',
 'a',
 'con',
 '##cl',
 '##ud',
 '##ing',
 'state',
 '##ment',
 'that',
 're',
 '##sta',
 '##tes',
 'the',
 'cl',
 '##ai',
 '##ms']

In [39]:
s.tokens(batch_index=1)

['[CLS]',
 'check',
 '##point',
 'of',
 'a',
 'model',
 'that',
 'you',
 'ex',
 '##pe',
 '##ct',
 'to',
 'be',
 'ex',
 '##act',
 '##ly',
 '[SEP]']

In [40]:
# Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer.
s.word_ids(batch_index=1)

[None, 0, 0, 1, 2, 3, 4, 5, 6, 6, 6, 7, 8, 9, 9, 9, None]

In [41]:
q = s.word_ids()  # 默认batch_index=0
q

[None, 0, 1, 2, 3, 4, 4, 4, 4, 5, 5, 6, 7, 7, 7, 8, 9, 9, 9, None]

In [42]:
print(s['offset_mapping'][0])
print(len(q), len(s['offset_mapping'][0]))

[(0, 0), (0, 10), (11, 20), (21, 22), (23, 24), (25, 28), (28, 30), (30, 32), (32, 35), (36, 41), (41, 45), (46, 50), (51, 53), (53, 56), (56, 59), (60, 63), (64, 66), (66, 68), (68, 70), (0, 0)]
20 20


In [43]:
list_of_token = [101, 16228, 4861, 1011, 1037, 16228, 4861, 2008, 2717, 8520, 1996, 4447, 102]

# Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special tokens and clean up tokenization spaces.
print(tokenizer.decode(list_of_token))
print(tokenizer.decode(list_of_token, 
                       # Whether or not to remove special tokens in the decoding.defaults to False
                       skip_special_tokens=True))

[CLS]断 祕 僖 兀断 祕 姪 慨 west 姑 瑜 [SEP]
##断 祕 僖 兀断 祕 姪 慨 west 姑 瑜


In [44]:
# 批量解码
print(tokenizer.batch_decode(s['input_ids']))
print(tokenizer.batch_decode(s['input_ids'], skip_special_tokens=True))

['[CLS] [UNK] [UNK] - a concluding statement that restates the claims [SEP]', '[CLS] checkpoint of a model that you expect to be exactly [SEP]']
['- a concluding statement that restates the claims', 'checkpoint of a model that you expect to be exactly']


In [45]:
# Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the vocabulary.
print(tokenizer.convert_tokens_to_ids(['i', 'love', '[CLS]', '<eop>']))

# Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and added tokens.
print(tokenizer.convert_ids_to_tokens([100, 101, 16228, 4861, 1011]))

[151, 8451, 101, 100]
['[UNK]', '[CLS]', '##断', '祕', '僖']
