In [152]:
from transformers import AutoTokenizer

In [153]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [154]:
# 词汇量
print(tokenizer.vocab_size)

# model’s maximum context size
print(tokenizer.model_max_length)

# 模型前向传递(forward)中需要输入的字段名称
print(tokenizer.model_input_names)

# 特殊符号
print(tokenizer.all_special_ids)
print(tokenizer.all_special_tokens)
print(tokenizer.special_tokens_map)

30522
512
['input_ids', 'token_type_ids', 'attention_mask']
[100, 102, 0, 101, 103]
['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']
{'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}


In [155]:
# 词典
tokenizer_dict = tokenizer.get_vocab()

print(tokenizer_dict)
print(type(tokenizer_dict))  # <class 'dict'>
print(len(tokenizer_dict))
print('love' in tokenizer_dict)
print('china' in tokenizer_dict)

<class 'dict'>
30522
True
True


In [156]:
text = 'Concluding Statement - a concluding statement that restates the claims'
text1 = 'checkpoint of a model that you expect to be exactly'
s = tokenizer([text, text1], return_offsets_mapping=True)
s

{'input_ids': [[101, 16228, 4861, 1011, 1037, 16228, 4861, 2008, 2717, 8520, 1996, 4447, 102], [101, 26520, 1997, 1037, 2944, 2008, 2017, 5987, 2000, 2022, 3599, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'offset_mapping': [[(0, 0), (0, 10), (11, 20), (21, 22), (23, 24), (25, 35), (36, 45), (46, 50), (51, 55), (55, 59), (60, 63), (64, 70), (0, 0)], [(0, 0), (0, 10), (11, 13), (14, 15), (16, 21), (22, 26), (27, 30), (31, 37), (38, 40), (41, 43), (44, 51), (0, 0)]]}

In [157]:
# Return the list of tokens (sub-parts of the input strings after word/subword splitting and before conversion to integer indices) at a given batch index (only works for the output of a fast tokenizer).
s.tokens()  # 默认batch_index=0

['[CLS]',
 'concluding',
 'statement',
 '-',
 'a',
 'concluding',
 'statement',
 'that',
 'rest',
 '##ates',
 'the',
 'claims',
 '[SEP]']

In [158]:
tokenizer.tokenize(text)

['concluding',
 'statement',
 '-',
 'a',
 'concluding',
 'statement',
 'that',
 'rest',
 '##ates',
 'the',
 'claims']

In [159]:
s.tokens(batch_index=1)

['[CLS]',
 'checkpoint',
 'of',
 'a',
 'model',
 'that',
 'you',
 'expect',
 'to',
 'be',
 'exactly',
 '[SEP]']

In [160]:
# Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer.
s.word_ids(batch_index=1)

[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, None]

In [161]:
q = s.word_ids()  # 默认batch_index=0
q

[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 9, None]

In [162]:
print(s['offset_mapping'][0])
print(len(q), len(s['offset_mapping'][0]))

[(0, 0), (0, 10), (11, 20), (21, 22), (23, 24), (25, 35), (36, 45), (46, 50), (51, 55), (55, 59), (60, 63), (64, 70), (0, 0)]
13 13


In [163]:
list_of_token = [101, 16228, 4861, 1011, 1037, 16228, 4861, 2008, 2717, 8520, 1996, 4447, 102]

# Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special tokens and clean up tokenization spaces.
print(tokenizer.decode(list_of_token))
print(tokenizer.decode(list_of_token, 
                       # Whether or not to remove special tokens in the decoding.defaults to False
                       skip_special_tokens=True))

[CLS] concluding statement - a concluding statement that restates the claims [SEP]
concluding statement - a concluding statement that restates the claims


In [164]:
# 批量解码
print(tokenizer.batch_decode(s['input_ids']))
print(tokenizer.batch_decode(s['input_ids'], skip_special_tokens=True))

['[CLS] concluding statement - a concluding statement that restates the claims [SEP]', '[CLS] checkpoint of a model that you expect to be exactly [SEP]']
['concluding statement - a concluding statement that restates the claims', 'checkpoint of a model that you expect to be exactly']


In [165]:
# Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the vocabulary.
print(tokenizer.convert_tokens_to_ids(['i', 'love', '[CLS]', '<eop>']))

# Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and added tokens.
print(tokenizer.convert_ids_to_tokens([100, 101, 16228, 4861, 1011]))

[1045, 2293, 101, 100]
['[UNK]', '[CLS]', 'concluding', 'statement', '-']
