In [159]:
from transformers import AutoTokenizer

In [160]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [161]:
# 词汇量
print(tokenizer.vocab_size)

# model’s maximum context size
print(tokenizer.model_max_length)

# 模型前向传递(forward)中需要输入的字段名称
print(tokenizer.model_input_names)

# 特殊符号
print(tokenizer.all_special_ids)
print(tokenizer.all_special_tokens)

30522
512
['input_ids', 'token_type_ids', 'attention_mask']
[100, 102, 0, 101, 103]
['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']


In [162]:
# 词典
tokenizer_dict = tokenizer.get_vocab()

print(tokenizer_dict)
print(type(tokenizer_dict))  # <class 'dict'>
print(len(tokenizer_dict))
print('love' in tokenizer_dict)
print('china' in tokenizer_dict)

<class 'dict'>
30522
True
True


In [163]:
text = 'Concluding Statement - a concluding statement that restates the claims'
text1 = 'checkpoint of a model that you expect to be exactly'
s = tokenizer([text, text1], return_offsets_mapping=True)
s

{'input_ids': [[101, 16228, 4861, 1011, 1037, 16228, 4861, 2008, 2717, 8520, 1996, 4447, 102], [101, 26520, 1997, 1037, 2944, 2008, 2017, 5987, 2000, 2022, 3599, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'offset_mapping': [[(0, 0), (0, 10), (11, 20), (21, 22), (23, 24), (25, 35), (36, 45), (46, 50), (51, 55), (55, 59), (60, 63), (64, 70), (0, 0)], [(0, 0), (0, 10), (11, 13), (14, 15), (16, 21), (22, 26), (27, 30), (31, 37), (38, 40), (41, 43), (44, 51), (0, 0)]]}

In [164]:
# Return the list of tokens (sub-parts of the input strings after word/subword splitting and before conversion to integer indices) at a given batch index (only works for the output of a fast tokenizer).
s.tokens()  # batch_index=0

['[CLS]',
 'concluding',
 'statement',
 '-',
 'a',
 'concluding',
 'statement',
 'that',
 'rest',
 '##ates',
 'the',
 'claims',
 '[SEP]']

In [165]:
tokenizer.tokenize(text)

['concluding',
 'statement',
 '-',
 'a',
 'concluding',
 'statement',
 'that',
 'rest',
 '##ates',
 'the',
 'claims']

In [166]:
s.tokens(batch_index=1)  # 添加了特殊token

['[CLS]',
 'checkpoint',
 'of',
 'a',
 'model',
 'that',
 'you',
 'expect',
 'to',
 'be',
 'exactly',
 '[SEP]']

In [167]:
s.word_to_tokens(0, 1)

TokenSpan(start=2, end=3)

In [168]:
# Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer.
s.word_ids(batch_index=1)

[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, None]

In [169]:
q = s.word_ids()  # 默认batch_index=0
q

[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 9, None]

In [170]:
print(s['offset_mapping'][0])
print(len(q), len(s['offset_mapping'][0]))

[(0, 0), (0, 10), (11, 20), (21, 22), (23, 24), (25, 35), (36, 45), (46, 50), (51, 55), (55, 59), (60, 63), (64, 70), (0, 0)]
13 13
