In [1]:
from transformers import AutoTokenizer, Qwen2TokenizerFast

In [2]:
# This is a generic tokenizer class that will be instantiated as one of the tokenizer classes of the library when created with the AutoTokenizer.from_pretrained() class method.
# This class cannot be instantiated directly using __init__() (throws an error).
# tokenizer_llm = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct")  # 加载预训练模型分词器(类方法)
tokenizer_llm = Qwen2TokenizerFast.from_pretrained("Qwen/Qwen2.5-7B-Instruct")  # 与上等价
tokenizer_llm.init_kwargs

{'vocab_file': 'C:\\Users\\duanm\\.cache\\huggingface\\hub\\models--Qwen--Qwen2.5-7B-Instruct\\snapshots\\a09a35458c702b33eeacc393d103063234e8bc28\\vocab.json',
 'merges_file': 'C:\\Users\\duanm\\.cache\\huggingface\\hub\\models--Qwen--Qwen2.5-7B-Instruct\\snapshots\\a09a35458c702b33eeacc393d103063234e8bc28\\merges.txt',
 'unk_token': None,
 'bos_token': None,
 'eos_token': AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 'pad_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 'add_bos_token': False,
 'add_prefix_space': False,
 'additional_special_tokens': ['<|im_start|>',
  '<|im_end|>',
  '<|object_ref_start|>',
  '<|object_ref_end|>',
  '<|box_start|>',
  '<|box_end|>',
  '<|quad_start|>',
  '<|quad_end|>',
  '<|vision_start|>',
  '<|vision_end|>',
  '<|vision_pad|>',
  '<|image_pad|>',
  '<|video_pad|>'],
 'chat_template': '{%- if tools %}\n    {{- \'<|im_star

In [3]:
# 词汇量
print(tokenizer_llm.vocab_size)

# model’s maximum context size
print(tokenizer_llm.model_max_length)

# 模型前向传递(forward)中需要输入的字段名称
print(tokenizer_llm.model_input_names)

151643
131072
['input_ids', 'attention_mask']


In [4]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')
tokenizer.init_kwargs

{'do_lower_case': False,
 'unk_token': '[UNK]',
 'sep_token': '[SEP]',
 'pad_token': '[PAD]',
 'cls_token': '[CLS]',
 'mask_token': '[MASK]',
 'tokenize_chinese_chars': True,
 'strip_accents': None,
 'model_max_length': 512,
 'name_or_path': 'bert-base-chinese'}

In [5]:
print(tokenizer.vocab_size)
print(tokenizer.model_max_length)
print(tokenizer.model_input_names)

21128
512
['input_ids', 'token_type_ids', 'attention_mask']


In [6]:
# 特殊符号
print(tokenizer.bos_token, tokenizer.bos_token_id)
print(tokenizer.eos_token, tokenizer.eos_token_id)
print(tokenizer.unk_token, tokenizer.unk_token_id)
print(tokenizer.sep_token, tokenizer.sep_token_id)
print(tokenizer.pad_token, tokenizer.pad_token_id)
print(tokenizer.cls_token, tokenizer.cls_token_id)
print(tokenizer.mask_token, tokenizer.mask_token_id)
print(tokenizer.all_special_tokens, tokenizer.all_special_ids)
print(tokenizer.additional_special_tokens, tokenizer.additional_special_tokens_ids)
print(tokenizer.special_tokens_map)

None None
None None
[UNK] 100
[SEP] 102
[PAD] 0
[CLS] 101
[MASK] 103
['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]'] [100, 102, 0, 101, 103]
[] []
{'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}


In [7]:
# 词典
tokenizer_dict = tokenizer.get_vocab()

print(tokenizer_dict)
print(type(tokenizer_dict))  # <class 'dict'>
print(len(tokenizer_dict))
print('love' in tokenizer_dict)
print('china' in tokenizer_dict)

{'晝': 3242, '羈': 5398, '铝': 7199, '箔': 5048, '##攞': 16167, '娃': 2015, '##囹': 14799, '##萊': 18901, '##预': 20621, '骼': 7761, '##畴': 17590, '♣': 490, '＊': 8022, '呂': 1436, '##堵': 14900, '枰': 3370, '##浔': 16907, '琲': 4431, 'xz': 12999, '媽': 2061, '歴': 3643, 'plc': 10015, '酩': 6990, '蕁': 5930, '罪': 5389, '266': 9674, '##ャ': 13695, '##抜': 15895, 'h1': 12333, '##壓': 14943, '##扛': 15863, '##隈': 20442, '##函': 14198, '310': 9643, 'ganji': 9055, 'video': 9539, 'jan': 9213, '354': 11848, '##凍': 14177, 'visa': 8958, '##ft': 9002, '##熄': 17276, '##97': 9410, '荼': 5795, '搭': 3022, '総': 5217, '视': 6228, 'rick': 13253, 'mount': 12881, '##cing': 11540, '##☕': 13620, '徐': 2528, '账': 6572, '鏈': 7122, '拈': 2860, '##离': 17952, '払': 2804, '氹': 3720, 'tcp': 9901, '1b': 12217, '##喷': 14670, 'a7': 11226, '屿': 2257, '##伐': 13884, 'al': 9266, '006': 12526, '##剋': 14239, '##『': 13656, '夜': 1915, '苷': 5740, '赃': 6597, 'apk': 8543, '鈎': 7044, '下': 678, '課': 6307, '##ube': 10957, '##糟': 18193, '钏': 7155, '##sion': 92