In [None]:
from transformers import AutoTokenizer, PreTrainedTokenizerFast
from tokenizers import processors

In [2]:
tokenizer_m0 = AutoTokenizer.from_pretrained("E:\huggingface_models\Qwen2.5-0.5B-Instruct",
                                             add_bos_token=False, add_eos_token=False)
print(tokenizer_m0("i love you!").tokens())

['i', 'Ġlove', 'Ġyou', '!']


In [None]:
class CustomTokenizer(PreTrainedTokenizerFast):  # 必须继承自PreTrainedTokenizerFast
    """自定义分词器"""
    def __init__(
        self,
        bos_token="<|im_start|>",
        add_bos_token=True,
        add_eos_token=False,
        **kwargs
    ):
        super().__init__(
            bos_token=bos_token,
            add_bos_token=add_bos_token,
            add_eos_token=add_eos_token,
            **kwargs,
        )
        self._add_bos_token = add_bos_token
        self._add_eos_token = add_eos_token
        self.update_post_processor()

    def update_post_processor(self):
        bos = self.bos_token
        bos_token_id = self.bos_token_id
        if bos is None and self._add_bos_token:
            raise ValueError("add_bos_token = True but bos_token = None")

        eos = self.eos_token
        eos_token_id = self.eos_token_id
        if eos is None and self.add_eos_token:
            raise ValueError("add_eos_token = True but eos_token = None")

        single = f"{(bos + ':0 ') if self._add_eos_token else ''}$A:0{(' ' + eos + ':0') if self._add_eos_token else ''}"
        pair = f"{single}{(' ' + bos + ':1') if self._add_bos_token else ''} $B:1{(' ' + eos + ':1') if self._add_eos_token else ''}"

        special_tokens = []
        if self._add_bos_token:
            special_tokens.append((bos, bos_token_id))
        if self._add_eos_token:
            special_tokens.append((eos, eos_token_id))
        self._tokenizer.post_processor = processors.TemplateProcessing(
            single=single, pair=pair, special_tokens=special_tokens
        )

In [4]:
tokenizer_m1 = CustomTokenizer.from_pretrained("E:\huggingface_models\Qwen2.5-0.5B-Instruct",
                                               add_bos_token=True, add_eos_token=True, bos_token="<|im_start|>")
print(tokenizer_m1("i love you!").tokens())

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'Qwen2Tokenizer'. 
The class this function is called from is 'CustomTokenizer'.


['<|im_start|>', 'i', 'Ġlove', 'Ġyou', '!', '<|im_end|>']
