In [None]:
# Tokenizer

In [None]:
# Copyright (c) Meta Platforms, Inc. and affiliates.
# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.

import os
from logging import getLogger
from typing import List

from sentencepiece import SentencePieceProcessor


logger = getLogger()


class Tokenizer:
    def __init__(self, model_path: str):
        # reload tokenizer
        # 检查给定的模型路径是否指向一个有效的文件
        assert os.path.isfile(model_path), model_path
        # 使用SentencePiece模型从这个路径加载分词器
        self.sp_model = SentencePieceProcessor(model_file=model_path)
        logger.info(f"Reloaded SentencePiece model from {model_path}")

        # BOS / EOS token IDs
        # 从分词器中获取词汇表的大小
        self.n_words: int = self.sp_model.vocab_size()
        # BOS (开始符号)、EOS (结束符号)、PAD (填充符号)
        self.bos_id: int = self.sp_model.bos_id()
        self.eos_id: int = self.sp_model.eos_id()
        self.pad_id: int = self.sp_model.pad_id()
        logger.info(
            f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
        )
        # 确认从分词器获取的词汇表大小是否与通过get_piece_size()方法获取的值相等。
        # 如果不相等，可能表示模型加载过程中出现问题。
        assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()

    def encode(self, s: str, bos: bool, eos: bool) -> List[int]:
        '''将文本转换为模型可接受的数字序列
        bos和eos指示是否在序列的开始和结束添加开始和结束符号
        '''
        assert type(s) is str
        # 使用SentencePiece模型将输入的字符串s编码为一个数字序列
        t = self.sp_model.encode(s)
        if bos:
            # 在这个序列的开头添加一个开始符号
            t = [self.bos_id] + t
        if eos:
            # 在序列的末尾添加一个结束符号
            t = t + [self.eos_id]
        return t

    def decode(self, t: List[int]) -> str:
        '''将模型输出的数字序列转换回文本'''
        return self.sp_model.decode(t)
