分词就是将句子、段落、文章这种长文本，分解为以字词为单位的数据结构，方便后续的处理分析工作。

In [None]:
import re  # 基于正则表达式进行分词


class Tokenizer:
    def __init__(self, vocab, unk_token="<|unk|>", endof_token="<|endoftext|>"):
        if vocab.get(unk_token) is None:
            raise ValueError(f"Vocabulary must contain unk_token '{unk_token}'")
        if vocab.get(endof_token) is None:
            raise ValueError(f"Vocabulary must contain endof_token '{endof_token}'")

        self._unk_token = unk_token
        self._endof_token = endof_token
        self._str_to_int = vocab
        self._int_to_str = {v: k for k, v in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [items.strip() for items in preprocessed if items.strip()]
        return [
            self._str_to_int.get(item, self._str_to_int[self._unk_token])
            for item in preprocessed
        ]

    def decode(self, tokens):
        text = " ".join(self._int_to_str[token] for token in tokens)
        text = re.sub(r'\s+([,.:;?_!"()\'])', r"\1", text)
        return text

我们使用这个文件来构建我们的词表，并使用它来实例化一个分词器。

In [None]:
with open("asserts/the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

print("Total length of the text:", len(raw_text))
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [items.strip() for items in preprocessed if items.strip()]
print("Total length of the preprocessed text:", len(preprocessed))

preprocessed.extend(["<|unk|>", "<|endoftext|>"])
vocab = {token: integer for integer, token in enumerate(sorted(set(preprocessed)))}
print("Total length of the vocabulary:", len(vocab))

In [None]:
tokenizer = Tokenizer(vocab)
text1 = "Hello, do you like tea"
text2 = """"It's the last he painted, you know, "
Mrs. Gisburn said with pardonable pride."""

ids = tokenizer.encode("<|endoftext|>".join([text1, text2]))
print(ids)
text = tokenizer.decode(ids)
print(text)

接下来我们使用 gpt2 的 tokenizer 来进行分词。

In [None]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")

text1 = "Hello, do you like tea"
text2 = """"It's the last he painted, you know, "
Mrs. Gisburn said with pardonable pride."""
idx = tokenizer.encode(
    "<|endoftext|>".join([text1, text2]), allowed_special={"<|endoftext|>"}
)
print(idx)
text = tokenizer.decode(idx)
print(text)