In [20]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing

## 从头训练分词器

### Models

Models are the core algorithms used to actually tokenize, and therefore, they are the only mandatory component of a Tokenizer.

| Name      | Description                                                  |
| --------- | ------------------------------------------------------------ |
| WordLevel | This is the “classic” tokenization algorithm. It let’s you simply map words to IDs without anything fancy. This has the advantage of being really simple to use and understand, but it requires extremely large vocabularies for a good coverage. Using this `Model` requires the use of a `PreTokenizer`. No choice will be made by this model directly, it simply maps input tokens to IDs. |
| BPE       | One of the most popular subword tokenization algorithm. The Byte-Pair-Encoding works by starting with characters, while merging those that are the most frequently seen together, thus creating new tokens. It then works iteratively to build new tokens out of the most frequent pairs it sees in a corpus. BPE is able to build words it has never seen by using multiple subword tokens, and thus requires smaller vocabularies, with less chances of having “unk” (unknown) tokens. |
| WordPiece | This is a subword tokenization algorithm quite similar to BPE, used mainly by Google in models like BERT. It uses a greedy algorithm, that tries to build long words first, splitting in multiple tokens when entire words don’t exist in the vocabulary. This is different from BPE that starts from characters, building bigger tokens as possible. It uses the famous `##` prefix to identify tokens that are part of a word (ie not starting a word). |
| Unigram   | Unigram is also a subword tokenization algorithm, and works by trying to identify the best set of subword tokens to maximize the probability for a given sentence. This is different from BPE in the way that this is not deterministic based on a set of rules applied sequentially. Instead Unigram will be able to compute multiple ways of tokenizing, while choosing the most probable one. |

In [21]:
# An implementation of the BPE (Byte-Pair Encoding) algorithm
model_BPE = BPE(unk_token='[UNK]')
model_BPE

<tokenizers.models.BPE at 0x138aa2b9630>

In [22]:
# A :obj:`Tokenizer` works as a pipeline. It processes some raw text as input and outputs an :class:`~tokenizers.Encoding`.
tokenizer = Tokenizer(
    # The core algorithm that this Tokenizer should be using.
    model=model_BPE)
tokenizer

<tokenizers.Tokenizer at 0x138b3b8fad0>

### Pre-tokenizers

The PreTokenizer takes care of splitting the input according to a set of rules. This pre-processing lets you ensure that the underlying Model does not build tokens across multiple “splits”. For example if you don’t want to have whitespaces inside a token, then you can have a PreTokenizer that splits on these whitespaces.

In [23]:
# This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
tokenizer.pre_tokenizer = Whitespace()

In [24]:
# Trainer capable of training a BPE model
trainer = BpeTrainer(
    # The size of the final vocabulary, including all tokens and alphabet.
    vocab_size=30000,  # 默认:30000
    # The minimum frequency a pair should have in order to be merged.
    min_frequency=0,  # 默认:0
    # A list of special tokens the model should know of.
    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])  # 默认:[]
trainer

<tokenizers.trainers.BpeTrainer at 0x138b0a468b0>

In [25]:
files = [f"wikitext-103-raw/wiki.{split}.raw" for split in ["test", "train", "valid"]]
files

['wikitext-103-raw/wiki.test.raw',
 'wikitext-103-raw/wiki.train.raw',
 'wikitext-103-raw/wiki.valid.raw']

In [26]:
# Train the Tokenizer using the given files.
tokenizer.train(
    files=files,  # 文件路径或包含路径的列表
    trainer=trainer)
tokenizer

<tokenizers.Tokenizer at 0x138b3b8fad0>

In [27]:
print(tokenizer.token_to_id("[SEP]"))
print(tokenizer.id_to_token(2))

2
[SEP]


In [28]:
# Encode the given sequence and pair. This method can process raw text sequences as well as already pre-tokenized sequences.
output = tokenizer.encode(
    sequence="Hello, y'all! How are you 😁 ?",  # 未分好词
    is_pretokenized=False)
output

Encoding(num_tokens=11, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [29]:
print(output.tokens)
print(output.ids)
print(output.type_ids)  # The generated type IDs
print(output.attention_mask)

['Hello', ',', 'y', "'", 'all', '!', 'How', 'are', 'you', '[UNK]', '?']
[27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [30]:
output_pair = tokenizer.encode(
    sequence=["A", "pre", "tokenized", "sequence"],  # 已经分好词
    pair=["And", "its", "pair"],
    # Whether the input is already pre-tokenized
    is_pretokenized=True
)
print(output_pair.tokens)
print(output_pair.ids)
print(output_pair.type_ids)
print(output_pair.attention_mask)

['A', 'pre', 'to', 'ken', 'ized', 'sequence', 'And', 'its', 'pair']
[37, 5262, 5030, 7359, 5759, 8606, 6187, 5181, 6703]
[0, 0, 0, 0, 0, 0, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1]


### Post-Processors

After the whole pipeline, we sometimes want to insert some special tokens before feed a tokenized string into a model like ”[CLS] My horse is amazing [SEP]”. The PostProcessor is the component doing just that.

In [31]:
# Provides a way to specify templates in order to add the special tokens to each input sequence as relevant.
# Let’s take BERT tokenizer as an example. It uses two special tokens, used to delimitate each sequence. [CLS] is always used at the beginning of the first sequence, and [SEP] is added at the end of both the first, and the pair sequences. The final result looks like this:
# Then, we specify the template for sentence pairs, which should have the form "[CLS] $A [SEP] $B [SEP]" where $A represents the first sentence and $B the second one. The :1 added in the template represent the type IDs we want for each part of our input: it defaults to 0 for everything (which is why we don’t have $A:0) and here we set it to 1 for the tokens of the second sentence and the last "[SEP]" token.
tokenizer.post_processor = TemplateProcessing(
    # The template used for single sequences
    single="[CLS] $A [SEP]",
    # The template used when both sequences are specified
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    #  The list of special tokens used in each sequences
    special_tokens=[
        ("[CLS]", tokenizer.token_to_id("[CLS]")),
        ("[SEP]", tokenizer.token_to_id("[SEP]")),
    ],
)

In [32]:
output_temp = tokenizer.encode(
    sequence="Hello, y'all! How are you 😁 ?",  # 未分好词
    is_pretokenized=False)

print(output_temp.tokens)
print(output_temp.ids)
print(output_temp.type_ids)
print(output_temp.attention_mask)

['[CLS]', 'Hello', ',', 'y', "'", 'all', '!', 'How', 'are', 'you', '[UNK]', '?', '[SEP]']
[1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [33]:
output_pair_temp = tokenizer.encode(
    sequence=["A", "pre", "tokenized", "sequence"],  # 已经分好词
    pair=["And", "its", "pair"],
    # Whether the input is already pre-tokenized
    is_pretokenized=True
)

print(output_pair_temp.tokens)
print(output_pair_temp.ids)
print(output_pair_temp.type_ids)
print(output_pair_temp.attention_mask)

['[CLS]', 'A', 'pre', 'to', 'ken', 'ized', 'sequence', '[SEP]', 'And', 'its', 'pair', '[SEP]']
[1, 37, 5262, 5030, 7359, 5759, 8606, 2, 6187, 5181, 6703, 2]
[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [34]:
output_batch = tokenizer.encode_batch(["Hello, y'all!", "How are you 😁 ?"])
print(output_batch, end='\n\n')

# 第一个句子
print(output_batch[0].tokens)
print(output_batch[0].ids)
print(output_batch[0].type_ids)
print(output_batch[0].attention_mask, end='\n\n')

# 第二个句子
print(output_batch[1].tokens)
print(output_batch[1].ids)
print(output_batch[1].type_ids)
print(output_batch[1].attention_mask)

[Encoding(num_tokens=8, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]), Encoding(num_tokens=7, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])]

['[CLS]', 'Hello', ',', 'y', "'", 'all', '!', '[SEP]']
[1, 27253, 16, 93, 11, 5097, 5, 2]
[0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1]

['[CLS]', 'How', 'are', 'you', '[UNK]', '?', '[SEP]']
[1, 7961, 5112, 6218, 0, 35, 2]
[0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1]


In [35]:
tokenizer.enable_padding(
    pad_id=3,
    pad_token="[PAD]",
    pad_type_id=0)

In [36]:
output_batch_padding = tokenizer.encode_batch(["Hello, y'all!", "How are you 😁 ?"])
print(output_batch_padding, end='\n\n')

# 第一个句子
print(output_batch_padding[0].tokens)
print(output_batch_padding[0].ids)
print(output_batch_padding[0].type_ids)
print(output_batch_padding[0].attention_mask, end='\n\n')

# 第二个句子(编码结果与第一个句子等长,通过'[PAD']填充)
print(output_batch_padding[1].tokens)
print(output_batch_padding[1].ids)
print(output_batch_padding[1].type_ids)
print(output_batch_padding[1].attention_mask)  # 填充部分为0

[Encoding(num_tokens=8, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]), Encoding(num_tokens=8, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])]

['[CLS]', 'Hello', ',', 'y', "'", 'all', '!', '[SEP]']
[1, 27253, 16, 93, 11, 5097, 5, 2]
[0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1]

['[CLS]', 'How', 'are', 'you', '[UNK]', '?', '[SEP]', '[PAD]']
[1, 7961, 5112, 6218, 0, 35, 2, 3]
[0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 0]


In [37]:
# Save the :class:`~tokenizers.Tokenizer` to the file at the given path.
tokenizer.save("tokenizer-wiki.json")

In [38]:
# Instantiate a new :class:`~tokenizers.Tokenizer` from the file at the given path.
tokenizer_load = Tokenizer.from_file("tokenizer-wiki.json")
tokenizer_load

<tokenizers.Tokenizer at 0x138ac6614e0>