In [1]:
from tokenizers.implementations import BertWordPieceTokenizer
from tokenizers.processors import TemplateProcessing
from tokenizers.pre_tokenizers import Whitespace

In [2]:
# Bert WordPiece Tokenizer
tokenizer = BertWordPieceTokenizer()
tokenizer

Tokenizer(vocabulary_size=0, model=BertWordPiece, unk_token=[UNK], sep_token=[SEP], cls_token=[CLS], pad_token=[PAD], mask_token=[MASK], clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True, wordpieces_prefix=##)

In [3]:
tokenizer.pre_tokenizer = Whitespace()

In [4]:
files = [f"../extra_dataset/wikitext-103-raw/wiki.{split}.raw" for split in ["test", "train", "valid"]]
files

['../extra_dataset/wikitext-103-raw/wiki.test.raw',
 '../extra_dataset/wikitext-103-raw/wiki.train.raw',
 '../extra_dataset/wikitext-103-raw/wiki.valid.raw']

In [5]:
tokenizer.train(files=files,
                # he size of the final vocabulary, including all tokens and alphabet.
                vocab_size=30000,  # 默认:30000
                # The minimum frequency a pair should have in order to be merged.
                min_frequency=2,  # 默认:2
                # A list of special tokens the model should know of.
                special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
                # 默认: ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
                )

In [6]:
print(tokenizer.token_to_id("[SEP]"))
print(tokenizer.token_to_id("[SEP]"))

print(tokenizer.id_to_token(2))
print(tokenizer.id_to_token(2))

2
2
[SEP]
[SEP]


In [7]:
output = tokenizer.encode(sequence="Hello, y'all! How are you 😁 ?",
                          pair="And, its pair")
output

Encoding(num_tokens=15, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [8]:
print(output.tokens)
print(output.ids)
print(output.type_ids)  # The generated type IDs
print(output.attention_mask)

['hello', ',', 'y', "'", 'all', '!', 'how', 'are', 'you', '[UNK]', '?', 'and', ',', 'its', 'pair']
[21784, 16, 67, 11, 1747, 5, 1935, 1691, 2413, 0, 35, 1549, 16, 1741, 4785]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [9]:
tokenizer.post_processor = TemplateProcessing(single="[CLS] $A [SEP]",
                                              pair="[CLS] $A [SEP] $B:1 [SEP]:1",
                                              special_tokens=[("[CLS]", tokenizer.token_to_id("[CLS]")),
                                                              ("[SEP]", tokenizer.token_to_id("[SEP]"))]
                                              )

In [10]:
output_temp = tokenizer.encode(sequence="Hello, y'all! How are you 😁 ?",
                               pair="And, its pair")
output_temp

Encoding(num_tokens=18, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [11]:
print(output_temp.tokens)
print(output_temp.ids)
print(output_temp.type_ids)  # The generated type IDs
print(output_temp.attention_mask)

['[CLS]', 'hello', ',', 'y', "'", 'all', '!', 'how', 'are', 'you', '[UNK]', '?', '[SEP]', 'and', ',', 'its', 'pair', '[SEP]']
[1, 21784, 16, 67, 11, 1747, 5, 1935, 1691, 2413, 0, 35, 2, 1549, 16, 1741, 4785, 2]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
