In [1]:
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import WordLevelTrainer
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.processors import TemplateProcessing

In [3]:
tokenizer = Tokenizer(WordLevel(unk_token="<unk>"))  # 分词器
# This pre-tokenizer simply splits using the following regex: \w+|[^\w\s]+
tokenizer.pre_tokenizer = Whitespace()

trainer = WordLevelTrainer(special_tokens=["<unk>", "<s>", "</s>", "<pad>", "<mask>"])
tokenizer.train(['../../data/csv_to_trainTxt/trainTxt_pretrain_model.txt'], trainer)

tokenizer.post_processor = TemplateProcessing(single="<s> $A </s>",
                                              pair="<s> $A </s> </s> $B:1 </s>:1",
                                              special_tokens=[("<s>", tokenizer.token_to_id("<s>")),
                                                              ("</s>", tokenizer.token_to_id("</s>"))])

print(tokenizer.token_to_id('<pad>'))

# Enable the padding
tokenizer.enable_padding(pad_id=tokenizer.token_to_id('<pad>'), pad_token="<pad>", pad_type_id=0)

# Enable truncation
tokenizer.enable_truncation(max_length=512)

3


In [4]:
output_temp = tokenizer.encode(sequence="5399 3117 1070 4321 4568 2621 5466 3772 4516 2990 3618 2456",
                               is_pretokenized=False)

print(output_temp.tokens)
print(output_temp.ids)
print(output_temp.type_ids)
print(output_temp.attention_mask)

['<s>', '5399', '3117', '1070', '4321', '4568', '2621', '5466', '3772', '4516', '2990', '3618', '2456', '</s>']
[1, 1575, 582, 156, 430, 537, 299, 918, 125, 54, 583, 109, 355, 2]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [5]:
tokenizer.save("tokenizer.json")