This code is showing how to train your own WordPiece Tokenizer.

In [1]:
import time
from glob import glob
from datasets import load_dataset
from transformers import AutoTokenizer, BertTokenizerFast
from tokenizers import decoders, models, normalizers, pre_tokenizers, processors, trainers, Tokenizer



In [2]:
files_list = glob('openwebtext_new/*')

# set up WordPiece

In [3]:
tokenizer = Tokenizer(models.WordPiece(unl_token="[UNK]"))

In [4]:
tokenizer.normalizer = normalizers.BertNormalizer(lowercase=True)

In [5]:
tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()

In [6]:
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.WordPieceTrainer(vocab_size=30000, special_tokens=special_tokens)

# start training

In [7]:
start = time.time()
tokenizer.train(files_list, trainer=trainer)
end = time.time()
print('Time to train: %0.2fs' %  (end - start))  

Time to train: 1610.65s


In [8]:
cls_token_id = tokenizer.token_to_id("[CLS]")
sep_token_id = tokenizer.token_to_id("[SEP]")
print(cls_token_id, sep_token_id)

2 3


In [9]:
tokenizer.post_processor = processors.TemplateProcessing(
    single=f"[CLS]:0 $A:0 [SEP]:0",
    pair=f"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
    special_tokens=[("[CLS]", cls_token_id), ("[SEP]", sep_token_id)],
)

In [10]:
tokenizer.decoder = decoders.WordPiece(prefix="##")

In [11]:
from transformers import BertTokenizerFast
new_tokenizer = BertTokenizerFast(tokenizer_object=tokenizer)

In [12]:
new_tokenizer.save_pretrained("openwebtext_tokenizer_new")

('/project/Meetup_20230717/openwebtext_tokenizer_new/tokenizer_config.json',
 '/project/Meetup_20230717/openwebtext_tokenizer_new/special_tokens_map.json',
 '/project/Meetup_20230717/openwebtext_tokenizer_new/vocab.txt',
 '/project/Meetup_20230717/openwebtext_tokenizer_new/added_tokens.json',
 '/project/Meetup_20230717/openwebtext_tokenizer_new/tokenizer.json')