In [1]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset("wikitext", name="wikitext-2-raw-v1", split="train")


def get_training_corpus():
    for i in range(0, len(dataset), 1000):
        yield dataset[i : i + 1000]["text"]

## BERT

### Building a WordPiece tokenizer from scratch

In [3]:
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))
tokenizer

Tokenizer(version="1.0", truncation=None, padding=None, added_tokens=[], normalizer=None, pre_tokenizer=None, post_processor=None, decoder=None, model=WordPiece(unk_token="[UNK]", continuing_subword_prefix="##", max_input_chars_per_word=100, vocab={}))

In [4]:
# These two tokenizer.normalizer has the same manner, but there is slightly different:
# BertNormalizer requires `clean_text==True`

# tokenizer.normalizer = normalizers.BertNormalizer(lowercase=True)
tokenizer.normalizer = normalizers.Sequence(
    [normalizers.NFD(), normalizers.Lowercase(), normalizers.StripAccents()]
)

print(tokenizer.normalizer.normalize_str("HÃ©llÃ² hÃ´w are Ã¼?"))

hello how are u?


we can see the result of normalizers.Sequence() and BertNormalizer is sligth different in some cases,  
but we don't want to deal with this complication here

In [5]:
tokenizer.normalizer = normalizers.BertNormalizer(lowercase=True)
print(tokenizer.normalizer.normalize_str("NÌƒÌƒ    Hello\tWorld\u200b"))

n    hello world


In [6]:
tokenizer.normalizer = normalizers.Sequence(
    [normalizers.NFD(), normalizers.Lowercase(), normalizers.StripAccents()]
)

print(tokenizer.normalizer.normalize_str("NÌƒÌƒ    Hello\tWorld\u200b"))

n    hello	worldâ€‹


In [7]:
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
tokenizer.pre_tokenizer.pre_tokenize_str("Let's test my pre-tokenizer.")

# so does:
# pre_tokenizer = pre_tokenizers.Sequence(
#    [pre_tokenizers.WhitespaceSplit(), pre_tokenizers.Punctuation()]
#)

[('Let', (0, 3)),
 ("'", (3, 4)),
 ('s', (4, 5)),
 ('test', (6, 10)),
 ('my', (11, 13)),
 ('pre', (14, 17)),
 ('-', (17, 18)),
 ('tokenizer', (18, 27)),
 ('.', (27, 28))]

In [8]:
# Need to pass it all the special tokens you intend to use
# otherwise it wonâ€™t add them to the vocabulary, since they are not in the training corpus
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.WordPieceTrainer(vocab_size=25000, special_tokens=special_tokens)
trainer

WordPieceTrainer(WordPieceTrainer(bpe_trainer=BpeTrainer(min_frequency=0, vocab_size=25000, show_progress=True, special_tokens=[AddedToken(content="[UNK]", single_word=False, lstrip=False, rstrip=False, normalized=False, special=True), AddedToken(content="[PAD]", single_word=False, lstrip=False, rstrip=False, normalized=False, special=True), AddedToken(content="[CLS]", single_word=False, lstrip=False, rstrip=False, normalized=False, special=True), AddedToken(content="[SEP]", single_word=False, lstrip=False, rstrip=False, normalized=False, special=True), AddedToken(content="[MASK]", single_word=False, lstrip=False, rstrip=False, normalized=False, special=True)], limit_alphabet=None, initial_alphabet=[], continuing_subword_prefix="##", end_of_word_suffix=None, max_token_length=None, words={})))

In [9]:
# train tokenizer
tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)

In [15]:
# test the trainer tokenizer
encoding = tokenizer.encode("Let's test this tokenizer.")
print(encoding)
print(encoding.tokens)

Encoding(num_tokens=9, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
['let', "'", 's', 'test', 'this', 'tok', '##eni', '##zer', '.']


In [17]:
cls_token_id = tokenizer.token_to_id("[CLS]")
sep_token_id = tokenizer.token_to_id("[SEP]")
print(cls_token_id, sep_token_id)
tokenizer.post_processor = processors.TemplateProcessing(
    single=f"[CLS]:0 $A:0 [SEP]:0",
    pair=f"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
    special_tokens=[("[CLS]", cls_token_id), ("[SEP]", sep_token_id)],
)

2 3


#### post-processing

add [CLS], [SEP] ... special tokens

In [None]:
cls_token_id = tokenizer.token_to_id("[CLS]")
sep_token_id = tokenizer.token_to_id("[SEP]")

In [None]:
# Note that we need to pass along the IDs of the special tokens, so the tokenizer can properly convert them to their IDs.
tokenizer.post_processor = processors.TemplateProcessing(
    single=f"[CLS]:0 $A:0 [SEP]:0",
    pair=f"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
    special_tokens=[("[CLS]", cls_token_id), ("[SEP]", sep_token_id)],
)
tokenizer.post_processor

TemplateProcessing(single=[SpecialToken(id="[CLS]", type_id=0), Sequence(id=A, type_id=0), SpecialToken(id="[SEP]", type_id=0)], pair=[SpecialToken(id="[CLS]", type_id=0), Sequence(id=A, type_id=0), SpecialToken(id="[SEP]", type_id=0), Sequence(id=B, type_id=1), SpecialToken(id="[SEP]", type_id=1)], special_tokens={"[CLS]":SpecialToken(id="[CLS]", ids=[2], tokens=["[CLS]"]), "[SEP]":SpecialToken(id="[SEP]", ids=[3], tokens=["[SEP]"])})

In [20]:
encoding = tokenizer.encode("Let's test this tokenizer.")
print(encoding.tokens)

['[CLS]', 'let', "'", 's', 'test', 'this', 'tok', '##eni', '##zer', '.', '[SEP]']


### Wrap the trained tokenizer for transformer training

To use this tokenizer in ðŸ¤— Transformers, we have to wrap it in a PreTrainedTokenizerFast. We can either use the generic class or, if our tokenizer corresponds to an existing model, use that class (here, BertTokenizerFast). If you apply this lesson to build a brand new tokenizer, you will have to use the first option.

In [None]:
# Generic class since we are creating a brand new tokenizer
from transformers import PreTrainedTokenizerFast

# input all special tokens at once
wrapped_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    # tokenizer_file="tokenizer.json", # You can load from the tokenizer file, alternatively
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)

ModuleNotFoundError: No module named 'transformers'

In [None]:
# We use a specific tokenizer class (like BertTokenizerFast), 
# we will only need to specify the special tokens that are different from the default ones (here, none):
from transformers import BertTokenizerFast

wrapped_tokenizer = BertTokenizerFast(tokenizer_object=tokenizer)

- We can then use this tokenizer like any other ðŸ¤— Transformers tokenizer. 
- We can save it with the save_pretrained() method, or upload it to the Hub with the push_to_hub() method.