In [2]:
from tokenizers import Tokenizer

tokenizer = Tokenizer.from_file("./tokenizer-wiki.json")

In [7]:
from tokenizers import normalizers
from tokenizers.normalizers import NFD, StripAccents


normalizer = normalizers.Sequence([NFD(), StripAccents()])

tokenizer.normalizer = normalizer
normalizer.normalize_str("Héllò hôw are ü?")

In [8]:
from tokenizers.pre_tokenizers import Whitespace
pre_tokenizer = Whitespace()
pre_tokenizer.pre_tokenize_str("Hello! How are you? I'm fine, thank you.")

[('Hello', (0, 5)),
 ('!', (5, 6)),
 ('How', (7, 10)),
 ('are', (11, 14)),
 ('you', (15, 18)),
 ('?', (18, 19)),
 ('I', (20, 21)),
 ("'", (21, 22)),
 ('m', (22, 23)),
 ('fine', (24, 28)),
 (',', (28, 29)),
 ('thank', (30, 35)),
 ('you', (36, 39)),
 ('.', (39, 40))]

In [10]:
from tokenizers import pre_tokenizers
from tokenizers.pre_tokenizers import Digits

tokenizer.pre_tokenizer = pre_tokenizer

pre_tokenizer = pre_tokenizers.Sequence([Whitespace(), Digits(individual_digits=True)])
pre_tokenizer.pre_tokenize_str("Call 911!")

[('Call', (0, 4)), ('9', (5, 6)), ('1', (6, 7)), ('1', (7, 8)), ('!', (8, 9))]

In [14]:
from tokenizers.processors import TemplateProcessing
from tokenizers import Tokenizer
from tokenizers.models import WordPiece

from tokenizers import normalizers
from tokenizers.normalizers import NFD, Lowercase, StripAccents

tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[("[CLS]", 1), ("[SEP]", 2)],
)


bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
bert_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])

from tokenizers.pre_tokenizers import Whitespace
bert_tokenizer.pre_tokenizer = Whitespace()

from tokenizers.processors import TemplateProcessing
bert_tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", 1),
        ("[SEP]", 2),
    ],
)

from tokenizers.trainers import WordPieceTrainer
trainer = WordPieceTrainer(vocab_size=30522, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
files = [f"../wikitext-103-raw/wiki.{split}.raw" for split in ["test", "train", "valid"]]
bert_tokenizer.train(files, trainer)
bert_tokenizer.save("./bert-wiki.json")






encoding and decoding

In [21]:
from tokenizers import decoders

output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
print(output.ids)

tokenizer.decode([1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2])

[1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2]


"Hello , y ' all ! How are you ?"

In [22]:
output = bert_tokenizer.encode("Welcome to the 🤗 Tokenizers library.")
print(output.tokens)

bert_tokenizer.decode(output.ids)

['[CLS]', 'welcome', 'to', 'the', '[UNK]', 'tok', '##eni', '##zer', '##s', 'library', '.', '[SEP]']


'welcome to the tokenizers library.'

In [23]:
bert_tokenizer.decoder = decoders.WordPiece()
bert_tokenizer.decode(output.ids)

'welcome to the tokenizers library.'