In [16]:
from tokenizers import Tokenizer, pre_tokenizers, models

In [17]:
tokenizer = Tokenizer(model=models.BPE(unk_token='[UNK]'))
string = 'This pre-tokenizer splits tokens on spaces, and also on punctuation. Each occurence of a punctuation character will be treated separately.'

### Whitespace()

In [18]:
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

# This pre-tokenizer simply splits using the following regex: \w+|[^\w\s]+
tokenizer.pre_tokenizer.pre_tokenize_str(string)

[('This', (0, 4)),
 ('pre', (5, 8)),
 ('-', (8, 9)),
 ('tokenizer', (9, 18)),
 ('splits', (19, 25)),
 ('tokens', (26, 32)),
 ('on', (33, 35)),
 ('spaces', (36, 42)),
 (',', (42, 43)),
 ('and', (44, 47)),
 ('also', (48, 52)),
 ('on', (53, 55)),
 ('punctuation', (56, 67)),
 ('.', (67, 68)),
 ('Each', (69, 73)),
 ('occurence', (74, 83)),
 ('of', (84, 86)),
 ('a', (87, 88)),
 ('punctuation', (89, 100)),
 ('character', (101, 110)),
 ('will', (111, 115)),
 ('be', (116, 118)),
 ('treated', (119, 126)),
 ('separately', (127, 137)),
 ('.', (137, 138))]

### BertPreTokenizer()

In [19]:
tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()

# This pre-tokenizer splits tokens on spaces, and also on punctuation. Each occurence of a punctuation character will be treated separately.
tokenizer.pre_tokenizer.pre_tokenize_str(string)

[('This', (0, 4)),
 ('pre', (5, 8)),
 ('-', (8, 9)),
 ('tokenizer', (9, 18)),
 ('splits', (19, 25)),
 ('tokens', (26, 32)),
 ('on', (33, 35)),
 ('spaces', (36, 42)),
 (',', (42, 43)),
 ('and', (44, 47)),
 ('also', (48, 52)),
 ('on', (53, 55)),
 ('punctuation', (56, 67)),
 ('.', (67, 68)),
 ('Each', (69, 73)),
 ('occurence', (74, 83)),
 ('of', (84, 86)),
 ('a', (87, 88)),
 ('punctuation', (89, 100)),
 ('character', (101, 110)),
 ('will', (111, 115)),
 ('be', (116, 118)),
 ('treated', (119, 126)),
 ('separately', (127, 137)),
 ('.', (137, 138))]

### Punctuation

In [20]:
tokenizer.pre_tokenizer = pre_tokenizers.Punctuation()

# This pre-tokenizer simply splits on punctuation as individual characters.
tokenizer.pre_tokenizer.pre_tokenize_str(string)

[('This pre', (0, 8)),
 ('-', (8, 9)),
 ('tokenizer splits tokens on spaces', (9, 42)),
 (',', (42, 43)),
 (' and also on punctuation', (43, 67)),
 ('.', (67, 68)),
 (' Each occurence of a punctuation character will be treated separately',
  (68, 137)),
 ('.', (137, 138))]

### WhitespaceSplit()

In [21]:
tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()

# This pre-tokenizer simply splits on the whitespace. Works like .split()
tokenizer.pre_tokenizer.pre_tokenize_str(string)

[('This', (0, 4)),
 ('pre-tokenizer', (5, 18)),
 ('splits', (19, 25)),
 ('tokens', (26, 32)),
 ('on', (33, 35)),
 ('spaces,', (36, 43)),
 ('and', (44, 47)),
 ('also', (48, 52)),
 ('on', (53, 55)),
 ('punctuation.', (56, 68)),
 ('Each', (69, 73)),
 ('occurence', (74, 83)),
 ('of', (84, 86)),
 ('a', (87, 88)),
 ('punctuation', (89, 100)),
 ('character', (101, 110)),
 ('will', (111, 115)),
 ('be', (116, 118)),
 ('treated', (119, 126)),
 ('separately.', (127, 138))]

### ByteLevel

In [22]:
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()

# This pre-tokenizer takes care of replacing all bytes of the given string with a corresponding representation, as well as splitting into words.
tokenizer.pre_tokenizer.pre_tokenize_str(string)

[('ĠThis', (0, 4)),
 ('Ġpre', (4, 8)),
 ('-', (8, 9)),
 ('tokenizer', (9, 18)),
 ('Ġsplits', (18, 25)),
 ('Ġtokens', (25, 32)),
 ('Ġon', (32, 35)),
 ('Ġspaces', (35, 42)),
 (',', (42, 43)),
 ('Ġand', (43, 47)),
 ('Ġalso', (47, 52)),
 ('Ġon', (52, 55)),
 ('Ġpunctuation', (55, 67)),
 ('.', (67, 68)),
 ('ĠEach', (68, 73)),
 ('Ġoccurence', (73, 83)),
 ('Ġof', (83, 86)),
 ('Ġa', (86, 88)),
 ('Ġpunctuation', (88, 100)),
 ('Ġcharacter', (100, 110)),
 ('Ġwill', (110, 115)),
 ('Ġbe', (115, 118)),
 ('Ġtreated', (118, 126)),
 ('Ġseparately', (126, 137)),
 ('.', (137, 138))]

In [23]:
# Returns the alphabet used by this PreTokenizer.
# Since the ByteLevel works as its name suggests, at the byte level, it encodes each byte value to a unique visible character. This means that there is a total of 256 different characters composing this alphabet.
pre_tokenizers.ByteLevel.alphabet()

['d',
 '~',
 'ğ',
 'À',
 'Ā',
 'Q',
 '=',
 'è',
 'û',
 'Ĥ',
 'B',
 '*',
 'V',
 'Õ',
 '$',
 '±',
 '7',
 'µ',
 'ě',
 'z',
 '¶',
 'l',
 'D',
 '¤',
 'þ',
 'Ĳ',
 'Ä',
 'Ń',
 'Ì',
 'î',
 'ç',
 '#',
 '6',
 'Ñ',
 '®',
 'ª',
 'G',
 'ú',
 'M',
 '³',
 '8',
 'W',
 '¹',
 '¸',
 '0',
 'ü',
 'I',
 '.',
 'ĕ',
 ':',
 'S',
 'g',
 'E',
 ',',
 'ĵ',
 'Þ',
 's',
 'Ă',
 'ô',
 '¼',
 'N',
 'É',
 '^',
 'ï',
 '@',
 'c',
 'å',
 'Ĵ',
 ')',
 '©',
 'ã',
 'æ',
 'q',
 '|',
 'w',
 'é',
 '>',
 'ñ',
 'ß',
 'Ĺ',
 'ħ',
 'F',
 'P',
 'T',
 'Ę',
 'Ô',
 '¿',
 'ķ',
 'í',
 'ð',
 'Ğ',
 'ī',
 '&',
 'ę',
 'Y',
 'Đ',
 '<',
 '«',
 'C',
 '£',
 'đ',
 '2',
 'f',
 '\\',
 'Ĉ',
 'ý',
 'U',
 'Ħ',
 '%',
 '¾',
 'Ó',
 '?',
 'Ļ',
 ';',
 '¯',
 'Î',
 '¦',
 '3',
 'Ú',
 'Ð',
 'Ċ',
 'Ö',
 'È',
 'ĺ',
 '4',
 'Č',
 'i',
 'A',
 'ı',
 'Ŀ',
 'ĸ',
 '_',
 'Ò',
 'Ï',
 '}',
 '+',
 '»',
 '²',
 '[',
 't',
 'ĭ',
 'Ù',
 '×',
 'ė',
 'â',
 'Ľ',
 'Ą',
 '¡',
 '§',
 'ą',
 'J',
 'ĳ',
 'Ĭ',
 'ó',
 'Â',
 'á',
 'k',
 'H',
 'Ø',
 '`',
 'y',
 'h',
 'Ď',
 'Ġ',
 'Ĕ',
 'ľ',
 'Z