In [1]:
#pip install tokenizers

#### Tokenizers operate on a "vocabulary."
#### This defines the set of tokens that may be used to represent bodies of text.

In [2]:
vocab = [
    " ", # alphabet
    "a",
    "b",
    "c",
    "aa", # other ngrams
    "ab",
    "ac",
    "aab"
]

#### You can build a tokenizer directly from a vocabulary.

In [3]:
from tokenizers import Tokenizer
from tokenizers.models import BPE

tokenizer = Tokenizer(BPE())
tokenizer.add_tokens(vocab)

8

#### Tokenizers encode text into tokens using the largest possible tokens.

In [4]:
tokenizer.encode("a b c").tokens

['a', ' ', 'b', ' ', 'c']

In [5]:
tokenizer.encode("aaa aab aac").tokens

['aa', 'a', ' ', 'aab', ' ', 'aa', 'c']

#### Tokenization can always fall back on base alphabet

In [6]:
tokenizer.encode("bccbbcc").tokens

['b', 'c', 'c', 'b', 'b', 'c', 'c']

#### define a tokenizer that uses *only* the alphabet

In [7]:
alphabet = [" ", "a", "b", "c"]
tokenizer_alphabet = Tokenizer(BPE())
tokenizer_alphabet.add_tokens(alphabet)

4

#### define a tokenizer that includes all bigrams

In [8]:
vocab_bigrams = [
    " ", "a", "b", "c",
    "  ", " a", " b", " c",
    "a ", "aa", "ab", "ac",
    "b ", "ba", "bb", "bc",
    "c ", "ca", "cb", "cc",
]
tokenizer_bigrams = Tokenizer(BPE())
tokenizer_bigrams.add_tokens(vocab_bigrams)

20

#### fertility is a common definition of tokenizer compression:
#### it is defined as the ratio of output tokens to original input length.
#### (smaller values are generally better)

In [9]:
def fertility(tokenizer, corpus):
    toks = tokenizer.encode(corpus).tokens
    return len(toks) / len(corpus)

#### generate a random input corpus to test fertility

In [10]:
import random
toks = random.choices(alphabet, k=1000)
corpus = "".join(toks)

In [11]:
# raw alphabet tokenizer has the highest fertility
# (it is the least compressed)
fertility(tokenizer_alphabet, corpus)

1.0

In [12]:
# our original tokenizer has a moderate compression rate
fertility(tokenizer, corpus)

0.844

In [13]:
# bigram tokenizer has the lowest fertility
# (it is the most compressed)
fertility(tokenizer_bigrams, corpus)

0.5