# Simple Byte-Level Tokenizer


In [26]:
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, decoders, trainers, processors
from transformers import PreTrainedTokenizerFast


In [None]:
add_prefix_space = True  # Note that we will add a prefix_space to the pre_tokenizer
PAD_TOKEN = "<|padding|>"
EOS_TOKEN = "<|endoftext|>"

# Define the tokenizer
tokenizer = Tokenizer(models.BPE())
tokenizer.normalizer = normalizers.Sequence([normalizers.NFD()])
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=add_prefix_space, use_regex=True)
tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)  # type: ignore
tokenizer.decoder = decoders.ByteLevel()  # type: ignore

# "Train", i.e., add the properties that we need
trainer = trainers.BpeTrainer(special_tokens=[PAD_TOKEN, EOS_TOKEN], initial_alphabet=pre_tokenizers.ByteLevel.alphabet())
tokenizer.train_from_iterator([], trainer=trainer)

# Load the tokenizer as a transformers-compatible tokenizer
wrapped_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer, pad_token=PAD_TOKEN, unk_token=None, bos_token=None, eos_token=EOS_TOKEN, add_prefix_space=add_prefix_space)
wrapped_tokenizer.push_to_hub("InfoTokenizers/Byte-Level-Tokenizer", create_pr=1)






CommitInfo(commit_url='https://huggingface.co/InfoTokenizers/Byte-Level-Tokenizer/commit/1a6c859d97210bb52a99a3aef8cfd7539b60ac63', commit_message='Upload tokenizer', commit_description='', oid='1a6c859d97210bb52a99a3aef8cfd7539b60ac63', pr_url='https://huggingface.co/InfoTokenizers/Byte-Level-Tokenizer/discussions/1', repo_url=RepoUrl('https://huggingface.co/InfoTokenizers/Byte-Level-Tokenizer', endpoint='https://huggingface.co', repo_type='model', repo_id='InfoTokenizers/Byte-Level-Tokenizer'), pr_revision='refs/pr/1', pr_num=1)

In [25]:
example = "Hello, my name is Zébulon."
t = tokenizer.encode(example)
print(t.tokens)
print(t.ids)
print([ tokenizer.id_to_token(id) for id in t.ids ])
print(tokenizer.decode(t.ids, skip_special_tokens=False).replace(" ", "-"))

['Ġ', 'H', 'e', 'l', 'l', 'o', ',', 'Ġ', 'm', 'y', 'Ġ', 'n', 'a', 'm', 'e', 'Ġ', 'i', 's', 'Ġ', 'Z', 'e', 'Ì', 'ģ', 'b', 'u', 'l', 'o', 'n', '.']
[222, 41, 70, 77, 77, 80, 13, 222, 78, 90, 222, 79, 66, 78, 70, 222, 74, 84, 222, 59, 70, 138, 225, 67, 86, 77, 80, 79, 15]
['Ġ', 'H', 'e', 'l', 'l', 'o', ',', 'Ġ', 'm', 'y', 'Ġ', 'n', 'a', 'm', 'e', 'Ġ', 'i', 's', 'Ġ', 'Z', 'e', 'Ì', 'ģ', 'b', 'u', 'l', 'o', 'n', '.']
-Hello,-my-name-is-Zébulon.


In [6]:
def get_byte_tokenizer():
    tokenizer = Tokenizer(models.BPE())
    tokenizer.normalizer = normalizers.Sequence([normalizers.NFD(), normalizers.Strip()])
    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True, use_regex=True)
    tokenizer.decoder = decoders.ByteLevel()

    trainer = trainers.BpeTrainer(special_tokens=["<|padding|>", "<|endoftext|>"], vocab_size=258, initial_alphabet=pre_tokenizers.ByteLevel.alphabet())
    tokenizer.train_from_iterator([], trainer=trainer, length=0)

    vocab = pre_tokenizers.ByteLevel.alphabet()
    vocab.append('<|padding|>')
    vocab.append('<|endoftext|>')
    # Add special bpe space symbol
    tokenizer.add_tokens(vocab)

    return tokenizer

In [7]:
example = "Hello, my name is Zébulon."
#example = "I can 嗎 feel ₉the magic, can you?"
tokenizer = get_byte_tokenizer()
t = tokenizer.encode(example)
print(t.tokens)
print(t.ids)
print([ tokenizer.id_to_token(id) for id in t.ids ])
print(tokenizer.decode(t.ids))




['H', 'e', 'l', 'l', 'o', ',', 'Ġ', 'm', 'y', 'Ġ', 'n', 'a', 'm', 'e', 'Ġ', 'i', 's', 'Ġ', 'Z', 'é', 'b', 'u', 'l', 'o', 'n', '.']
[41, 70, 77, 77, 80, 13, 222, 78, 90, 222, 79, 66, 78, 70, 222, 74, 84, 222, 59, 167, 67, 86, 77, 80, 79, 15]
['H', 'e', 'l', 'l', 'o', ',', 'Ġ', 'm', 'y', 'Ġ', 'n', 'a', 'm', 'e', 'Ġ', 'i', 's', 'Ġ', 'Z', 'é', 'b', 'u', 'l', 'o', 'n', '.']
Hello, my name is Z�bulon.


In [5]:
from transformers import PreTrainedTokenizerFast
wrapped_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer, pad_token='<|padding|>', unk_token=None, bos_token=None, eos_token='<|endoftext|>', add_prefix_space=True)
wrapped_tokenizer.push_to_hub("InfoTokenizers/Byte-Level-Tokenizer")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


CommitInfo(commit_url='https://huggingface.co/InfoTokenizers/Byte-Level-Tokenizer/commit/d0c52a16e43cf4542fb4207b561fe8e88f0a3aab', commit_message='Upload tokenizer', commit_description='', oid='d0c52a16e43cf4542fb4207b561fe8e88f0a3aab', pr_url=None, repo_url=RepoUrl('https://huggingface.co/InfoTokenizers/Byte-Level-Tokenizer', endpoint='https://huggingface.co', repo_type='model', repo_id='InfoTokenizers/Byte-Level-Tokenizer'), pr_revision=None, pr_num=None)