# This is an illustration of how to use huggingface tokenizer

In [1]:
# Please read https://huggingface.co/docs/tokenizers/en/index

In [84]:
!pip install tokenizers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable


In [114]:
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import BPE, WordPiece
from tokenizers.trainers import BpeTrainer, WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing
from tokenizers import BertWordPieceTokenizer

In [4]:
# Load the dataset
dataset = load_dataset("wikitext", "wikitext-103-raw-v1")

# Display some samples
print(dataset['train'][0])

Downloading readme:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/733k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/157M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/157M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1801350 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

{'text': ''}


In [13]:
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))

In [23]:
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], vocab_size=30000, show_progress=True)

In [24]:
tokenizer.pre_tokenizer = Whitespace()

In [41]:
dataset_iterator = [dataset['train'][idx]['text'] for idx in range(len(dataset['train']))]

In [42]:
dataset_iterator[4]

" The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more forgiving for series newcomers . Character designer Raita Honjou and composer Hitoshi Sakimoto both returned from previous entries , along with Valkyria Chronicles II director Takeshi Ozawa . A large team of writers handled the script . The game 's opening theme was sung by May 'n . \n"

In [43]:
tokenizer.train_from_iterator(dataset_iterator, trainer)






In [44]:
tokenizer.save("data/tokenizer-wiki.json")

In [45]:
tokenizer = Tokenizer.from_file("data/tokenizer-wiki.json")

In [65]:
tokenizer.get_vocab()

{'ibles': 24657,
 'definitely': 15485,
 'impacts': 21414,
 'hier': 16834,
 'Shakes': 12295,
 'conveyed': 27064,
 'subtle': 17363,
 'Loren': 22589,
 'Parry': 28379,
 'Robin': 15453,
 'prud': 29956,
 'Form': 10338,
 'application': 13014,
 'publishing': 13494,
 'repairs': 12630,
 'Brandon': 17863,
 'squad': 11764,
 'Lana': 22402,
 'Sha': 17300,
 'opus': 23521,
 'resup': 26235,
 'bud': 8856,
 'hospitalized': 29741,
 'coloration': 23160,
 'folding': 26011,
 'Publishing': 17101,
 '1895': 14362,
 'crocod': 29335,
 'NA': 6846,
 'Suther': 24905,
 'completion': 11153,
 'dens': 11894,
 'thinks': 18334,
 'awi': 20289,
 'cancellation': 19627,
 'artifacts': 21144,
 'ritz': 20495,
 'Die': 9799,
 'ﮮ': 4911,
 'Earth': 7936,
 'lore': 18499,
 'overshadowed': 26583,
 'reek': 6955,
 'ὡ': 1645,
 'eni': 29721,
 'Broad': 9501,
 'ideology': 21389,
 '鄭': 4365,
 'sympathy': 22414,
 'hills': 14311,
 'ref': 5939,
 'Temple': 11220,
 'wickets': 10997,
 'horst': 23390,
 'Lab': 15763,
 'Gaz': 17108,
 'Southampton': 17

# Using the tokenizer

In [46]:
output = tokenizer.encode("Hello, y'all! How are you 😁 ?")

In [47]:
print(output.tokens)

['Hello', ',', 'y', "'", 'all', '!', 'How', 'are', 'you', '[UNK]', '?']


In [48]:
print(output.ids)

[27194, 16, 93, 11, 5068, 5, 7928, 5083, 6190, 0, 35]


In [49]:
print(output.offsets[9])

(26, 27)


In [50]:
sentence = "Hello, y'all! How are you 😁 ?"
sentence[26:27]

'😁'

# Post-processing

In [57]:
tokenizer.token_to_id("[MASK]")

4

In [67]:
tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", tokenizer.token_to_id("[CLS]")),
        ("[SEP]", tokenizer.token_to_id("[SEP]")),
    ],
)

In [70]:
output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
print(output.tokens)

['[CLS]', 'Hello', ',', 'y', "'", 'all', '!', 'How', 'are', 'you', '[UNK]', '?', '[SEP]']


In [71]:
output = tokenizer.encode("Hello, y'all!", "How are you 😁 ?")
print(output.tokens)

['[CLS]', 'Hello', ',', 'y', "'", 'all', '!', '[SEP]', 'How', 'are', 'you', '[UNK]', '?', '[SEP]']


In [72]:
print(output.type_ids)

[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]


In [73]:
# encode a batch
output = tokenizer.encode_batch(
    [["Hello, y'all!", "How are you 😁 ?"], ["Hello to you too!", "I'm fine, thank you!"]]
)

In [79]:
tokenizer.enable_padding(pad_id=3, pad_token="[PAD]", length=10)

In [83]:
output = tokenizer.encode_batch(["Hello, y'all!", "How are you 😁 ?"])
print(output[0].tokens)

['[CLS]', 'Hello', ',', 'y', "'", 'all', '!', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']


In [82]:
print(output[0].attention_mask)

[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


# Using a pretrained tokenizer

In [85]:
tokenizer = Tokenizer.from_pretrained("bert-base-uncased")

In [87]:
output = tokenizer.encode_batch(["Hello, y'all!", "How are you 😁 ?"])
print(output[0].tokens)
print(output[1].tokens)

['[CLS]', 'hello', ',', 'y', "'", 'all', '!', '[SEP]']
['[CLS]', 'how', 'are', 'you', '[UNK]', '?', '[SEP]']


In [92]:
tokenizer.enable_padding(pad_id=3, pad_token="[PAD]", length=10)

In [93]:
output = tokenizer.encode_batch(["Hello, y'all!", "How are you 😁 ?"])
print(output[0].tokens)
print(output[1].tokens)

['[CLS]', 'hello', ',', 'y', "'", 'all', '!', '[SEP]', '[PAD]', '[PAD]']
['[CLS]', 'how', 'are', 'you', '[UNK]', '?', '[SEP]', '[PAD]', '[PAD]', '[PAD]']


# Tokenization pipeline

In [94]:
tokenizer = Tokenizer.from_file("data/tokenizer-wiki.json")

In [126]:
from tokenizers import normalizers
from tokenizers.normalizers import NFD, Lowercase, StripAccents
from tokenizers.pre_tokenizers import Whitespace
from tokenizers import pre_tokenizers
from tokenizers.pre_tokenizers import Digits
from tokenizers import decoders
from tokenizers.decoders import Metaspace, WordPiece

In [96]:
normalizer = normalizers.Sequence([NFD(), StripAccents()])

In [97]:
normalizer.normalize_str("Héllò hôw are ü?")

'Hello how are u?'

In [98]:
tokenizer.normalizer = normalizer

In [100]:
pre_tokenizer = Whitespace()
pre_tokenizer.pre_tokenize_str("Hello! How are you? I'm fine, thank you.")

[('Hello', (0, 5)),
 ('!', (5, 6)),
 ('How', (7, 10)),
 ('are', (11, 14)),
 ('you', (15, 18)),
 ('?', (18, 19)),
 ('I', (20, 21)),
 ("'", (21, 22)),
 ('m', (22, 23)),
 ('fine', (24, 28)),
 (',', (28, 29)),
 ('thank', (30, 35)),
 ('you', (36, 39)),
 ('.', (39, 40))]

In [102]:
pre_tokenizer = pre_tokenizers.Sequence([Whitespace(), Digits(individual_digits=True)])
pre_tokenizer.pre_tokenize_str("Call 911!")

[('Call', (0, 4)), ('9', (5, 6)), ('1', (6, 7)), ('1', (7, 8)), ('!', (8, 9))]

In [103]:
tokenizer.pre_tokenizer = pre_tokenizer

In [104]:
tokenizer.train_from_iterator(dataset_iterator, trainer)






In [105]:
tokenizer.get_vocab()

{'Infinity': 27830,
 '『': 1368,
 'ʠ': 255,
 'Engineer': 17601,
 'host': 5876,
 'hort': 28089,
 'commissioner': 20605,
 '怖': 2305,
 'invited': 9618,
 'Everglades': 20680,
 '沼': 2758,
 'Pixar': 21882,
 'this': 4430,
 'ogo': 20542,
 'inson': 8073,
 'Juven': 26746,
 'tenor': 21156,
 '栖': 2585,
 'parody': 12655,
 'Rico': 12028,
 '仙': 1568,
 'hover': 27691,
 '姬': 2043,
 'Companion': 25900,
 'Manhatt': 11345,
 'Professional': 15503,
 'organizers': 27637,
 'Wendy': 25415,
 'Nash': 11438,
 'confidence': 13117,
 'itat': 23235,
 'injun': 25916,
 'behind': 5957,
 'foraging': 21752,
 'cocaine': 26210,
 'bowl': 14194,
 'Gif': 28169,
 'inviting': 27900,
 '小': 2129,
 'Northwest': 15116,
 'recept': 15557,
 'orange': 13789,
 'consecrated': 24447,
 'paleonto': 23816,
 'mia': 28481,
 'rail': 6115,
 'Today': 9714,
 'temperate': 20047,
 '•': 1160,
 '還': 3752,
 'erc': 8783,
 'ɮ': 213,
 '虫': 3483,
 'includes': 6971,
 '芦': 3371,
 '輿': 3688,
 '囁': 1912,
 'Start': 13782,
 '鯖': 4028,
 'ﻴ': 4112,
 'Clinton': 10785

In [106]:
output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
print(output.tokens)
print(output.ids)

['Hello', ',', 'y', "'", 'all', '!', 'How', 'are', 'you', '[UNK]', '?']
[25755, 16, 93, 11, 4258, 5, 7043, 4272, 5345, 0, 35]


In [107]:
tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[("[CLS]", 1), ("[SEP]", 2)],
)

# BERT tokenizer from scratch

In [109]:
bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))

In [111]:
bert_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])

In [112]:
bert_tokenizer.pre_tokenizer = Whitespace()

In [113]:
bert_tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", 1),
        ("[SEP]", 2),
    ],
)

In [115]:
trainer = WordPieceTrainer(vocab_size=30522, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

In [119]:
bert_dataset_iterator = [dataset['train'][idx]['text'] for idx in range(len(dataset['train']))] + \
[dataset['validation'][idx]['text'] for idx in range(len(dataset['validation']))] + \
[dataset['test'][idx]['text'] for idx in range(len(dataset['test']))]

In [120]:
bert_tokenizer.train_from_iterator(bert_dataset_iterator, trainer)
bert_tokenizer.save("data/bert-wiki.json")






In [121]:
output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
print(output.ids)

[1, 25755, 16, 93, 11, 4258, 5, 7043, 4272, 5345, 0, 35, 2]


In [122]:
tokenizer.decode([1, 25755, 16, 93, 11, 4258, 5, 7043, 4272, 5345, 0, 35, 2])

"Hello , y ' all ! How are you ?"

In [130]:
output = bert_tokenizer.encode("Welcome to the 🤗     Tokenizers library .  ")
print(output.tokens)
bert_tokenizer.decode(output.ids)

['[CLS]', 'welcome', 'to', 'the', '[UNK]', 'tok', '##eni', '##zer', '##s', 'library', '.', '[SEP]']


'welcome to the tokenizers library.'

In [131]:
bert_tokenizer.decoder = decoders.Sequence([WordPiece(), Metaspace()])
bert_tokenizer.decode(output.ids)

'welcome to the tokenizers library.'