# L2: Role of the Tokenizers

<p style="background-color:#fff6e4; padding:15px; border-width:3px; border-color:#f5ecda; border-style:solid; border-radius:6px"> ⏳ <b>Note <code>(Kernel Starting)</code>:</b> This notebook takes about 30 seconds to be ready to use. You may start and watch the video while you wait.</p>

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
training_data = [
    "walker walked a long walk",
]

## BPE - Byte-Pair Encoding

In [3]:
from tokenizers.trainers import BpeTrainer
from tokenizers.models import BPE
from tokenizers import Tokenizer
from tokenizers.pre_tokenizers import Whitespace

bpe_tokenizer = Tokenizer(BPE())
bpe_tokenizer.pre_tokenizer = Whitespace()

bpe_trainer = BpeTrainer(vocab_size=14)

<p style="background-color:#fff6ff; padding:15px; border-width:3px; border-color:#efe6ef; border-style:solid; border-radius:6px"> 💻 &nbsp; <b>Access <code>requirements.txt</code> and <code>helper.py</code> files:</b> 1) click on the <em>"File"</em> option on the top menu of the notebook and then 2) click on <em>"Open"</em>. For more help, please see the <em>"Appendix - Tips and Help"</em> Lesson.</p>

In [4]:
bpe_tokenizer.train_from_iterator(training_data, bpe_trainer)






In [5]:
bpe_tokenizer.get_vocab()

{'a': 0,
 'n': 6,
 'd': 1,
 'o': 7,
 'r': 8,
 'w': 9,
 'al': 10,
 'wal': 11,
 'walk': 12,
 'walke': 13,
 'l': 5,
 'e': 2,
 'g': 3,
 'k': 4}

In [6]:
bpe_tokenizer.encode("walker walked a long walk").tokens

['walke', 'r', 'walke', 'd', 'a', 'l', 'o', 'n', 'g', 'walk']

In [7]:
bpe_tokenizer.encode("wlk").ids

[9, 5, 4]

In [8]:
bpe_tokenizer.encode("wlk").tokens

['w', 'l', 'k']

In [9]:
bpe_tokenizer.encode("she walked").tokens

['e', 'walke', 'd']

## WordPiece

In [10]:
from real_wordpiece.trainer import RealWordPieceTrainer
from tokenizers.models import WordPiece

real_wordpiece_tokenizer = Tokenizer(WordPiece())
real_wordpiece_tokenizer.pre_tokenizer = Whitespace()

real_wordpiece_trainer = RealWordPieceTrainer(
    vocab_size=27,
)

In [11]:
real_wordpiece_trainer.train_tokenizer(
    training_data, real_wordpiece_tokenizer
)
real_wordpiece_tokenizer.get_vocab()

{'n': 17,
 '##d': 8,
 'long': 21,
 '##e': 4,
 '##k': 3,
 'l': 6,
 '##o': 9,
 'e': 13,
 '##r': 7,
 'r': 14,
 'w': 0,
 'o': 16,
 '##g': 11,
 '##l': 2,
 'lo': 19,
 '##er': 22,
 'a': 5,
 '##a': 1,
 '##ed': 23,
 'walk': 26,
 'wa': 24,
 '##n': 10,
 '##lk': 25,
 'k': 12,
 'g': 18,
 '##ng': 20,
 'd': 15}

In [12]:
real_wordpiece_tokenizer.encode("walker walked a long walk").tokens

['walk', '##er', 'walk', '##ed', 'a', 'long', 'walk']

In [13]:
real_wordpiece_tokenizer.encode("wlk").tokens

['w', '##lk']

**Unknown Characters:**
The following line will produce an error because it contains unknown characters. Please uncomment the line and run it to see the error.

In [14]:
#real_wordpiece_tokenizer.encode("she walked").tokens

## HuggingFace WordPiece and special tokens

In [15]:
from tokenizers.trainers import WordPieceTrainer

unk_token = "[UNK]"

wordpiece_model = WordPiece(unk_token=unk_token)
wordpiece_tokenizer = Tokenizer(wordpiece_model)
wordpiece_tokenizer.pre_tokenizer = Whitespace()
wordpiece_trainer = WordPieceTrainer(
    vocab_size=28,
    special_tokens=[unk_token]
)

In [16]:
wordpiece_tokenizer.train_from_iterator(
    training_data, 
    wordpiece_trainer
)
wordpiece_tokenizer.get_vocab()






{'##r': 19,
 'r': 9,
 '##k': 13,
 'd': 2,
 'o': 8,
 'n': 7,
 'w': 10,
 'wa': 20,
 '##a': 11,
 'walke': 23,
 'walker': 27,
 '##g': 18,
 'l': 6,
 '##e': 14,
 '##lk': 21,
 '##o': 16,
 '##d': 15,
 '[UNK]': 0,
 'e': 3,
 'walk': 22,
 'lo': 24,
 'walked': 26,
 'a': 1,
 '##n': 17,
 'g': 4,
 'k': 5,
 '##l': 12,
 '##ng': 25}

In [17]:
wordpiece_tokenizer.encode("walker walked a long walk").tokens

['walker', 'walked', 'a', 'lo', '##ng', 'walk']

In [18]:
wordpiece_tokenizer.encode("wlk").tokens

['w', '##lk']

In [19]:
wordpiece_tokenizer.encode("she walked").tokens

['[UNK]', 'walked']

## Unigram

In [20]:
from tokenizers.trainers import UnigramTrainer
from tokenizers.models import Unigram

unigram_tokenizer = Tokenizer(Unigram())
unigram_tokenizer.pre_tokenizer = Whitespace()
unigram_trainer = UnigramTrainer(
    vocab_size=14, 
    special_tokens=[unk_token],
    unk_token=unk_token,
)

unigram_tokenizer.train_from_iterator(training_data, unigram_trainer)
unigram_tokenizer.get_vocab()





{'e': 2,
 'g': 8,
 'w': 9,
 'o': 11,
 'r': 12,
 'd': 10,
 'walke': 1,
 'k': 3,
 'walk': 4,
 '[UNK]': 0,
 'a': 6,
 'l': 5,
 'n': 7}

In [21]:
unigram_tokenizer.encode("walker walked a long walk").tokens

['walke', 'r', 'walke', 'd', 'a', 'l', 'o', 'n', 'g', 'walk']

In [22]:
unigram_tokenizer.encode("wlk").tokens

['w', 'l', 'k']

In [23]:
unigram_tokenizer.encode("she walked").tokens

['sh', 'e', 'walke', 'd']

In [24]:
unigram_tokenizer.encode("she walked").ids

[0, 2, 1, 10]