# Build Tokenizer

In this notebook, we will create our character-based tokenizer object to work with our data.


In [1]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
tokenizer = Tokenizer(BPE(unk_token='<unk>'))

In [2]:
from tokenizers.trainers import BpeTrainer
trainer = BpeTrainer(
    special_tokens=['<unk>', '<sep>', '<pad>', '<mask>'], vocab_size=4_096)

In [3]:
from tokenizers.pre_tokenizers import Whitespace
tokenizer.pre_tokenizer = Whitespace()

In [4]:
tokenizer.train(['../data/sentences.txt'], trainer)






In [5]:
tokenizer.get_vocab_size()

4096

In [6]:
output = tokenizer.encode('السلام عليكم و رحمة الله ...')
output

Encoding(num_tokens=9, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [7]:
output.tokens

['السلام', 'علي', 'كم', 'و', 'رح', 'مة', 'الله', '..', '.']

In [8]:
output.ids

[3808, 1823, 1786, 477, 3669, 1695, 1943, 3580, 17]

In [9]:
from tokenizers.processors import TemplateProcessing
tokenizer.post_processor = TemplateProcessing(
    single='<sep> $A <sep>',
    pair='<sep> $A <sep> $B:1 <sep>:1',
    special_tokens=[
        ('<sep>', tokenizer.token_to_id('<sep>')),
    ],
)

In [10]:
output = tokenizer.encode('السلام عليكم و رحمة الله ...')
output

Encoding(num_tokens=11, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [11]:
output.tokens

['<sep>', 'السلام', 'علي', 'كم', 'و', 'رح', 'مة', 'الله', '..', '.', '<sep>']

In [12]:
output.ids

[1, 3808, 1823, 1786, 477, 3669, 1695, 1943, 3580, 17, 1]

In [13]:
tokenizer.enable_padding(pad_id=2, pad_token='<pad>')
tokenizer.enable_truncation(max_length=512)

In [14]:
tokenizer.save('../models/nano-gpt-tokenizer.json')