# Training a BPE tokenizer on the BabyLM data

In [1]:
import pandas as pd

from datasets import load_dataset
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, trainers, processors, decoders
from transformers import GPT2TokenizerFast

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [15]:
dataset = load_dataset('transformersegmentation/BabyLM-phonemized', 'English', split='train')
# Remove 'None' from the dataset
dataset = dataset.filter(lambda x: x['processed_gloss'] is not None)
tokenizer = Tokenizer(models.BPE())
tokenizer.normalizer = normalizers.Sequence(
        [normalizers.NFD(),
         normalizers.Lowercase(),
         normalizers.Strip(),
         normalizers.StripAccents(),
        ]
    )
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)

trainer = trainers.BpeTrainer(vocab_size=8192, special_tokens=["UTT_BOUNDARY", "PAD", "UNK"])
tokenizer.train_from_iterator(dataset['processed_gloss'], trainer=trainer)

tokenizer.decoder = decoders.ByteLevel()
tokenizer.post_processor = processors.TemplateProcessing(
    single="UTT_BOUNDARY $A",
    pair="UTT_BOUNDARY $A UTT_BOUNDARY $B:1",
    special_tokens=[("UTT_BOUNDARY", tokenizer.token_to_id("UTT_BOUNDARY"))],
)

Filter: 100%|██████████| 10144265/10144265 [01:57<00:00, 86645.95 examples/s]







In [16]:
example = dataset['processed_gloss'][300]
encoding = tokenizer.encode(example)
print(f'Example: {example}')
print(encoding.tokens)

Example: he just goes down !
['UTT_BOUNDARY', 'Ġhe', 'Ġjust', 'Ġgoes', 'Ġdown', 'Ġ!']


In [17]:
wrapped_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer, pad_token='PAD', unk_token='UNK', bos_token='UTT_BOUNDARY', eos_token='UTT_BOUNDARY', add_prefix_space=True)
wrapped_tokenizer.push_to_hub("transformersegmentation/BabyLM-BPE-gloss-tokenizer")

CommitInfo(commit_url='https://huggingface.co/transformersegmentation/BabyLM-BPE-gloss-tokenizer/commit/f349e4af4a6d0a524a76c1a1e42d2a5e2ea2b020', commit_message='Upload tokenizer', commit_description='', oid='f349e4af4a6d0a524a76c1a1e42d2a5e2ea2b020', pr_url=None, pr_revision=None, pr_num=None)

In [18]:
tokenized = wrapped_tokenizer(example, padding='max_length', max_length=20, truncation=True, add_special_tokens=True)
tokenized

{'input_ids': [0, 222, 392, 1570, 574, 830, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}

In [19]:
wrapped_tokenizer.convert_ids_to_tokens(tokenized['input_ids'])

['UTT_BOUNDARY',
 'Ġhe',
 'Ġjust',
 'Ġgoes',
 'Ġdown',
 'Ġ!',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD']

In [20]:
wrapped_tokenizer('this is a test .')

{'input_ids': [0, 285, 239, 181, 1833, 272], 'attention_mask': [1, 1, 1, 1, 1, 1]}