# Training Tokenizers for the BabyLM dataset

We create eight tokenizers:
* A BPE tokenizer for orthographic text (keeps spaces)
* A BPE tokenizer for orthographic text (removes spaces)
* A character-based tokenizer for orthographic text (keeps spaces)
* A character-based tokenizer for orthographic text (removes spaces)
* A BPE tokenizer for phonemes (keeps spaces)
* A BPE tokenizer for phonemes (removes spaces)
* A character-based tokenizer for phonemes (keeps spaces)
* A character-based tokenizer for phonemes (removes spaces)

In [3]:
import pandas as pd

from datasets import load_dataset
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, trainers, processors, decoders
from transformers import GPT2TokenizerFast

dataset = load_dataset('BabyLM-phonemized', 'strict', split='train')

In [4]:
def show_example(example, tokenizer):
    tokenized = tokenizer(example)["input_ids"]
    print(f"Original: {example}")
    print(f"Ids: {tokenized}")
    print(f"Tokens: {tokenizer.convert_ids_to_tokens(tokenized)}")
    print(f"Decoded: {tokenizer.decode(tokenized)}")
    print()

## BPE tokenizer for orthographic text

In [4]:
tokenizer = Tokenizer(models.BPE())
tokenizer.normalizer = normalizers.Sequence(
        [normalizers.NFD(),
         normalizers.Lowercase(),
         normalizers.Strip(),
         normalizers.StripAccents(),
        ]
    )

tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
trainer = trainers.BpeTrainer(vocab_size=8192, special_tokens=["UTT_BOUNDARY", "PAD", "UNK"])
tokenizer.train_from_iterator(dataset['text'], trainer=trainer)

tokenizer.decoder = decoders.ByteLevel()
tokenizer.post_processor = processors.TemplateProcessing(
    single="UTT_BOUNDARY $A",
    pair="UTT_BOUNDARY $A UTT_BOUNDARY $B:1",
    special_tokens=[("UTT_BOUNDARY", tokenizer.token_to_id("UTT_BOUNDARY"))],
)

wrapped_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer, pad_token='PAD', unk_token='UNK', bos_token='UTT_BOUNDARY', eos_token='UTT_BOUNDARY', add_prefix_space=True)






In [10]:
show_example(dataset['text'][6], wrapped_tokenizer)

Original: who's that?
Ids: [0, 398, 237, 238, 33]
Tokens: ['UTT_BOUNDARY', 'Ġwho', "'s", 'Ġthat', '?']
Decoded: UTT_BOUNDARY who's that?



In [5]:
wrapped_tokenizer.push_to_hub("transformersegmentation/BabyLM-BPE-ortho-tokenizer")

CommitInfo(commit_url='https://huggingface.co/transformersegmentation/BabyLM-BPE-ortho-tokenizer/commit/ae0bd4d3959981c5a6b621ac2539d0271f311ef9', commit_message='Upload tokenizer', commit_description='', oid='ae0bd4d3959981c5a6b621ac2539d0271f311ef9', pr_url=None, pr_revision=None, pr_num=None)

## BPE tokenizer for orthographic text without spaces

In [6]:
tokenizer = Tokenizer(models.BPE())
tokenizer.normalizer = normalizers.Sequence(
        [normalizers.Replace(" ", ""),
         normalizers.Replace("\t", ""),
         normalizers.NFD(),
         normalizers.Lowercase(),
         normalizers.Strip(),
         normalizers.StripAccents(),
        ]
    )

tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False, use_regex=True)
trainer = trainers.BpeTrainer(vocab_size=8192, special_tokens=["UTT_BOUNDARY", "PAD", "UNK"])
tokenizer.train_from_iterator(list(dataset['text'])[:], trainer=trainer)

tokenizer.decoder = decoders.ByteLevel()
tokenizer.post_processor = processors.TemplateProcessing(
    single="UTT_BOUNDARY $A",
    pair="UTT_BOUNDARY $A UTT_BOUNDARY $B:1",
    special_tokens=[("UTT_BOUNDARY", tokenizer.token_to_id("UTT_BOUNDARY"))],
)

wrapped_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer, pad_token='PAD', unk_token='UNK', bos_token='UTT_BOUNDARY', eos_token='UTT_BOUNDARY', add_prefix_space=True)






In [7]:
show_example(dataset['text'][0], wrapped_tokenizer)

Original: do you want to look at that it says look?
Ids: [0, 5102, 1418, 2609, 2456, 416, 33]
Tokens: ['UTT_BOUNDARY', 'doyouwantto', 'lookat', 'thatit', 'says', 'look', '?']
Decoded: UTT_BOUNDARYdoyouwanttolookatthatitsayslook?



In [8]:
wrapped_tokenizer.push_to_hub("transformersegmentation/BabyLM-BPE-ortho-tokenizer-spaceless")

CommitInfo(commit_url='https://huggingface.co/transformersegmentation/BabyLM-BPE-ortho-tokenizer-spaceless/commit/dda79ed9ab5c2da9969623686600ab8fd2465b23', commit_message='Upload tokenizer', commit_description='', oid='dda79ed9ab5c2da9969623686600ab8fd2465b23', pr_url=None, pr_revision=None, pr_num=None)

## Character-level tokenizer for orthographic text with/without word boundaries

Since it's just a character-level model, the trainer can simply filter out the word boundary tokens so we do not need a separate tokenizer for no word boundaries. We train on the `character_split_utterance` column.

In [36]:
MIN_COUNT = 100

def build_vocabulary(lines):

    vocab = {'UNK' : 0, 'PAD' : 1, 'WORD_BOUNDARY' : 2, 'UTT_BOUNDARY' : 3}
    token_counts = {}
    for line in lines:
        tokens = line.strip().split()
        for token in tokens:
            if token not in token_counts:
                token_counts[token] = 0
            token_counts[token] += 1
        
    # Add tokens to vocab if they are not in phoible and have a count greater than MIN_COUNT
    for token, count in token_counts.items():
        if count > MIN_COUNT and token not in vocab:
            vocab[token] = len(vocab)

    print('Vocab: ', vocab)
    print('Vocab size: ', len(vocab))
    return vocab

In [37]:
normalizer = normalizers.Sequence(
        [normalizers.NFD(),
         normalizers.Lowercase(),
         normalizers.Strip(),
         normalizers.StripAccents(),
         normalizers.Replace("word_boundary", "WORD_BOUNDARY"), # fix lowercasing for word boundary token
        ]
    )

# We normalzie the text using the normalizer before building the vocabulary
vocab = build_vocabulary([normalizer.normalize_str(line) for line in dataset['character_split_utterance']])

tokenizer = Tokenizer(models.WordLevel(vocab=vocab, unk_token='UNK'))
tokenizer.normalizer = normalizer
tokenizer.add_special_tokens(["UNK", "PAD", "UTT_BOUNDARY", "WORD_BOUNDARY"])
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
tokenizer.post_processor = processors.TemplateProcessing(
    single="UTT_BOUNDARY $A",
    pair="UTT_BOUNDARY $A UTT_BOUNDARY $B:1",
    special_tokens=[("UTT_BOUNDARY", tokenizer.token_to_id("UTT_BOUNDARY"))],
)

wrapped_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer, bos_token='UTT_BOUNDARY', eos_token='UTT_BOUNDARY', pad_token='PAD', unk_token='UNK')

Vocab:  {'UNK': 0, 'PAD': 1, 'WORD_BOUNDARY': 2, 'UTT_BOUNDARY': 3, 'd': 4, 'o': 5, 'y': 6, 'u': 7, 'w': 8, 'a': 9, 'n': 10, 't': 11, 'l': 12, 'k': 13, 'h': 14, 'i': 15, 's': 16, '?': 17, 'e': 18, '.': 19, 'r': 20, "'": 21, 'f': 22, 'c': 23, 'g': 24, 'p': 25, 'b': 26, 'm': 27, 'v': 28, 'j': 29, '!': 30, 'x': 31, 'q': 32, 'z': 33, '-': 34, '&': 35, ',': 36, '/': 37, '1': 38, '9': 39, '5': 40, '0': 41, ';': 42, '‘': 43, '’': 44, '—': 45, ':': 46, '+': 47, '8': 48, '3': 49, '7': 50, '4': 51, '6': 52, '2': 53, '=': 54, ')': 55, '(': 56, '_': 57, '*': 58, '£': 59, '–': 60, '#': 61, '`': 62, '"': 63, 'æ': 64, ']': 65, '|': 66, '$': 67, '“': 68, '”': 69, '[': 70, 'œ': 71, '{': 72, '}': 73, '…': 74, '°': 75, '§': 76, '>': 77, '·': 78, '¢': 79, '%': 80, '^': 81, '½': 82, '¶': 83, '×': 84, '¼': 85, '¾': 86, 'φ': 87, '<': 88, '´': 89, '¯': 90, '¦': 91, '\x86': 92, '«': 93, '¬': 94, '©': 95, '\x93': 96, '●': 97, '¹': 98, '⁄': 99, '²': 100, '₂': 101, '│': 102, 'δ': 103, 'α': 104, 'ο': 105, 'υ': 106

In [38]:
show_example(dataset['character_split_utterance'][0], wrapped_tokenizer)

Original: d o WORD_BOUNDARY y o u WORD_BOUNDARY w a n t WORD_BOUNDARY t o WORD_BOUNDARY l o o k WORD_BOUNDARY a t WORD_BOUNDARY t h a t WORD_BOUNDARY i t WORD_BOUNDARY s a y s WORD_BOUNDARY l o o k ? WORD_BOUNDARY
Ids: [3, 4, 5, 2, 6, 5, 7, 2, 8, 9, 10, 11, 2, 11, 5, 2, 12, 5, 5, 13, 2, 9, 11, 2, 11, 14, 9, 11, 2, 15, 11, 2, 16, 9, 6, 16, 2, 12, 5, 5, 13, 17, 2]
Tokens: ['UTT_BOUNDARY', 'd', 'o', 'WORD_BOUNDARY', 'y', 'o', 'u', 'WORD_BOUNDARY', 'w', 'a', 'n', 't', 'WORD_BOUNDARY', 't', 'o', 'WORD_BOUNDARY', 'l', 'o', 'o', 'k', 'WORD_BOUNDARY', 'a', 't', 'WORD_BOUNDARY', 't', 'h', 'a', 't', 'WORD_BOUNDARY', 'i', 't', 'WORD_BOUNDARY', 's', 'a', 'y', 's', 'WORD_BOUNDARY', 'l', 'o', 'o', 'k', '?', 'WORD_BOUNDARY']
Decoded: UTT_BOUNDARY d o WORD_BOUNDARY y o u WORD_BOUNDARY w a n t WORD_BOUNDARY t o WORD_BOUNDARY l o o k WORD_BOUNDARY a t WORD_BOUNDARY t h a t WORD_BOUNDARY i t WORD_BOUNDARY s a y s WORD_BOUNDARY l o o k? WORD_BOUNDARY



In [39]:
vocab2 = vocab.copy()
vocab2['W'] = vocab2.pop('WORD_BOUNDARY')

tokenizer = Tokenizer(models.WordLevel(vocab=vocab2, unk_token='UNK'))
tokenizer.normalizer = normalizers.Sequence(
        [normalizers.NFD(),
         normalizers.Lowercase(),
         normalizers.Strip(),
         normalizers.StripAccents(),
         normalizers.Replace(" ", "W"),
        ]
    )
tokenizer.add_special_tokens(["UNK", "PAD", "UTT_BOUNDARY", "W"])
tokenizer.pre_tokenizer = pre_tokenizers.Split("", behavior="isolated")
tokenizer.post_processor = processors.TemplateProcessing(
    single="UTT_BOUNDARY $A",
    pair="UTT_BOUNDARY $A UTT_BOUNDARY $B:1",
    special_tokens=[("UTT_BOUNDARY", tokenizer.token_to_id("UTT_BOUNDARY"))],
)

wrapped_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer, bos_token='UTT_BOUNDARY', eos_token='UTT_BOUNDARY', pad_token='PAD', unk_token='UNK')

In [40]:
show_example(dataset['text'][0], wrapped_tokenizer)

Original: do you want to look at that it says look?
Ids: [3, 4, 5, 2, 6, 5, 7, 2, 8, 9, 10, 11, 2, 11, 5, 2, 12, 5, 5, 13, 2, 9, 11, 2, 11, 14, 9, 11, 2, 15, 11, 2, 16, 9, 6, 16, 2, 12, 5, 5, 13, 17]
Tokens: ['UTT_BOUNDARY', 'd', 'o', 'W', 'y', 'o', 'u', 'W', 'w', 'a', 'n', 't', 'W', 't', 'o', 'W', 'l', 'o', 'o', 'k', 'W', 'a', 't', 'W', 't', 'h', 'a', 't', 'W', 'i', 't', 'W', 's', 'a', 'y', 's', 'W', 'l', 'o', 'o', 'k', '?']
Decoded: UTT_BOUNDARY d o W y o u W w a n t W t o W l o o k W a t W t h a t W i t W s a y s W l o o k?



In [41]:
wrapped_tokenizer.push_to_hub("transformersegmentation/BabyLM-char-tokenizer")

CommitInfo(commit_url='https://huggingface.co/transformersegmentation/BabyLM-char-tokenizer/commit/c98c9640822dce4314f7a84759a217091ac28658', commit_message='Upload tokenizer', commit_description='', oid='c98c9640822dce4314f7a84759a217091ac28658', pr_url=None, pr_revision=None, pr_num=None)

In [42]:
tokenizer = Tokenizer(models.WordLevel(vocab=vocab2, unk_token='UNK'))
tokenizer.normalizer = normalizers.Sequence(
        [normalizers.NFD(),
         normalizers.Lowercase(),
         normalizers.Strip(),
         normalizers.StripAccents(),
         normalizers.Replace(" ", ""), # Remove word boundaries
        ]
    )
tokenizer.add_special_tokens(["UNK", "PAD", "UTT_BOUNDARY"])
tokenizer.pre_tokenizer = pre_tokenizers.Split("", behavior="isolated")
tokenizer.post_processor = processors.TemplateProcessing(
    single="UTT_BOUNDARY $A",
    pair="UTT_BOUNDARY $A UTT_BOUNDARY $B:1",
    special_tokens=[("UTT_BOUNDARY", tokenizer.token_to_id("UTT_BOUNDARY"))],
)

wrapped_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer, bos_token='UTT_BOUNDARY', eos_token='UTT_BOUNDARY', pad_token='PAD', unk_token='UNK')

In [43]:
show_example(dataset['text'][0], wrapped_tokenizer)

Original: do you want to look at that it says look?
Ids: [3, 4, 5, 6, 5, 7, 8, 9, 10, 11, 11, 5, 12, 5, 5, 13, 9, 11, 11, 14, 9, 11, 15, 11, 16, 9, 6, 16, 12, 5, 5, 13, 17]
Tokens: ['UTT_BOUNDARY', 'd', 'o', 'y', 'o', 'u', 'w', 'a', 'n', 't', 't', 'o', 'l', 'o', 'o', 'k', 'a', 't', 't', 'h', 'a', 't', 'i', 't', 's', 'a', 'y', 's', 'l', 'o', 'o', 'k', '?']
Decoded: UTT_BOUNDARY d o y o u w a n t t o l o o k a t t h a t i t s a y s l o o k?



In [44]:
wrapped_tokenizer.push_to_hub("transformersegmentation/BabyLM-char-tokenizer-spaceless")

CommitInfo(commit_url='https://huggingface.co/transformersegmentation/BabyLM-char-tokenizer-spaceless/commit/0c523f1f927b6fa433ee4fc557852ff7ed42f29f', commit_message='Upload tokenizer', commit_description='', oid='0c523f1f927b6fa433ee4fc557852ff7ed42f29f', pr_url=None, pr_revision=None, pr_num=None)

## BPE tokenizer for phonemes

The phoneme data is space-separated by phoneme, with "WORD_BOUNDARY" separating words. We can use the normalizer to turn this back into word-like units for comparison with BPE on orthographic text.

In [15]:
tokenizer = Tokenizer(models.BPE())
tokenizer.normalizer = normalizers.Sequence(
        [normalizers.Replace(" ", ""),
         normalizers.Replace("WORD_BOUNDARY", " "),
         normalizers.Strip(),
        ]
    )

tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
trainer = trainers.BpeTrainer(vocab_size=8192, special_tokens=["UTT_BOUNDARY", "PAD", "UNK"])
tokenizer.train_from_iterator(dataset['phonemized_utterance'], trainer=trainer)

tokenizer.decoder = decoders.ByteLevel()
tokenizer.post_processor = processors.TemplateProcessing(
    single="UTT_BOUNDARY $A",
    pair="UTT_BOUNDARY $A UTT_BOUNDARY $B:1",
    special_tokens=[("UTT_BOUNDARY", tokenizer.token_to_id("UTT_BOUNDARY"))],
)

wrapped_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer, pad_token='PAD', unk_token='UNK', bos_token='UTT_BOUNDARY', eos_token='UTT_BOUNDARY', add_prefix_space=True)






In [16]:
show_example(dataset['phonemized_utterance'][0], wrapped_tokenizer)

Original: d uː WORD_BOUNDARY j uː WORD_BOUNDARY w ɔ n t WORD_BOUNDARY t ə WORD_BOUNDARY l ʊ k WORD_BOUNDARY æ t WORD_BOUNDARY ð ʌ t ɪ t WORD_BOUNDARY s ɛ z WORD_BOUNDARY l ʊ k WORD_BOUNDARY
Ids: [0, 199, 111, 341, 119, 406, 215, 1616, 1137, 406]
Tokens: ['UTT_BOUNDARY', 'ĠduËĲ', 'ĠjuËĲ', 'ĠwÉĶnt', 'ĠtÉĻ', 'ĠlÊĬk', 'ĠÃ¦t', 'ĠÃ°ÊĮtÉªt', 'ĠsÉĽz', 'ĠlÊĬk']
Decoded: UTT_BOUNDARY duː juː wɔnt tə lʊk æt ðʌtɪt sɛz lʊk



In [17]:
wrapped_tokenizer.push_to_hub("transformersegmentation/BabyLM-BPE-phoneme-tokenizer")

CommitInfo(commit_url='https://huggingface.co/transformersegmentation/BabyLM-BPE-phoneme-tokenizer/commit/29eb69a8ad35547189fb583dc055dacdbe9b3fb7', commit_message='Upload tokenizer', commit_description='', oid='29eb69a8ad35547189fb583dc055dacdbe9b3fb7', pr_url=None, pr_revision=None, pr_num=None)

## BPE tokenizer for phonemes without spaces

Similar to the BPE for orthographic text. The only difference is the normalizer and the fact we train on phonemes.

In [27]:
tokenizer = Tokenizer(models.BPE())
tokenizer.normalizer = normalizers.Sequence(
        [normalizers.Replace(" ", ""),
         normalizers.Replace("WORD_BOUNDARY", ""), # Remove word boundaries
        #  normalizers.Replace("t̠ʃ", 'S'),
        #  normalizers.Replace("tʰ", 'T'),
        #  normalizers.Replace("uː", 'U'),
        #  normalizers.Replace("ɪ", 'I'),
        #  normalizers.Replace("ɜː", 'E'),
        #  normalizers.Replace("ɛː", '1'),
        #  normalizers.Replace("ə", 'F'),
        #  normalizers.Replace("əʊ", 'G'),
        #  normalizers.Replace("æ", 'A'),
        #  normalizers.Replace("ɔ", 'O'),
        #  normalizers.Replace("ʌ", 'V'),
        #  normalizers.Replace("aʊ", 'X'),
        #  normalizers.Replace("ɔɪ", 'Y'),
        #  normalizers.Replace("ʊ", 'Z'),
        #  normalizers.Replace("ɯ", 'W'),
        #  normalizers.Replace("eə", 'Q'),
        #  normalizers.Replace("ç", 'C'),
        #  normalizers.Replace("r̩", 'R'),
        #  normalizers.Replace("nʲ", 'N'),
        #  normalizers.Replace("tɕ", 'J'),
        #  normalizers.Replace("ɬ", 'L'),
        #  normalizers.Replace("aɪ", '2'),
        #  normalizers.Replace("ð", 'H'),
        #  normalizers.Replace("ʃ", '3'),
        #  normalizers.Replace("ɪː", '4'),
        #  normalizers.Replace("ɑ̃", '5'),
        #  normalizers.Replace("ʒ", '6'),
        #  normalizers.Replace("ɒ", '7'),
        #  normalizers.Replace("ŋ", '8'),
        #  normalizers.Replace("ɛ", 'D'),
        #  normalizers.Replace("eɪ", '9'),
        #  normalizers.Replace("ɹ", 'K'),
        #  normalizers.Replace("oʊ", '0'),
        #  normalizers.Replace("θ", 'B'),
        #  normalizers.Replace("iː", 'P'),
        #  normalizers.Replace("d̠ʒ", 'M'),
        #  normalizers.Replace("iə", ':'),
         normalizers.Strip(),
        ]
    )

tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False, use_regex=True)
trainer = trainers.BpeTrainer(vocab_size=8192, special_tokens=["UTT_BOUNDARY", "PAD", "UNK"])
tokenizer.train_from_iterator(dataset['phonemized_utterance'], trainer=trainer)

tokenizer.decoder = decoders.ByteLevel()
tokenizer.post_processor = processors.TemplateProcessing(
    single="UTT_BOUNDARY $A",
    pair="UTT_BOUNDARY $A UTT_BOUNDARY $B:1",
    special_tokens=[("UTT_BOUNDARY", tokenizer.token_to_id("UTT_BOUNDARY"))],
)

wrapped_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer, pad_token='PAD', unk_token='UNK', bos_token='UTT_BOUNDARY', eos_token='UTT_BOUNDARY', add_prefix_space=True)






In [28]:
show_example(dataset['phonemized_utterance'][0], wrapped_tokenizer)

Original: d uː WORD_BOUNDARY j uː WORD_BOUNDARY w ɔ n t WORD_BOUNDARY t ə WORD_BOUNDARY l ʊ k WORD_BOUNDARY æ t WORD_BOUNDARY ð ʌ t ɪ t WORD_BOUNDARY s ɛ z WORD_BOUNDARY l ʊ k WORD_BOUNDARY
Ids: [0, 3207, 3199, 101, 2556, 1175, 419]
Tokens: ['UTT_BOUNDARY', 'duËĲjuËĲwÉĶnt', 'tÉĻlÊĬk', 'Ã¦t', 'Ã°ÊĮtÉªt', 'sÉĽz', 'lÊĬk']
Decoded: UTT_BOUNDARYduːjuːwɔnttəlʊkætðʌtɪtsɛzlʊk



In [29]:
wrapped_tokenizer.push_to_hub("transformersegmentation/BabyLM-BPE-phoneme-tokenizer-spaceless")

CommitInfo(commit_url='https://huggingface.co/transformersegmentation/BabyLM-BPE-phoneme-tokenizer-spaceless/commit/4608f0eb4e14a16b6bfa2896e97bcdd2428348fb', commit_message='Upload tokenizer', commit_description='', oid='4608f0eb4e14a16b6bfa2896e97bcdd2428348fb', pr_url=None, pr_revision=None, pr_num=None)

In [30]:
# Play a ding sound
print("\a")





## Phoneme tokenizer with spaces

Character-based tokenizer that just uses the phonemes.

In [17]:
phoible = pd.read_csv('../../data/phoible.csv')
phoible_phonemes = phoible.Phoneme.unique()

vocab = {'UNK' : 0, 'PAD' : 1, 'WORD_BOUNDARY' : 2, 'UTT_BOUNDARY' : 3}
unk_tokens = []
token_counts = {}
for line in dataset['phonemized_utterance']:
    tokens = line.strip().split()
    for token in tokens:
        if token not in token_counts:
            token_counts[token] = 0
        token_counts[token] += 1
    
# Add tokens to vocab if they are not in phoible and have a count greater than MIN_COUNT
for token, count in token_counts.items():
    if count > MIN_COUNT and token not in vocab:
        if token not in phoible_phonemes:
            unk_tokens.append(token)
        else:
            vocab[token] = len(vocab)

print('Tokens not found in phoible: ', {token: token_counts[token] for token in unk_tokens})
print('Vocab: ', vocab)
print('Vocab size: ', len(vocab))

  phoible = pd.read_csv('../../data/phoible.csv')


Tokens not found in phoible:  {'1': 690, 'kh': 17638, 'ʌʌ': 308}
Vocab:  {'UNK': 0, 'PAD': 1, 'WORD_BOUNDARY': 2, 'UTT_BOUNDARY': 3, 'd': 4, 'uː': 5, 'j': 6, 'w': 7, 'ɔ': 8, 'n': 9, 't': 10, 'ə': 11, 'l': 12, 'ʊ': 13, 'k': 14, 'æ': 15, 'ð': 16, 'ʌ': 17, 'ɪ': 18, 's': 19, 'ɛ': 20, 'z': 21, 'iː': 22, 'ɹ': 23, 'f': 24, 'eɪ': 25, 'ɡ': 26, 'ɑ': 27, 'h': 28, 'p': 29, 'b': 30, 'i': 31, 't̠ʃ': 32, 'aɪ': 33, 'θ': 34, 'ŋ': 35, 'm': 36, 'ɔɪ': 37, 'oʊ': 38, 'aʊ': 39, 'v': 40, 'ɜː': 41, 'd̠ʒ': 42, 'ʃ': 43, 'iə': 44, 'ʒ': 45, 'ɑ̃': 46, 'r': 47, 'nʲ': 48, 'x': 49, 'ɬ': 50, 'ç': 51, 'e': 52, 'o': 53, 'ɛː': 54, 'ɪː': 55, 'u': 56, 'q': 57, 'tɕ': 58, 'tʰ': 59, 'ɯ': 60, 'r̩': 61, 'əʊ': 62, 'a': 63, 'ɒ': 64, 'eə': 65}
Vocab size:  66


In [18]:
tokenizer = Tokenizer(models.WordLevel(vocab=vocab, unk_token='UNK'))
tokenizer.normalizer = normalizers.Sequence([normalizers.Strip()]) 
tokenizer.add_special_tokens(["UNK", "PAD", "WORD_BOUNDARY", "UTT_BOUNDARY"])
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
tokenizer.post_processor = processors.TemplateProcessing(
    single="UTT_BOUNDARY $A",
    pair="UTT_BOUNDARY $A UTT_BOUNDARY $B:1",
    special_tokens=[("UTT_BOUNDARY", tokenizer.token_to_id("UTT_BOUNDARY"))],
)

wrapped_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer, bos_token='UTT_BOUNDARY', eos_token='UTT_BOUNDARY', pad_token='PAD', unk_token='UNK')


In [19]:
show_example(dataset['phonemized_utterance'][0], wrapped_tokenizer)

Original: d uː WORD_BOUNDARY j uː WORD_BOUNDARY w ɔ n t WORD_BOUNDARY t ə WORD_BOUNDARY l ʊ k WORD_BOUNDARY æ t WORD_BOUNDARY ð ʌ t ɪ t WORD_BOUNDARY s ɛ z WORD_BOUNDARY l ʊ k WORD_BOUNDARY
Ids: [3, 4, 5, 2, 6, 5, 2, 7, 8, 9, 10, 2, 10, 11, 2, 12, 13, 14, 2, 15, 10, 2, 16, 17, 10, 18, 10, 2, 19, 20, 21, 2, 12, 13, 14, 2]
Tokens: ['UTT_BOUNDARY', 'd', 'uː', 'WORD_BOUNDARY', 'j', 'uː', 'WORD_BOUNDARY', 'w', 'ɔ', 'n', 't', 'WORD_BOUNDARY', 't', 'ə', 'WORD_BOUNDARY', 'l', 'ʊ', 'k', 'WORD_BOUNDARY', 'æ', 't', 'WORD_BOUNDARY', 'ð', 'ʌ', 't', 'ɪ', 't', 'WORD_BOUNDARY', 's', 'ɛ', 'z', 'WORD_BOUNDARY', 'l', 'ʊ', 'k', 'WORD_BOUNDARY']
Decoded: UTT_BOUNDARY d uː WORD_BOUNDARY j uː WORD_BOUNDARY w ɔ n t WORD_BOUNDARY t ə WORD_BOUNDARY l ʊ k WORD_BOUNDARY æ t WORD_BOUNDARY ð ʌ t ɪ t WORD_BOUNDARY s ɛ z WORD_BOUNDARY l ʊ k WORD_BOUNDARY



In [20]:
wrapped_tokenizer.push_to_hub("transformersegmentation/BabyLM-phoneme-tokenizer")

CommitInfo(commit_url='https://huggingface.co/transformersegmentation/BabyLM-phoneme-tokenizer/commit/05288fcf78055b38245bfe8d807060d4f01568b5', commit_message='Upload tokenizer', commit_description='', oid='05288fcf78055b38245bfe8d807060d4f01568b5', pr_url=None, pr_revision=None, pr_num=None)

## Phoneme tokenizer without spaces

Character-based tokenizer that uses the phonemes and removes word boundaries.

In [33]:
tokenizer = Tokenizer(models.WordLevel(vocab=vocab, unk_token='UNK'))
tokenizer.normalizer = normalizers.Sequence([normalizers.Replace(' WORD_BOUNDARY', ''), normalizers.Strip()]) 
tokenizer.add_special_tokens(["UNK", "PAD", "UTT_BOUNDARY"])
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
tokenizer.post_processor = processors.TemplateProcessing(
    single="UTT_BOUNDARY $A",
    pair="UTT_BOUNDARY $A UTT_BOUNDARY $B:1",
    special_tokens=[("UTT_BOUNDARY", tokenizer.token_to_id("UTT_BOUNDARY"))],
)

wrapped_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer, bos_token='UTT_BOUNDARY', eos_token='UTT_BOUNDARY', pad_token='PAD', unk_token='UNK')


In [34]:
show_example(dataset['phonemized_utterance'][0], wrapped_tokenizer)

Original: d uː WORD_BOUNDARY j uː WORD_BOUNDARY w ɔ n t WORD_BOUNDARY t ə WORD_BOUNDARY l ʊ k WORD_BOUNDARY æ t WORD_BOUNDARY ð ʌ t ɪ t WORD_BOUNDARY s ɛ z WORD_BOUNDARY l ʊ k WORD_BOUNDARY
Ids: [3, 4, 5, 6, 5, 7, 8, 9, 10, 10, 11, 12, 13, 14, 15, 10, 16, 17, 10, 18, 10, 19, 20, 21, 12, 13, 14]
Tokens: ['UTT_BOUNDARY', 'd', 'uː', 'j', 'uː', 'w', 'ɔ', 'n', 't', 't', 'ə', 'l', 'ʊ', 'k', 'æ', 't', 'ð', 'ʌ', 't', 'ɪ', 't', 's', 'ɛ', 'z', 'l', 'ʊ', 'k']
Decoded: UTT_BOUNDARY d uː j uː w ɔ n t t ə l ʊ k æ t ð ʌ t ɪ t s ɛ z l ʊ k



In [35]:
wrapped_tokenizer.push_to_hub("transformersegmentation/BabyLM-phoneme-tokenizer-spaceless")

CommitInfo(commit_url='https://huggingface.co/transformersegmentation/BabyLM-phoneme-tokenizer-spaceless/commit/d0f7972bc48b28fa2daa74bf92c4083a9d3acabd', commit_message='Upload tokenizer', commit_description='', oid='d0f7972bc48b28fa2daa74bf92c4083a9d3acabd', pr_url=None, pr_revision=None, pr_num=None)

# Compare all tokenizers

In [7]:
tokenizers = [
    "babylm/babyllama-100m-2024",
    "babylm/ltgbert-100m-2024",
    "transformersegmentation/BabyLM-BPE-ortho-tokenizer",
    "transformersegmentation/BabyLM-BPE-ortho-tokenizer",
    "transformersegmentation/BabyLM-BPE-ortho-tokenizer-spaceless",
    "transformersegmentation/BabyLM-char-tokenizer",
    "transformersegmentation/BabyLM-char-tokenizer-spaceless",
    "transformersegmentation/BabyLM-BPE-phoneme-tokenizer",
    "transformersegmentation/BabyLM-BPE-phoneme-tokenizer-spaceless",
    "transformersegmentation/BabyLM-phoneme-tokenizer",
    "transformersegmentation/BabyLM-phoneme-tokenizer-spaceless",
]

from transformers import AutoTokenizer

text_example = "what a conundrum !"
phoneme_example = "w ʌ t WORD_BOUNDARY ʌ WORD_BOUNDARY k ə n ʌ n d ɹ ə m WORD_BOUNDARY"

for tokenizer in tokenizers:
    print(tokenizer)
    t = AutoTokenizer.from_pretrained(tokenizer)    
    print(len(t.get_vocab()))
    show_example(phoneme_example if 'phoneme' in tokenizer else text_example, t)

transformersegmentation/BabyLM-BPE-ortho-tokenizer
8192
Original: what a conundrum !
Ids: [0, 285, 181, 354, 1727, 58, 355, 2862]
Tokens: ['UTT_BOUNDARY', 'Ġwhat', 'Ġa', 'Ġcon', 'und', 'r', 'um', 'Ġ!']
Decoded: UTT_BOUNDARY what a conundrum!

transformersegmentation/BabyLM-BPE-ortho-tokenizer-spaceless
8192
Original: what a conundrum !
Ids: [0, 264, 1981, 237, 6822, 3]
Tokens: ['UTT_BOUNDARY', 'what', 'acon', 'un', 'drum', '!']
Decoded: UTT_BOUNDARYwhataconundrum!

transformersegmentation/BabyLM-char-tokenizer
231
Original: what a conundrum !
Ids: [3, 8, 14, 9, 11, 2, 9, 2, 23, 5, 10, 7, 10, 4, 20, 7, 27, 2, 30]
Tokens: ['UTT_BOUNDARY', 'w', 'h', 'a', 't', 'W', 'a', 'W', 'c', 'o', 'n', 'u', 'n', 'd', 'r', 'u', 'm', 'W', '!']
Decoded: UTT_BOUNDARY w h a t W a W c o n u n d r u m W!

transformersegmentation/BabyLM-char-tokenizer-spaceless
231
Original: what a conundrum !
Ids: [3, 8, 14, 9, 11, 9, 23, 5, 10, 7, 10, 4, 20, 7, 27, 30]
Tokens: ['UTT_BOUNDARY', 'w', 'h', 'a', 't', 'a', 'c', '