# Training Tokenizers for the BabyLM dataset

We create eight tokenizers:
* A BPE tokenizer for orthographic text (keeps spaces)
* A BPE tokenizer for orthographic text (removes spaces)
* A character-based tokenizer for orthographic text (keeps spaces)
* A character-based tokenizer for orthographic text (removes spaces)
* A BPE tokenizer for phonemes (keeps spaces)
* A BPE tokenizer for phonemes (removes spaces)
* A character-based tokenizer for phonemes (keeps spaces)
* A character-based tokenizer for phonemes (removes spaces)

In [7]:
import pandas as pd

from datasets import load_dataset
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, trainers, processors, decoders
from transformers import GPT2TokenizerFast

dataset = load_dataset('BabyLM-phonemized', 'strict_small', split='train')

In [8]:
def show_example(example, tokenizer):
    tokenized = tokenizer(example)["input_ids"]
    print(f"Original: {example}")
    print(f"Ids: {tokenized}")
    print(f"Tokens: {tokenizer.convert_ids_to_tokens(tokenized)}")
    print(f"Decoded: {tokenizer.decode(tokenized)}")
    print()

## BPE tokenizer for orthographic text

In [9]:
tokenizer = Tokenizer(models.BPE())
tokenizer.normalizer = normalizers.Sequence(
        [normalizers.NFD(),
         normalizers.Lowercase(),
         normalizers.Strip(),
         normalizers.StripAccents(),
        ]
    )

tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
trainer = trainers.BpeTrainer(vocab_size=16000, special_tokens=["UTT_BOUNDARY", "PAD", "UNK"])
tokenizer.train_from_iterator(dataset['text'], trainer=trainer)

tokenizer.decoder = decoders.ByteLevel()
tokenizer.post_processor = processors.TemplateProcessing(
    single="UTT_BOUNDARY $A",
    pair="UTT_BOUNDARY $A UTT_BOUNDARY $B:1",
    special_tokens=[("UTT_BOUNDARY", tokenizer.token_to_id("UTT_BOUNDARY"))],
)

wrapped_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer, pad_token='PAD', unk_token='UNK', bos_token='UTT_BOUNDARY', eos_token='UTT_BOUNDARY', add_prefix_space=True)






In [10]:
show_example(dataset['text'][6], wrapped_tokenizer)

Original: That's the top number.
Ids: [0, 203, 202, 156, 1234, 1283, 16]
Tokens: ['UTT_BOUNDARY', 'Ġthat', "'s", 'Ġthe', 'Ġtop', 'Ġnumber', '.']
Decoded: UTT_BOUNDARY that's the top number.



In [11]:
wrapped_tokenizer.push_to_hub("phonemetransformers/BABYLM-TOKENIZER-BPE-TXT")

CommitInfo(commit_url='https://huggingface.co/phonemetransformers/BABYLM-TOKENIZER-BPE-TXT/commit/437a1d54a94d357800c1bb9a8ca79ae0673801a8', commit_message='Upload tokenizer', commit_description='', oid='437a1d54a94d357800c1bb9a8ca79ae0673801a8', pr_url=None, pr_revision=None, pr_num=None)

## BPE tokenizer for orthographic text without spaces

In [12]:
tokenizer = Tokenizer(models.BPE())
tokenizer.normalizer = normalizers.Sequence(
        [normalizers.Replace(" ", ""),
         normalizers.Replace("\t", ""),
         normalizers.NFD(),
         normalizers.Lowercase(),
         normalizers.Strip(),
         normalizers.StripAccents(),
        ]
    )

tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False, use_regex=True)
trainer = trainers.BpeTrainer(vocab_size=16000, special_tokens=["UTT_BOUNDARY", "PAD", "UNK"])
tokenizer.train_from_iterator(list(dataset['text'])[:], trainer=trainer)

tokenizer.decoder = decoders.ByteLevel()
tokenizer.post_processor = processors.TemplateProcessing(
    single="UTT_BOUNDARY $A",
    pair="UTT_BOUNDARY $A UTT_BOUNDARY $B:1",
    special_tokens=[("UTT_BOUNDARY", tokenizer.token_to_id("UTT_BOUNDARY"))],
)

wrapped_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer, pad_token='PAD', unk_token='UNK', bos_token='UTT_BOUNDARY', eos_token='UTT_BOUNDARY', add_prefix_space=True)






In [13]:
show_example(dataset['text'][0], wrapped_tokenizer)

Original: Yeah.
Ids: [0, 279, 16]
Tokens: ['UTT_BOUNDARY', 'yeah', '.']
Decoded: UTT_BOUNDARYyeah.



In [14]:
wrapped_tokenizer.push_to_hub("phonemetransformers/BABYLM-TOKENIZER-BPE-TXT-SPACELESS")

CommitInfo(commit_url='https://huggingface.co/phonemetransformers/BABYLM-TOKENIZER-BPE-TXT-SPACELESS/commit/98fc164d70bdaf507002444892e2df7ed1176a69', commit_message='Upload tokenizer', commit_description='', oid='98fc164d70bdaf507002444892e2df7ed1176a69', pr_url=None, pr_revision=None, pr_num=None)

## Character-level tokenizer for orthographic text with/without word boundaries

Since it's just a character-level model, the trainer can simply filter out the word boundary tokens so we do not need a separate tokenizer for no word boundaries. We train on the `character_split_utterance` column.

In [15]:
MIN_COUNT = 10

def build_vocabulary(lines):

    vocab = {'UNK' : 0, 'PAD' : 1, 'W' : 2, 'UTT_BOUNDARY' : 3}
    token_counts = {}
    for line in lines:
        tokens = line.strip()
        for token in tokens:
            if token not in token_counts:
                token_counts[token] = 0
            token_counts[token] += 1
        
    # Add tokens to vocab if they are not in phoible and have a count greater than MIN_COUNT
    for token, count in token_counts.items():
        if count > MIN_COUNT and token not in vocab:
            vocab[token] = len(vocab)

    print('Vocab: ', vocab)
    print('Vocab size: ', len(vocab))
    return vocab

In [16]:
normalizer = normalizers.Sequence(
        [normalizers.NFD(),
         normalizers.Lowercase(),
         normalizers.Strip(),
         normalizers.StripAccents(),
         normalizers.Replace(" ", "W"),
        ]
    )

vocab2 = build_vocabulary([normalizer.normalize_str(line) for line in dataset['text']])

tokenizer = Tokenizer(models.WordLevel(vocab=vocab2, unk_token='UNK'))
tokenizer.normalizer = normalizer
tokenizer.add_special_tokens(["UNK", "PAD", "UTT_BOUNDARY", "W"])
tokenizer.pre_tokenizer = pre_tokenizers.Split("", behavior="isolated")
tokenizer.post_processor = processors.TemplateProcessing(
    single="UTT_BOUNDARY $A",
    pair="UTT_BOUNDARY $A UTT_BOUNDARY $B:1",
    special_tokens=[("UTT_BOUNDARY", tokenizer.token_to_id("UTT_BOUNDARY"))],
)

wrapped_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer, bos_token='UTT_BOUNDARY', eos_token='UTT_BOUNDARY', pad_token='PAD', unk_token='UNK')

Vocab:  {'UNK': 0, 'PAD': 1, 'W': 2, 'UTT_BOUNDARY': 3, 'y': 4, 'e': 5, 'a': 6, 'h': 7, '.': 8, 'c': 9, 'o': 10, 'm': 11, 'p': 12, 'u': 13, 'n': 14, 'd': 15, "'": 16, 's': 17, 't': 18, 'i': 19, 'g': 20, 'l': 21, 'k': 22, 'x': 23, ',': 24, 'r': 25, 'w': 26, 'v': 27, 'f': 28, 'b': 29, 'j': 30, '?': 31, '-': 32, 'q': 33, ';': 34, '2': 35, '‘': 36, '’': 37, '!': 38, '/': 39, '1': 40, ':': 41, 'z': 42, '3': 43, '6': 44, '9': 45, '&': 46, '4': 47, '5': 48, '0': 49, '=': 50, '8': 51, '7': 52, '£': 53, '(': 54, ')': 55, '—': 56, '*': 57, ']': 58, '[': 59, '"': 60, '_': 61, '%': 62, '“': 63, '”': 64, '+': 65, '$': 66, '^': 67, '#': 68, 'æ': 69, 'ʌ': 70, 'ɩ': 71, 'ə': 72, '↫': 73, '|': 74, '°': 75, 'ø': 76, '~': 77, '⁄': 78, '`': 79, '�': 80, '′': 81, '@': 82, '}': 83, '{': 84, '―': 85, '–': 86, '·': 87, '♪': 88, '¡': 89, '÷': 90, '\\': 91, '¶': 92, 'ð': 93, '¿': 94, '\xad': 95, '♫': 96, '\u200b': 97, 'œ': 98, 'ł': 99, '¦': 100, '×': 101, '\x99': 102, 'ß': 103, 'ˈ': 104, 'ı': 105, 'đ': 106, '−':

In [17]:
show_example(dataset['text'][0], wrapped_tokenizer)

Original: Yeah.
Ids: [3, 4, 5, 6, 7, 8]
Tokens: ['UTT_BOUNDARY', 'y', 'e', 'a', 'h', '.']
Decoded: UTT_BOUNDARY y e a h.



In [18]:
wrapped_tokenizer.push_to_hub("phonemetransformers/BABYLM-TOKENIZER-CHAR-TXT")

CommitInfo(commit_url='https://huggingface.co/phonemetransformers/BABYLM-TOKENIZER-CHAR-TXT/commit/0b0c0eb6931749bcb04d516a1e6b128c6e38c895', commit_message='Upload tokenizer', commit_description='', oid='0b0c0eb6931749bcb04d516a1e6b128c6e38c895', pr_url=None, pr_revision=None, pr_num=None)

In [19]:
tokenizer = Tokenizer(models.WordLevel(vocab=vocab2, unk_token='UNK'))
tokenizer.normalizer = normalizers.Sequence(
        [normalizers.NFD(),
         normalizers.Lowercase(),
         normalizers.Strip(),
         normalizers.StripAccents(),
         normalizers.Replace(" ", ""), # Remove word boundaries
        ]
    )
tokenizer.add_special_tokens(["UNK", "PAD", "UTT_BOUNDARY"])
tokenizer.pre_tokenizer = pre_tokenizers.Split("", behavior="isolated")
tokenizer.post_processor = processors.TemplateProcessing(
    single="UTT_BOUNDARY $A",
    pair="UTT_BOUNDARY $A UTT_BOUNDARY $B:1",
    special_tokens=[("UTT_BOUNDARY", tokenizer.token_to_id("UTT_BOUNDARY"))],
)

wrapped_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer, bos_token='UTT_BOUNDARY', eos_token='UTT_BOUNDARY', pad_token='PAD', unk_token='UNK')

In [20]:
show_example(dataset['text'][0], wrapped_tokenizer)

Original: Yeah.
Ids: [3, 4, 5, 6, 7, 8]
Tokens: ['UTT_BOUNDARY', 'y', 'e', 'a', 'h', '.']
Decoded: UTT_BOUNDARY y e a h.



In [21]:
wrapped_tokenizer.push_to_hub("phonemetransformers/BABYLM-TOKENIZER-CHAR-TXT-SPACELESS")

CommitInfo(commit_url='https://huggingface.co/phonemetransformers/BABYLM-TOKENIZER-CHAR-TXT-SPACELESS/commit/3b789f09076ef9d28b7281fd3ccde838f11d5181', commit_message='Upload tokenizer', commit_description='', oid='3b789f09076ef9d28b7281fd3ccde838f11d5181', pr_url=None, pr_revision=None, pr_num=None)

## BPE tokenizer for phonemes

The phoneme data is space-separated by phoneme, with "WORD_BOUNDARY" separating words. We can use the normalizer to turn this back into word-like units for comparison with BPE on orthographic text.

In [22]:
tokenizer = Tokenizer(models.BPE())
tokenizer.normalizer = normalizers.Sequence(
        [normalizers.Replace(" ", ""),
         normalizers.Replace("WORD_BOUNDARY", " "),
         normalizers.Strip(),
        ]
    )

tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
trainer = trainers.BpeTrainer(vocab_size=16000, special_tokens=["UTT_BOUNDARY", "PAD", "UNK"])
tokenizer.train_from_iterator(dataset['phonemized_utterance'], trainer=trainer)

tokenizer.decoder = decoders.ByteLevel()
tokenizer.post_processor = processors.TemplateProcessing(
    single="UTT_BOUNDARY $A",
    pair="UTT_BOUNDARY $A UTT_BOUNDARY $B:1",
    special_tokens=[("UTT_BOUNDARY", tokenizer.token_to_id("UTT_BOUNDARY"))],
)

wrapped_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer, pad_token='PAD', unk_token='UNK', bos_token='UTT_BOUNDARY', eos_token='UTT_BOUNDARY', add_prefix_space=True)






In [23]:
show_example(dataset['phonemized_utterance'][0], wrapped_tokenizer)

Original: j ɛ h WORD_BOUNDARY
Ids: [0, 241]
Tokens: ['UTT_BOUNDARY', 'ĠjÉĽh']
Decoded: UTT_BOUNDARY jɛh



In [24]:
wrapped_tokenizer.push_to_hub("phonemetransformers/BABYLM-TOKENIZER-BPE-PHON")

CommitInfo(commit_url='https://huggingface.co/phonemetransformers/BABYLM-TOKENIZER-BPE-PHON/commit/787b3eeb590a21fc8fff642b8cd7ee5a83ed46f9', commit_message='Upload tokenizer', commit_description='', oid='787b3eeb590a21fc8fff642b8cd7ee5a83ed46f9', pr_url=None, pr_revision=None, pr_num=None)

## BPE tokenizer for phonemes without spaces

Similar to the BPE for orthographic text. The only difference is the normalizer and the fact we train on phonemes.

In [25]:
tokenizer = Tokenizer(models.BPE())
tokenizer.normalizer = normalizers.Sequence(
        [normalizers.Replace(" ", ""),
         normalizers.Replace("WORD_BOUNDARY", ""), # Remove word boundaries
         normalizers.Strip(),
        ]
    )

tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False, use_regex=True)
trainer = trainers.BpeTrainer(vocab_size=16000, special_tokens=["UTT_BOUNDARY", "PAD", "UNK"])
tokenizer.train_from_iterator(dataset['phonemized_utterance'], trainer=trainer)

tokenizer.decoder = decoders.ByteLevel()
tokenizer.post_processor = processors.TemplateProcessing(
    single="UTT_BOUNDARY $A",
    pair="UTT_BOUNDARY $A UTT_BOUNDARY $B:1",
    special_tokens=[("UTT_BOUNDARY", tokenizer.token_to_id("UTT_BOUNDARY"))],
)

wrapped_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer, pad_token='PAD', unk_token='UNK', bos_token='UTT_BOUNDARY', eos_token='UTT_BOUNDARY', add_prefix_space=True)






In [26]:
show_example(dataset['phonemized_utterance'][0], wrapped_tokenizer)

Original: j ɛ h WORD_BOUNDARY
Ids: [0, 215]
Tokens: ['UTT_BOUNDARY', 'jÉĽh']
Decoded: UTT_BOUNDARYjɛh



In [27]:
wrapped_tokenizer.push_to_hub("phonemetransformers/BABYLM-TOKENIZER-BPE-PHON-SPACELESS")

CommitInfo(commit_url='https://huggingface.co/phonemetransformers/BABYLM-TOKENIZER-BPE-PHON-SPACELESS/commit/a114579b0a004b50c7e5caf0a9da855abec8d404', commit_message='Upload tokenizer', commit_description='', oid='a114579b0a004b50c7e5caf0a9da855abec8d404', pr_url=None, pr_revision=None, pr_num=None)

## Phoneme tokenizer with spaces

Character-based tokenizer that just uses the phonemes.

In [28]:
phoible = pd.read_csv('../../data/phoible.csv')
phoible_phonemes = phoible.Phoneme.unique()

vocab = {'UNK' : 0, 'PAD' : 1, 'WORD_BOUNDARY' : 2, 'UTT_BOUNDARY' : 3}
unk_tokens = []
token_counts = {}
for line in dataset['phonemized_utterance']:
    tokens = line.strip().split()
    for token in tokens:
        if token not in token_counts:
            token_counts[token] = 0
        token_counts[token] += 1
    
# Add tokens to vocab if they are not in phoible and have a count greater than MIN_COUNT
for token, count in token_counts.items():
    if count > MIN_COUNT and token not in vocab:
        if token not in phoible_phonemes:
            unk_tokens.append(token)
        else:
            vocab[token] = len(vocab)

print('Tokens not found in phoible: ', {token: token_counts[token] for token in unk_tokens})
print('Vocab: ', vocab)
print('Vocab size: ', len(vocab))

  phoible = pd.read_csv('../../data/phoible.csv')


Tokens not found in phoible:  {}
Vocab:  {'UNK': 0, 'PAD': 1, 'WORD_BOUNDARY': 2, 'UTT_BOUNDARY': 3, 'j': 4, 'ɛ': 5, 'h': 6, 'k': 7, 'ɑ': 8, 'm': 9, 'p': 10, 'aʊ': 11, 'n': 12, 'd': 13, 'z': 14, 'θ': 15, 'ɪ': 16, 'ŋ': 17, 'l': 18, 'aɪ': 19, 's': 20, 'ɜː': 21, 't': 22, 'w': 23, 'v': 24, 'ð': 25, 'æ': 26, 'ɔ': 27, 'ɹ': 28, 'ʌ': 29, 'f': 30, 'ə': 31, 'b': 32, 'iː': 33, 'eɪ': 34, 'oʊ': 35, 'd̠ʒ': 36, 'i': 37, 'uː': 38, 'iə': 39, 'ʊ': 40, 'ɡ': 41, 't̠ʃ': 42, 'ɔɪ': 43, 'ʃ': 44, 'ʒ': 45, 'r': 46, 'x': 47, 'ɬ': 48, 'ɑ̃': 49, 'nʲ': 50}
Vocab size:  51


In [29]:
tokenizer = Tokenizer(models.WordLevel(vocab=vocab, unk_token='UNK'))
tokenizer.normalizer = normalizers.Sequence([normalizers.Strip()]) 
tokenizer.add_special_tokens(["UNK", "PAD", "WORD_BOUNDARY", "UTT_BOUNDARY"])
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
tokenizer.post_processor = processors.TemplateProcessing(
    single="UTT_BOUNDARY $A",
    pair="UTT_BOUNDARY $A UTT_BOUNDARY $B:1",
    special_tokens=[("UTT_BOUNDARY", tokenizer.token_to_id("UTT_BOUNDARY"))],
)

wrapped_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer, bos_token='UTT_BOUNDARY', eos_token='UTT_BOUNDARY', pad_token='PAD', unk_token='UNK')


In [30]:
show_example(dataset['phonemized_utterance'][0], wrapped_tokenizer)

Original: j ɛ h WORD_BOUNDARY
Ids: [3, 4, 5, 6, 2]
Tokens: ['UTT_BOUNDARY', 'j', 'ɛ', 'h', 'WORD_BOUNDARY']
Decoded: UTT_BOUNDARY j ɛ h WORD_BOUNDARY



In [31]:
wrapped_tokenizer.push_to_hub("phonemetransformers/BABYLM-TOKENIZER-CHAR-PHON")

CommitInfo(commit_url='https://huggingface.co/phonemetransformers/BABYLM-TOKENIZER-CHAR-PHON/commit/bc882f744292abeadc86c2eeee020abdc9984da9', commit_message='Upload tokenizer', commit_description='', oid='bc882f744292abeadc86c2eeee020abdc9984da9', pr_url=None, pr_revision=None, pr_num=None)

## Phoneme tokenizer without spaces

Character-based tokenizer that uses the phonemes and removes word boundaries.

In [32]:
tokenizer = Tokenizer(models.WordLevel(vocab=vocab, unk_token='UNK'))
tokenizer.normalizer = normalizers.Sequence([normalizers.Replace(' WORD_BOUNDARY', ''), normalizers.Strip()]) 
tokenizer.add_special_tokens(["UNK", "PAD", "UTT_BOUNDARY"])
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
tokenizer.post_processor = processors.TemplateProcessing(
    single="UTT_BOUNDARY $A",
    pair="UTT_BOUNDARY $A UTT_BOUNDARY $B:1",
    special_tokens=[("UTT_BOUNDARY", tokenizer.token_to_id("UTT_BOUNDARY"))],
)

wrapped_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer, bos_token='UTT_BOUNDARY', eos_token='UTT_BOUNDARY', pad_token='PAD', unk_token='UNK')


In [33]:
show_example(dataset['phonemized_utterance'][0], wrapped_tokenizer)

Original: j ɛ h WORD_BOUNDARY
Ids: [3, 4, 5, 6]
Tokens: ['UTT_BOUNDARY', 'j', 'ɛ', 'h']
Decoded: UTT_BOUNDARY j ɛ h



In [34]:
wrapped_tokenizer.push_to_hub("phonemetransformers/BABYLM-TOKENIZER-CHAR-PHON-SPACELESS")

CommitInfo(commit_url='https://huggingface.co/phonemetransformers/BABYLM-TOKENIZER-CHAR-PHON-SPACELESS/commit/6b55fbf836c911362a2cee94e10c6dd66dc85ea9', commit_message='Upload tokenizer', commit_description='', oid='6b55fbf836c911362a2cee94e10c6dd66dc85ea9', pr_url=None, pr_revision=None, pr_num=None)

# Compare all tokenizers

In [35]:
tokenizers = [
    "babylm/babyllama-100m-2024",
    "babylm/ltgbert-100m-2024",
    "phonemetransformers/BABYLM-TOKENIZER-BPE-TXT",
    "phonemetransformers/BABYLM-TOKENIZER-BPE-TXT-SPACELESS",
    "phonemetransformers/BABYLM-TOKENIZER-CHAR-TXT",
    "phonemetransformers/BABYLM-TOKENIZER-CHAR-TXT-SPACELESS",
    "phonemetransformers/BABYLM-TOKENIZER-BPE-PHON",
    "phonemetransformers/BABYLM-TOKENIZER-BPE-PHON-SPACELESS",
    "phonemetransformers/BABYLM-TOKENIZER-CHAR-PHON",
    "phonemetransformers/BABYLM-TOKENIZER-CHAR-PHON-SPACELESS",
]

from transformers import AutoTokenizer

#text_example = "what a conundrum !"
text_example = "what a conundrum !"
phoneme_example = "w ʌ t WORD_BOUNDARY ʌ WORD_BOUNDARY k ə n ʌ n d ɹ ə m WORD_BOUNDARY"

for tokenizer in tokenizers:
    print(tokenizer)
    t = AutoTokenizer.from_pretrained(tokenizer)    
    print(len(t.get_vocab()))
    vocab = t.get_vocab()
    vocab = sorted(vocab.items(), key=lambda x: x[1])
    print(vocab)
    show_example(phoneme_example if 'phoneme' in tokenizer else text_example, t)

babylm/babyllama-100m-2024
16001
Original: what a conundrum !
Ids: [3101, 192, 488, 1045, 11892, 2220]
Tokens: ['what', 'Ġa', 'Ġcon', 'und', 'rum', 'Ġ!']
Decoded: what a conundrum!

babylm/ltgbert-100m-2024
16000
Original: what a conundrum !
Ids: [897, 711, 1014, 1655, 89, 869, 2632]
Tokens: ['▁what', '▁a', '▁con', 'und', 'r', 'um', '▁!']
Decoded: what a conundrum!

phonemetransformers/BABYLM-TOKENIZER-BPE-TXT
16000
Original: w ʌ t WORD_BOUNDARY ʌ WORD_BOUNDARY k ə n ʌ n d ɹ ə m WORD_BOUNDARY
Ids: [0, 154, 114, 102, 128, 150, 1491, 37, 13580, 614, 114, 102, 128, 1491, 37, 13580, 614, 227, 114, 101, 141, 183, 114, 102, 128, 183, 171, 114, 101, 90, 114, 101, 141, 166, 1491, 37, 13580, 614]
Tokens: ['UTT_BOUNDARY', 'Ġw', 'Ġ', 'Ê', 'Į', 'Ġt', 'Ġword', '_', 'bound', 'ary', 'Ġ', 'Ê', 'Į', 'Ġword', '_', 'bound', 'ary', 'Ġk', 'Ġ', 'É', 'Ļ', 'Ġn', 'Ġ', 'Ê', 'Į', 'Ġn', 'Ġd', 'Ġ', 'É', '¹', 'Ġ', 'É', 'Ļ', 'Ġm', 'Ġword', '_', 'bound', 'ary']
Decoded: UTT_BOUNDARY w ʌ t word_boundary ʌ word_boundar