# Training the CHILDES Tokenizer

Using the phonemes in our CHILDES dataset, we train a tokenizer that just splits according to whitespace.

In [8]:
import pandas as pd

from datasets import load_dataset, get_dataset_config_names
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, trainers, processors, decoders
from transformers import GPT2TokenizerFast

In [9]:
phoible = pd.read_csv('../../../data/phoible.csv')
phoible_phonemes = phoible.Phoneme.unique()

  phoible = pd.read_csv('../../../data/phoible.csv')


In [19]:
MIN_COUNT = 10

def build_vocabulary(datasets, column='phonemized_utterance', allow_non_phoible=False):

    vocab = {'UNK' : 0, 'PAD' : 1, 'WORD_BOUNDARY' : 2, 'UTT_BOUNDARY' : 3}
    unk_tokens = []
    token_counts = {}
    for dataset in datasets:
        for line in dataset[column]:
            tokens = line.strip().split()
            for token in tokens:
                if token not in token_counts:
                    token_counts[token] = 0
                token_counts[token] += 1
        
    # Add tokens to vocab if they are not in phoible and have a count greater than MIN_COUNT
    for token, count in token_counts.items():
        if count > MIN_COUNT and token not in vocab:
            if token not in phoible_phonemes and not allow_non_phoible:
                unk_tokens.append(token)
            else:
                vocab[token] = len(vocab)

    print('Tokens not found in phoible: ', {token: token_counts[token] for token in unk_tokens})
    print('Vocab: ', vocab)
    print('Vocab size: ', len(vocab))
    return vocab

def build_phoneme_tokenizer(vocab):

    tokenizer = Tokenizer(models.WordLevel(vocab=vocab, unk_token='UNK'))
    # tokenizer.normalizer = normalizers.Sequence([normalizers.Replace(' WORD_BOUNDARY', ''), normalizers.Strip()]) 
    tokenizer.normalizer = normalizers.Sequence([normalizers.Strip()]) 
    tokenizer.add_special_tokens(["UNK", "PAD", "UTT_BOUNDARY", "WORD_BOUNDARY"])
    tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
    tokenizer.post_processor = processors.TemplateProcessing(
        single="UTT_BOUNDARY $A",
        pair="UTT_BOUNDARY $A UTT_BOUNDARY $B:1",
        special_tokens=[("UTT_BOUNDARY", tokenizer.token_to_id("UTT_BOUNDARY"))],
    )

    wrapped_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer, bos_token='UTT_BOUNDARY', eos_token='UTT_BOUNDARY', pad_token='PAD', unk_token='UNK')
    return wrapped_tokenizer

# Training a Tokenizer for each language in CHILDES

We create a unique tokenizer for each language, to keep the vocabulary size appropriate for each language. For most languages we remove any tokens not found in Phoible. We do not do this for Mandarin or Cantonese as for these languages we merge the tone marker and preceding vowel into one phoneme, whereas Phoible treats tone markers as independent symbols.

In [25]:
languages = get_dataset_config_names('phonemetransformers/CHILDES')
print('Languages:', languages)
datasets = [load_dataset('phonemetransformers/CHILDES', language, split='train') for language in languages]

Downloading readme: 100%|██████████| 10.2k/10.2k [00:00<00:00, 40.7MB/s]


Languages: ['English', 'EnglishUK', 'French', 'German', 'Spanish', 'Dutch', 'Mandarin', 'Japanese', 'Cantonese', 'Estonian', 'Croatian', 'Danish', 'Basque', 'Hungarian', 'Turkish', 'Farsi', 'Icelandic', 'Indonesian', 'Irish', 'Welsh', 'Korean', 'Swedish', 'Norwegian', 'Quechua', 'Catalan', 'Italian', 'PortuguesePt', 'PortugueseBr', 'Romanian', 'Serbian', 'Polish']


Downloading data: 100%|██████████| 871M/871M [00:28<00:00, 30.7MB/s] 
Generating train split: 2564614 examples [00:12, 211448.74 examples/s]
Downloading data: 100%|██████████| 661M/661M [00:21<00:00, 30.6MB/s] 
Generating train split: 2043115 examples [00:09, 212605.69 examples/s]
Downloading data: 100%|██████████| 271M/271M [00:08<00:00, 31.7MB/s] 
Generating train split: 721121 examples [00:03, 190377.05 examples/s]
Downloading data: 100%|██████████| 558M/558M [00:16<00:00, 34.5MB/s] 
Generating train split: 1525559 examples [00:07, 200322.05 examples/s]
Downloading data: 100%|██████████| 196M/196M [00:05<00:00, 34.5MB/s] 
Generating train split: 533308 examples [00:02, 191385.68 examples/s]
Downloading data: 100%|██████████| 131M/131M [00:05<00:00, 25.3MB/s] 
Generating train split: 403472 examples [00:01, 230634.64 examples/s]
Downloading data: 100%|██████████| 214M/214M [00:07<00:00, 27.3MB/s] 
Generating train split: 530342 examples [00:03, 157772.36 examples/s]
Downloading data:

In [26]:
for language, dataset in zip(languages, datasets):
    print(f'\nTraining tokenizer for {language}...')
    allow_non_phoible = language in ['Mandarin', 'Cantonese'] # For Mandarin and Cantonese, allow non-phoible tokens since we merge tone with vowels
    vocab = build_vocabulary([dataset], allow_non_phoible=allow_non_phoible)
    tokenizer = build_phoneme_tokenizer(vocab)
    tokenizer.push_to_hub(f"phonemetransformers/CHILDES-{language}-phoneme-tokenizer")
    print(f'Tokenizer for {language} pushed to the hub.')

print(f'\nTrainking tokenizer for all languages...')
vocab = build_vocabulary(datasets)
tokenizer = build_phoneme_tokenizer(vocab)
tokenizer.push_to_hub("phonemetransformers/CHILDES-phoneme-tokenizer")
print('Done.')



Training tokenizer for English...
Tokens not found in phoible:  {}
Vocab:  {'UNK': 0, 'PAD': 1, 'WORD_BOUNDARY': 2, 'UTT_BOUNDARY': 3, 'd̠ʒ': 4, 'ʌ': 5, 's': 6, 't': 7, 'l': 8, 'aɪ': 9, 'k': 10, 'j': 11, 'ʊ': 12, 'ɹ': 13, 'b': 14, 'æ': 15, 'h': 16, 'oʊ': 17, 'm': 18, 'iː': 19, 'ð': 20, 'ɛ': 21, 'z': 22, 'f': 23, 'eɪ': 24, 'w': 25, 'ɪ': 26, 'ɡ': 27, 'ɑ': 28, 'ə': 29, 'p': 30, 'uː': 31, 'i': 32, 'θ': 33, 'ŋ': 34, 'ɔ': 35, 'ɔɪ': 36, 'n': 37, 'd': 38, 'aʊ': 39, 'v': 40, 'ɜː': 41, 't̠ʃ': 42, 'ʃ': 43, 'iə': 44, 'ʒ': 45, 'x': 46}
Vocab size:  47


No files have been modified since last commit. Skipping to prevent empty commit.


Tokenizer for English pushed to the hub.

Training tokenizer for EnglishUK...
Tokens not found in phoible:  {}
Vocab:  {'UNK': 0, 'PAD': 1, 'WORD_BOUNDARY': 2, 'UTT_BOUNDARY': 3, 'ð': 4, 'æ': 5, 'tʰ': 6, 'ɡ': 7, 'ʊ': 8, 'd': 9, 'ɑː': 10, 'l': 11, 'ɪ': 12, 'n': 13, 'eɪ': 14, 't̠ʃ': 15, 'w': 16, 'ɒ': 17, 'ʌ': 18, 'z': 19, 'm': 20, 'iː': 21, 'aɪ': 22, 'h': 23, 'e': 24, 'kʰ': 25, 's': 26, 'ə': 27, 'ɔː': 28, 'ɹ': 29, 'i': 30, 'əʊ': 31, 'uː': 32, 'j': 33, 'ɪə': 34, 'ɔɪ': 35, 'v': 36, 'f': 37, 'ɜː': 38, 'b': 39, 'pʰ': 40, 'd̠ʒ': 41, 'ɐ': 42, 'eə': 43, 'ʃ': 44, 'θ': 45, 'ŋ': 46, 'aʊ': 47, 'ʊə': 48, 'n̩': 49, 'ʒ': 50}
Vocab size:  51


No files have been modified since last commit. Skipping to prevent empty commit.


Tokenizer for EnglishUK pushed to the hub.

Training tokenizer for French...
Tokens not found in phoible:  {}
Vocab:  {'UNK': 0, 'PAD': 1, 'WORD_BOUNDARY': 2, 'UTT_BOUNDARY': 3, 'm': 4, 'a': 5, 'ɑ̃': 6, 'd': 7, 'ɔ': 8, 'n': 9, 'b': 10, 'ʁ': 11, 'ə': 12, 'ɡ': 13, 'ʒ': 14, 'i': 15, 'v': 16, 't': 17, 'k': 18, 'o': 19, 'ɛ̃': 20, 'w': 21, 'y': 22, 'j': 23, 'e': 24, 'ɔ̃': 25, 'p': 26, 'ɛ': 27, 'f': 28, 's': 29, 'z': 30, 'l': 31, 'u': 32, 'ʃ': 33, 'œ': 34, 'ø': 35, 'ɲ': 36, 't̠ʃ': 37, 'd̠ʒ': 38}
Vocab size:  39


No files have been modified since last commit. Skipping to prevent empty commit.


Tokenizer for French pushed to the hub.

Training tokenizer for German...
Tokens not found in phoible:  {'WORD_BOUNDaRY': 5825166}
Vocab:  {'UNK': 0, 'PAD': 1, 'WORD_BOUNDARY': 2, 'UTT_BOUNDARY': 3, 'aː': 4, 'oː': 5, 'a': 6, 'b': 7, 'x': 8, 'v': 9, 'øː': 10, 'n': 11, 'ɛː': 12, 'f': 13, 'l': 14, 'iː': 15, 'yː': 16, 'j': 17, 'uː': 18, 'h': 19, 'ʊ': 20, 'm': 21, 'ɔ': 22, 'ɪ': 23, 'eː': 24, 'ə': 25, 'd̺': 26, 't̺ʰ': 27, 'ɛ': 28, 'ŋ': 29, 'ç': 30, 'œ': 31, 'kʰ': 32, 'ʀ': 33, 'ɡ': 34, 'pʰ': 35, 'ʏ': 36, 's': 37, 'z': 38, 'ts': 39, 'ʃ': 40, 'ɐ': 41, 'pf': 42, 't̠ʃ': 43, 'd̠ʒ': 44}
Vocab size:  45


No files have been modified since last commit. Skipping to prevent empty commit.


Tokenizer for German pushed to the hub.

Training tokenizer for Spanish...
Tokens not found in phoible:  {}
Vocab:  {'UNK': 0, 'PAD': 1, 'WORD_BOUNDARY': 2, 'UTT_BOUNDARY': 3, 'a': 4, 'i': 5, 'ɾ': 6, 'e̞': 7, 'n': 8, 'k': 9, 'ɲ': 10, 'o̞': 11, 'm': 12, 's': 13, 'u': 14, 'p': 15, 'd': 16, 'l': 17, 't': 18, 'β': 19, 'ɡ': 20, 'w': 21, 'ʝ': 22, 'f': 23, 'x': 24, 'j': 25, 'r': 26, 't̠ʃ': 27, 'ʃ': 28, 'tl': 29, 'ts': 30}
Vocab size:  31


No files have been modified since last commit. Skipping to prevent empty commit.


Tokenizer for Spanish pushed to the hub.

Training tokenizer for Dutch...
Tokens not found in phoible:  {}
Vocab:  {'UNK': 0, 'PAD': 1, 'WORD_BOUNDARY': 2, 'UTT_BOUNDARY': 3, 'z': 4, 'oː': 5, 'j': 6, 'ãː': 7, 'ɦ': 8, 'ɾ': 9, 'd': 10, 'i': 11, 'ɛ': 12, 'p': 13, 'ɪ': 14, 'k': 15, 'ɑ': 16, 'l': 17, 'ɛː': 18, 'n': 19, 's': 20, 'v': 21, 'ə': 22, 'ɛi': 23, 'ʋ': 24, 't': 25, 'm': 26, 'ɣ': 27, 'ʏ': 28, 'ɔ': 29, 'x': 30, 'u': 31, 'f': 32, 'ŋ': 33, 'øː': 34, 'b': 35, 'ɔː': 36, 'ʌu': 37, 'y': 38, 'œy': 39, 'tʲ': 40, 'w': 41, 'ʃ': 42, 't̠ʃ': 43, 'ɲ': 44, 'ʒ': 45, 'iː': 46, 'ɡ': 47, 'd̠ʒ': 48, 'ã': 49}
Vocab size:  50


No files have been modified since last commit. Skipping to prevent empty commit.


Tokenizer for Dutch pushed to the hub.

Training tokenizer for Mandarin...
Tokens not found in phoible:  {}
Vocab:  {'UNK': 0, 'PAD': 1, 'WORD_BOUNDARY': 2, 'UTT_BOUNDARY': 3, 'a˧˥': 4, 'u˧˥': 5, 'a˥': 6, 'au': 7, 'n': 8, 'a˥˩': 9, 'ʃ̺': 10, 'ɻ̩˥˩': 11, 'ə˧˥': 12, 'm': 13, 'ɤ': 14, 'p': 15, 'j': 16, 'e˧˥': 17, 'kʰ': 18, 'k': 19, 'ɤ˥˩': 20, 'w': 21, 'o˥': 22, 't̠ʃ̺ʰ': 23, 'ə˥': 24, 'ŋ': 25, 't': 26, 'ʊ˥': 27, 'ɕ': 28, 'i': 29, 'a': 30, 'l': 31, 'au˧˩˧': 32, 'x': 33, 'u˧˩˧': 34, 'i˥': 35, 'ei˧˩˧': 36, 'pʰ': 37, 'i˧˥': 38, 'ai˧˥': 39, 'ou˧˩˧': 40, 'ɤ˧˥': 41, 'o˧˩˧': 42, 'tɕ': 43, 'au˥˩': 44, 'ts': 45, 'ə˧˩˧': 46, 'ɤ˥': 47, 'ei˧˥': 48, 'ʊ˧˥': 49, 'i˧˩˧': 50, 't̠ʃ̺': 51, 'ɻ̩˧˩˧': 52, 'ei˥˩': 53, 's': 54, 'u˥˩': 55, 'ɹ̪̩': 56, 'ai˥': 57, 'u˥': 58, 'tɕʰ': 59, 'a˧˩˧': 60, 'ai˥˩': 61, 'ɛ˥˩': 62, 'f': 63, 'i˥˩': 64, 'y˥˩': 65, 'au˧˥': 66, 'ɻ': 67, 'ou˥˩': 68, 'e˥': 69, 'tʰ': 70, 'ɹ̪̩˥˩': 71, 'ɛ˧˥': 72, 'au˥': 73, 'ou˧˥': 74, 'e˧˩˧': 75, 'ɛ˥': 76, 'ɻ̩˥': 77, 'ɥ': 78, 'ɹ̪̩˧˩˧': 79, 'ai˧˩˧': 80, 'o

No files have been modified since last commit. Skipping to prevent empty commit.


Tokenizer for Mandarin pushed to the hub.

Training tokenizer for Japanese...
Tokens not found in phoible:  {}
Vocab:  {'UNK': 0, 'PAD': 1, 'WORD_BOUNDARY': 2, 'UTT_BOUNDARY': 3, 'kʲ': 4, 'aː': 5, 'o': 6, 'ts': 7, 'ɯ': 8, 'k': 9, 'a': 10, 'i': 11, 'w': 12, 'd̠ʒ': 13, 't': 14, 'e': 15, 'n': 16, 'ʃ': 17, 'd': 18, 'b': 19, 's': 20, 'm': 21, 'h': 22, 'ɾ': 23, 't̠ʃ': 24, 'ɯː': 25, 'p': 26, 'j': 27, 'ɡʲ': 28, 'ɸ': 29, 'ɡ': 30, 'oː': 31, 'ɲ': 32, 'z': 33, 'eː': 34, 'pʲ': 35, 'ɾʲ': 36, 'ç': 37, 'bʲ': 38, 'mʲ': 39}
Vocab size:  40


No files have been modified since last commit. Skipping to prevent empty commit.


Tokenizer for Japanese pushed to the hub.

Training tokenizer for Cantonese...
Tokens not found in phoible:  {}
Vocab:  {'UNK': 0, 'PAD': 1, 'WORD_BOUNDARY': 2, 'UTT_BOUNDARY': 3, 'j': 4, 'ɐ˥': 5, 't': 6, 'k': 7, 'ɐu˧˥': 8, 'i˨': 9, 'n': 10, 'i˧˩̰': 11, 'y˨': 12, 's': 13, 'ɐ˨': 14, 'p': 15, 'ts': 16, 'ɐu˥': 17, 'ɪ̞˧˥': 18, 'ŋ': 19, 'ɵ˧': 20, 'a̞˧': 21, 'l': 22, 'ʊ̟˥': 23, 'a̞˧˩̰': 24, 'ɛ˥': 25, 'ei˩˧': 26, 'w': 27, 'a̞˨': 28, 'ɐi˧˥': 29, 'a̞˧˥': 30, 'm̩˧˥': 31, 'm': 32, 'ou˥': 33, 'ei˥': 34, 'i˧': 35, 'ɔ̽˧˥': 36, 'tʰ': 37, 'i˥': 38, 'f': 39, 'aːĭ˧': 40, 'h': 41, 'ɵy˧': 42, 'a̞˥': 43, 'ei˧˩̰': 44, 'ou˨': 45, 'ɔ̽˧': 46, 'ɐi˧˩̰': 47, 'u˧': 48, 'ɔːĭ˥': 49, 'ɐu˨': 50, 'ei˧˥': 51, 'ɐi˨': 52, 'ʊ̟˧˩̰': 53, 'ʊ̟˨': 54, 'a̞˩˧': 55, 'ou˧˥': 56, 'aːĭ˧˥': 57, 'ɔ̽˨': 58, 'ɛ˩˧': 59, 'ɪ̞˨': 60, 'iːŭ˧': 61, 'ɛ˧˩̰': 62, 'm̩˧˩̰': 63, 'ɵ˧˥': 64, 'ei˧': 65, 'ɐu˧˩̰': 66, 'm̩˧': 67, 'ɐ˧˥': 68, 'ɐu˩˧': 69, 'ɐi˥': 70, 'ɔ̽˥': 71, 'ɔ̽˧˩̰': 72, 'ɔːĭ˧': 73, 'ou˩˧': 74, 'm̩˥': 75, 'ɐ˧': 76, 'tsʰ': 77, 'ɛ˧˥': 78

No files have been modified since last commit. Skipping to prevent empty commit.


Tokenizer for Cantonese pushed to the hub.

Training tokenizer for Estonian...
Tokens not found in phoible:  {}
Vocab:  {'UNK': 0, 'PAD': 1, 'WORD_BOUNDARY': 2, 'UTT_BOUNDARY': 3, 'n': 4, 'o': 5, 't': 6, 'ʃ': 7, 'a': 8, 'uː': 9, 'm': 10, 'u': 11, 'tʲ': 12, 'i': 13, 's': 14, 'eː': 15, 'd': 16, 'iː': 17, 'k': 18, 'ɡ': 19, 'ɑ': 20, 'ɤ': 21, 'ʊ': 22, 'sʲ': 23, 'j': 24, 'aː': 25, 'h': 26, 'v': 27, 'æi': 28, 'kː': 29, 'e': 30, 'ɪ': 31, 'tː': 32, 'r': 33, 'ɛ': 34, 'mː': 35, 'p': 36, 'sː': 37, 'æ': 38, 'l': 39, 'pː': 40, 'yː': 41, 'æː': 42, 'b': 43, 'ɔ': 44, 'ɤː': 45, 'lː': 46, 'ø': 47, 'øː': 48, 'ŋ': 49, 'y': 50, 'oː': 51, 'rː': 52, 'ɲ': 53, 'nː': 54, 'w': 55, 'tʲː': 56, 'øɪ̯': 57, 'f': 58, 'dʲ': 59, 'sʲː': 60, 't̠ʃ': 61, 'ʃː': 62, 'ʒ': 63, 'z': 64, 'fː': 65, 'dː': 66, 'yi': 67}
Vocab size:  68


No files have been modified since last commit. Skipping to prevent empty commit.


Tokenizer for Estonian pushed to the hub.

Training tokenizer for Croatian...
Tokens not found in phoible:  {}
Vocab:  {'UNK': 0, 'PAD': 1, 'WORD_BOUNDARY': 2, 'UTT_BOUNDARY': 3, 'e': 4, 'a': 5, 'u': 6, 'x': 7, 'k': 8, 't̪': 9, 'n': 10, 'o': 11, 'd̪': 12, 'i': 13, 'r': 14, 'm': 15, 'ʃ': 16, 'p': 17, 's': 18, 'ʋ': 19, 'j': 20, 't̠ʃ': 21, 'l': 22, 'ɡ': 23, 'ʒ': 24, 'b': 25, 't̪s': 26, 'z': 27, 'd̠ʒ': 28, 'ʎ': 29, 'f': 30, 'ɲ': 31, 'y': 32, 'q': 33, 'w': 34}
Vocab size:  35


No files have been modified since last commit. Skipping to prevent empty commit.


Tokenizer for Croatian pushed to the hub.

Training tokenizer for Danish...
Tokens not found in phoible:  {}
Vocab:  {'UNK': 0, 'PAD': 1, 'WORD_BOUNDARY': 2, 'UTT_BOUNDARY': 3, 'n': 4, 'oˤ': 5, 't': 6, 'y': 7, 'ə': 8, 'ð': 9, 'ʁ': 10, 'ɑˤː': 11, 's': 12, 'k': 13, 'i': 14, 'b': 15, 'eˤ': 16, 't̠ʃ': 17, 'a': 18, 'l': 19, 'd': 20, 'ɡ': 21, 'f': 22, 'e': 23, 'ɛ': 24, 'r': 25, 'ɔ': 26, 'w': 27, 'ɔˤ': 28, 'm': 29, 'uˤ': 30, 'j': 31, 'ɑ': 32, 'u': 33, 'ɒ': 34, 'iˤ': 35, 'ʋ': 36, 'h': 37, 'œ': 38, 'p': 39, 'ɕ': 40, 'o': 41, 'ŋ': 42, 'ɒː': 43, 'aˤ': 44, 'ɜ': 45, 'œː': 46, 'eː': 47, 'aː': 48, 'd̠ʒ': 49, 'uː': 50, 'ɔː': 51, 'oː': 52, 'iː': 53, 'yː': 54}
Vocab size:  55


No files have been modified since last commit. Skipping to prevent empty commit.


Tokenizer for Danish pushed to the hub.

Training tokenizer for Basque...
Tokens not found in phoible:  {}
Vocab:  {'UNK': 0, 'PAD': 1, 'WORD_BOUNDARY': 2, 'UTT_BOUNDARY': 3, 'b': 4, 'ai̯': 5, 'e': 6, 's̪̻': 7, 'ɟ': 8, 'ei̯': 9, 't̺s̺': 10, 'i': 11, 'oi̯': 12, 'a': 13, 'ɾ': 14, 'k': 15, 't̠ʃ': 16, 's̺': 17, 'l': 18, 'p': 19, 'o': 20, 'r': 21, 't̪': 22, 'u': 23, 'n': 24, 'm': 25, 'ð': 26, 't̪̻s̪̻': 27, 'β': 28, 'ʎ': 29, 'ɡ': 30, 'ɣ': 31, 'au̯': 32, 'c': 33, 'j': 34, 'd̪': 35, 'ʃ': 36, 'ɲ': 37, 'f': 38, 'eu̯': 39, 'θ': 40, 'x': 41}
Vocab size:  42


No files have been modified since last commit. Skipping to prevent empty commit.


Tokenizer for Basque pushed to the hub.

Training tokenizer for Hungarian...
Tokens not found in phoible:  {}
Vocab:  {'UNK': 0, 'PAD': 1, 'WORD_BOUNDARY': 2, 'UTT_BOUNDARY': 3, 'i': 4, 'd̪': 5, 'ɛ': 6, 'b': 7, 'aː': 8, 't̠ʃ': 9, 'm': 10, 'l̪': 11, 's̻': 12, 'z̻': 13, 'ɡ': 14, 'k': 15, 'o': 16, 'ɑ': 17, 't̪ː': 18, 'j': 19, 'ø': 20, 'n̪': 21, 'ɲ': 22, 'u': 23, 't̻s̻': 24, 'y': 25, 'r̪': 26, 'h': 27, 'oː': 28, 'v': 29, 'd̠ʒ': 30, 't̪': 31, 'eː': 32, 'ʃ': 33, 'ɟʝ': 34, 's̻ː': 35, 'p': 36, 'øː': 37, 'mː': 38, 'z̻ː': 39, 'l̪ː': 40, 'f': 41, 'ɟʝː': 42, 'uː': 43, 'n̪ː': 44, 'iː': 45, 'ɲː': 46, 'ʃː': 47, 'r̪ː': 48, 'kː': 49, 'ŋ': 50, 't̠ʃː': 51, 'jː': 52, 'bː': 53, 'cç': 54, 't̻s̻ː': 55, 'd̪ː': 56, 'ɡː': 57, 'pː': 58, 'ʒ': 59, 'vː': 60, 'cçː': 61, 'fː': 62, 'hː': 63, 'yː': 64}
Vocab size:  65


No files have been modified since last commit. Skipping to prevent empty commit.


Tokenizer for Hungarian pushed to the hub.

Training tokenizer for Turkish...
Tokens not found in phoible:  {}
Vocab:  {'UNK': 0, 'PAD': 1, 'WORD_BOUNDARY': 2, 'UTT_BOUNDARY': 3, 'a': 4, 'm': 5, 'h': 6, 'e': 7, 'ɾ': 8, 'k': 9, 'lʲ': 10, 'iː': 11, 'b': 12, 'f': 13, 'l̪ˠ': 14, 'n̪': 15, 'ɯ': 16, 'j': 17, 'o': 18, 'z̪': 19, 's̪': 20, 'v': 21, 'd̪': 22, 'i': 23, 'p': 24, 'ɟ': 25, 'œ': 26, 'y': 27, 'eː': 28, 'd̠ʒ': 29, 'ʃ': 30, 'u': 31, 'ɡ': 32, 't̪': 33, 't̠ʃ': 34, 'aː': 35, 'pː': 36, 'ʒ': 37, 'uː': 38, 'c': 39, 'w': 40}
Vocab size:  41


No files have been modified since last commit. Skipping to prevent empty commit.


Tokenizer for Turkish pushed to the hub.

Training tokenizer for Farsi...
Tokens not found in phoible:  {}
Vocab:  {'UNK': 0, 'PAD': 1, 'WORD_BOUNDARY': 2, 'UTT_BOUNDARY': 3, 'm': 4, 'a̟': 5, 'b': 6, 's': 7, 'e': 8, 'r': 9, 'j': 10, 'h': 11, 't̠ʃ': 12, 'kʰ': 13, 'd̪': 14, 'n̪': 15, 'z': 16, 'ʃ': 17, 'ɡ': 18, 'i': 19, 'u': 20, 'o': 21, 'f': 22, 't̪ʰ': 23, 'ɑ': 24, 'd̠ʒ': 25, 'v': 26, 'pʰ': 27, 'l': 28, 'w': 29, 'ɢ': 30}
Vocab size:  31


No files have been modified since last commit. Skipping to prevent empty commit.


Tokenizer for Farsi pushed to the hub.

Training tokenizer for Icelandic...
Tokens not found in phoible:  {}
Vocab:  {'UNK': 0, 'PAD': 1, 'WORD_BOUNDARY': 2, 'UTT_BOUNDARY': 3, 'pʰ': 4, 'iː': 5, 'i': 6, 'aː': 7, 'r̥': 8, 'ɪ': 9, 'ɛ': 10, 't̪ʰ': 11, 's̺': 12, 'j': 13, 'ä': 14, 'k': 15, 'ʋ': 16, 'ɛː': 17, 'r': 18, 'ei̯': 19, 'θ̻': 20, 'l': 21, 'n̪': 22, 't̪': 23, 'ɬ': 24, 'uː': 25, 'ð̺̞': 26, 'ɡ': 27, 'c': 28, 'h': 29, 'ɔ': 30, 'n̪̥': 31, 'äu̯': 32, 'ŋ̥': 33, 'ʏ': 34, 'm': 35, 'f': 36, 'ɔː': 37, 'x': 38, 'cʰ': 39, 'ou̯': 40, 'p': 41, 'ŋ': 42, 'øɪ̯': 43, 'äi̯': 44, 'ɰ': 45, 'ʏː': 46, 'u': 47, 'ɪː': 48, 'œ': 49, 'ç': 50, 'ə': 51, 'œː': 52, 'ɲ': 53, 'm̥': 54, 'ɔi̯': 55, 'z': 56, 'ɲ̥': 57}
Vocab size:  58


No files have been modified since last commit. Skipping to prevent empty commit.


Tokenizer for Icelandic pushed to the hub.

Training tokenizer for Indonesian...
Tokens not found in phoible:  {}
Vocab:  {'UNK': 0, 'PAD': 1, 'WORD_BOUNDARY': 2, 'UTT_BOUNDARY': 3, 's': 4, 'i': 5, 'n': 6, 'm': 7, 'a': 8, 'j': 9, 'u': 10, 'k': 11, 'o': 12, 'h': 13, 'l': 14, 't': 15, 'w': 16, 'd̠ʒ': 17, 'ŋ': 18, 'ə': 19, 'd': 20, 'p': 21, 'ɡ': 22, 'b': 23, 'r': 24, 'ɲ': 25, 't̠ʃ': 26, 'f': 27, 'z': 28, 'ʃ': 29, 'x': 30}
Vocab size:  31
Tokenizer for Indonesian pushed to the hub.

Training tokenizer for Irish...
Tokens not found in phoible:  {}
Vocab:  {'UNK': 0, 'PAD': 1, 'WORD_BOUNDARY': 2, 'UTT_BOUNDARY': 3, 'kʰ': 4, 'a': 5, 'ɾ̪ʲ': 6, 'd̪ˠ': 7, 'eː': 8, 'ʃ': 9, 'ɪ': 10, 'n̪ˠ': 11, 'ə': 12, 'w': 13, 'l̪ˠ': 14, 'ɛ̝': 15, 'ɡ': 16, 'ɾ̪ˠ': 17, 'mˠ': 18, 'x': 19, 'iː': 20, 'sˠ': 21, 'bˠ': 22, 'pˠʰ': 23, 't̪ʲʰ': 24, 'ɔ̝': 25, 'cʰ': 26, 't̪ˠʰ': 27, 'h': 28, 'vˠ': 29, 'ʊ': 30, 'j': 31, 'oː': 32, 'ɑː': 33, 'fˠ': 34, 'd̠ʒ': 35, 'l̪ʲ': 36, 'iːə': 37, 'uːe': 38, 'uː': 39, 'n̪ʲ': 40, 'd̪ʲ': 41, 'ɐ'

# BPE Tokenizers for CHILDES

In [13]:
dataset = load_dataset('phonemetransformers/CHILDES', 'English', split='train')
tokenizer = Tokenizer(models.BPE())
tokenizer.normalizer = normalizers.Sequence(
        [normalizers.NFD(),
         normalizers.Lowercase(),
         normalizers.Strip(),
         normalizers.StripAccents(),
        ]
    )
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)

trainer = trainers.BpeTrainer(vocab_size=8192, special_tokens=["UTT_BOUNDARY", "PAD", "UNK"])
tokenizer.train_from_iterator(dataset['processed_gloss'], trainer=trainer)

tokenizer.decoder = decoders.ByteLevel()
tokenizer.post_processor = processors.TemplateProcessing(
    single="UTT_BOUNDARY $A",
    pair="UTT_BOUNDARY $A UTT_BOUNDARY $B:1",
    special_tokens=[("UTT_BOUNDARY", tokenizer.token_to_id("UTT_BOUNDARY"))],
)






In [14]:
example = dataset['processed_gloss'][300]
encoding = tokenizer.encode(example)
print(f'Example: {example}')
print(encoding.tokens)

Example: is that what you saw?
['UTT_BOUNDARY', 'Ġis', 'Ġthat', 'Ġwhat', 'Ġyou', 'Ġsaw', '?']


In [15]:
wrapped_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer, pad_token='PAD', unk_token='UNK', bos_token='UTT_BOUNDARY', eos_token='UTT_BOUNDARY', add_prefix_space=True)
wrapped_tokenizer.push_to_hub("phonemetransformers/CHILDES-English-BPE-gloss-tokenizer")

CommitInfo(commit_url='https://huggingface.co/phonemetransformers/CHILDES-English-BPE-gloss-tokenizer/commit/dc70201e9f3dc609aea522ae4df6cc435f07a55e', commit_message='Upload tokenizer', commit_description='', oid='dc70201e9f3dc609aea522ae4df6cc435f07a55e', pr_url=None, repo_url=RepoUrl('https://huggingface.co/phonemetransformers/CHILDES-English-BPE-gloss-tokenizer', endpoint='https://huggingface.co', repo_type='model', repo_id='phonemetransformers/CHILDES-English-BPE-gloss-tokenizer'), pr_revision=None, pr_num=None)

In [16]:
tokenized = wrapped_tokenizer(example, padding='max_length', max_length=20, truncation=True, add_special_tokens=True)
tokenized

{'input_ids': [0, 115, 92, 95, 67, 781, 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}

In [17]:
wrapped_tokenizer.convert_ids_to_tokens(tokenized['input_ids'])

['UTT_BOUNDARY',
 'Ġis',
 'Ġthat',
 'Ġwhat',
 'Ġyou',
 'Ġsaw',
 '?',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD']

In [18]:
wrapped_tokenizer('this is a test .')

{'input_ids': [0, 124, 115, 61, 3630, 45, 6], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}