# Training the CHILDES Tokenizer

Using the phonemes in our CHILDES dataset, we train a tokenizer that just splits according to whitespace.

In [1]:
import pandas as pd

from datasets import load_dataset, get_dataset_config_names
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, trainers, processors, decoders
from transformers import GPT2TokenizerFast

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [2]:
phoible = pd.read_csv('../../../data/phoible.csv')
phoible_phonemes = phoible.Phoneme.unique()

  phoible = pd.read_csv('../../../data/phoible.csv')


In [3]:
MIN_COUNT = 10

def build_vocabulary(datasets, column='phonemized_utterance', allow_non_phoible=False):

    vocab = {'UNK' : 0, 'PAD' : 1, 'WORD_BOUNDARY' : 2, 'UTT_BOUNDARY' : 3}
    unk_tokens = []
    token_counts = {}
    for dataset in datasets:
        for line in dataset[column]:
            tokens = line.strip().split()
            for token in tokens:
                if token not in token_counts:
                    token_counts[token] = 0
                token_counts[token] += 1
        
    # Add tokens to vocab if they are not in phoible and have a count greater than MIN_COUNT
    for token, count in token_counts.items():
        if count > MIN_COUNT and token not in vocab:
            if token not in phoible_phonemes and not allow_non_phoible:
                unk_tokens.append(token)
            else:
                vocab[token] = len(vocab)

    print('Tokens not found in phoible: ', {token: token_counts[token] for token in unk_tokens})
    print('Vocab: ', vocab)
    print('Vocab size: ', len(vocab))
    return vocab

def build_phoneme_tokenizer(vocab):

    tokenizer = Tokenizer(models.WordLevel(vocab=vocab, unk_token='UNK'))
    tokenizer.normalizer = normalizers.Sequence([normalizers.Strip()]) 
    tokenizer.add_special_tokens(["UNK", "PAD"])
    tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
    tokenizer.post_processor = processors.TemplateProcessing(
        single="UTT_BOUNDARY $A",
        pair="UTT_BOUNDARY $A UTT_BOUNDARY $B:1",
        special_tokens=[("UTT_BOUNDARY", tokenizer.token_to_id("UTT_BOUNDARY"))],
    )

    wrapped_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer, bos_token='UTT_BOUNDARY', eos_token='UTT_BOUNDARY', pad_token='PAD', unk_token='UNK')
    return wrapped_tokenizer

# Training a Tokenizer for each language in CHILDES

We create a unique tokenizer for each language, to keep the vocabulary size appropriate for each language. For most languages we remove any tokens not found in Phoible. We do not do this for Mandarin or Cantonese as for these languages we merge the tone marker and preceding vowel into one phoneme, whereas Phoible treats tone markers as independent symbols.

In [4]:
languages = get_dataset_config_names('transformersegmentation/CHILDES')
print('Languages:', languages)
datasets = [load_dataset('transformersegmentation/CHILDES', language, split='train') for language in languages]

Languages: ['English', 'EnglishUK', 'French', 'German', 'Spanish', 'Dutch', 'Mandarin', 'Japanese', 'Cantonese', 'Estonian', 'Croatian', 'Danish', 'Basque', 'Hungarian', 'Turkish', 'Farsi', 'Icelandic', 'Indonesian', 'Irish', 'Welsh', 'Korean', 'Swedish', 'Norwegian', 'Quechua', 'Catalan', 'Italian', 'PortuguesePt', 'PortugueseBr', 'Romanian']


In [5]:
for language, dataset in zip(languages, datasets):
    print(f'\nTrainking tokenizer for {language}...')
    allow_non_phoible = language in ['Mandarin', 'Cantonese'] # For Mandarin and Cantonese, allow non-phoible tokens since we merge tone with vowels
    vocab = build_vocabulary([dataset], allow_non_phoible=allow_non_phoible)
    tokenizer = build_phoneme_tokenizer(vocab)
    tokenizer.push_to_hub(f"transformersegmentation/CHILDES-{language}-phoneme-tokenizer")
    print(f'Tokenizer for {language} pushed to the hub.')

print(f'\nTrainking tokenizer for all languages...')
vocab = build_vocabulary(datasets)
tokenizer = build_phoneme_tokenizer(vocab)
tokenizer.push_to_hub("transformersegmentation/CHILDES-phoneme-tokenizer")
print('Done.')



Trainking tokenizer for English...
Tokens not found in phoible:  {}
Vocab:  {'UNK': 0, 'PAD': 1, 'WORD_BOUNDARY': 2, 'UTT_BOUNDARY': 3, 's': 4, 'iː': 5, 'ð': 6, 'ɛ': 7, 'ɹ': 8, 'z': 9, 'ʌ': 10, 'f': 11, 'eɪ': 12, 'w': 13, 'ɪ': 14, 'ɡ': 15, 'l': 16, 'æ': 17, 'ɑ': 18, 'h': 19, 'ə': 20, 'ʊ': 21, 'k': 22, 'p': 23, 'uː': 24, 'b': 25, 'i': 26, 't': 27, 'aɪ': 28, 'θ': 29, 'ŋ': 30, 'j': 31, 'ɔ': 32, 'm': 33, 'ɔɪ': 34, 'n': 35, 'd': 36, 'oʊ': 37, 'aʊ': 38, 'v': 39, 'ɜː': 40, 't̠ʃ': 41, 'd̠ʒ': 42, 'ʃ': 43, 'iə': 44, 'ʒ': 45, 'ɑ̃': 46, 'r': 47, 'x': 48}
Vocab size:  49
Tokenizer for English pushed to the hub.

Trainking tokenizer for EnglishUK...
Tokens not found in phoible:  {}
Vocab:  {'UNK': 0, 'PAD': 1, 'WORD_BOUNDARY': 2, 'UTT_BOUNDARY': 3, 'eɪ': 4, 't̠ʃ': 5, 'w': 6, 'ɒ': 7, 't': 8, 'd': 9, 'ʌ': 10, 'z': 11, 'ð': 12, 'a': 13, 'm': 14, 'iː': 15, 'n': 16, 'ɛ': 17, 'k': 18, 's': 19, 'ɪ': 20, 'ɡ': 21, 'ʊ': 22, 'ɑː': 23, 'ɔː': 24, 'l': 25, 'ə': 26, 'ɹ': 27, 'i': 28, 'əʊ': 29, 'uː': 30, 'j': 31, 

# BPE Tokenizers for CHILDES

In [4]:
dataset = load_dataset('transformersegmentation/CHILDES', 'English', split='train')
tokenizer = Tokenizer(models.BPE())
tokenizer.normalizer = normalizers.Sequence(
        [normalizers.NFD(),
         normalizers.Lowercase(),
         normalizers.Strip(),
         normalizers.StripAccents(),
        ]
    )
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)

trainer = trainers.BpeTrainer(vocab_size=8192, special_tokens=["UTT_BOUNDARY", "PAD", "UNK"])
tokenizer.train_from_iterator(dataset['processed_gloss'], trainer=trainer)

tokenizer.decoder = decoders.ByteLevel()
tokenizer.post_processor = processors.TemplateProcessing(
    single="UTT_BOUNDARY $A",
    pair="UTT_BOUNDARY $A UTT_BOUNDARY $B:1",
    special_tokens=[("UTT_BOUNDARY", tokenizer.token_to_id("UTT_BOUNDARY"))],
)

Downloading readme: 100%|██████████| 11.1k/11.1k [00:00<00:00, 6.42MB/s]
Downloading data: 100%|██████████| 597M/597M [00:28<00:00, 20.9MB/s] 
Downloading data: 100%|██████████| 3.62M/3.62M [00:00<00:00, 7.24MB/s]
Generating train split: 1635797 examples [00:09, 179180.63 examples/s]
Generating valid split: 10000 examples [00:00, 158621.60 examples/s]







In [5]:
example = dataset['processed_gloss'][300]
encoding = tokenizer.encode(example)
print(f'Example: {example}')
print(encoding.tokens)

Example: we're gonna get some clothes for thomas.
['UTT_BOUNDARY', 'Ġwe', "'re", 'Ġgonna', 'Ġget', 'Ġsome', 'Ġclothes', 'Ġfor', 'Ġthomas', '.']


In [7]:
wrapped_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer, pad_token='PAD', unk_token='UNK', bos_token='UTT_BOUNDARY', eos_token='UTT_BOUNDARY', add_prefix_space=True)
wrapped_tokenizer.push_to_hub("transformersegmentation/CHILDES-English-BPE-gloss-tokenizer")

CommitInfo(commit_url='https://huggingface.co/transformersegmentation/CHILDES-English-BPE-gloss-tokenizer/commit/90f971556e7648c3c03e704268f1a281c47ef676', commit_message='Upload tokenizer', commit_description='', oid='90f971556e7648c3c03e704268f1a281c47ef676', pr_url=None, pr_revision=None, pr_num=None)

In [8]:
tokenized = wrapped_tokenizer(example, padding='max_length', max_length=20, truncation=True, add_special_tokens=True)
tokenized

{'input_ids': [0, 90, 152, 188, 166, 197, 1217, 190, 1863, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}

In [9]:
wrapped_tokenizer.convert_ids_to_tokens(tokenized['input_ids'])

['UTT_BOUNDARY',
 'Ġwe',
 "'re",
 'Ġgonna',
 'Ġget',
 'Ġsome',
 'Ġclothes',
 'Ġfor',
 'Ġthomas',
 '.',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD']

In [10]:
wrapped_tokenizer('this is a test .')

{'input_ids': [0, 115, 91, 45, 3501, 37, 6], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}