# Training the CHILDES Tokenizer

Using the phonemes in our CHILDES dataset, we train a tokenizer that just splits according to whitespace.

In [1]:
import pandas as pd
import re

from datasets import load_dataset, get_dataset_config_names
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, trainers, processors, decoders
from transformers import GPT2TokenizerFast

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
phoible = pd.read_csv('../data/phoible.csv')
phoible_phonemes = phoible.Phoneme.unique()

  phoible = pd.read_csv('../data/phoible.csv')


In [8]:
MIN_COUNT = 10
STRESS_RE = re.compile(r"[ˈˌ'-]+")

def build_vocabulary(datasets, column='ipa_transcription', allow_non_phoible=False, allow_stressed_tokens=False):

    vocab = {'UNK' : 0, 'PAD' : 1, 'WORD_BOUNDARY' : 2, 'UTT_BOUNDARY' : 3}
    unk_tokens = []
    token_counts = {}
    for dataset in datasets:
        for line in dataset[column]:
            tokens = line.strip().split()
            for token in tokens:
                if token not in token_counts:
                    token_counts[token] = 0
                token_counts[token] += 1
        
    # Add tokens to vocab if they are not in phoible and have a count greater than MIN_COUNT
    for token, count in token_counts.items():
        if count > MIN_COUNT and token not in vocab:
            if token not in phoible_phonemes and not allow_non_phoible:
                if allow_stressed_tokens and STRESS_RE.findall(token):
                    vocab[token] = len(vocab)
                else:
                    unk_tokens.append(token)
            else:
                vocab[token] = len(vocab)

    print('Tokens not found in phoible: ', {token: token_counts[token] for token in unk_tokens})
    print('Vocab: ', vocab)
    print('Vocab size: ', len(vocab))
    return vocab

def build_phoneme_tokenizer(vocab, add_stress_replacer=False):

    # We replace any kind of stress marker with a single primary stress marker
    norms = []
    if add_stress_replacer:
        new_vocab = {}
        for token in vocab:
            if STRESS_RE.findall(token):
                new_token = "ˈ" + STRESS_RE.sub('', token)
                if token != new_token:
                    norms.append(normalizers.Replace(token, new_token))
                token = new_token
            if token not in new_vocab:
                new_vocab[token] = len(new_vocab)
        vocab = new_vocab
        print('Using only primary stress markers...')
        print('New vocab: ', vocab)
        print('New vocab size: ', len(vocab))
    norms.append(normalizers.Strip())

    tokenizer = Tokenizer(models.WordLevel(vocab=vocab, unk_token='UNK'))
    # tokenizer.normalizer = normalizers.Sequence([normalizers.Replace(' WORD_BOUNDARY', ''), normalizers.Strip()]) 
    tokenizer.normalizer = normalizers.Sequence(norms) 
    tokenizer.add_special_tokens(["UNK", "PAD", "UTT_BOUNDARY", "WORD_BOUNDARY"])
    tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()
    tokenizer.post_processor = processors.TemplateProcessing(
        single="UTT_BOUNDARY $A",
        pair="UTT_BOUNDARY $A UTT_BOUNDARY $B:1",
        special_tokens=[("UTT_BOUNDARY", tokenizer.token_to_id("UTT_BOUNDARY"))],
    )

    wrapped_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer, bos_token='UTT_BOUNDARY', eos_token='UTT_BOUNDARY', pad_token='PAD', unk_token='UNK')
    return wrapped_tokenizer


# Training a Tokenizer for each language in CHILDES

We create a unique tokenizer for each language, to keep the vocabulary size appropriate for each language. For most languages we remove any tokens not found in Phoible. We do not do this for Mandarin or Cantonese as for these languages we merge the tone marker and preceding vowel into one phoneme, whereas Phoible treats tone markers as independent symbols.

In [6]:
languages = get_dataset_config_names('phonemetransformers/IPA-CHILDES')
print('Languages:', languages)
datasets = {language : load_dataset('phonemetransformers/IPA-CHILDES', language, split='train') for language in languages}

Languages: ['EnglishNA', 'EnglishUK', 'French', 'German', 'Spanish', 'Dutch', 'Mandarin', 'Japanese', 'Cantonese', 'Estonian', 'Croatian', 'Danish', 'Basque', 'Hungarian', 'Turkish', 'Farsi', 'Icelandic', 'Indonesian', 'Irish', 'Welsh', 'Korean', 'Swedish', 'Norwegian', 'Quechua', 'Catalan', 'Italian', 'PortuguesePt', 'PortugueseBr', 'Romanian', 'Serbian', 'Polish']


Downloading data: 100%|██████████| 856M/856M [01:03<00:00, 13.5MB/s] 
Generating train split: 2564614 examples [00:13, 183649.41 examples/s]
Downloading data: 100%|██████████| 649M/649M [00:55<00:00, 11.7MB/s] 
Generating train split: 2043115 examples [00:09, 206074.50 examples/s]
Downloading data: 100%|██████████| 267M/267M [00:29<00:00, 8.90MB/s] 
Generating train split: 721121 examples [00:04, 153748.52 examples/s]
Downloading data: 100%|██████████| 544M/544M [00:52<00:00, 10.4MB/s] 
Generating train split: 1525559 examples [00:08, 172756.08 examples/s]
Downloading data: 100%|██████████| 191M/191M [00:18<00:00, 10.5MB/s] 
Generating train split: 533308 examples [00:02, 179323.77 examples/s]
Downloading data: 100%|██████████| 130M/130M [00:12<00:00, 10.3MB/s] 
Generating train split: 403472 examples [00:02, 200529.49 examples/s]
Downloading data: 100%|██████████| 209M/209M [00:19<00:00, 10.6MB/s] 
Generating train split: 530022 examples [00:03, 140328.05 examples/s]
Downloading data:

In [10]:
for language, dataset in datasets.items():
    print(f'\nTraining tokenizer for {language}...')
    allow_non_phoible = language in ['Mandarin', 'Cantonese'] # For Mandarin and Cantonese, allow non-phoible tokens since we merge tone with vowels
    vocab = build_vocabulary([dataset], allow_non_phoible=allow_non_phoible, allow_stressed_tokens=True)
    tokenizer = build_phoneme_tokenizer(vocab, add_stress_replacer=True)
    # save locally
    tokenizer.save_pretrained(f'ipa-childes-tokenizers/{language}')
    print(f'Tokenizer for {language} saved.')

# print(f'\nTrainking tokenizer for all languages...')
# vocab = build_vocabulary(datasets.values())
# tokenizer = build_phoneme_tokenizer(vocab)
# tokenizer.push_to_hub("phonemetransformers/CHILDES-phoneme-tokenizer")
# print('Done.')



Training tokenizer for EnglishNA...
Tokens not found in phoible:  {}
Vocab:  {'UNK': 0, 'PAD': 1, 'WORD_BOUNDARY': 2, 'UTT_BOUNDARY': 3, 'd̠ʒ': 4, 'ʌ': 5, 's': 6, 't': 7, 'l': 8, 'aɪ': 9, 'k': 10, 'j': 11, 'ʊ': 12, 'ɹ': 13, 'b': 14, 'æ': 15, 'h': 16, 'oʊ': 17, 'm': 18, 'iː': 19, 'ð': 20, 'ɛ': 21, 'z': 22, 'f': 23, 'eɪ': 24, 'w': 25, 'ɪ': 26, 'ɡ': 27, 'ɑ': 28, 'ə': 29, 'p': 30, 'uː': 31, 'i': 32, 'θ': 33, 'ŋ': 34, 'ɔ': 35, 'ɔɪ': 36, 'n': 37, 'd': 38, 'aʊ': 39, 'v': 40, 'ɜː': 41, 't̠ʃ': 42, 'ʃ': 43, 'iə': 44, 'ʒ': 45, 'x': 46}
Vocab size:  47
Using only primary stress markers...
New vocab:  {'UNK': 0, 'PAD': 1, 'WORD_BOUNDARY': 2, 'UTT_BOUNDARY': 3, 'd̠ʒ': 4, 'ʌ': 5, 's': 6, 't': 7, 'l': 8, 'aɪ': 9, 'k': 10, 'j': 11, 'ʊ': 12, 'ɹ': 13, 'b': 14, 'æ': 15, 'h': 16, 'oʊ': 17, 'm': 18, 'iː': 19, 'ð': 20, 'ɛ': 21, 'z': 22, 'f': 23, 'eɪ': 24, 'w': 25, 'ɪ': 26, 'ɡ': 27, 'ɑ': 28, 'ə': 29, 'p': 30, 'uː': 31, 'i': 32, 'θ': 33, 'ŋ': 34, 'ɔ': 35, 'ɔɪ': 36, 'n': 37, 'd': 38, 'aʊ': 39, 'v': 40, 'ɜː': 4

In [11]:
from transformers import AutoTokenizer

def check_tokenizer(tokenizer):
    # It turns out that the Whitespace normalizer does not include tone symbols, so for the Cantonese 
    # and Mandarin tokenizers, it was splitting phonemes like 'a˥' in two, and so converting them to two UNK
    # tokens. This is fixed by using WhitespaceSplit normalizer, which works like split().
    is_ok = True
    for v, x in tokenizer.vocab.items():
        if not (tokenizer.encode(v)[1:] == [x]):
            #print(f'Tokenizer failed to encode "{v}", gave {tokenizer.encode(v)[1:]}')
            is_ok = False
    return is_ok

for language in datasets.keys():
    t = AutoTokenizer.from_pretrained(f'ipa-childes-tokenizers/{language}')
    is_ok = check_tokenizer(t)
    print(f'{language} tokenizer is ok: {is_ok}')

EnglishNA tokenizer is ok: True
EnglishUK tokenizer is ok: True
French tokenizer is ok: True
German tokenizer is ok: True
Spanish tokenizer is ok: True
Dutch tokenizer is ok: True
Mandarin tokenizer is ok: True
Japanese tokenizer is ok: True
Cantonese tokenizer is ok: True
Estonian tokenizer is ok: True
Croatian tokenizer is ok: True
Danish tokenizer is ok: True
Basque tokenizer is ok: True
Hungarian tokenizer is ok: True
Turkish tokenizer is ok: True
Farsi tokenizer is ok: True
Icelandic tokenizer is ok: True
Indonesian tokenizer is ok: True
Irish tokenizer is ok: True
Welsh tokenizer is ok: True
Korean tokenizer is ok: True
Swedish tokenizer is ok: True
Norwegian tokenizer is ok: True
Quechua tokenizer is ok: True
Catalan tokenizer is ok: True
Italian tokenizer is ok: True
PortuguesePt tokenizer is ok: True
PortugueseBr tokenizer is ok: True
Romanian tokenizer is ok: True
Serbian tokenizer is ok: True
Polish tokenizer is ok: True


# BPE Tokenizers for CHILDES

In [7]:
dataset = load_dataset('phonemetransformers/IPA-CHILDES', 'English', split='train')
tokenizer = Tokenizer(models.BPE())
tokenizer.normalizer = normalizers.Sequence(
        [normalizers.NFD(),
         normalizers.Lowercase(),
         normalizers.Strip(),
         normalizers.StripAccents(),
        ]
    )
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)

trainer = trainers.BpeTrainer(vocab_size=8192, special_tokens=["UTT_BOUNDARY", "PAD", "UNK"])
tokenizer.train_from_iterator(dataset['processed_gloss'], trainer=trainer)

tokenizer.decoder = decoders.ByteLevel()
tokenizer.post_processor = processors.TemplateProcessing(
    single="UTT_BOUNDARY $A",
    pair="UTT_BOUNDARY $A UTT_BOUNDARY $B:1",
    special_tokens=[("UTT_BOUNDARY", tokenizer.token_to_id("UTT_BOUNDARY"))],
)






In [8]:
example = dataset['processed_gloss'][300]
encoding = tokenizer.encode(example)
print(f'Example: {example}')
print(encoding.tokens)

Example: is that what you saw?
['UTT_BOUNDARY', 'Ġis', 'Ġthat', 'Ġwhat', 'Ġyou', 'Ġsaw', '?']


In [9]:
wrapped_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer, pad_token='PAD', unk_token='UNK', bos_token='UTT_BOUNDARY', eos_token='UTT_BOUNDARY', add_prefix_space=True)
wrapped_tokenizer.push_to_hub("phonemetransformers/CHILDES-English-BPE-gloss-tokenizer")

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/phonemetransformers/CHILDES-English-BPE-gloss-tokenizer/commit/dc70201e9f3dc609aea522ae4df6cc435f07a55e', commit_message='Upload tokenizer', commit_description='', oid='dc70201e9f3dc609aea522ae4df6cc435f07a55e', pr_url=None, repo_url=RepoUrl('https://huggingface.co/phonemetransformers/CHILDES-English-BPE-gloss-tokenizer', endpoint='https://huggingface.co', repo_type='model', repo_id='phonemetransformers/CHILDES-English-BPE-gloss-tokenizer'), pr_revision=None, pr_num=None)

In [10]:
tokenized = wrapped_tokenizer(example, padding='max_length', max_length=20, truncation=True, add_special_tokens=True)
tokenized

{'input_ids': [0, 115, 92, 95, 67, 781, 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}

In [11]:
wrapped_tokenizer.convert_ids_to_tokens(tokenized['input_ids'])

['UTT_BOUNDARY',
 'Ġis',
 'Ġthat',
 'Ġwhat',
 'Ġyou',
 'Ġsaw',
 '?',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD']

In [12]:
wrapped_tokenizer('this is a test .')

{'input_ids': [0, 124, 115, 61, 3630, 45, 6], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}