# Training the CHILDES Tokenizer

Using the phonemes in our CHILDES dataset, we train a tokenizer that just splits according to whitespace.

In [54]:
from datasets import load_dataset
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, trainers
from transformers import GPT2TokenizerFast

In [55]:
languages = ['english', 'french']
train_datasets = [load_dataset('transformersegmentation/CHILDES', lang, split='train') for lang in languages]
all_lines = []
for dataset in train_datasets:
    all_lines += dataset['text']

Found cached dataset childes (/Users/zebulongoriely/.cache/huggingface/datasets/transformersegmentation___childes/english/1.0.0/c1a2022b0fe6c73568543b5d30ee329425ac03b8b9f3d320d1fcc49917af66f1)
Found cached dataset childes (/Users/zebulongoriely/.cache/huggingface/datasets/transformersegmentation___childes/french/1.0.0/c1a2022b0fe6c73568543b5d30ee329425ac03b8b9f3d320d1fcc49917af66f1)


In [56]:
# Build vocabulary
vocab = {'UNK' : 0, 'PAD' : 1, 'BOS' : 2, 'EOS' : 3, 'WORD_BOUNDARY' : 4, 'UTT_BOUNDARY' : 5}
BAD_TOKENS = ['(', ')', 'fr', 'en', '(en)', '(fr)']
MIN_COUNT = 10
for dataset in train_datasets:
    token_counts = {}
    for line in dataset['text']:
        tokens = line.strip().split()
        for token in tokens:
            if token not in token_counts:
                token_counts[token] = 0
            token_counts[token] += 1
    for token, count in token_counts.items():
        if count > MIN_COUNT and token not in vocab and token not in BAD_TOKENS:
            vocab[token] = len(vocab)
vocab
    

{'UNK': 0,
 'PAD': 1,
 'BOS': 2,
 'EOS': 3,
 'WORD_BOUNDARY': 4,
 'UTT_BOUNDARY': 5,
 'dʒ': 6,
 'ʌ': 7,
 's': 8,
 't': 9,
 'l': 10,
 'aɪ': 11,
 'k': 12,
 'j': 13,
 'ʊɹ': 14,
 'b': 15,
 'ʊ': 16,
 'æ': 17,
 'h': 18,
 'oʊ': 19,
 'm': 20,
 'd': 21,
 'uː': 22,
 'w': 23,
 'ɑː': 24,
 'n': 25,
 'ə': 26,
 'ð': 27,
 'ɐ': 28,
 'ɾ': 29,
 'ɪ': 30,
 'ɛ': 31,
 'z': 32,
 'iː': 33,
 'ɛɹ': 34,
 'f': 35,
 'eɪ': 36,
 'ɡ': 37,
 'ᵻ': 38,
 'p': 39,
 'i': 40,
 'əl': 41,
 'tʃ': 42,
 'θ': 43,
 'ŋ': 44,
 'oːɹ': 45,
 'ɹ': 46,
 'ɔɪ': 47,
 'ɔː': 48,
 'aʊ': 49,
 'ɪɹ': 50,
 'v': 51,
 'ɜː': 52,
 'ɚ': 53,
 'ɑːɹ': 54,
 'ɔːɹ': 55,
 'ɔ': 56,
 'ʃ': 57,
 'æː': 58,
 'aɪɚ': 59,
 'iə': 60,
 'ʔ': 61,
 'n̩': 62,
 'oː': 63,
 'aɪə': 64,
 'ʒ': 65,
 'aɪʊɹ': 66,
 'r': 67,
 'ɫ': 68,
 'aɪʊ': 69,
 'y': 70,
 'ɛ̃': 71,
 'a': 72,
 'ʁ': 73,
 'e': 74,
 'ɔ̃': 75,
 'ɑ̃': 76,
 'u': 77,
 'o': 78,
 'ø': 79,
 'œ̃': 80,
 'œ': 81,
 'ɛː': 82,
 'yː': 83,
 'aː': 84,
 'əʊ': 85,
 'ɲ': 86,
 'œː': 87,
 'jː': 88}

In [57]:
# Set unknown token to UNK, and replace newlines with UTT_BOUNDARY
# tokenizer = Tokenizer(models.WordLevel(unk_token="UNK"))
# tokenizer.normalizer = normalizers.Sequence(
#     [normalizers.Replace("\n", "UTT_BOUNDARY"), normalizers.Strip()]
# )
# tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
# trainer = trainers.WordLevelTrainer(special_tokens=["UNK", "PAD", "BOS", "EOS"], min_frequency=20)
# tokenizer.train_from_iterator(all_lines, trainer=trainer)

tokenizer = Tokenizer(models.WordLevel(vocab=vocab, unk_token='UNK'))
tokenizer.normalizer = normalizers.Sequence(
    [normalizers.Replace("\n", " UTT_BOUNDARY"), normalizers.Strip()]
)
tokenizer.add_special_tokens(["UNK", "PAD", "BOS", "EOS"])
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

In [58]:
wrapped_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer)
wrapped_tokenizer.bos_token = "BOS"
wrapped_tokenizer.eos_token = "EOS"
wrapped_tokenizer.pad_token = "PAD"
wrapped_tokenizer.unk_token = "UNK"

In [59]:
wrapped_tokenizer.push_to_hub('transformersegmentation/CHILDES-tokenizer')

CommitInfo(commit_url='https://huggingface.co/transformersegmentation/CHILDES-tokenizer/commit/6a4e30117f0b4451678f90b2a9c8c5c67859dd14', commit_message='Upload tokenizer', commit_description='', oid='6a4e30117f0b4451678f90b2a9c8c5c67859dd14', pr_url=None, pr_revision=None, pr_num=None)

# Training the BR Tokenizer

In [64]:
dataset = load_dataset('transformersegmentation/CHILDES', 'br', split='train')

Found cached dataset childes (/Users/zebulongoriely/.cache/huggingface/datasets/transformersegmentation___childes/br/1.0.0/c1a2022b0fe6c73568543b5d30ee329425ac03b8b9f3d320d1fcc49917af66f1)


In [65]:
# Set unknown token to UNK, and replace newlines with UTT_BOUNDARY
tokenizer = Tokenizer(models.WordLevel(unk_token="UNK"))
tokenizer.normalizer = normalizers.Sequence(
    [normalizers.Replace("\n", " UTT_BOUNDARY"), normalizers.Strip()]
)
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
trainer = trainers.WordLevelTrainer(special_tokens=["UNK", "PAD", "BOS", "EOS"], min_frequency=20)
tokenizer.train_from_iterator(dataset['text'], trainer=trainer)

In [66]:
wrapped_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer)
wrapped_tokenizer.bos_token = "BOS"
wrapped_tokenizer.eos_token = "EOS"
wrapped_tokenizer.pad_token = "PAD"
wrapped_tokenizer.unk_token = "UNK"

In [67]:
wrapped_tokenizer.push_to_hub('transformersegmentation/BR-tokenizer')

CommitInfo(commit_url='https://huggingface.co/transformersegmentation/BR-Tokenizer/commit/989424ca42e299a1583fc4681e55d46dc4a853e5', commit_message='Upload tokenizer', commit_description='', oid='989424ca42e299a1583fc4681e55d46dc4a853e5', pr_url=None, pr_revision=None, pr_num=None)