# Training the CHILDES Tokenizer

Using the phonemes in our CHILDES dataset, we train a tokenizer that just splits according to whitespace.

In [1]:
import pandas as pd

from datasets import load_dataset
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, trainers
from transformers import GPT2TokenizerFast

In [2]:
languages = ['English', 'French', 'German']
train_datasets = [load_dataset('transformersegmentation/CHILDES', lang, split='train') for lang in languages]
all_lines = []
for dataset in train_datasets:
    all_lines += dataset['text']

Found cached dataset childes (/Users/zebulongoriely/.cache/huggingface/datasets/transformersegmentation___childes/English/1.0.0/095e19727e2d33f7808ec4d5c95d086a19ab190ee0ae9ded0d0f7532fa5652c8)


Downloading and preparing dataset childes/French to /Users/zebulongoriely/.cache/huggingface/datasets/transformersegmentation___childes/French/1.0.0/095e19727e2d33f7808ec4d5c95d086a19ab190ee0ae9ded0d0f7532fa5652c8...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/9.88M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/536k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/556k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset childes downloaded and prepared to /Users/zebulongoriely/.cache/huggingface/datasets/transformersegmentation___childes/French/1.0.0/095e19727e2d33f7808ec4d5c95d086a19ab190ee0ae9ded0d0f7532fa5652c8. Subsequent calls will reuse this data.


Found cached dataset childes (/Users/zebulongoriely/.cache/huggingface/datasets/transformersegmentation___childes/German/1.0.0/095e19727e2d33f7808ec4d5c95d086a19ab190ee0ae9ded0d0f7532fa5652c8)


In [3]:
phoible = pd.read_csv('../data/phoible.csv')
phonemes = phoible.Phoneme.unique()

  phoible = pd.read_csv('../data/phoible.csv')


In [10]:
# Build vocabulary by language, so that when we add more lanugages, the IDs don't change
vocab = {'UNK' : 0, 'PAD' : 1, 'BOS' : 2, 'EOS' : 3, 'WORD_BOUNDARY' : 4, 'UTT_BOUNDARY' : 5}
unk_tokens = []
MIN_COUNT = 10
for dataset in train_datasets:
    token_counts = {}
    for line in dataset['text']:
        tokens = line.strip().split()
        for token in tokens:
            if token not in token_counts:
                token_counts[token] = 0
            token_counts[token] += 1
    for token, count in token_counts.items():
        if count > MIN_COUNT and token not in vocab:
            if token not in phonemes:
                unk_tokens.append(token)
            else:
                vocab[token] = len(vocab)
print('Tokens not found in phoible: ', unk_tokens)
print('Vocab: ', vocab)
    

Tokens not found in phoible:  ['ᵻ', 'ɔø', 'ʊɐ']
Vocab:  {'UNK': 0, 'PAD': 1, 'BOS': 2, 'EOS': 3, 'WORD_BOUNDARY': 4, 'UTT_BOUNDARY': 5, 'd̠ʒ': 6, 'ʌ': 7, 's': 8, 't': 9, 'l': 10, 'aɪ': 11, 'k': 12, 'j': 13, 'ʊ': 14, 'ɹ': 15, 'b': 16, 'æ': 17, 'h': 18, 'oʊ': 19, 'm': 20, 'iː': 21, 'ð': 22, 'ɛ': 23, 'z': 24, 'ɐ': 25, 'f': 26, 'eɪ': 27, 'w': 28, 'ɪ': 29, 'ɡ': 30, 'ɑː': 31, 'p': 32, 'uː': 33, 'i': 34, 'ɾ': 35, 'ə': 36, 't̠ʃ': 37, 'd': 38, 'θ': 39, 'ŋ': 40, 'oː': 41, 'ɔɪ': 42, 'ɔː': 43, 'n': 44, 'aʊ': 45, 'v': 46, 'ɜː': 47, 'ɚ': 48, 'ɔ': 49, 'ʃ': 50, 'æː': 51, 'ʔ': 52, 'n̩': 53, 'ʒ': 54, 'r': 55, 'ɫ': 56, 'y': 57, 'ɛ̃': 58, 'a': 59, 'ʁ': 60, 'e': 61, 'ɔ̃': 62, 'ɑ̃': 63, 'u': 64, 'o': 65, 'ø': 66, 'œ̃': 67, 'œ': 68, 'ɛː': 69, 'yː': 70, 'aː': 71, 'ɲ': 72, 'œː': 73, 'əʊ': 74, 'ts': 75, 'eː': 76, 'ç': 77, 'x': 78, 'ɛɪ': 79, 'ɜ': 80, 'ɑ': 81, 'ʏ': 82, 'pf': 83, 'øː': 84}


In [7]:
# Set unknown token to UNK, and replace newlines with UTT_BOUNDARY
# tokenizer = Tokenizer(models.WordLevel(unk_token="UNK"))
# tokenizer.normalizer = normalizers.Sequence(
#     [normalizers.Replace("\n", "UTT_BOUNDARY"), normalizers.Strip()]
# )
# tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
# trainer = trainers.WordLevelTrainer(special_tokens=["UNK", "PAD", "BOS", "EOS"], min_frequency=20)
# tokenizer.train_from_iterator(all_lines, trainer=trainer)

tokenizer = Tokenizer(models.WordLevel(vocab=vocab, unk_token='UNK'))
tokenizer.normalizer = normalizers.Sequence(
    [normalizers.Replace("\n", " UTT_BOUNDARY"), normalizers.Strip()]
)
tokenizer.add_special_tokens(["UNK", "PAD", "BOS", "EOS"])
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

In [8]:
wrapped_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer)
wrapped_tokenizer.bos_token = "BOS"
wrapped_tokenizer.eos_token = "EOS"
wrapped_tokenizer.pad_token = "PAD"
wrapped_tokenizer.unk_token = "UNK"

In [9]:
wrapped_tokenizer.push_to_hub('transformersegmentation/CHILDES-tokenizer')

CommitInfo(commit_url='https://huggingface.co/transformersegmentation/CHILDES-tokenizer/commit/eb227a47a632a826509b8d6d2d1065b91affbca6', commit_message='Upload tokenizer', commit_description='', oid='eb227a47a632a826509b8d6d2d1065b91affbca6', pr_url=None, pr_revision=None, pr_num=None)

# Training the BR Tokenizer

In [64]:
dataset = load_dataset('transformersegmentation/CHILDES', 'br', split='train')

Found cached dataset childes (/Users/zebulongoriely/.cache/huggingface/datasets/transformersegmentation___childes/br/1.0.0/c1a2022b0fe6c73568543b5d30ee329425ac03b8b9f3d320d1fcc49917af66f1)


In [65]:
# Set unknown token to UNK, and replace newlines with UTT_BOUNDARY
tokenizer = Tokenizer(models.WordLevel(unk_token="UNK"))
tokenizer.normalizer = normalizers.Sequence(
    [normalizers.Replace("\n", " UTT_BOUNDARY"), normalizers.Strip()]
)
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
trainer = trainers.WordLevelTrainer(special_tokens=["UNK", "PAD", "BOS", "EOS"], min_frequency=20)
tokenizer.train_from_iterator(dataset['text'], trainer=trainer)

In [66]:
wrapped_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer)
wrapped_tokenizer.bos_token = "BOS"
wrapped_tokenizer.eos_token = "EOS"
wrapped_tokenizer.pad_token = "PAD"
wrapped_tokenizer.unk_token = "UNK"

In [67]:
wrapped_tokenizer.push_to_hub('transformersegmentation/BR-tokenizer')

CommitInfo(commit_url='https://huggingface.co/transformersegmentation/BR-Tokenizer/commit/989424ca42e299a1583fc4681e55d46dc4a853e5', commit_message='Upload tokenizer', commit_description='', oid='989424ca42e299a1583fc4681e55d46dc4a853e5', pr_url=None, pr_revision=None, pr_num=None)