# Training the CHILDES Tokenizer

Using the phonemes in our CHILDES dataset, we train a tokenizer that just splits according to whitespace.

In [None]:
import pandas as pd

from datasets import load_dataset
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, trainers
from transformers import GPT2TokenizerFast

In [None]:
phoible = pd.read_csv('../data/phoible.csv')
phoible_phonemes = phoible.Phoneme.unique()

In [None]:
MIN_COUNT = 10

def build_vocabulary(datasets, column='text'):

    vocab = {'UNK' : 0, 'PAD' : 1, 'BOS' : 2, 'EOS' : 3, 'WORD_BOUNDARY' : 4, 'UTT_BOUNDARY' : 5}
    unk_tokens = []
    for dataset in datasets:
        token_counts = {}
        for line in dataset['text']:
            tokens = line.strip().split()
            for token in tokens:
                if token not in token_counts:
                    token_counts[token] = 0
                token_counts[token] += 1
        
        # Add tokens to vocab if they are not in phoible and have a count greater than MIN_COUNT
        for token, count in token_counts.items():
            if count > MIN_COUNT and token not in vocab:
                if token not in phoible_phonemes:
                    unk_tokens.append(token)
                else:
                    vocab[token] = len(vocab)

    print('Tokens not found in phoible: ', {token: token_counts[token] for token in unk_tokens})
    print('Vocab: ', vocab)
    return vocab

def build_phoneme_tokenizer(vocab):

    tokenizer = Tokenizer(models.WordLevel(vocab=vocab, unk_token='UNK'))
    tokenizer.normalizer = normalizers.Sequence(
        [normalizers.Replace("\n", " UTT_BOUNDARY"), normalizers.Strip()]
    ) # Replace newlines with utterance boundaries
    tokenizer.add_special_tokens(["UNK", "PAD", "BOS", "EOS"])
    tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

    wrapped_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer)
    wrapped_tokenizer.bos_token = "BOS"
    wrapped_tokenizer.eos_token = "EOS"
    wrapped_tokenizer.pad_token = "PAD"
    wrapped_tokenizer.unk_token = "UNK"

    return wrapped_tokenizer

In [None]:
datasets = [load_dataset('transformersegmentation/CHILDES', 'English', split='train')]
vocab = build_vocabulary(datasets)
tokenizer = build_phoneme_tokenizer(vocab)
tokenizer.push_to_hub('transformersegmentation/CHILDES-English-tokenizer')

# Training the BR Tokenizer

In [64]:
dataset = load_dataset('transformersegmentation/CHILDES', 'br', split='train')

Found cached dataset childes (/Users/zebulongoriely/.cache/huggingface/datasets/transformersegmentation___childes/br/1.0.0/c1a2022b0fe6c73568543b5d30ee329425ac03b8b9f3d320d1fcc49917af66f1)


In [65]:
# Set unknown token to UNK, and replace newlines with UTT_BOUNDARY
tokenizer = Tokenizer(models.WordLevel(unk_token="UNK"))
tokenizer.normalizer = normalizers.Sequence(
    [normalizers.Replace("\n", " UTT_BOUNDARY"), normalizers.Strip()]
)
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
trainer = trainers.WordLevelTrainer(special_tokens=["UNK", "PAD", "BOS", "EOS"], min_frequency=20)
tokenizer.train_from_iterator(dataset['text'], trainer=trainer)

In [66]:
wrapped_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer)
wrapped_tokenizer.bos_token = "BOS"
wrapped_tokenizer.eos_token = "EOS"
wrapped_tokenizer.pad_token = "PAD"
wrapped_tokenizer.unk_token = "UNK"

In [67]:
wrapped_tokenizer.push_to_hub('transformersegmentation/BR-tokenizer')

CommitInfo(commit_url='https://huggingface.co/transformersegmentation/BR-Tokenizer/commit/989424ca42e299a1583fc4681e55d46dc4a853e5', commit_message='Upload tokenizer', commit_description='', oid='989424ca42e299a1583fc4681e55d46dc4a853e5', pr_url=None, pr_revision=None, pr_num=None)