# Phonemize the BabyLM Dataset

We produce a Huggingface dataset that contains the BabyLM dataset (with some cleaning applied) as well as phonemized versions of each line. We begin by loading the original dataset.

In [1]:
import pandas as pd

filenames = [
    'bnc_spoken.train',
    'childes.train',
    'gutenberg.train',
    'open_subtitles.train',
    'simple_wiki.train',
    'switchboard.train'
]

directories = [
    'BabyLM-original/train_10M/',
    'BabyLM-original/train_100M/',
    'BabyLM-original/dev/'
]

pds = {'train_10M': {}, 'train_100M': {}, 'dev': {}}

for directory in directories:
    for filename in filenames:
        if 'dev' in directory:
            filename = filename.replace('.train', '.dev')
        with open(directory + filename, 'r') as f:
            lines = f.readlines()
            pds[directory.split('/')[1]][filename] = pd.DataFrame({'text': lines})

## Cleaning

We apply some light cleaning to the data.

In [2]:
from cleaning import *

CLEANUP_FUNCTIONS = {
    'childes': cleanup_aochildes,
    'bnc_spoken': cleanup_bnc_spoken,
    'cbt': cleanup_cbt,
    'children_stories': cleanup_children_stories,
    'gutenberg': cleanup_gutenberg,
    'open_subtitles': cleanup_open_subtitles,
    'qed': cleanup_qed,
    'simple_wiki': cleanup_simple_wikipedia,
    'switchboard': cleanup_switchboard,
    'wikipedia': cleanup_wikipedia,
}

def cleanup(df, filename):
    new_df = {'text': []}
    lines = [line.strip() for line in df['text'].tolist()]
    new_lines = CLEANUP_FUNCTIONS[filename.split('.')[0]]('\n'.join(lines)).split('\n')
    new_lines = [new_line for new_line in new_lines if new_line.strip() != '']
    new_df['text'] = new_lines
    print(f"🧹 Cleaned '{filename}' (size {len(lines)} -> {len(new_lines)})")
    return pd.DataFrame(new_df)

for dir in pds:
    print(f"🧼 Cleaning '{dir}'")
    for filename in pds[dir]:
        pds[dir][filename] = cleanup(pds[dir][filename], filename)


🧼 Cleaning 'train_10M'
🧹 Cleaned 'bnc_spoken.train' (size 90000 -> 89794)
🧹 Cleaned 'childes.train' (size 580000 -> 579129)
🧹 Cleaned 'gutenberg.train' (size 66014 -> 65963)
🧹 Cleaned 'open_subtitles.train' (size 360000 -> 359552)
🧹 Cleaned 'simple_wiki.train' (size 65000 -> 40432)
🧹 Cleaned 'switchboard.train' (size 18000 -> 18000)
🧼 Cleaning 'train_100M'
🧹 Cleaned 'bnc_spoken.train' (size 818961 -> 817564)
🧹 Cleaned 'childes.train' (size 5790000 -> 5780103)
🧹 Cleaned 'gutenberg.train' (size 676014 -> 675451)
🧹 Cleaned 'open_subtitles.train' (size 3495000 -> 3490637)
🧹 Cleaned 'simple_wiki.train' (size 646969 -> 414892)
🧹 Cleaned 'switchboard.train' (size 161740 -> 161740)
🧼 Cleaning 'dev'
🧹 Cleaned 'bnc_spoken.dev' (size 130000 -> 129766)
🧹 Cleaned 'childes.dev' (size 520153 -> 519283)
🧹 Cleaned 'gutenberg.dev' (size 65000 -> 64942)
🧹 Cleaned 'open_subtitles.dev' (size 375000 -> 374552)
🧹 Cleaned 'simple_wiki.dev' (size 60000 -> 38726)
🧹 Cleaned 'switchboard.dev' (size 18000 -> 18000

## Phonemize

Use our phonemicizer code to add a phonemic transcription of every line.

In [3]:
import sys
import os
sys.path.append('../../')
os.environ['PHONEMIZER_ESPEAK_LIBRARY'] = '/opt/local/lib/libespeak-ng.dylib'
from corpus_phonemizer import phonemize_utterances

def add_phonemes(df):
    lines = df['text'].tolist()
    len_before = len(lines)

    # The Espeak backend is used for phonemization but will sometimes place word boundaries in places
    # that don't match the orthography. E.g. "that's it" might become one word instead of two. This is
    # not so much a problem for our cases, unless we're interested in the word boundaries themselves.
    df['phonemized_utterance'] = phonemize_utterances(lines, backend='phonemizer', language='en-us', keep_word_boundaries=True, allow_possibly_faulty_word_boundaries=True)

    # Remove lines that are empty or whitespace, or that get saved as NaNs
    remove = ['None', 'nan', 'NaN', '', ' ', '  ', None]
    df = df[~df['phonemized_utterance'].isin(remove)]
    df = df[~df['text'].isin(remove)]
    len_after = len(df)
    print(f"🔠 Added phonemes... (size {len_before} -> {len_after})")
    return df

for dir in pds:
    for filename in pds[dir]:
        print(f"🔠 Adding phonemes to '{dir}/{filename}'")
        pds[dir][filename] = add_phonemes(pds[dir][filename])

🔠 Adding phonemes to 'train_10M/bnc_spoken.train'
🔠 Added phonemes... (size 89794 -> 89191)
🔠 Adding phonemes to 'train_10M/childes.train'
🔠 Added phonemes... (size 579129 -> 579128)
🔠 Adding phonemes to 'train_10M/gutenberg.train'
🔠 Added phonemes... (size 65963 -> 65859)
🔠 Adding phonemes to 'train_10M/open_subtitles.train'
🔠 Added phonemes... (size 359552 -> 357840)
🔠 Adding phonemes to 'train_10M/simple_wiki.train'
🔠 Added phonemes... (size 40432 -> 40418)
🔠 Adding phonemes to 'train_10M/switchboard.train'
🔠 Added phonemes... (size 18000 -> 18000)
🔠 Adding phonemes to 'train_100M/bnc_spoken.train'
🔠 Added phonemes... (size 817564 -> 812252)
🔠 Adding phonemes to 'train_100M/childes.train'
🔠 Added phonemes... (size 5780103 -> 5780100)
🔠 Adding phonemes to 'train_100M/gutenberg.train'
🔠 Added phonemes... (size 675451 -> 674589)
🔠 Adding phonemes to 'train_100M/open_subtitles.train'
🔠 Added phonemes... (size 3490637 -> 3473703)
🔠 Adding phonemes to 'train_100M/simple_wiki.train'
🔠 Adde

## Save to Huggingface Dataset

In [4]:
for dir in pds:
    os.makedirs(f'BabyLM-phonemized/{dir}', exist_ok=True)
    for filename in pds[dir]:
        if dir == 'dev':
            pds[dir][filename] = pds[dir][filename].sample(n=4000, random_state=42)
        filename2 = filename.split('.')[0] + '.csv'
        pds[dir][filename].to_csv(f'BabyLM-phonemized/{dir}/{filename2}', index=False)


### Test import of dataset

The dataset is saved at `BabyLM-phonemized/`. We don't need to push it to Huggingface to load it here, we can provide a local path.

In [5]:
from datasets import load_dataset

train_dataset = load_dataset('BabyLM-phonemized', 'strict_small', split='train')

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 1150436 examples [00:04, 276831.25 examples/s]
Generating valid split: 24000 examples [00:00, 135716.03 examples/s]


In [6]:
print(train_dataset['text'][24607])

oh I think we'll take him a bar of this
