# Phonemize the BabyLM Dataset

We produce a Huggingface dataset that contains the BabyLM dataset (with some cleaning applied) as well as phonemized versions of each line. We begin by loading the original dataset stored within the cambridge-climb version:

In [1]:
import datasets
import pandas as pd

dataset_strict = datasets.load_dataset("cambridge-climb/BabyLM", "original_strict")
dataset_strict_small = datasets.load_dataset("cambridge-climb/BabyLM", "original_strict_small")

  from .autonotebook import tqdm as notebook_tqdm
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
Downloading builder script: 100%|██████████| 13.7k/13.7k [00:00<00:00, 4.51MB/s]
Downloading readme: 100%|██████████| 4.73k/4.73k [00:00<00:00, 3.18MB/s]
Downloading data: 100%|██████████| 99.8M/99.8M [00:18<00:00, 5.31MB/s]
Downloading data: 100%|██████████| 225M/225M [00:41<00:00, 5.36MB/s] 
Downloading data: 100%|██████████| 133M/133M [00:24<00:00, 5.41MB/s] 
Downloading data: 100%|██████████| 90.7M/90.7M [00:16<00:00, 5.34MB/s]
Downloading data: 100%|██████████| 274M/274M [00:52<00:00, 5.18MB/s] 
Downloading data: 100%|██████████| 920M/920M [02:46<00:00, 5.51MB/s] 
Downloading data: 100%|██████████| 292M/292M [00:50<00:00, 5.81MB/s] 
Downloading data: 100%|██████████| 430M/430M [01:15<00:00, 5.68MB/s] 
Downloading data: 100%|█████████

In [2]:
dataset_strict_small['train']

Dataset({
    features: ['text', 'tagged_text', 'filename'],
    num_rows: 1015485
})

## Cleaning

We apply some light cleaning to the data.

In [3]:
from cleaning import *

CLEANUP_FUNCTIONS = {
    'aochildes': cleanup_aochildes,
    'bnc_spoken': cleanup_bnc_spoken,
    'cbt': cleanup_cbt,
    'children_stories': cleanup_children_stories,
    'gutenberg': cleanup_gutenberg,
    'open_subtitles': cleanup_open_subtitles,
    'qed': cleanup_qed,
    'simple_wikipedia': cleanup_simple_wikipedia,
    'switchboard': cleanup_switchboard,
    'wikipedia': cleanup_wikipedia,
}

def dataset_to_dataframe(dataset):
    df = dataset.to_pandas()
    df = df.drop(columns=['tagged_text'])
    remove = ['None', 'nan', 'NaN']
    # When saving the dataset, strings with 'None' or 'nan' or 'NaN'
    # are converted to None values and this causes problems
    df = df[~df['text'].isin(remove)]
    return df

def cleanup(df):
    new_df = {'filename': [], 'text': []}
    for filename in df['filename'].unique():
        lines = list(df[df['filename'] == filename]['text'])
        new_lines = CLEANUP_FUNCTIONS[filename.split('.')[0]]('\n'.join(lines)).split('\n')
        new_lines = [new_line for new_line in new_lines if new_line.strip() != '']
        new_df['filename'].extend([filename] * len(new_lines))
        new_df['text'].extend(new_lines)
        print(f"🧹 Cleaned '{filename}' (size {len(lines)} -> {len(new_lines)})")
    return pd.DataFrame(new_df)

dfs = {
    'strict_train': dataset_to_dataframe(dataset_strict['train']),
    'strict_small_train': dataset_to_dataframe(dataset_strict_small['train']),
    'strict_valid': dataset_to_dataframe(dataset_strict['validation']),
    'strict_small_valid': dataset_to_dataframe(dataset_strict_small['validation']),
}

for name, df in dfs.items():
    print(f"🧼 Cleaning '{name}'")
    dfs[name] = cleanup(df)


🧼 Cleaning 'strict_train'
🧹 Cleaned 'aochildes.txt' (size 763988 -> 763988)
🧹 Cleaned 'bnc_spoken.txt' (size 848199 -> 848199)
🧹 Cleaned 'cbt.txt' (size 263518 -> 263518)
🧹 Cleaned 'children_stories.txt' (size 76379 -> 76379)
🧹 Cleaned 'gutenberg.txt' (size 898292 -> 898292)
🧹 Cleaned 'open_subtitles.txt' (size 5433930 -> 5433127)
🧹 Cleaned 'qed.txt' (size 959619 -> 959844)
🧹 Cleaned 'simple_wikipedia.txt' (size 567001 -> 567001)
🧹 Cleaned 'switchboard.txt' (size 161739 -> 161739)
🧹 Cleaned 'wikipedia.txt' (size 203011 -> 177085)
🧼 Cleaning 'strict_small_train'
🧹 Cleaned 'aochildes.txt' (size 79999 -> 79999)
🧹 Cleaned 'bnc_spoken.txt' (size 89931 -> 89931)
🧹 Cleaned 'cbt.txt' (size 25999 -> 25999)
🧹 Cleaned 'children_stories.txt' (size 5731 -> 5731)
🧹 Cleaned 'gutenberg.txt' (size 94502 -> 94502)
🧹 Cleaned 'open_subtitles.txt' (size 527394 -> 527316)
🧹 Cleaned 'qed.txt' (size 99928 -> 99932)
🧹 Cleaned 'simple_wikipedia.txt' (size 56616 -> 56616)
🧹 Cleaned 'switchboard.txt' (size 15739 

## Phonemicize

Use our phonemicizer code to add a phonemic transcription of every line.

In [33]:
import sys
import os
os.environ['PHONEMIZER_ESPEAK_LIBRARY'] = '/opt/local/lib/libespeak-ng.dylib'
sys.path.append('../../')
from src.phonemize import phonemize_utterances, character_split_utterance

def add_phonemes(df):
    lines = df['text'].tolist()
    len_before = len(lines)
    # The Espeak backend is used for phonemization but will sometimes place word boundaries in places
    # that don't match the orthography. E.g. "that's it" might become one word instead of two. This is
    # not so much a problem for our cases, unless we're interested in the word boundaries themselves.
    phonemized = phonemize_utterances(lines, keep_word_boundaries=True, allow_possibly_faulty_word_boundaries=True)
    df['phonemized_utterance'] = phonemized
    remove = ['None', 'nan', 'NaN', '', ' ', '  ', None]
    # We also split the phonemized text into characters for the character-level model
    df['character_split_utterance'] = character_split_utterance(lines)
    df = df[~df['phonemized_utterance'].isin(remove)]
    len_after = len(df)
    print(f"🔠 Added phonemes... (size {len_before} -> {len_after})")
    return df

for name, df in dfs.items():
    print(f"🔠 Adding phonemes to '{name}'")
    dfs[name] = add_phonemes(df) 


🔠 Adding phonemes to 'strict_train'
Phonemizing using language "EnglishNA"...
Using espeak backend with language code "en-us"...
🔠 Added phonemes... (size 10149172 -> 10117701)
🔠 Adding phonemes to 'strict_small_train'
Phonemizing using language "EnglishNA"...
Using espeak backend with language code "en-us"...
🔠 Added phonemes... (size 1012695 -> 1009906)
🔠 Adding phonemes to 'strict_valid'
Phonemizing using language "EnglishNA"...
Using espeak backend with language code "en-us"...
🔠 Added phonemes... (size 982644 -> 979449)
🔠 Adding phonemes to 'strict_small_valid'
Phonemizing using language "EnglishNA"...
Using espeak backend with language code "en-us"...
🔠 Added phonemes... (size 982644 -> 979449)


## Save to Huggingface Dataset

In [34]:
for name, df in dfs.items():
    # We don't need the entire validation set
    if 'valid' in name:
        df = df.sample(n=10000, random_state=42)
    df.to_csv(f'BabyLM-phonemized/{name}.csv', index=False)

### Test import of dataset

The dataset is saved at `BabyLM-phonemized/`. We don't need to push it to Huggingface to load it here, we can provide a local path.

In [35]:
from datasets import load_dataset

train_dataset = load_dataset('BabyLM-phonemized', 'strict', split='train')

Generating train split: 10117701 examples [01:34, 107153.37 examples/s]
Generating valid split: 10000 examples [00:00, 141522.56 examples/s]


In [36]:
print(train_dataset['text'][764123])

Good.


In [37]:
for i, line in enumerate(train_dataset['phonemized_utterance']):
    if line is None:
        print('None found')
        print(i)
        print(line)
        break