In [1]:
import datasets

# Load cambridge-climb/BabyLM dataset
dataset = datasets.load_dataset("cambridge-climb/BabyLM", "original_strict")

  from .autonotebook import tqdm as notebook_tqdm
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [2]:
df = dataset['train'].to_pandas()
df = df.drop(columns=['tagged_text'])
df['is_child'] = False
df['target_child_age'] = 0.2 # Arbitrary value
df = df.rename(columns={'text': 'processed_gloss'})
df['language_code'] = 'en-us'

In [3]:
import sys
import os
os.environ['PHONEMIZER_ESPEAK_LIBRARY'] = '/opt/local/lib/libespeak-ng.dylib'
sys.path.append('../../')
from src.phonemize import phonemize_utterances, character_split_utterance

lines = df['processed_gloss'].tolist()

# We remove word boundaries to keep more data, since the phonemization would remove around 
# 25% of the data due to word mismatches but we do not need the word boundaries for training
phonemized_lines = phonemize_utterances(lines, keep_word_boundaries=False)
df['phonemized_utterance'] = phonemized_lines
df['character_split_utterance'] = character_split_utterance(lines)

Phonemizing using language "EnglishNA"...
Using espeak backend with language code "en-us"...


In [4]:
num_empty = len(df[df['phonemized_utterance'] == ''])
print(f'WARNING: {num_empty} lines were not phonemized successfully. Dropping these.')
df = df[df['phonemized_utterance'] != '']
df['character_split_utterance'] = character_split_utterance(df['processed_gloss'].tolist())



In [5]:
df.to_csv('BabyLM-phonemized/train.csv', index=False)

In [7]:
df = dataset['validation'].to_pandas()
df = df.drop(columns=['tagged_text'])
df['is_child'] = False
df['target_child_age'] = 0.2
df = df.rename(columns={'text': 'processed_gloss'})
df['language_code'] = 'en-us'

lines = df['processed_gloss'].tolist()
phonemized_lines = phonemize_utterances(lines)
df['phonemized_utterance'] = phonemized_lines
df['character_split_utterance'] = character_split_utterance(lines)

num_empty = len(df[df['phonemized_utterance'] == ''])
print(f'WARNING: {num_empty} lines were not phonemized successfully. Dropping these.')
df = df[df['phonemized_utterance'] != '']
df['character_split_utterance'] = character_split_utterance(df['processed_gloss'].tolist())

# Subsamples the validation set to 1000 examples
df = df.sample(n=10000, random_state=42)

df.to_csv('BabyLM-phonemized/valid.csv', index=False)

Phonemizing using language "EnglishNA"...
Using espeak backend with language code "en-us"...


In [11]:
import datasets
from datasets import load_dataset

dataset_new = load_dataset('BabyLM-phonemized')

Generating train split: 10144265 examples [00:56, 178957.75 examples/s]
Generating valid split: 10000 examples [00:00, 177261.31 examples/s]


In [12]:
dataset_new

DatasetDict({
    train: Dataset({
        features: ['processed_gloss', 'filename', 'is_child', 'target_child_age', 'language_code', 'phonemized_utterance', 'character_split_utterance'],
        num_rows: 10144265
    })
    valid: Dataset({
        features: ['processed_gloss', 'filename', 'is_child', 'target_child_age', 'language_code', 'phonemized_utterance', 'character_split_utterance'],
        num_rows: 10000
    })
})