# Prepare BabyLM Evaluation Pipeline Data

Script to convert BabyLM evaluation data to phonemes. First, download the [evaluation data](https://github.com/codebyzeb/evaluation-pipeline-2024?tab=readme-ov-file) used in the [BabyLM evaluation pipeline](https://github.com/codebyzeb/evaluation-pipeline-2024?tab=readme-ov-file) then run this notebook. After converting to phonemes, the data was copied into the [forked version](https://github.com/codebyzeb/evaluation-pipeline-2024?tab=readme-ov-file) of the pipeline used in the TransformerSegmentation project. 

In [6]:
import json
import os
import sys

os.environ['PHONEMIZER_ESPEAK_LIBRARY'] = '/opt/local/lib/libespeak-ng.dylib'
sys.path.append('../../')
from corpus_phonemizer import phonemize_utterances

INPUT_DIR = "evaluation_data"
OUTPUT_DIR = "evaluation_data_phonemized"

In [7]:
phonemize_utterances(['what a conundrum'], backend='phonemizer', language='en-us', keep_word_boundaries=True)



['w ʌ t WORD_BOUNDARY ʌ WORD_BOUNDARY k ə n ʌ n d ɹ ə m WORD_BOUNDARY']

In [8]:
keys = ['sentence_good', 'sentence_bad', 'sentence', 'question', 'passage', 'premise', 'hypothesis', 'sentence1', 'sentence2', 'paragraph', 'answer', 'question1', 'question2', 'text', 'span1_text', 'span2_text', 'Context1', 'Context2', 'Target1', 'Target2']
folders = ['blimp_filtered', 'glue_filtered', 'supplement_filtered', 'ewok_filtered']

In [9]:
for folder in folders:

    print(f"\n----------\n----------\nPhonemizing {folder}\n----------\n----------\n")

    files = []
    for root, _, filenames in os.walk(f'{INPUT_DIR}/{folder}'):
        for filename in filenames:
            if filename.endswith('.jsonl'):
                files.append(os.path.join(root, filename))

    for file in files:
        print(f"----------------\nPhonemizing {file}")

        with open(file, 'r') as f:
            data = f.readlines()
            data = [json.loads(line) for line in data]

        data_keys = []
        for line in data:
            data_keys += line.keys()
        data_keys = list(set(data_keys))

        for key in keys:
            if key in data_keys:
                sentences = [line[key] for line in data]
                phonemized = phonemize_utterances(sentences, backend='phonemizer', language='en-us', keep_word_boundaries=True, allow_possibly_faulty_word_boundaries=True)
                if len(phonemized) != len(sentences):
                    print(f"Failed to phonemize {len(sentences) - len(phonemized)} sentences ({(len(sentences) - len(phonemized)) / len(sentences) * 100:.2f}%) out of {len(sentences)} total sentences")
                    continue
                i = 0
                for line in data:
                    if key in line:
                        line[key] = phonemized[i]
                        i += 1

        # Save the phonemized data
        filename = file.split('/')[-1]
        os.makedirs(f'{OUTPUT_DIR}/{folder}', exist_ok=True)
        with open(f'{OUTPUT_DIR}/{folder}/{filename}', 'w', encoding='utf-8') as f:
            for line in data:
                f.write(json.dumps(line, ensure_ascii=False) + '\n')
        
    print("Done phonemizing")


----------
----------
Phonemizing blimp_filtered
----------
----------

----------------
Phonemizing evaluation_data/blimp_filtered/ellipsis_n_bar_2.jsonl




----------------
Phonemizing evaluation_data/blimp_filtered/principle_A_case_2.jsonl
----------------
Phonemizing evaluation_data/blimp_filtered/existential_there_quantifiers_1.jsonl




----------------
Phonemizing evaluation_data/blimp_filtered/causative.jsonl




----------------
Phonemizing evaluation_data/blimp_filtered/distractor_agreement_relative_clause.jsonl




----------------
Phonemizing evaluation_data/blimp_filtered/anaphor_number_agreement.jsonl
----------------
Phonemizing evaluation_data/blimp_filtered/wh_vs_that_no_gap.jsonl




----------------
Phonemizing evaluation_data/blimp_filtered/animate_subject_trans.jsonl




----------------
Phonemizing evaluation_data/blimp_filtered/superlative_quantifiers_1.jsonl
----------------
Phonemizing evaluation_data/blimp_filtered/determiner_noun_agreement_irregular_2.jsonl
----------------
Phonemizing evaluation_data/blimp_filtered/only_npi_scope.jsonl




----------------
Phonemizing evaluation_data/blimp_filtered/irregular_plural_subject_verb_agreement_1.jsonl




----------------
Phonemizing evaluation_data/blimp_filtered/sentential_negation_npi_scope.jsonl




----------------
Phonemizing evaluation_data/blimp_filtered/principle_A_reconstruction.jsonl




----------------
Phonemizing evaluation_data/blimp_filtered/left_branch_island_simple_question.jsonl




----------------
Phonemizing evaluation_data/blimp_filtered/passive_1.jsonl




----------------
Phonemizing evaluation_data/blimp_filtered/drop_argument.jsonl




----------------
Phonemizing evaluation_data/blimp_filtered/existential_there_quantifiers_2.jsonl




----------------
Phonemizing evaluation_data/blimp_filtered/determiner_noun_agreement_with_adj_2.jsonl




----------------
Phonemizing evaluation_data/blimp_filtered/wh_island.jsonl




----------------
Phonemizing evaluation_data/blimp_filtered/ellipsis_n_bar_1.jsonl




----------------
Phonemizing evaluation_data/blimp_filtered/principle_A_case_1.jsonl




----------------
Phonemizing evaluation_data/blimp_filtered/superlative_quantifiers_2.jsonl
----------------
Phonemizing evaluation_data/blimp_filtered/inchoative.jsonl




----------------
Phonemizing evaluation_data/blimp_filtered/wh_questions_subject_gap.jsonl




----------------
Phonemizing evaluation_data/blimp_filtered/adjunct_island.jsonl




----------------
Phonemizing evaluation_data/blimp_filtered/complex_NP_island.jsonl




----------------
Phonemizing evaluation_data/blimp_filtered/wh_vs_that_with_gap.jsonl




----------------
Phonemizing evaluation_data/blimp_filtered/irregular_plural_subject_verb_agreement_2.jsonl




----------------
Phonemizing evaluation_data/blimp_filtered/determiner_noun_agreement_irregular_1.jsonl




----------------
Phonemizing evaluation_data/blimp_filtered/expletive_it_object_raising.jsonl




----------------
Phonemizing evaluation_data/blimp_filtered/wh_questions_subject_gap_long_distance.jsonl




----------------
Phonemizing evaluation_data/blimp_filtered/coordinate_structure_constraint_complex_left_branch.jsonl




----------------
Phonemizing evaluation_data/blimp_filtered/only_npi_licensor_present.jsonl




----------------
Phonemizing evaluation_data/blimp_filtered/determiner_noun_agreement_with_adjective_1.jsonl




----------------
Phonemizing evaluation_data/blimp_filtered/passive_2.jsonl




----------------
Phonemizing evaluation_data/blimp_filtered/regular_plural_subject_verb_agreement_1.jsonl




----------------
Phonemizing evaluation_data/blimp_filtered/determiner_noun_agreement_with_adj_irregular_2.jsonl




----------------
Phonemizing evaluation_data/blimp_filtered/animate_subject_passive.jsonl
----------------
Phonemizing evaluation_data/blimp_filtered/npi_present_1.jsonl




----------------
Phonemizing evaluation_data/blimp_filtered/irregular_past_participle_adjectives.jsonl




----------------
Phonemizing evaluation_data/blimp_filtered/tough_vs_raising_1.jsonl




----------------
Phonemizing evaluation_data/blimp_filtered/sentential_negation_npi_licensor_present.jsonl




----------------
Phonemizing evaluation_data/blimp_filtered/existential_there_object_raising.jsonl




----------------
Phonemizing evaluation_data/blimp_filtered/wh_vs_that_with_gap_long_distance.jsonl




----------------
Phonemizing evaluation_data/blimp_filtered/sentential_subject_island.jsonl




----------------
Phonemizing evaluation_data/blimp_filtered/intransitive.jsonl




----------------
Phonemizing evaluation_data/blimp_filtered/principle_A_domain_3.jsonl
----------------
Phonemizing evaluation_data/blimp_filtered/principle_A_domain_1.jsonl




----------------
Phonemizing evaluation_data/blimp_filtered/determiner_noun_agreement_1.jsonl
----------------
Phonemizing evaluation_data/blimp_filtered/left_branch_island_echo_question.jsonl




----------------
Phonemizing evaluation_data/blimp_filtered/anaphor_gender_agreement.jsonl
----------------
Phonemizing evaluation_data/blimp_filtered/irregular_past_participle_verbs.jsonl
----------------
Phonemizing evaluation_data/blimp_filtered/tough_vs_raising_2.jsonl




----------------
Phonemizing evaluation_data/blimp_filtered/matrix_question_npi_licensor_present.jsonl




----------------
Phonemizing evaluation_data/blimp_filtered/npi_present_2.jsonl




----------------
Phonemizing evaluation_data/blimp_filtered/determiner_noun_agreement_with_adj_irregular_1.jsonl




----------------
Phonemizing evaluation_data/blimp_filtered/regular_plural_subject_verb_agreement_2.jsonl
----------------
Phonemizing evaluation_data/blimp_filtered/wh_vs_that_no_gap_long_distance.jsonl




----------------
Phonemizing evaluation_data/blimp_filtered/existential_there_subject_raising.jsonl




----------------
Phonemizing evaluation_data/blimp_filtered/transitive.jsonl




----------------
Phonemizing evaluation_data/blimp_filtered/distractor_agreement_relational_noun.jsonl




----------------
Phonemizing evaluation_data/blimp_filtered/principle_A_c_command.jsonl




----------------
Phonemizing evaluation_data/blimp_filtered/wh_questions_object_gap.jsonl




----------------
Phonemizing evaluation_data/blimp_filtered/coordinate_structure_constraint_object_extraction.jsonl




----------------
Phonemizing evaluation_data/blimp_filtered/principle_A_domain_2.jsonl
----------------
Phonemizing evaluation_data/blimp_filtered/determiner_noun_agreement_2.jsonl




Done phonemizing

----------
----------
Phonemizing glue_filtered
----------
----------

----------------
Phonemizing evaluation_data/glue_filtered/multirc.train.jsonl




Failed to phonemize 3 sentences (0.01%) out of 27243 total sentences
----------------
Phonemizing evaluation_data/glue_filtered/sst2.valid.jsonl




----------------
Phonemizing evaluation_data/glue_filtered/rte.valid.jsonl




----------------
Phonemizing evaluation_data/glue_filtered/cola.valid.jsonl




----------------
Phonemizing evaluation_data/glue_filtered/boolq.train.jsonl




----------------
Phonemizing evaluation_data/glue_filtered/mnli.valid.jsonl




----------------
Phonemizing evaluation_data/glue_filtered/mrpc.train.jsonl




----------------
Phonemizing evaluation_data/glue_filtered/qqp.train.jsonl




----------------
Phonemizing evaluation_data/glue_filtered/qnli.train.jsonl




----------------
Phonemizing evaluation_data/glue_filtered/wsc.train.jsonl




----------------
Phonemizing evaluation_data/glue_filtered/boolq.valid.jsonl




----------------
Phonemizing evaluation_data/glue_filtered/mnli.train.jsonl




----------------
Phonemizing evaluation_data/glue_filtered/mnli-mm.valid.jsonl




----------------
Phonemizing evaluation_data/glue_filtered/mrpc.valid.jsonl




----------------
Phonemizing evaluation_data/glue_filtered/qqp.valid.jsonl




----------------
Phonemizing evaluation_data/glue_filtered/qnli.valid.jsonl




----------------
Phonemizing evaluation_data/glue_filtered/wsc.valid.jsonl




----------------
Phonemizing evaluation_data/glue_filtered/multirc.valid.jsonl




----------------
Phonemizing evaluation_data/glue_filtered/sst2.train.jsonl




----------------
Phonemizing evaluation_data/glue_filtered/mnli.subs.jsonl




----------------
Phonemizing evaluation_data/glue_filtered/rte.train.jsonl




----------------
Phonemizing evaluation_data/glue_filtered/cola.train.jsonl




Done phonemizing

----------
----------
Phonemizing supplement_filtered
----------
----------

----------------
Phonemizing evaluation_data/supplement_filtered/qa_congruence_easy.jsonl




----------------
Phonemizing evaluation_data/supplement_filtered/turn_taking.jsonl
----------------
Phonemizing evaluation_data/supplement_filtered/subject_aux_inversion.jsonl




----------------
Phonemizing evaluation_data/supplement_filtered/qa_congruence_tricky.jsonl




----------------
Phonemizing evaluation_data/supplement_filtered/hypernym.jsonl




Done phonemizing

----------
----------
Phonemizing ewok_filtered
----------
----------

----------------
Phonemizing evaluation_data/ewok_filtered/social-relations.jsonl




----------------
Phonemizing evaluation_data/ewok_filtered/physical-relations.jsonl




----------------
Phonemizing evaluation_data/ewok_filtered/agent-properties.jsonl




----------------
Phonemizing evaluation_data/ewok_filtered/physical-interactions.jsonl




----------------
Phonemizing evaluation_data/ewok_filtered/material-dynamics.jsonl
----------------
Phonemizing evaluation_data/ewok_filtered/material-properties.jsonl




----------------
Phonemizing evaluation_data/ewok_filtered/social-properties.jsonl




----------------
Phonemizing evaluation_data/ewok_filtered/social-interactions.jsonl




----------------
Phonemizing evaluation_data/ewok_filtered/quantitative-properties.jsonl




----------------
Phonemizing evaluation_data/ewok_filtered/spatial-relations.jsonl




----------------
Phonemizing evaluation_data/ewok_filtered/physical-dynamics.jsonl




Done phonemizing


In [10]:
from transformers import AutoTokenizer
import pandas as pd

tokenizer_name = 'phonemetransformers/BABYLM-TOKENIZER-BPE-PHON'
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

df = pd.DataFrame(columns=['folder', 'file', 'key', 'count', 'total_lines', 'percentage', 'average_length'])

for folder in folders:

    print(f"\n----------Counting {folder}----------\n")

    files = []
    for root, _, filenames in os.walk(f'{INPUT_DIR}/{folder}'):
        for filename in filenames:
            if filename.endswith('.jsonl'):
                files.append(os.path.join(root, filename))

    files = sorted(files)

    for file in files:
        print(f"Counting {file}")

        with open(file, 'r') as f:
            data = f.readlines()
            data = [json.loads(line) for line in data]

        data_keys = []
        for line in data:
            data_keys += line.keys()
        data_keys = list(set(data_keys))

        for key in keys:
            if key in data_keys:
                sentences = [line[key] for line in data]
                tokenized = tokenizer(sentences, padding=False, truncation=False)
                # Count number of lines with more than 128 tokens
                count = 0
                total_length = 0
                for i in range(len(tokenized['input_ids'])):
                    length = len(tokenized['input_ids'][i])
                    if length > 128:
                        count += 1
                    total_length += length
                total_lines = len(tokenized['input_ids'])
                average_length = total_length / total_lines
                percentage = count / total_lines * 100
                df = pd.concat([df, pd.DataFrame({'folder': folder, 'file': file.split('/')[-1], 'key': key, 'count': count, 'total_lines': total_lines, 'percentage': percentage, 'average_length': average_length}, index=[0])], ignore_index=True)

    print("Done counting")

df.to_csv('phonemized_stats.csv', index=False)

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.



----------Counting blimp_filtered----------

Counting evaluation_data/blimp_filtered/adjunct_island.jsonl
Counting evaluation_data/blimp_filtered/anaphor_gender_agreement.jsonl
Counting evaluation_data/blimp_filtered/anaphor_number_agreement.jsonl
Counting evaluation_data/blimp_filtered/animate_subject_passive.jsonl
Counting evaluation_data/blimp_filtered/animate_subject_trans.jsonl
Counting evaluation_data/blimp_filtered/causative.jsonl
Counting evaluation_data/blimp_filtered/complex_NP_island.jsonl
Counting evaluation_data/blimp_filtered/coordinate_structure_constraint_complex_left_branch.jsonl
Counting evaluation_data/blimp_filtered/coordinate_structure_constraint_object_extraction.jsonl
Counting evaluation_data/blimp_filtered/determiner_noun_agreement_1.jsonl
Counting evaluation_data/blimp_filtered/determiner_noun_agreement_2.jsonl
Counting evaluation_data/blimp_filtered/determiner_noun_agreement_irregular_1.jsonl
Counting evaluation_data/blimp_filtered/determiner_noun_agreement_i