## Tables

Brief bit of code for converting the database info into a README or latex table.



In [1]:
import datasets
import os
import pandas as pd
import json
import numpy as np

config_file = '../childes_processor/phonemizer_config.json'
childes_folder = '../CHILDES-dataset'

collection_map = {
    'basque' : 'Other/Basque',
    'dutch' : 'DutchAfricaans/Dutch',
    'englishNA' : 'Eng-NA',
    'englishUK' : 'Eng-UK',
    'indonesian' : 'EastAsian/Indonesian',
    'mandarin' : 'Chinese/Mandarin',
    'serbian' : 'Slavic/Serbian',
    'estonian' : 'Other/Estonian',
    'cantonese' : 'Chinese/Cantonese',
    'polish' : 'Slavic/Polish',
    'swedish' : 'Scandinavian/Swedish',
    'portuguesept' : 'Romance/Portuguese',
    'portuguesebr' : 'Romance/Portuguese',
    'korean' : 'EastAsian/Korean',
    'italian' : 'Romance/Italian',
    'catalan' : 'Romance/Catalan',
    'croatian' : 'Slavic/Croatian',
    'welsh' : 'Celtic/Welsh',
    'icelandic' : 'Scandinavian/Icelandic',
    'danish' : 'Scandinavian/Danish',
    'norwegian' : 'Scandinavian/Norwegian',
    'hungarian' : 'Other/Hungarian',
    'romaninian' : 'Other/Romanian',
    'irish' : 'Celtic/Irish',
    'turkish' : 'Other/Turkish',
    'quechua' : 'Other/Quechua',
    'farsi' : 'Other/Farsi',
}

PHONEME_SETS = {
    'basque' : 2161,
    'cantonese' : 2309,
    'catalan' : 2555,
    'croatian' : 1139,
    'danish' : 2265,
    'dutch' : 2405,
    'englishna' : 2175,
    'englishuk' : 2252,
    'estonian' : 2181,
    'farsi' : 516,
    'french' : 2269,
    'german' : 2398,
    'hungarian' : 2191,
    'icelandic' : 2568,
    'indonesian' : 1690,
    'italian' : 1145,
    'irish' : 2521,
    'japanese' : 2196,
    'korean' : 423,
    'mandarin' : 2457,
    'norwegian' : 499,
    'polish' : 1046,
    'romanian' : 2443,
    'serbian' : 2499,
    'spanish' : 164,
    'swedish' : 1150,
    'portuguesept' : 2206,
    'portuguesebr' : 2207,
    'quechua' : 104,
    'turkish' : 2217,
    'welsh' : 2406,
}

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
phoible = pd.read_csv('../../../data/phoible.csv')
phonemes = phoible.Phoneme.unique()
TONES = '˧˥˩̰˨˩˦'

def get_phoneme_set(lines):
    token_counts = {}
    for line in lines:
        # Our tool combines tone markers with the preceeding vowel, we remove tone markers in our comparison so that we don't get many "unknown phonemes" consisting of a known vowel + tone marker.
        #line = line.replace('˧˥', '').replace('˧˩̰', '').replace('˩˧', '').replace('˨', '').replace('˥', '').replace('˧', '').replace('˧˥', '').replace('˧˩̰', '').replace('˩˧','').replace('˩','').replace('˦','')
        tokens = line.strip().split()
        for token in tokens:
            if token == 'WORD_BOUNDARY':
                continue
            if token not in token_counts:
                token_counts[token] = 0
            token_counts[token] += 1
    vowels = []
    consonants = []
    other = []
    for phoneme in token_counts:
        cmp_phoneme = phoneme
        if phoneme not in phonemes:
            has_tones = False
            for tone in TONES:
                if tone in phoneme:
                    has_tones = True
                    cmp_phoneme = cmp_phoneme.replace(tone, '')
            if not has_tones or cmp_phoneme not in phonemes:
                print(f'{phoneme} not in phoible')
                other.append(phoneme)
                continue
        if phoible[phoible.Phoneme == cmp_phoneme].SegmentClass.iloc[0] == 'vowel':
            vowels.append(phoneme)
        elif phoible[phoible.Phoneme == cmp_phoneme].SegmentClass.iloc[0] == 'consonant':
            consonants.append(phoneme)
        else:
            other.append(phoneme)

    return vowels, consonants, other

  phoible = pd.read_csv('../../../data/phoible.csv')


In [3]:
columns = ['Language', 'CHILDES Collection', 'Backend', 'Language Code', 'Inventory ID', 'Description',
           'Speakers', 'Utterances', 'Words', 'Phonemes',
           '% Child', 'Phoneme Types', 'Consonants', 'Vowels']

data = {column: [] for column in columns}

# load json config
config = json.load(open(config_file))

for config_name in datasets.get_dataset_config_names(childes_folder)[::-1]:
    print('\n' + config_name + '...')
    
    dataset = datasets.load_dataset('../CHILDES-dataset', config_name)
    dataset = datasets.concatenate_datasets([dataset['train'], dataset['valid']])

    config_name = 'EnglishNA' if config_name == 'English' else config_name
    language = config_name
    config_name = config_name.lower()

    collection = collection_map[config_name] if config_name in collection_map else language
    backend = config[config_name]['backend']
    lang_code = config[config_name]['language']
    inventory_id = PHONEME_SETS[config_name]
    num_corpora = len(set(dataset['corpus_id']))
    speakers = len(set(dataset['speaker_id']))
    total_utterances = len(dataset)
    total_words = sum([utterance.count('WORD_BOUNDARY') for utterance in list(dataset['phonemized_utterance'])])
    total_phonemes = sum([len(utterance.split()) for utterance in dataset['phonemized_utterance']]) - total_words
    percentage_child = 100 * len([c for c in dataset['is_child'] if c]) / total_utterances
    vowels, consonants, other = get_phoneme_set(dataset['phonemized_utterance'])
    n_phonemes = len(set(vowels + consonants + other))
    description = f"Taken from {num_corpora} corpora in the {collection} collection of CHILDES and phonemized using `{backend}` with language code `{lang_code}`."

    data['Language'].append(language)
    data['CHILDES Collection'].append(collection)
    data['Backend'].append(backend)
    data['Language Code'].append(lang_code)
    data['Inventory ID'].append(inventory_id)
    data['Description'].append(description)
    data['Speakers'].append(speakers)
    data['Utterances'].append(total_utterances)
    data['Words'].append(total_words)
    data['Phonemes'].append(total_phonemes)
    data['% Child'].append(percentage_child)
    data['Phoneme Types'].append(n_phonemes)
    data['Consonants'].append(len(consonants))
    data['Vowels'].append(len(vowels))


Polish...
z̻ʲ not in phoible

Serbian...
ä̈ not in phoible

Romanian...

PortugueseBr...

PortuguesePt...

Italian...

Catalan...

Quechua...

Norwegian...

Swedish...

Korean...

Welsh...
ɪuː not in phoible

Irish...

Indonesian...

Icelandic...

Farsi...

Turkish...

Hungarian...

Basque...

Danish...

Croatian...

Estonian...

Cantonese...

Japanese...

Mandarin...

Dutch...

Spanish...
î not in phoible
k̈ not in phoible
ê̞ not in phoible

German...
oː̈ not in phoible
uː̂ not in phoible
A not in phoible
Ø not in phoible
Z not in phoible
I not in phoible

French...

EnglishUK...

English...


In [4]:
def create_readme_table(columns):
    text = " | ".join(columns) + "\n"
    text += "|:----" * len(columns) + "|"

    # Sort data by number of utterances
    languages = [x for _, x in sorted(zip(data['Phonemes'], data['Language']))][::-1]

    for language in languages:
        idx = data['Language'].index(language)
        text += "\n"
        for column in columns:
            if isinstance(data[column][idx], int):
                text += f"| {data[column][idx]:,}"
            elif isinstance(data[column][idx], float):
                text += f"| {data[column][idx]:.2f}"
            else:
                text += f"| {data[column][idx]}"
            #text += f"| {data[column][idx]}"
        #text += f"\n| {data['Language'][idx]} | {data['Description'][idx]} | {data['Speakers'][idx]} | {data['Total Utterances'][idx]:,} | {data['Total Words'][idx]:,} | {data['Total Phonemes'][idx]:,} |"
        # Number of phonemes with comma between thousands
    return text

def create_latex_table():
    text = """| Language | Description | Speakers | Utterances | Words | Phonemes
|:----|:-----|:-----|:----|:-----|:-----|"""

    # Sort data by number of utterances
    languages = [x for _, x in sorted(zip(data['Total Phonemes'], data['Language']))][::-1]

    for language in languages:
        idx = data['Language'].index(language)
        text += f"\n| {data['Language'][idx]} | {data['Description'][idx]} | {data['Speakers'][idx]} | {data['Total Utterances'][idx]:,} | {data['Total Words'][idx]:,} | {data['Total Phonemes'][idx]:,} |"
        # Number of phonemes with comma between thousands
    return text

In [5]:
print(create_readme_table(['Language', 'Description', 'Speakers', 'Utterances', 'Words', 'Phonemes', '% Child']))

Language | Description | Speakers | Utterances | Words | Phonemes | % Child
|:----|:----|:----|:----|:----|:----|:----|
| EnglishNA| Taken from 49 corpora in the EnglishNA collection of CHILDES and phonemized using `phonemizer` with language code `en-us`.| 3,687| 2,564,614| 9,993,744| 30,986,218| 35.83
| EnglishUK| Taken from 16 corpora in the EnglishUK collection of CHILDES and phonemized using `phonemizer` with language code `en-gb`.| 869| 2,043,115| 7,147,541| 21,589,844| 39.00
| German| Taken from 10 corpora in the German collection of CHILDES and phonemized using `epitran` with language code `deu-Latn`.| 829| 1,525,559| 5,825,166| 21,442,576| 43.61
| Japanese| Taken from 11 corpora in the Japanese collection of CHILDES and phonemized using `phonemizer` with language code `ja`.| 489| 998,642| 2,970,674| 11,985,729| 44.20
| Indonesian| Taken from 1 corpora in the EastAsian/Indonesian collection of CHILDES and phonemized using `epitran` with language code `ind-Latn`.| 438| 813,795| 2

In [6]:
new_lines = ['Language & CHILDES Collection & Backend & Language Code & Speakers & Utterances & Words & Phonemes + \\\\']  
summ = 0
for line in text.split('\n')[2:]:
    line_data = line.split('| ')
    new_lines.append(line_data[1] + ' & ')
    # find text between "in" and "collection" using regex
    collection = re.search(r'in (.*) collection', line_data[2]).group(1)
    corpora = re.search(r'from (.*) corp', line_data[2]).group(1)
    backend = re.search(r'using `(.*)` with', line_data[2]).group(1)
    language_code = re.search(r'code `(.*)`', line_data[2]).group(1)

    new_lines[-1] += collection + ' (' + corpora + ') & ' + backend + ' & ' + language_code + ' & ' + line_data[3] + ' & ' + line_data[4] + ' & ' + line_data[5] + ' & ' + line_data[6] + ' \\\\'
    summ += int(line_data[4].replace(',', ''))
print('\n'.join(new_lines) + '\n')
print(summ)

NameError: name 'text' is not defined

In [23]:
data['% Child']

[0.6325870419446221,
 0.2913797153192089,
 0.42617583936561054,
 0.44422656981148895,
 0.39466192964331104,
 0.3901929822702176,
 0.3649147615680729,
 0.40058936464705097,
 0.42579071495493165,
 0.44634048187766123,
 0.3676351858360008,
 0.6918176201928381,
 0.34373427277302465,
 0.34316996295135754,
 0.352054847085609,
 0.40445761287754833,
 0.5058498482109356,
 0.4795236045343665,
 0.48818094133105944,
 0.4170842309477618,
 0.3923861438368208,
 0.44713007099255836,
 0.3354023983006771,
 0.44202727303678396,
 0.3889395899249918,
 0.3507901415711623,
 0.45927681564874334,
 0.436077529613735,
 0.40074827941496644,
 0.3900436343524471,
 0.35826717003026576]