## Latex Table

Brief bit of code for converting the database info into a latex table.



In [1]:
import datasets
import os
import pandas as pd
import json
import numpy as np

config_file = '../childes_processor/phonemizer_config.json'
childes_folder = '../CHILDES-dataset'

collection_map = {
    'basque' : 'Other/Basque',
    'dutch' : 'DutchAfricaans/Dutch',
    'englishNA' : 'Eng-NA',
    'englishUK' : 'Eng-UK',
    'indonesian' : 'EastAsian/Indonesian',
    'mandarin' : 'Chinese/Mandarin',
    'serbian' : 'Slavic/Serbian',
    'estonian' : 'Other/Estonian',
    'cantonese' : 'Chinese/Cantonese',
    'polish' : 'Slavic/Polish',
    'swedish' : 'Scandinavian/Swedish',
    'portuguesept' : 'Romance/Portuguese',
    'portuguesebr' : 'Romance/Portuguese',
    'korean' : 'EastAsian/Korean',
    'italian' : 'Romance/Italian',
    'catalan' : 'Romance/Catalan',
    'croatian' : 'Slavic/Croatian',
    'welsh' : 'Celtic/Welsh',
    'icelandic' : 'Scandinavian/Icelandic',
    'danish' : 'Scandinavian/Danish',
    'norwegian' : 'Scandinavian/Norwegian',
    'hungarian' : 'Other/Hungarian',
    'romaninian' : 'Other/Romanian',
    'irish' : 'Celtic/Irish',
    'turkish' : 'Other/Turkish',
    'quechua' : 'Other/Quechua',
    'farsi' : 'Other/Farsi',
}

PHONEME_SETS = {
    'basque' : 2161,
    'cantonese' : 2309,
    'catalan' : 2555,
    'croatian' : 1139,
    'danish' : 2265,
    'dutch' : 2405,
    'englishna' : 2175,
    'englishuk' : 2252,
    'estonian' : 2181,
    'farsi' : 516,
    'french' : 2269,
    'german' : 2398,
    'hungarian' : 2191,
    'icelandic' : 2568,
    'indonesian' : 1690,
    'italian' : 1145,
    'irish' : 2521,
    'japanese' : 2196,
    'korean' : 423,
    'mandarin' : 2457,
    'norwegian' : 499,
    'polish' : 1046,
    'romanian' : 2443,
    'serbian' : 2499,
    'spanish' : 164,
    'swedish' : 1150,
    'portuguesept' : 2206,
    'portuguesebr' : 2207,
    'quechua' : 104,
    'turkish' : 2217,
    'welsh' : 2406,
}

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
phoible = pd.read_csv('../../../data/phoible.csv')
phonemes = phoible.Phoneme.unique()
TONES = '˧˥˩̰˨˩˦'

def get_phoneme_set(lines):
    token_counts = {}
    for line in lines:
        # Our tool combines tone markers with the preceeding vowel, we remove tone markers in our comparison so that we don't get many "unknown phonemes" consisting of a known vowel + tone marker.
        #line = line.replace('˧˥', '').replace('˧˩̰', '').replace('˩˧', '').replace('˨', '').replace('˥', '').replace('˧', '').replace('˧˥', '').replace('˧˩̰', '').replace('˩˧','').replace('˩','').replace('˦','')
        tokens = line.strip().split()
        for token in tokens:
            if token == 'WORD_BOUNDARY':
                continue
            if token not in token_counts:
                token_counts[token] = 0
            token_counts[token] += 1
    vowels = []
    consonants = []
    other = []
    for phoneme in token_counts:
        cmp_phoneme = phoneme
        if phoneme not in phonemes:
            has_tones = False
            for tone in TONES:
                if tone in phoneme:
                    has_tones = True
                    cmp_phoneme = cmp_phoneme.replace(tone, '')
            if not has_tones or cmp_phoneme not in phonemes:
                print(f'{phoneme} not in phoible')
                other.append(phoneme)
                continue
        if phoible[phoible.Phoneme == cmp_phoneme].SegmentClass.iloc[0] == 'vowel':
            vowels.append(phoneme)
        elif phoible[phoible.Phoneme == cmp_phoneme].SegmentClass.iloc[0] == 'consonant':
            consonants.append(phoneme)
        else:
            other.append(phoneme)

    return vowels, consonants, other

  phoible = pd.read_csv('../../../data/phoible.csv')


In [3]:
columns = ['Language', 'CHILDES Collection', 'Backend', 'Language Code', 'Inventory ID', 'Description',
           'Speakers', 'Total Utterances', 'Total Words', 'Total Phonemes',
           '% Child', 'Phonemes', 'Consonants', 'Vowels']

data = {column: [] for column in columns}

# load json config
config = json.load(open(config_file))

for config_name in datasets.get_dataset_config_names(childes_folder)[::-1]:
    print('\n' + config_name + '...')
    
    dataset = datasets.load_dataset('../CHILDES-dataset', config_name)
    dataset = datasets.concatenate_datasets([dataset['train'], dataset['valid']])

    config_name = 'EnglishNA' if config_name == 'English' else config_name
    language = config_name
    config_name = config_name.lower()

    collection = collection_map[config_name] if config_name in collection_map else language
    backend = config[config_name]['backend']
    lang_code = config[config_name]['language']
    inventory_id = PHONEME_SETS[config_name]
    num_corpora = len(set(dataset['corpus_id']))
    speakers = len(set(dataset['speaker_id']))
    total_utterances = len(dataset)
    total_words = sum([utterance.count('WORD_BOUNDARY') for utterance in list(dataset['phonemized_utterance'])])
    total_phonemes = sum([len(utterance.split()) for utterance in dataset['phonemized_utterance']]) - total_words
    percentage_child = len([c for c in dataset['is_child'] if c]) / total_utterances
    vowels, consonants, other = get_phoneme_set(dataset['phonemized_utterance'])
    n_phonemes = len(set(vowels + consonants + other))
    description = f"Taken from {num_corpora} corpora in the {collection} collection of CHILDES and phonemized using `{backend}` with language code `{lang_code}`."

    data['Language'].append(language)
    data['CHILDES Collection'].append(collection)
    data['Backend'].append(backend)
    data['Language Code'].append(lang_code)
    data['Inventory ID'].append(inventory_id)
    data['Description'].append(description)
    data['Speakers'].append(speakers)
    data['Total Utterances'].append(total_utterances)
    data['Total Words'].append(total_words)
    data['Total Phonemes'].append(total_phonemes)
    data['% Child'].append(percentage_child)
    data['Phonemes'].append(n_phonemes)
    data['Consonants'].append(len(consonants))
    data['Vowels'].append(len(vowels))

data

Polish...

Serbian...

́ not in phoible
Romanian...

PortugueseBr...

PortuguesePt...

Italian...

Catalan...

Quechua...

Norwegian...

Swedish...

Korean...

Welsh...

Irish...



Generating train split: 0 examples [00:00, ? examples/s]


KeyboardInterrupt: 

In [4]:
import re

text = """| Language | Description | Speakers | Utterances | Words | Phonemes
|:----|:-----|:-----|:----|:-----|:-----|
| English (US) | Taken from 44 corpora in Eng-NA collection of CHILDES and phonemized using `phonemizer` with language code `en-us`. | 2,692 | 1,645,797 | 7,096,724 | 22,107,530
| English (UK) | Taken from 14 corpora in Eng-NA collection of CHILDES and phonemized using `phonemizer` with language code `en-gb`. | 588 | 1,246,211 | 5,170,088 | 15,710,282
| German | Taken from 10 corpora in German collection of CHILDES and phonemized using `epitran` with language code `deu-Latn`. | 628 | 860,297 | 3,967,699 | 14,821,724
| Japanese | Taken from 9 corpora in Japanese collection of CHILDES and phonemized using `phonemizer` with language code `japanese`. | 329 | 557,215 | 1,773,816 | 7,100,307
| Indonesian | Taken from 1 corpus in EastAsian/Indonesian collection of CHILDES and phonemized using `epitran` with language code `ind-Latn`. | 389 | 534,525 | 2,122,372 | 6,369,459
| French | Taken from 11 corpora in French collection of CHILDES and phonemized using `phonemizer` with language code `fr-fr`. | 722 | 432,133 | 1,995,063 | 5,510,523
| Spanish | Taken from 18 corpora in Spanish collection of CHILDES and phonemized using `epitran` with language code `spa-Latn`. | 562 | 288,372 | 1,567,124 | 4,553,108
| Mandarin | Taken from 15 corpora in Chinese/Mandarin collection of CHILDES and phonemized using `pinyin_to_ipa` with language code `mandarin`. | 883 | 324,071 | 1,506,475 | 4,397,546
| Dutch | Taken from 4 corpora in DutchAfricaans/Dutch collection of CHILDES and phonemized using `phonemizer` with language code `nl`. | 78 | 261,938 | 1,106,865 | 3,585,608
| Serbian | Taken from 1 corpus in Slavic/Serbian collection of CHILDES and phonemized using `epitran` with language code `srp-Latn`. | 199 | 226,266 | 1,054,074 | 3,067,398
| Estonian | Taken from 9 corpora in Other/Estonian collection of CHILDES and phonemized using `phonemizer` with language code `et`. | 118 | 103,343 | 544,680 | 2,226,518
| Polish | Taken from 2 corpora in Slavic/Polish collection of CHILDES and phonemized using `phonemizer` with language code `pl`. | 466 | 80,412 | 381,940 | 1,599,152
| Cantonese | Taken from 2 corpora in Chinese/Cantonese collection of CHILDES and phonemized using `pingyam` with language code `cantonese`. | 80 | 136,727 | 591,314 | 1,425,686
| Swedish | Taken from 3 corpora in Scandinavian/Swedish collection of CHILDES and phonemized using `phonemizer` with language code `sv`. | 32 | 85,299 | 396,800 | 1,242,615
| Portuguese (Portugal) | Taken from 3 corpora in Romance/Portuguese collection of CHILDES and phonemized using `phonemizer` with language code `pt`. | 33 | 81,444 | 368,032 | 1,117,010
| Korean | Taken from 3 corpora in EastAsian/Korean collection of CHILDES and phonemized using `phonemizer` with language code `ko`. | 95 | 66,576 | 201,078 | 1,074,044
| Italian | Taken from 5 corpora in Romance/Italian collection of CHILDES and phonemized using `phonemizer` with language code `it`. | 92 | 57,542 | 264,479 | 996,701
| Catalan | Taken from 5 corpora in Romance/Catalan collection of CHILDES and phonemized using `phonemizer` with language code `ca`. | 159 | 56,588 | 248,999 | 839,462
| Croatian | Taken from 1 corpus in Slavic/Croatian collection of CHILDES and phonemized using `phonemizer` with language code `hr`. | 51 | 55,288 | 214,949 | 805,530
| Welsh | Taken from 2 corpora in Celtic/Welsh collection of CHILDES and phonemized using `phonemizer` with language code `cy`. | 65 | 55,871 | 269,295 | 785,569
| Icelandic | Taken from 2 corpora in Scandinavian/Icelandic collection of CHILDES and phonemized using `phonemizer` with language code `is`. | 15 | 50,657 | 197,519 | 751,804
| Danish | Taken from 1 corpus in Scandinavian/Danish collection of CHILDES and phonemized using `phonemizer` with language code `da`. | 25 | 48,976 | 192,527 | 579,972
| Norwegian | Taken from 2 corpora in Scandinavian/Norwegian collection of CHILDES and phonemized using `phonemizer` with language code `nb`. | 27 | 35,547 | 175,952 | 559,340
| Basque | Taken from 2 corpora in Other/Basque collection of CHILDES and phonemized using `phonemizer` with language code `eu`. | 150 | 36,614 | 135,866 | 565,633
| Hungarian | Taken from 3 corpora in Other/Hungarian collection of CHILDES and phonemized using `epitran` with language code `hun-Latn`. | 65 | 36,272 | 147,334 | 588,934
| Romanian | Taken from 2 corpora in Romance/Romanian collection of CHILDES and phonemized using `phonemizer` with language code `ro`. | 21 | 31,550 | 110,067 | 380,577
| Portuguese (Brazil) | Taken from 2 corpora in Romance/Portuguese collection of CHILDES and phonemized using `phonemizer` with language code `pt-br`. | 163 | 12,471 | 91,484 | 303,998
| Irish | Taken from 2 corpora in Celtic/Irish collection of CHILDES and phonemized using `phonemizer` with language code `ga`. | 20 | 18,256 | 88,388 | 278,558
| Turkish | Taken from 2 corpora in Other/Turkish collection of CHILDES and phonemized using `phonemizer` with language code `tr`. | 35 | 14,487 | 43,823 | 230,737
| Quechua | Taken from 2 corpora in Other/Quechua collection of CHILDES and phonemized using `phonemizer` with language code `qu`. | 7 | 13,425 | 33,102 | 204,692
| Farsi | Taken from 2 corpora in Other/Farsi collection of CHILDES and phonemized using `phonemizer` with language code `fa-latn`. | 23 | 13,467 | 28,080 | 115,089"""

new_lines = ['Language & CHILDES Collection & Backend & Language Code & Speakers & Utterances & Words & Phonemes + \\\\']  
summ = 0
for line in text.split('\n')[2:]:
    line_data = line.split('| ')
    new_lines.append(line_data[1] + ' & ')
    # find text between "in" and "collection" using regex
    collection = re.search(r'in (.*) collection', line_data[2]).group(1)
    corpora = re.search(r'from (.*) corp', line_data[2]).group(1)
    backend = re.search(r'using `(.*)` with', line_data[2]).group(1)
    language_code = re.search(r'code `(.*)`', line_data[2]).group(1)

    new_lines[-1] += collection + ' (' + corpora + ') & ' + backend + ' & ' + language_code + ' & ' + line_data[3] + ' & ' + line_data[4] + ' & ' + line_data[5] + ' & ' + line_data[6] + ' \\\\'
    summ += int(line_data[4].replace(',', ''))
print('\n'.join(new_lines) + '\n')
print(summ)

Language & CHILDES Collection & Backend & Language Code & Speakers & Utterances & Words & Phonemes + \\
English (US)  & Eng-NA (44) & phonemizer & en-us & 2,692  & 1,645,797  & 7,096,724  & 22,107,530 \\
English (UK)  & Eng-NA (14) & phonemizer & en-gb & 588  & 1,246,211  & 5,170,088  & 15,710,282 \\
German  & German (10) & epitran & deu-Latn & 628  & 860,297  & 3,967,699  & 14,821,724 \\
Japanese  & Japanese (9) & phonemizer & japanese & 329  & 557,215  & 1,773,816  & 7,100,307 \\
Indonesian  & EastAsian/Indonesian (1) & epitran & ind-Latn & 389  & 534,525  & 2,122,372  & 6,369,459 \\
French  & French (11) & phonemizer & fr-fr & 722  & 432,133  & 1,995,063  & 5,510,523 \\
Spanish  & Spanish (18) & epitran & spa-Latn & 562  & 288,372  & 1,567,124  & 4,553,108 \\
Mandarin  & Chinese/Mandarin (15) & pinyin_to_ipa & mandarin & 883  & 324,071  & 1,506,475  & 4,397,546 \\
Dutch  & DutchAfricaans/Dutch (4) & phonemizer & nl & 78  & 261,938  & 1,106,865  & 3,585,608 \\
Serbian  & Slavic/Serbi

In [7]:
def create_readme_table():
    text = """| Language | Description | Speakers | Utterances | Words | Phonemes
|:----|:-----|:-----|:----|:-----|:-----|"""

    # Sort data by number of utterances
    languages = [x for _, x in sorted(zip(data['Total Utterances'], data['Language']))][::-1]

    for language in languages:
        idx = data['Language'].index(language)
        text += f"\n| {data['Language'][idx]} | {data['Description'][idx]} | {data['Speakers'][idx]} | {data['Total Utterances'][idx]} | {data['Total Words'][idx]} | {data['Total Phonemes'][idx]} |"

    return text

In [8]:
print(create_readme_table())

| Language | Description | Speakers | Utterances | Words | Phonemes
|:----|:-----|:-----|:----|:-----|:-----|
| EnglishNA | Taken from 44 corpora in the EnglishNA collection of CHILDES and phonemized using `phonemizer` with language code `en-us`. | 2692 | 1645797 | 7096724 | 22107530 |
| EnglishUK | Taken from 14 corpora in the EnglishUK collection of CHILDES and phonemized using `phonemizer` with language code `en-gb`. | 588 | 1246211 | 5170088 | 15710282 |
| German | Taken from 10 corpora in the German collection of CHILDES and phonemized using `epitran` with language code `deu-Latn`. | 628 | 860297 | 3967699 | 14821812 |
| Japanese | Taken from 9 corpora in the Japanese collection of CHILDES and phonemized using `epitran` with language code `ja`. | 329 | 557215 | 1773816 | 7100307 |
| Indonesian | Taken from 1 corpora in the EastAsian/Indonesian collection of CHILDES and phonemized using `epitran` with language code `ind-Latn`. | 389 | 534525 | 1587847 | 6369459 |
| French | Taken f