# Corpus Analysis

This python notebook contains various calculations to do with the corpora used in the study. 

In [1]:
import os
import collections
import numpy as np

In [2]:
CHILDES_PHONEMIZED = "../../Corpora/CHILDES_wordseg/phonemized/"
CHILDES_FILES = [CHILDES_PHONEMIZED+file for file in os.listdir(CHILDES_PHONEMIZED) if ".txt" in file]
CHILDES_FILES.sort()

BR_FILE = "../data/br-phonemes.txt"

In [14]:
def get_phones(files):
    phones = []
    for file in files:
        f = open(file, 'r').readlines()
        for line in f:
            for p in line.strip().split(' '):
                if not p in phones and p != ';eword':
                    phones.append(p)
    return phones

def find_phone(phone):
    for file in CHILDES_FILES:
        if phone in get_phones([file]):
            print(file)

In [4]:
LANGUAGES = ["Basque", "Cantonese", "Croatian", "Danish", "Dutch", "English", "Estonian", "Farsi", "French", "German", "Greek", "Hungarian", "Icelandic", "Indonesian", "Irish", "Italian", "Japanese", "Korean", "Mandarin", "Norwegian", "Portuguese", "Romanian", "Serbian", "Spanish", "Swedish", "Turkish"]
CHILDES_FILES_USED = [list(filter(lambda x : language in x, CHILDES_FILES))[0] for language in LANGUAGES]

In [11]:
print(get_phones(CHILDES_FILES))

['a', 'ɣ', 'u', 'ɾ', 'l', 'tʃ', 'aʊ', 'k', 'e', 'o', 's̻', 'β', 'n', 'ɡ', 'eɪ', 'm', 'i', 'p', 't', 'θ', '', 'b', 'aɪ', 'ð', 'ʎ', 'oɪ', 'd', 'ts̻', 'ts̺', 'ɲ', 's̺', 'r', 'ɟ', 'f', 'j', 'ʃ', 'x', 'eʊ', 'c', 'aaɜ', 'z', 'eoɜ', 'onɡ', 'ei', 'inɡɜ', 'au', 'ou', 'yu', 'h', 'ai', 'ɹ', 'iː', 'ɑː', 'aa', 'aɜ', 'oɜ', 'ŋ', 'eɜ', 'w', 'aau', 'inɡ', 'uː', 's', 'ouɜ', 'iu', 'oiɜ', 'nɡ', 'eoiɜ', 'ɒ', 'ɛ', 'oenɡ', 'ə', 'eo', 'ɔːɔː', 'ui', 'onɡɜ', 'ʌ', 'oenɡɜ', 'yuɜ', 'eoi', 'anɡ', 'unɡ', 'iuɜ', 'aai', 'iɜ', 'aaiɜ', 'ɪ', 'unɡɜ', 'enɡɜ', 'dʒ', 'oi', 'anɡɜ', 'ɐɐ', 'v', 'eiɜ', 'auɜ', 'aiɜ', 'enɡ', 'aanɡ', 'əə', 'aauɜ', 'uɜ', 'ələl', 'ɔɪ', 'oeɜ', 'ɑːɑː', 'ii', 'ɜː', 'ʊ', 'əʊəʊ', 'uiɜ', 'ɔː', 'oe', 'eə', 'aɪə', 'æ', 'ɑ', 'ts', 'dʑ', 'l̩', 'tɕ', 'ʒ', 'aː', 'oʊ', 'ɐ̯', 'ʔu', 'ʋ', 'ʔœ', '?ɑ', 'ε', 'ʔe', 'ʔʌ', 'ʔi', 'ʔy', 'ɔ', 'ʁ', 'œ', 'ʔo', '?a', 'ɜ', 'y', 'ɒɒ', 'œː', 'ɑɑ', 'ʌː', 'j-', 'əʊ', 'oː', 'ʔeː', 'œy', 'ɵ', 'eː', 'ʌʊ', 'øː', 'ɪː', 'yʊ', 'tʲ', 'ɛɪ', 'pː', 'əʲ', 'ʔ', 'ɛː', 'ʀ', 'r̩', 'ɕ', '1', 'ɫ', 'p

In [52]:
find_phone("A")

../../Corpora/CHILDES_wordseg/phonemized/Irish_Gaeltacht_Gaeltacht_10000utterances_phonemes.txt


In [26]:
a = get_phones([x for x in CHILDES_FILES_USED if "Jap" in x])
print(a, len(a))

['ɯː', 'n', 'ɯ', 'j', 'o', 'i', 'ʃ', 'd', 'r', 'e', 'g', 'a', 'k', 'oː', 'h', 'dʒ', 'z', 't', 'aː', 'eː', 'm', '�', 'tʃ', 'w', 'b', 'pʲ', 'p', 't͡s', 'ɲ', 'ɸ', 'rʲ', 'kʲ', 'ç', 'bʲ', 'gʲ'] 35


In [5]:
def get_type_token_ratio(file):
    f = open(file, 'r').readlines()
    types = collections.Counter()
    for line in f:
        words = [word.strip().split(' ') for word in line.split(';eword')]
        word_strs = [''.join(word) for word in words if word != [] and word != ['']]
        types.update(word_strs)
    return len(types)/sum(types[k] for k in types)

def get_phones_per_word(file):
    f = open(file, 'r').readlines()
    lengths = collections.Counter()
    for line in f:
        words = [word.strip().split(' ') for word in line.split(';eword')]
        word_lengths = [len(word) for word in words if word != [] and word != ['']]
        lengths.update(word_lengths)
    return sum([k * lengths[k] for k in lengths])/sum([lengths[k] for k in lengths])

In [6]:
ppw = np.array([get_phones_per_word(CHILDES_FILES_USED[i]) for i in range(len(CHILDES_FILES_USED))])
print(np.mean(ppw), np.std(ppw))

3.7085392993357003 0.6808417183682755


In [7]:
ttrs = np.array([get_type_token_ratio(CHILDES_FILES_USED[i]) for i in range(len(CHILDES_FILES_USED))])
print(np.mean(ttrs), np.std(ttrs))

0.09089396847085207 0.049873222560843575


In [8]:
print(get_type_token_ratio(BR_FILE))

0.039668034874314646


# Check Syllabic Sounds

In [54]:
IPA_SYLLABIC_SOUNDS = ['ɪ','ɐ','ʊ','i','ĩ','ĭ','ɨ','y','ỹ','ȳ','u','ʉ','ɯ','u','ʏ','ũ','ŭ','ʌ','ɞ','ü',
                        'ø','ɵ','ɤ','o','œ','œ','ö','œ','ɔ','ọ','õ','ŏ','ɵ','e','ɘ','ə','ɜ','ẹ','ɛ','ε','ɚ','ẽ','ĕ',
                        'æ','ɐ','a','ä','ɑ','ɒ','ã','ă','α']+['A']

In [35]:
def check_for_syllabic(file):
    wrong = collections.Counter()
    for line in open(file, 'r').readlines():
        word = ""
        is_syllabic = False
        for p in line.strip().split(' '):
            if p == ";eword":
                if not is_syllabic:
                    wrong[word]+=1
                is_syllabic = False
                word = ""
            else:
                word += p
                for s in IPA_SYLLABIC_SOUNDS:
                    if s in p:
                        is_syllabic = True
    return wrong


In [69]:
i = 20
print(CHILDES_FILES_USED[i])
l = check_for_syllabic(CHILDES_FILES_USED[i])
print(l)
print(sum([l[x] for x in l]))

../../Corpora/CHILDES_wordseg/phonemized/PortugueseBR_Florianopolis_Florianopolis_10000utterances_phonemes.txt
Counter({'sj': 69, 'dʒj': 60, 'kj': 18, 'mj': 7, 'tʃj': 3, 'p': 2, 'fj': 1, 'm': 1, 'vj': 1, 'lj': 1})
163
