# Preparing the English language data

This file shows how we selected the English language data to be included in the English version of Automatergon: AutomatergonE. Given the nature of web data, we do still need to apply some filtering to the results.

In [1]:
import re

onsets = set()
nuclei = set()
codas = set()

with open("./one-syllable-sorted-by-prevalence.txt") as f:
    for line in f:
        match = re.search("^([qwrtsdfgzxcvbnmhjklp]+)[aeuio]", line)
        if match:
            string = match.group(1)
            onsets.add(string)
        match = re.search("([aeuio]+)", line)
        if match:
            string = match.group(0)
            nuclei.add(string)
        match = re.search("[aeuio]+([qwrtsdfgzxcvbnmhjklp]+)$", line)
        if match:
            string = match.group(1)
            codas.add(string)
            
short_nuclei = set("aeiou")
long_nuclei = nuclei - short_nuclei

In [2]:
with open("langdata.txt", 'w') as f:
    f.write("ONSETS:\n")
    for item in sorted(onsets):
        f.write(item + '\n')
    f.write('\n')
    f.write("SHORT NUCLEI:\n")
    for item in sorted(short_nuclei):
        f.write(item + '\n')
    f.write('\n')
    f.write("LONG NUCLEI:\n")
    for item in sorted(long_nuclei):
        f.write(item + '\n')
    f.write('\n')
    f.write("CODAS:\n")
    for item in sorted(codas):
        f.write(item + '\n')

In [11]:
# Manually selected from the files:
onsets     = ['b', 'bl', 'br', 'c', 'ch', 'chr', 'cl', 'cr', 'd', 'dr', 'dw', 'f', 'fl', 'fr', 
              'g', 'gh', 'gl', 'gn', 'gr', 'h', 'j', 'k', 'kn', 'l', 'm', 'n', 'p', 'ph', 'phr', 
              'pl', 'pr', 'ps', 'r', 'rh', 's', 'sc', 'scr', 'sh', 'shr', 'sk', 'sl', 'sm', 'sn', 
              'sp', 'sph', 'spl', 'spr', 'st', 'str', 'sw', 't', 'th', 'thr', 'thw', 'tr', 'tw', 
              'v', 'w', 'wh', 'wr', 'z']

longvocals = ['ai', 'ea', 'ee', 'ei', 'ia', 'ie', 'oa', 'oo', 'ou']

shortcoda  = ['b', 'd', 'f', 'g', 'n', 'm', 'p', 'r', 's', 't', 'w', 'x']

longcoda   = ['bb', 'bs', 'ch', 'ck', 'cks', 'ct', 'dd', 'dds', 'ds', 'dth', 
              'ff', 'ffs', 'fs', 'ft', 'gg', 'gh', 'ght', 'ghth', 'ghts', 'gs', 
              'lb', 'lch', 'ld', 'lf', 'lfth', 'lk', 'll', 'lls', 'lm', 'lms', 
              'ln', 'lp', 'ls', 'lsh', 'lst', 'lt', 'lth', 'lts', 'mb', 'mbs', 
              'mn', 'mp', 'mps', 'mpt', 'ms', 'nch', 'nd', 'nds', 'ng', 'ngs', 
              'nk', 'nks', 'nn', 'ns', 'nt', 'nth', 'nts', 'ph', 'ps', 'pt', 
              'pth', 'rb', 'rc', 'rch', 'rcs', 'rd', 'rds', 'rf', 'rg', 'rk', 
              'rks', 'rld', 'rm', 'rms', 'rmth', 'rn', 'rp', 'rph', 'rps', 'rr', 
              'rs', 'rsh', 'rst', 'rsts', 'rt', 'rth', 'rts', 'sc', 'sh', 'sk', 
              'sp', 'ss', 'st', 'sts', 'tch', 'th', 'ths', 'ts', 'tsch', 'tt', 
              'wls', 'wn', 'wns', 'ws', 'wt', 'wth', 'xt', 'xth', 'zz']