In [1]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

**Notebook author:** emeinhardt@ucsd.edu

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Extract-present/past-tense-orthography-pairings" data-toc-modified-id="Extract-present/past-tense-orthography-pairings-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Extract present/past tense orthography pairings</a></span></li><li><span><a href="#Grab-Transcriptions-from-CMU" data-toc-modified-id="Grab-Transcriptions-from-CMU-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Grab Transcriptions from CMU</a></span></li><li><span><a href="#Aligning-the-morphological-DB-with-the-CMU-dictionary" data-toc-modified-id="Aligning-the-morphological-DB-with-the-CMU-dictionary-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Aligning the morphological DB with the CMU dictionary</a></span><ul class="toc-item"><li><span><a href="#Export-to-file" data-toc-modified-id="Export-to-file-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Export to file</a></span></li></ul></li></ul></div>

# Extract present/past tense orthography pairings

Past/present tense pairings taken from 
 - http://www.linguistics.ucla.edu/people/hayes/rulesvsanalogy/RuleBasedLearnerEnglish.zip

In [2]:
import csv

In [3]:
%ls

 CELEXFull.in
 CELEXFull_w_CMU_transcriptions.tsv
 cmudict-0.7b_IPA_destressed.tsv
 EngVrbs_present_past.tsv
'Extracting phonological forms of English verbs.ipynb'


In [4]:
db_fn = 'CELEXFull.in'

In [5]:
!cat -n CELEXFull.in | head -30

     1	Phonological Learner File
     2	Adam Albright/Bruce Hayes
     3	English:  CELEX freq 10 or greater
     4	Tue Apr 21 21:27:37  2001
     5	Tue Apr 21 21:27:37  2001
     6	This is a new data set of all the verbs in CELEX with COBUILD freq 10 or higher.
     7	Morphological categories:
     8		Present	Past
     9	Input forms:
    10	d�u	d�Id	80717	do	did	irreg
    11	s�e	s�Ed	76541	say	said	irreg
    12	g�o	w�Ent	51830	go	went	irreg
    13	g�Et	g�at	42717	get	got	irreg
    14	n�o	n�u	38013	know	knew	irreg
    15	s�i	sȍ	36958	see	saw	irreg
    16	T�INk	Tȍt	35874	think	thought	irreg
    17	k��m	k�em	35152	come	came	irreg
    18	t�ek	t�Uk	34323	take	took	irreg
    19	l�Uk	l�Ukt	24165	look	looked	reg
    20	g�Iv	g�ev	22921	give	gave	irreg
    21	wȍnt	wȍnt�d	20077	want	wanted	reg
    22	f�Ynd	f�Wnd	19525	find	found	irreg
    23	t�El	t�old	19040	tell	told	irreg
    24	s�im	s�imd	15911	seem	seemed	reg
    25	f�il	f�Elt	15489	feel	felt	ir

In [6]:
db_in = []
with open(db_fn, 'r', encoding='latin-1') as the_file:
    for row in the_file:
        db_in.append(row.rstrip('\n'))#(row.rstrip('\r\n'))

In [7]:
db_in[:10]

['Phonological Learner File',
 'Adam Albright/Bruce Hayes',
 'English:  CELEX freq 10 or greater',
 'Tue Apr 21 21:27:37  2001',
 'Tue Apr 21 21:27:37  2001',
 'This is a new data set of all the verbs in CELEX with COBUILD freq 10 or higher.',
 'Morphological categories:',
 '\tPresent\tPast',
 'Input forms:',
 'dÈu\tdÈId\t80717\tdo\tdid\tirreg']

In [8]:
db_in[9:19]

['dÈu\tdÈId\t80717\tdo\tdid\tirreg',
 'sÈe\tsÈEd\t76541\tsay\tsaid\tirreg',
 'gÈo\twÈEnt\t51830\tgo\twent\tirreg',
 'gÈEt\tgÈat\t42717\tget\tgot\tirreg',
 'nÈo\tnÈu\t38013\tknow\tknew\tirreg',
 'sÈi\tsÈ\x8d\t36958\tsee\tsaw\tirreg',
 'TÈINk\tTÈ\x8dt\t35874\tthink\tthought\tirreg',
 'kÈÃm\tkÈem\t35152\tcome\tcame\tirreg',
 'tÈek\ttÈUk\t34323\ttake\ttook\tirreg',
 'lÈUk\tlÈUkt\t24165\tlook\tlooked\treg']

In [9]:
db = db_in[9:]

In [10]:
db[1]
db[1].split('\t')

'sÈe\tsÈEd\t76541\tsay\tsaid\tirreg'

['sÈe', 'sÈEd', '76541', 'say', 'said', 'irreg']

In [11]:
len(db)

4489

In [12]:
db[4243:4263]

['swÈab\tswÈabd\t10\tswab\tswabbed\treg',
 'swÈat\tswÈat«d\t10\tswot\tswotted\treg',
 'tÈo\ttÈod\t10\ttoe\t\treg',
 'trÈ\x8dl\ttrÈ\x8dld\t10\ttrawl\ttrawled\treg',
 'ÃndÕplÈe\tÃndÕplÈed\t10\tunderplay\tunderplayed\treg',
 'ÃpbrÈed\tÃpbrÈed«d\t10\tupbraid\tupbraided\treg',
 'vÈ\x8dnt\tvÈ\x8dnt«d\t10\tvaunt\tvaunted\treg',
 'vÈEJItet\tvÈEJItet«d\t10\tvegetate\tvegetated\treg',
 'vÈali\tvÈalid\t10\tvolley\tvolleyed\treg',
 'wÈInC\twÈInCt\t10\twinch\twinched\treg',
 'Test forms:',
 'splÈIN',
 'skrÈIN',
 'sprÈINk',
 'klÈid',
 'prÈid',
 'kwÈid',
 'klÈo',
 'frÈo',
 'plÈer']

In [13]:
db[4253:4263]

['Test forms:',
 'splÈIN',
 'skrÈIN',
 'sprÈINk',
 'klÈid',
 'prÈid',
 'kwÈid',
 'klÈo',
 'frÈo',
 'plÈer']

In [14]:
db = db[:4253]
db[-1]

'wÈInC\twÈInCt\t10\twinch\twinched\treg'

In [15]:
len(db)

4253

In [16]:
def process_line(l):
    columns = l.split('\t')
    present_transcription = columns[0]
    past_transcription = columns[1]
    count = columns[2]
    present_orthography = columns[3]
    past_orthography = columns[4]
    regular_or_irregular = columns[5]
    return {'Present_Transcription':columns[0],
            'Past_Transcription':columns[1],
            'Count':columns[2],
            'Present_Orthography':columns[3],
            'Past_Orthography':columns[4],
            'Regular_or_Irregular':columns[5]}


In [17]:
morph_db = list(map(process_line,
                    db))
len(morph_db)

4253

In [18]:
morph_db[0]

{'Present_Transcription': 'dÈu',
 'Past_Transcription': 'dÈId',
 'Count': '80717',
 'Present_Orthography': 'do',
 'Past_Orthography': 'did',
 'Regular_or_Irregular': 'irreg'}

In [19]:
orthographic_wordforms = set.union( set(map(lambda row: row['Present_Orthography'],
                                            morph_db)),
                                    set(map(lambda row: row['Past_Orthography'],
                                            morph_db)))
len(orthographic_wordforms)

8443

# Grab Transcriptions from CMU

The copy of the CMU dictionary used here was generated as described at https://github.com/emeinhardt/cmu-ipa.

In [20]:
%ls

 CELEXFull.in
 CELEXFull_w_CMU_transcriptions.tsv
 cmudict-0.7b_IPA_destressed.tsv
 EngVrbs_present_past.tsv
'Extracting phonological forms of English verbs.ipynb'


In [21]:
transcription_lexicon_fn = 'cmudict-0.7b_IPA_destressed.tsv'

In [22]:
lexicon_in = []
with open(transcription_lexicon_fn, 'r', newline='', encoding='utf-8') as csvfile:
    my_reader = csv.DictReader(csvfile, delimiter='\t', quoting=csv.QUOTE_NONE, quotechar='@')
    for row in my_reader:
        #print(row)
        lexicon_in.append(row)

len(lexicon_in)
lexicon_in[0].keys()
lexicon_in[0]

133854

odict_keys(['Orthography', 'Transcription'])

OrderedDict([('Orthography', '!EXCLAMATION-POINT'),
             ('Transcription', 'ɛ.k.s.k.l.ʌ.m.eɪ.ʃ.ʌ.n.p.ɔɪ.n.t')])

In [23]:
transcription_orthographic_wordforms_lc = set(map(lambda row: row['Orthography'].lower(),
                                                  lexicon_in))
len(transcription_orthographic_wordforms_lc)

133854

In [24]:
orthographic_wordforms_in_morph_db_not_in_transcription_dict = orthographic_wordforms - transcription_orthographic_wordforms_lc
len(orthographic_wordforms_in_morph_db_not_in_transcription_dict)
orthographic_wordforms_in_morph_db_not_in_transcription_dict

822

{'',
 'KO',
 "KO'd",
 'absented',
 'acclimatize',
 'acclimatized',
 'adduce',
 'adduced',
 'adjoined',
 'adjure',
 'adjured',
 'aerate',
 'aerated',
 'ailed',
 'alighted',
 'anaesthetize',
 'anaesthetized',
 'analyse',
 'analysed',
 'antedate',
 'antedated',
 'aped',
 'appal',
 'arrogated',
 'articled',
 'asphalted',
 'assented',
 'atoned',
 'averred',
 'backslid',
 'balloted',
 'banqueted',
 'bayoneted',
 'bedevilled',
 'befitted',
 'begrudged',
 'belay',
 'belayed',
 'besought',
 'bespoke',
 'bestride',
 'betoken',
 'betokened',
 'betted',
 'biassed',
 'billeted',
 'bisect',
 'bisected',
 'bitched',
 'bivouacked',
 'blanched',
 'bleat',
 'bleated',
 'blubbered',
 'blustered',
 'bobbed',
 'boded',
 'boozed',
 'bopped',
 'bottle-fed',
 'bottle-feed',
 'bracketed',
 'brayed',
 'breakfasted',
 'broadcasted',
 'brooked',
 'budded',
 'buggered',
 'bumbled',
 'bunged',
 'bungle',
 'burgle',
 'burgled',
 'burked',
 'burped',
 'burrowed',
 'bustled',
 'butted',
 'cackled',
 'cadge',
 'cadged'

In [25]:
lexicon_lc = list(map(lambda row: {'Orthography':row['Orthography'].lower(),
                                   'Transcription':row['Transcription']},
                      lexicon_in))

In [26]:
def findMatchingTranscriptions(orth_lc):
    return [r for r in lexicon_lc if r['Orthography'] == orth_lc]

# Aligning the morphological DB with the CMU dictionary

In [27]:
alignable_wordforms = {w for w in orthographic_wordforms if w in transcription_orthographic_wordforms_lc}
len(alignable_wordforms)

7621

In [28]:
alignable_wordforms_w_unique_transcription = {w for w in alignable_wordforms if len(findMatchingTranscriptions(w)) == 1}
len(alignable_wordforms_w_unique_transcription)

7621

In [29]:
def findMatchingTranscription(orth_lc):
    return findMatchingTranscriptions(orth_lc)[0]

In [30]:
len(morph_db)
alignable_rows = [r for r in morph_db 
                  if r['Present_Orthography'] in alignable_wordforms and r['Past_Orthography'] in alignable_wordforms]
len(alignable_rows)

4253

3542

In [31]:
alignable_rows[0]

{'Present_Transcription': 'dÈu',
 'Past_Transcription': 'dÈId',
 'Count': '80717',
 'Present_Orthography': 'do',
 'Past_Orthography': 'did',
 'Regular_or_Irregular': 'irreg'}

In [32]:
findMatchingTranscription('did')

{'Orthography': 'did', 'Transcription': 'd.ɪ.d'}

In [33]:
def replaceTranscriptions(alignable_row):
    r = alignable_row
    r['Present_Transcription'] = findMatchingTranscription(r['Present_Orthography'])['Transcription']
    r['Past_Transcription'] = findMatchingTranscription(r['Past_Orthography'])['Transcription']
    return r

In [34]:
replacedDB = list(map(replaceTranscriptions,
                      alignable_rows))
replacedDB[0]

{'Present_Transcription': 'd.u',
 'Past_Transcription': 'd.ɪ.d',
 'Count': '80717',
 'Present_Orthography': 'do',
 'Past_Orthography': 'did',
 'Regular_or_Irregular': 'irreg'}

## Export to file

In [35]:
replaced_db_fn = 'CELEXFull_w_CMU_transcriptions.tsv'

In [36]:
with open(replaced_db_fn, 'w', newline='', encoding='utf-8') as tsvfile:
    writer = csv.DictWriter(tsvfile, delimiter='\t', fieldnames=['Present_Transcription','Past_Transcription','Count','Present_Orthography','Past_Orthography','Regular_or_Irregular'], quoting=csv.QUOTE_NONE, quotechar='@')
    writer.writeheader()
    writer.writerows(replacedDB)

In [37]:
!cat -n CELEXFull_w_CMU_transcriptions.tsv | head -20

     1	Present_Transcription	Past_Transcription	Count	Present_Orthography	Past_Orthography	Regular_or_Irregular
     2	d.u	d.ɪ.d	80717	do	did	irreg
     3	s.eɪ	s.ɛ.d	76541	say	said	irreg
     4	g.oʊ	w.ɛ.n.t	51830	go	went	irreg
     5	g.ɛ.t	g.ɑ.t	42717	get	got	irreg
     6	n.oʊ	n.u	38013	know	knew	irreg
     7	s.i	s.ɔ	36958	see	saw	irreg
     8	θ.ɪ.ŋ.k	θ.ɔ.t	35874	think	thought	irreg
     9	k.ʌ.m	k.eɪ.m	35152	come	came	irreg
    10	t.eɪ.k	t.ʊ.k	34323	take	took	irreg
    11	l.ʊ.k	l.ʊ.k.t	24165	look	looked	reg
    12	g.ɪ.v	g.eɪ.v	22921	give	gave	irreg
    13	w.ɑ.n.t	w.ɔ.n.t.ɪ.d	20077	want	wanted	reg
    14	f.aɪ.n.d	f.aʊ.n.d	19525	find	found	irreg
    15	t.ɛ.l	t.oʊ.l.d	19040	tell	told	irreg
    16	s.i.m	s.i.m.d	15911	seem	seemed	reg
    17	f.i.l	f.ɛ.l.t	15489	feel	felt	irreg
    18	b.ɪ.k.ʌ.m	b.ɪ.k.eɪ.m	14957	become	became	irreg
    19	æ.s.k	æ.s.k.t	14651	ask	asked	reg
    20	j.u.s	j.u.z.d	14307	use	used	reg
cat: write error: Broken pipe


In [38]:
just_forms_fn = 'EngVrbs_present_past.tsv'

In [39]:
just_forms = list(map(lambda r:{'Present_Transcription':r['Present_Transcription'],
                                'Past_Transcription':r['Past_Transcription']},
                      replacedDB))
just_forms[0]

{'Present_Transcription': 'd.u', 'Past_Transcription': 'd.ɪ.d'}

In [40]:
with open(just_forms_fn, 'w', newline='', encoding='utf-8') as tsvfile:
    writer = csv.DictWriter(tsvfile, delimiter='\t', fieldnames=['Present_Transcription','Past_Transcription'], quoting=csv.QUOTE_NONE, quotechar='@')
    writer.writeheader()
    writer.writerows(just_forms)

In [41]:
!cat -n EngVrbs_present_past.tsv | head -20

     1	Present_Transcription	Past_Transcription
     2	d.u	d.ɪ.d
     3	s.eɪ	s.ɛ.d
     4	g.oʊ	w.ɛ.n.t
     5	g.ɛ.t	g.ɑ.t
     6	n.oʊ	n.u
     7	s.i	s.ɔ
     8	θ.ɪ.ŋ.k	θ.ɔ.t
     9	k.ʌ.m	k.eɪ.m
    10	t.eɪ.k	t.ʊ.k
    11	l.ʊ.k	l.ʊ.k.t
    12	g.ɪ.v	g.eɪ.v
    13	w.ɑ.n.t	w.ɔ.n.t.ɪ.d
    14	f.aɪ.n.d	f.aʊ.n.d
    15	t.ɛ.l	t.oʊ.l.d
    16	s.i.m	s.i.m.d
    17	f.i.l	f.ɛ.l.t
    18	b.ɪ.k.ʌ.m	b.ɪ.k.eɪ.m
    19	æ.s.k	æ.s.k.t
    20	j.u.s	j.u.z.d
cat: write error: Broken pipe
