In [5]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

**Notebook author:** emeinhardt@ucsd.edu

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Overview" data-toc-modified-id="Overview-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Overview</a></span><ul class="toc-item"><li><span><a href="#Requirements" data-toc-modified-id="Requirements-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Requirements</a></span></li></ul></li><li><span><a href="#Imports" data-toc-modified-id="Imports-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Imports</a></span></li><li><span><a href="#Loading-and-Parsing-CELEX-emw" data-toc-modified-id="Loading-and-Parsing-CELEX-emw-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Loading and Parsing CELEX <code>emw</code></a></span></li><li><span><a href="#Filtering-it-for-wordforms-with-inflections-of-interest" data-toc-modified-id="Filtering-it-for-wordforms-with-inflections-of-interest-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Filtering it for wordforms with inflections of interest</a></span></li><li><span><a href="#Filtering-emw-for-just-those-lemmas-with-all-inflections-of-interest" data-toc-modified-id="Filtering-emw-for-just-those-lemmas-with-all-inflections-of-interest-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Filtering <code>emw</code> for just those lemmas with <em>all</em> inflections of interest</a></span></li><li><span><a href="#Create-tsv-file-with-rows-of-interest" data-toc-modified-id="Create-tsv-file-with-rows-of-interest-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Create <code>tsv</code> file with rows of interest</a></span></li><li><span><a href="#Substituting-orthographic-forms-for-transcriptions-from-the-CMU-pronouncing-dictionary" data-toc-modified-id="Substituting-orthographic-forms-for-transcriptions-from-the-CMU-pronouncing-dictionary-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Substituting orthographic forms for transcriptions from the CMU pronouncing dictionary</a></span></li><li><span><a href="#Aligning-the-(orthographic)-morphological-database-with-the-CMU-dictionary" data-toc-modified-id="Aligning-the-(orthographic)-morphological-database-with-the-CMU-dictionary-8"><span class="toc-item-num">8&nbsp;&nbsp;</span>Aligning the (orthographic) morphological database with the CMU dictionary</a></span></li></ul></div>

# Overview

The goal of this notebook is producing a tab-separated value file where each row contains phonemic transcriptions of 
 - the first person present
 - the third person present
 - the first person past
 - the third person past

inflected wordforms for a single English verb.

The file is produced by reading in data from the CELEX-2 database for English morphological wordforms, and then mapping the orthographic transcriptions of each inflected wordform to transcriptions from the CMU pronouncing dictionary. The CELEX data is not included in this repository.

## Requirements

 - As noted above, you must have (and further below, correctly set the filepath for) the CELEX database.
 - The notebook makes use of shell command magics for Unix-like systems (though none of these calls are essential).
 - The notebook makes use of `joblib` for parallelizing data processing. In each case, commented-out code in the same cell that does not require joblib is also present.

# Imports

In [6]:
import csv

from collections import OrderedDict

from functools import reduce

union = lambda Ss: reduce(set.union, Ss)

from itertools import chain, zip_longest, compress

# from https://docs.python.org/3/library/itertools.html#itertools-recipes
def grouper(iterable, n, fillvalue=None):
    "Collect data into fixed-length chunks or blocks"
    # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
    args = [iter(iterable)] * n
    return zip_longest(*args, fillvalue=fillvalue)

from os import getcwd, chdir, listdir, mkdir, makedirs, path

from copy import deepcopy

In [7]:
repo_dir = getcwd(); repo_dir

'/mnt/cube/home/AD/emeinhar/english-verbs'

In [8]:
from tqdm import tqdm

from joblib import Parallel, delayed, Memory

J = -1
BACKEND = 'multiprocessing'
# BACKEND = 'loky'
V = 10
PREFER = 'processes'
# PREFER = 'threads'

def par(gen_expr, j=None, backend=None, verbose=None, prefer=None):
    if j is None:
        j = J
    if backend is None:
        backend = BACKEND
    if verbose is None:
        verbose = V
    if prefer is None:
        prefer = PREFER
    return Parallel(n_jobs=j, backend=backend, verbose=verbose, prefer=prefer)(gen_expr)

def identity(x):
    return x

# Loading and Parsing CELEX `emw`

In [9]:
CELEX_dir = '/mnt/cube/home/AD/emeinhar/celex-2'
chdir(CELEX_dir)

In [10]:
listdir()

['english',
 'intro_le.ps',
 'c',
 'dutch',
 'intro_a4.ps',
 'intro_let.pdf',
 'intro_a4.pdf',
 'awk',
 'german',
 'README']

In [11]:
chdir('english'); listdir()

['ect',
 'efl',
 'emw',
 'eug_let.ps',
 'eol',
 'epl',
 'epw',
 'eow',
 'esl',
 'efs',
 'efw',
 'eug_a4.ps',
 'eml',
 'readme.']

In [12]:
!cat -n readme. | head -1000

     1	This directory contains the subdirectories:
     2	
     3	         eol       English Orthography, Lemmas
     4	                       (filesize eol.cd:  2,068,262 bytes)
     5	         epl       English Phonology, Lemmas
     6	                       (filesize epl.cd:  5,480,381 bytes)
     7	         eml       English Morphology, Lemmas
     8	                       (filesize eml.cd:  4,918,610 bytes)
     9	         esl       English Syntax, Lemmas
    10	                       (filesize esl.cd:  5,560,364 bytes)
    11	         efl       English Frequency, Lemmas
    12	                       (filesize efl.cd:  2,143,469 bytes)
    13	
    14	         eow       English Orthography, Wordforms
    15	                       (filesize eow.cd:  7,351,883 bytes)
    16	         epw       English Phonology, Wordforms
    17	                       (filesize epw.cd: 15,567,897 bytes)
    18	         emw       English Morphology, Wordforms
    19	                  

In [13]:
listdir('emw'); chdir('emw')

['awk', 'emw.cd', 'readme.']

In [14]:
!cat -n readme. | head -1000

     1	ENGLISH MORPHOLOGY, WORDFORMS
     2	
     3	The emw.cd file contains the following fields:
     4	
     5	1.    IdNum
     6	2.    Word
     7	3.    Cob
     8	4.    IdNumLemma
     9	5.    FlectType
    10	6.    TransInfl
    11	
    12	The awk directory contains the script
    13	
    14	   script TypeToInflectionalFeatures(String):    type2fea.awk
    15	
    16	for reconstructing the 13 Inflectional Features fields outlined in the
    17	CELEX User Guide:
    18	
    19	   1. Sing
    20	   2. Plu
    21	   3. Pos
    22	   4. Comp
    23	   5. Sup
    24	   6. Inf
    25	   7. Part
    26	   8. Pres
    27	   9. Past
    28	  10. Sin1
    29	  11. Sin2
    30	  12. Sin3
    31	  13. Rare


In [15]:
!cat -n emw.cd | head -10

     1	1\a\413887\1\X\@
     2	2\a\422336\2\S\@
     3	3\A\422334\4\X\@
     4	4\a\8448\3\X\@
     5	5\AA\52\5\S\@
     6	6\AA\95\6\X\@
     7	7\AAs\0\5\P\@+s
     8	8\abaci\0\8\P\
     9	9\aback\59\7\b\@
    10	10\abacus\8\8\S\@
cat: write error: Broken pipe


In [16]:
emw_lines = []
with open("emw.cd", 'r') as f:
    for line in f:
        split_line = [s for s in line.rstrip().split("\\") if s != ""]
        emw_lines.append(split_line)
len(emw_lines)
emw_lines[:10]

160595

[['1', 'a', '413887', '1', 'X', '@'],
 ['2', 'a', '422336', '2', 'S', '@'],
 ['3', 'A', '422334', '4', 'X', '@'],
 ['4', 'a', '8448', '3', 'X', '@'],
 ['5', 'AA', '52', '5', 'S', '@'],
 ['6', 'AA', '95', '6', 'X', '@'],
 ['7', 'AAs', '0', '5', 'P', '@+s'],
 ['8', 'abaci', '0', '8', 'P'],
 ['9', 'aback', '59', '7', 'b', '@'],
 ['10', 'abacus', '8', '8', 'S', '@']]

In [17]:
emw_fields = ('IdNum','Word','Cob','IdNumLemma','FlectType','TransInfl')

In [18]:
def parseEMWline(split_line):
    entry = OrderedDict(zip(emw_fields, split_line))
    for field in ('IdNum', 'Cob', 'IdNumLemma'):
        entry[field] = int(entry[field])
    return entry

In [19]:
emw = list(map(parseEMWline,
               emw_lines))
emw[455:465]

[OrderedDict([('IdNum', 456),
              ('Word', 'accordance'),
              ('Cob', 134),
              ('IdNumLemma', 249),
              ('FlectType', 'S'),
              ('TransInfl', '@')]),
 OrderedDict([('IdNum', 457),
              ('Word', 'accorded'),
              ('Cob', 16),
              ('IdNumLemma', 248),
              ('FlectType', 'a1S'),
              ('TransInfl', '@+ed')]),
 OrderedDict([('IdNum', 458),
              ('Word', 'according'),
              ('Cob', 1884),
              ('IdNumLemma', 248),
              ('FlectType', 'pe'),
              ('TransInfl', '@+ing')]),
 OrderedDict([('IdNum', 459),
              ('Word', 'according as'),
              ('Cob', 0),
              ('IdNumLemma', 250),
              ('FlectType', 'X'),
              ('TransInfl', '@ @')]),
 OrderedDict([('IdNum', 460),
              ('Word', 'accordingly'),
              ('Cob', 249),
              ('IdNumLemma', 251),
              ('FlectType', 'b'),
              ('Trans

# Filtering it for wordforms with inflections of interest

We want wordforms that are 
 - first or third person verbs
 - and also either present or past

In [20]:
isPresent       = lambda e: 'e' in e['FlectType']
isPast          = lambda e: 'a' in e['FlectType']
isFirstPerson   = lambda e: '1' in e['FlectType']
isSecondPerson  = lambda e: '2' in e['FlectType']
isThirdPerson   = lambda e: '3' in e['FlectType']
isParticiple    = lambda e: 'p' in e['FlectType']
isNotParticiple = lambda e: not isParticiple(e)
isNotPhrasal    = lambda e: not ' ' in e['Word']

isOfInterest = lambda e: (isFirstPerson(e) or isThirdPerson(e)) \
                         and (isPresent(e) or isPast(e)) \
                         and isNotPhrasal(e)

In [21]:
emw_of_interest = list(filter(isOfInterest,
                              emw))
len(emw_of_interest)

24065

In [22]:
IdNums_of_interest = set(map(lambda e: e['IdNum'],
                              emw_of_interest))
len(IdNums_of_interest)
list(IdNums_of_interest)[:10]

24065

[131073, 131075, 131081, 131083, 131087, 17, 131089, 131091, 20, 22]

In [23]:
IdNumLemmas_of_interest = set(map(lambda e: e['IdNumLemma'],
                                  emw_of_interest))
len(IdNumLemmas_of_interest)
list(IdNumLemmas_of_interest)[:10]

5985

[32768, 32770, 32774, 12, 15, 32783, 17, 18, 32789, 25]

In [24]:
emw_of_interest[3255:3265]

[OrderedDict([('IdNum', 25373),
              ('Word', 'dissociated'),
              ('Cob', 2),
              ('IdNumLemma', 12956),
              ('FlectType', 'a1S'),
              ('TransInfl', '@+d')]),
 OrderedDict([('IdNum', 25374),
              ('Word', 'dissociates'),
              ('Cob', 0),
              ('IdNumLemma', 12956),
              ('FlectType', 'e3S'),
              ('TransInfl', '@+s')]),
 OrderedDict([('IdNum', 25385),
              ('Word', 'dissolved'),
              ('Cob', 28),
              ('IdNumLemma', 12964),
              ('FlectType', 'a1S'),
              ('TransInfl', '@+d')]),
 OrderedDict([('IdNum', 25386),
              ('Word', 'dissolves'),
              ('Cob', 39),
              ('IdNumLemma', 12964),
              ('FlectType', 'e3S'),
              ('TransInfl', '@+s')]),
 OrderedDict([('IdNum', 25393),
              ('Word', 'dissuaded'),
              ('Cob', 3),
              ('IdNumLemma', 12968),
              ('FlectType', 'a1S'),
  

In [25]:
def allWordforms_with_lemma(IdNumLemma, emw_entries, present=None, firstPerson=None):
    baseline_matches = [e for e in emw_entries if e['IdNumLemma'] == IdNumLemma]
    if present is None and firstPerson is None:
        return baseline_matches
    if present is not None:
        if present:
            tense_filtered = list(filter(isPresent,
                                         baseline_matches))
        else:
            tense_filtered = list(filter(isPast,
                                         baseline_matches))
    else:
        tense_filtered = baseline_matches
    if firstPerson is not None:
        if firstPerson:
            person_filtered = list(filter(isFirstPerson,
                                          tense_filtered))
        else:
            person_filtered = list(filter(isThirdPerson,
                                          tense_filtered))
    else:
        person_filtered = tense_filtered
    return person_filtered

# Filtering `emw` for just those lemmas with *all* inflections of interest

With respect to the two morphosyntactic properties previously mentioned and the four possible combinations of values for those properties, we only want wordforms for lemmas where *all four* possible inflections of interest are documented.

In [26]:
def lemma_has_inflections_of_interest(IdNumLemma, emw_entries):
    entries={
        "1p.pres":allWordforms_with_lemma(IdNumLemma, emw_entries, present=True, firstPerson=True),
        "1p.past":allWordforms_with_lemma(IdNumLemma, emw_entries, present=False, firstPerson=True),
        "3p.pres":allWordforms_with_lemma(IdNumLemma, emw_entries, present=True, firstPerson=False),
        "3p.past":allWordforms_with_lemma(IdNumLemma, emw_entries, present=False, firstPerson=False)
    }
    
    for k in entries:
        if len(entries[k]) == 0:
            print("IdNumLemma {0} has no entries for inflection {1}".format(IdNumLemma, k))
    if any([len(entries[k]) == 0 for k in entries]):
        return False
    return True

In [29]:
len(IdNumLemmas_of_interest)

#takes ~3.5m on Wittgenstein
# IdNumLemmas_of_interest_with_all_inflections = list(filter(lambda IdNumLemma: lemma_has_inflections_of_interest(IdNumLemma, emw_of_interest),
#                                                            IdNumLemmas_of_interest))


# takes 11s
def include_IdNumLemma(IdNumLemma):
    return lemma_has_inflections_of_interest(IdNumLemma, emw_of_interest)
IdNumLemma_mask = par(delayed(include_IdNumLemma)(IdNumLemma) for IdNumLemma in IdNumLemmas_of_interest)

IdNumLemmas_of_interest_with_all_inflections = list(compress(IdNumLemmas_of_interest, IdNumLemma_mask))

len(IdNumLemmas_of_interest_with_all_inflections)

5985

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1406s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1564s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done  98 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 132 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 170 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 224 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 308 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 392 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 484 tasks      | elapsed:    1.1s
[P

IdNumLemma 3831 has no entries for inflection 1p.past
IdNumLemma 3831 has no entries for inflection 3p.pres
IdNumLemma 3831 has no entries for inflection 3p.past


[Parallel(n_jobs=-1)]: Done 676 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 884 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 992 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 1108 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 1224 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 1348 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done 1472 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 1604 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done 1736 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 1876 tasks      | elapsed:    3.5s


IdNumLemma 41818 has no entries for inflection 1p.past
IdNumLemma 41818 has no entries for inflection 3p.past


[Parallel(n_jobs=-1)]: Done 2016 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 2164 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done 2312 tasks      | elapsed:    4.2s


IdNumLemma 11151 has no entries for inflection 1p.past
IdNumLemma 11151 has no entries for inflection 3p.pres


[Parallel(n_jobs=-1)]: Done 2468 tasks      | elapsed:    4.5s


IdNumLemma 11151 has no entries for inflection 3p.past
IdNumLemma 44435 has no entries for inflection 1p.pres


[Parallel(n_jobs=-1)]: Done 2624 tasks      | elapsed:    4.8s


IdNumLemma 44435 has no entries for inflection 3p.pres


[Parallel(n_jobs=-1)]: Done 2788 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done 2952 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done 3124 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done 3296 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done 3476 tasks      | elapsed:    6.3s
[Parallel(n_jobs=-1)]: Done 3656 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done 3844 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done 4032 tasks      | elapsed:    7.2s


IdNumLemma 49753 has no entries for inflection 3p.pres


[Parallel(n_jobs=-1)]: Done 4228 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done 4424 tasks      | elapsed:    7.9s


IdNumLemma 52098 has no entries for inflection 1p.past
IdNumLemma 52098 has no entries for inflection 3p.past


[Parallel(n_jobs=-1)]: Done 4628 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-1)]: Done 4832 tasks      | elapsed:    8.6s
[Parallel(n_jobs=-1)]: Done 5044 tasks      | elapsed:    9.0s
[Parallel(n_jobs=-1)]: Done 5256 tasks      | elapsed:    9.4s


IdNumLemma 28326 has no entries for inflection 1p.past
IdNumLemma 28326 has no entries for inflection 3p.past


[Parallel(n_jobs=-1)]: Done 5476 tasks      | elapsed:    9.7s


IdNumLemma 29546 has no entries for inflection 1p.past
IdNumLemma 29546 has no entries for inflection 3p.past


[Parallel(n_jobs=-1)]: Done 5696 tasks      | elapsed:   10.1s
[Parallel(n_jobs=-1)]: Done 5985 out of 5985 | elapsed:   10.6s finished


5977

In [30]:
allWordforms_with_lemma(3831, emw_of_interest)
allWordforms_with_lemma(41818, emw_of_interest)
allWordforms_with_lemma(11151, emw_of_interest)
allWordforms_with_lemma(44435, emw_of_interest)
allWordforms_with_lemma(49753, emw_of_interest)
allWordforms_with_lemma(52098, emw_of_interest)
allWordforms_with_lemma(28326, emw_of_interest)
allWordforms_with_lemma(29546, emw_of_interest)

[OrderedDict([('IdNum', 101737),
              ('Word', 'beware'),
              ('Cob', 23),
              ('IdNumLemma', 3831),
              ('FlectType', 'e1S'),
              ('TransInfl', '@')])]

[OrderedDict([('IdNum', 114492),
              ('Word', 'should'),
              ('Cob', 3552),
              ('IdNumLemma', 41818),
              ('FlectType', 'e1S'),
              ('TransInfl', '@')]),
 OrderedDict([('IdNum', 148394),
              ('Word', 'should'),
              ('Cob', 3552),
              ('IdNumLemma', 41818),
              ('FlectType', 'e3S'),
              ('TransInfl', 'IRR')])]

[OrderedDict([('IdNum', 104314),
              ('Word', 'daresay'),
              ('Cob', 21),
              ('IdNumLemma', 11151),
              ('FlectType', 'e1S'),
              ('TransInfl', '@')])]

[OrderedDict([('IdNum', 86310),
              ('Word', 'stove'),
              ('Cob', 1),
              ('IdNumLemma', 44435),
              ('FlectType', 'a1S')]),
 OrderedDict([('IdNum', 132579),
              ('Word', 'stove'),
              ('Cob', 1),
              ('IdNumLemma', 44435),
              ('FlectType', 'a3S')])]

[OrderedDict([('IdNum', 95574),
              ('Word', 'used'),
              ('Cob', 554),
              ('IdNumLemma', 49753),
              ('FlectType', 'a1S'),
              ('TransInfl', '@+d')]),
 OrderedDict([('IdNum', 117159),
              ('Word', 'use'),
              ('Cob', 0),
              ('IdNumLemma', 49753),
              ('FlectType', 'e1S'),
              ('TransInfl', '@')]),
 OrderedDict([('IdNum', 134093),
              ('Word', 'used'),
              ('Cob', 554),
              ('IdNumLemma', 49753),
              ('FlectType', 'a3S'),
              ('TransInfl', '@+d')])]

[OrderedDict([('IdNum', 99942),
              ('Word', 'woulds'),
              ('Cob', 0),
              ('IdNumLemma', 52098),
              ('FlectType', 'e3S'),
              ('TransInfl', '@+s')]),
 OrderedDict([('IdNum', 117842),
              ('Word', 'would'),
              ('Cob', 4612),
              ('IdNumLemma', 52098),
              ('FlectType', 'e1S'),
              ('TransInfl', '@')])]

[OrderedDict([('IdNum', 110064),
              ('Word', 'might'),
              ('Cob', 1116),
              ('IdNumLemma', 28326),
              ('FlectType', 'e1S'),
              ('TransInfl', '@')]),
 OrderedDict([('IdNum', 144068),
              ('Word', 'might'),
              ('Cob', 1116),
              ('IdNumLemma', 28326),
              ('FlectType', 'e3S'),
              ('TransInfl', 'IRR')])]

[OrderedDict([('IdNum', 110461),
              ('Word', 'must'),
              ('Cob', 3304),
              ('IdNumLemma', 29546),
              ('FlectType', 'e1S'),
              ('TransInfl', '@')]),
 OrderedDict([('IdNum', 144452),
              ('Word', 'must'),
              ('Cob', 3304),
              ('IdNumLemma', 29546),
              ('FlectType', 'e3S'),
              ('TransInfl', 'IRR')])]

In [35]:
len(emw_of_interest)

# takes ~50s on wittgenstein
emw_of_interest_with_all_inflections = list(chain.from_iterable([allWordforms_with_lemma(IdNumLemma, emw_of_interest) 
                                                                 for IdNumLemma in IdNumLemmas_of_interest_with_all_inflections]))

# takes >>50s
# emw_of_interest_with_all_inflections = par(delayed(allWordforms_with_lemma)(IdNumLemma, emw_of_interest)
#                                            for IdNumLemma in IdNumLemmas_of_interest_with_all_inflections)
# emw_of_interest_with_all_inflections = list(chain.from_iterable(emw_of_interest_with_all_inflections))

len(emw_of_interest_with_all_inflections)

24065

24050

# Create `tsv` file with rows of interest

For any lemma with multiple wordforms for a given inflection, we will take the wordform for that inflection with the highest frequency.

In [36]:
def createRows(IdNumLemma, emw_entries):
    first_person_present_entries = allWordforms_with_lemma(IdNumLemma, emw_entries, present=True, firstPerson=True)
    first_person_past_entries    = allWordforms_with_lemma(IdNumLemma, emw_entries, present=False, firstPerson=True)
    third_person_present_entries = allWordforms_with_lemma(IdNumLemma, emw_entries, present=True, firstPerson=False)
    third_person_past_entries    = allWordforms_with_lemma(IdNumLemma, emw_entries, present=False, firstPerson=False)
    
    assert len(first_person_present_entries) > 0, "IdNumLemma {0} has no entries for 1p.pres".format(IdNumLemma)
    assert len(first_person_past_entries) > 0, "IdNumLemma {0} has no entries for 1p.past".format(IdNumLemma)
    assert len(third_person_present_entries) > 0, "IdNumLemma {0} has no entries for 3p.pres".format(IdNumLemma)
    assert len(third_person_past_entries) > 0, "IdNumLemma {0} has no entries for 3p.past".format(IdNumLemma)
    
    fp_pres_word = sorted(first_person_present_entries, key=lambda d: d['Cob'], reverse=True)[0]['Word']
    fp_past_word = sorted(first_person_past_entries, key=lambda d: d['Cob'], reverse=True)[0]['Word']
    tp_pres_word = sorted(third_person_present_entries, key=lambda d: d['Cob'], reverse=True)[0]['Word']
    tp_past_word = sorted(third_person_past_entries, key=lambda d: d['Cob'], reverse=True)[0]['Word']
    if fp_past_word != tp_past_word:
        print("1p.pst vs. 3p.pst = {0} vs. {1}".format(fp_past_word, tp_past_word))
    # return {'1p.pres':first_person_present_entries,
    #         '1p.past':first_person_past_entries,
    #         '3p.pres':third_person_present_entries,
    #         '3p.past':third_person_past_entries}
    return OrderedDict({
            '1p.prs':fp_pres_word,
            '1p.pst':fp_past_word,
            '3p.prs':tp_pres_word,
            '3p.pst':tp_past_word
    })

In [37]:
# takes ~4m on wittgenstein
# rows_of_interest = list(map(lambda IdNumLemma: createRows(IdNumLemma, emw_of_interest_with_all_inflections),
#                             IdNumLemmas_of_interest_with_all_inflections))

# takes 11s on wittgenstein
def toExportableRow(IdNumLemma):
    return createRows(IdNumLemma, emw_of_interest_with_all_inflections)
rows_of_interest = par(delayed(toExportableRow)(IdNumLemma) 
                       for IdNumLemma in IdNumLemmas_of_interest_with_all_inflections)

len(rows_of_interest)
rows_of_interest[0]

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1358s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1767s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done  98 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 132 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 170 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 224 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 308 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 392 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 484 tasks      | elapsed:    1.1s
[P

5977

OrderedDict([('1p.prs', 'pauperize'),
             ('1p.pst', 'pauperized'),
             ('3p.prs', 'pauperizes'),
             ('3p.pst', 'pauperized')])

In [38]:
chdir(repo_dir)

In [39]:
morph_db = rows_of_interest

In [40]:
with open("english-verbs-orth.tsv", 'w', newline='', encoding='utf-8') as tsvfile:
    writer = csv.DictWriter(tsvfile, delimiter='\t', fieldnames=['1p.prs','1p.pst','3p.prs','3p.pst'], quoting=csv.QUOTE_NONE, quotechar='@')
    writer.writeheader()
    writer.writerows(morph_db)

In [41]:
get_orth_wordforms = lambda row: set(row.values())
orthographic_wordforms = union(map(get_orth_wordforms, 
                                   rows_of_interest))

# Substituting orthographic forms for transcriptions from the CMU pronouncing dictionary

The copy of the CMU dictionary used here was generated as described at https://github.com/emeinhardt/cmu-ipa.

In [42]:
transcription_lexicon_fn = 'cmudict-0.7b_IPA_destressed.tsv'

In [43]:
lexicon_in = []
with open(transcription_lexicon_fn, 'r', newline='', encoding='utf-8') as csvfile:
    my_reader = csv.DictReader(csvfile, delimiter='\t', quoting=csv.QUOTE_NONE, quotechar='@')
    for row in my_reader:
        #print(row)
        lexicon_in.append(row)

len(lexicon_in)
lexicon_in[0].keys()
lexicon_in[0]

133854

odict_keys(['Orthography', 'Transcription'])

OrderedDict([('Orthography', '!EXCLAMATION-POINT'),
             ('Transcription', 'ɛ.k.s.k.l.ʌ.m.eɪ.ʃ.ʌ.n.p.ɔɪ.n.t')])

In [44]:
transcription_orthographic_wordforms_lc = set(map(lambda row: row['Orthography'].lower(),
                                                  lexicon_in))
len(transcription_orthographic_wordforms_lc)

133854

In [45]:
orthographic_wordforms_in_morph_db_not_in_transcription_dict = orthographic_wordforms - transcription_orthographic_wordforms_lc
len(orthographic_wordforms_in_morph_db_not_in_transcription_dict)
orthographic_wordforms_in_morph_db_not_in_transcription_dict

5139

{'attorn',
 'sensitizes',
 'moil',
 'poniard',
 'touch-type',
 'confab',
 'eviscerates',
 'conciliate',
 'elided',
 'libelled',
 'unfurls',
 'sulks',
 'disinters',
 'over-exerts',
 'splodge',
 'crenellate',
 'velarize',
 'undersells',
 'whops',
 'de-escalate',
 'waddles',
 'swinge',
 'miscalled',
 'redacts',
 'macadamize',
 'outstayed',
 'placates',
 'stupefies',
 'desalinize',
 'machinate',
 'flavoured',
 'disrobed',
 'rehabilitates',
 'metabolized',
 'tousled',
 'farrowed',
 'levitates',
 'sermonizes',
 'misreads',
 'defames',
 'tousle',
 'sculled',
 'composts',
 'shinned',
 'dappled',
 'embitters',
 'slenderized',
 'mistranslate',
 'adduces',
 'debark',
 'demonetizes',
 'oxygenized',
 'barbarized',
 'orated',
 'reconstructs',
 'diffracted',
 'gormandizes',
 'hie',
 'droops',
 'cerebrates',
 'congratulates',
 'prinks',
 'berthed',
 'disarranged',
 'soldered',
 'cohabited',
 'interpellated',
 'bumbled',
 'dry-cleaned',
 'decoke',
 'pottered',
 'squealed',
 'ad-libbed',
 'vilifies',
 '

In [46]:
lexicon_lc = list(map(lambda row: {'Orthography':row['Orthography'].lower(),
                                   'Transcription':row['Transcription']},
                      lexicon_in))

In [47]:
def findMatchingTranscriptions(orth_lc):
    return [r for r in lexicon_lc if r['Orthography'] == orth_lc]

# Aligning the (orthographic) morphological database with the CMU dictionary

In [48]:
alignable_wordforms = {w for w in orthographic_wordforms if w in transcription_orthographic_wordforms_lc}
len(alignable_wordforms)

12588

In [49]:
alignable_wordforms_w_unique_transcription = {w for w in alignable_wordforms if len(findMatchingTranscriptions(w)) == 1}
len(alignable_wordforms_w_unique_transcription)

12588

In [54]:
def findMatchingTranscription(orth_lc):
    return findMatchingTranscriptions(orth_lc)[0]['Transcription']

In [55]:
findMatchingTranscription('did')

'd.ɪ.d'

In [52]:
def isAlignableRow(morph_db_row):
    wordforms = set(morph_db_row.values())
    return all([w in alignable_wordforms for w in wordforms])

In [53]:
len(morph_db)
alignable_rows = list(filter(isAlignableRow,
                             morph_db))
len(alignable_rows)

5977

3147

In [56]:
def alignRow(morph_db_row):
    new_row = deepcopy(morph_db_row)
    for k in new_row:
        new_row[k] = findMatchingTranscription(new_row[k])
    return new_row

In [59]:
len(alignable_rows)

# takes ~1m on wittgenstein
# aligned_db = list(map(alignRow,
#                       alignable_rows))

# takes ~11s on wittgenstein
aligned_db = par(delayed(alignRow)(row) for row in alignable_rows)

len(aligned_db)

3147

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1253s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  98 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 132 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 170 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 250 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 292 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 338 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 384 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: 

3147

In [60]:
aligned_db[:10]

[OrderedDict([('1p.prs', 'p.ɔ.z'),
              ('1p.pst', 'p.ɔ.z.d'),
              ('3p.prs', 'p.ɔ.z.ʌ.z'),
              ('3p.pst', 'p.ɔ.z.d')]),
 OrderedDict([('1p.prs', 'p.eɪ.v'),
              ('1p.pst', 'p.eɪ.v.d'),
              ('3p.prs', 'p.eɪ.v.z'),
              ('3p.pst', 'p.eɪ.v.d')]),
 OrderedDict([('1p.prs', 'ʌ.b.æ.n.d.ʌ.n'),
              ('1p.pst', 'ʌ.b.æ.n.d.ʌ.n.d'),
              ('3p.prs', 'ʌ.b.æ.n.d.ʌ.n.z'),
              ('3p.pst', 'ʌ.b.æ.n.d.ʌ.n.d')]),
 OrderedDict([('1p.prs', 'p.ɔ'),
              ('1p.pst', 'p.ɔ.d'),
              ('3p.prs', 'p.ɔ.z'),
              ('3p.pst', 'p.ɔ.d')]),
 OrderedDict([('1p.prs', 'ʌ.b.eɪ.t'),
              ('1p.pst', 'ʌ.b.eɪ.t.ɪ.d'),
              ('3p.prs', 'ʌ.b.eɪ.t.s'),
              ('3p.pst', 'ʌ.b.eɪ.t.ɪ.d')]),
 OrderedDict([('1p.prs', 'p.ɔ.n'),
              ('1p.pst', 'p.ɔ.n.d'),
              ('3p.prs', 'p.ɔ.n.z'),
              ('3p.pst', 'p.ɔ.n.d')]),
 OrderedDict([('1p.prs', 'ʌ.b.ɹ.i.v.i.eɪ.t'),
              ('1p.p

In [61]:
with open("english-verbs-phon.tsv", 'w', newline='', encoding='utf-8') as tsvfile:
    writer = csv.DictWriter(tsvfile, delimiter='\t', fieldnames=['1p.prs','1p.pst','3p.prs','3p.pst'], quoting=csv.QUOTE_NONE, quotechar='@')
    writer.writeheader()
    writer.writerows(aligned_db)