In [1]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Overview-and-requirements" data-toc-modified-id="Overview-and-requirements-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Overview and requirements</a></span></li><li><span><a href="#Import-data" data-toc-modified-id="Import-data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Import data</a></span><ul class="toc-item"><li><span><a href="#Hammond's-newdic" data-toc-modified-id="Hammond's-newdic-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Hammond's newdic</a></span></li><li><span><a href="#IPhOD" data-toc-modified-id="IPhOD-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>IPhOD</a></span></li></ul></li><li><span><a href="#Convert-phonological-representations-to-IPA---Hammond" data-toc-modified-id="Convert-phonological-representations-to-IPA---Hammond-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Convert phonological representations to IPA - Hammond</a></span><ul class="toc-item"><li><span><a href="#Define-Hammond-inventory--&gt;-IPA-mapping" data-toc-modified-id="Define-Hammond-inventory-->-IPA-mapping-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Define Hammond inventory -&gt; IPA mapping</a></span></li><li><span><a href="#Transform-transcriptions-to-IPA..." data-toc-modified-id="Transform-transcriptions-to-IPA...-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Transform transcriptions to IPA...</a></span></li><li><span><a href="#Transform-entries-to-have-IPA-transcriptions..." data-toc-modified-id="Transform-entries-to-have-IPA-transcriptions...-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Transform entries to have IPA transcriptions...</a></span></li><li><span><a href="#Write-to-file-/-read-back-in" data-toc-modified-id="Write-to-file-/-read-back-in-3.4"><span class="toc-item-num">3.4&nbsp;&nbsp;</span>Write to file / read back in</a></span></li></ul></li><li><span><a href="#Convert-phonological-representations-to-IPA---IPhOD" data-toc-modified-id="Convert-phonological-representations-to-IPA---IPhOD-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Convert phonological representations to IPA - IPhOD</a></span><ul class="toc-item"><li><span><a href="#Identify-fields-to-modify" data-toc-modified-id="Identify-fields-to-modify-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Identify fields to modify</a></span></li><li><span><a href="#Define-arpabet-⟶-IPA-mapping" data-toc-modified-id="Define-arpabet-⟶-IPA-mapping-4.2"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>Define arpabet ⟶ IPA mapping</a></span></li><li><span><a href="#IPA-ify-each-field-in-the-full-dataset" data-toc-modified-id="IPA-ify-each-field-in-the-full-dataset-4.3"><span class="toc-item-num">4.3&nbsp;&nbsp;</span>IPA-ify each field in the full dataset</a></span></li><li><span><a href="#Write-to-file-/-import-and-check-contents" data-toc-modified-id="Write-to-file-/-import-and-check-contents-4.4"><span class="toc-item-num">4.4&nbsp;&nbsp;</span>Write to file / import and check contents</a></span><ul class="toc-item"><li><span><a href="#...read-back-in" data-toc-modified-id="...read-back-in-4.4.1"><span class="toc-item-num">4.4.1&nbsp;&nbsp;</span>...read back in</a></span></li></ul></li></ul></li><li><span><a href="#Add-probability-annotations-to-IPhOD" data-toc-modified-id="Add-probability-annotations-to-IPhOD-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Add probability annotations to IPhOD</a></span><ul class="toc-item"><li><span><a href="#Export/import" data-toc-modified-id="Export/import-5.1"><span class="toc-item-num">5.1&nbsp;&nbsp;</span>Export/import</a></span><ul class="toc-item"><li><span><a href="#Import" data-toc-modified-id="Import-5.1.1"><span class="toc-item-num">5.1.1&nbsp;&nbsp;</span>Import</a></span></li></ul></li></ul></li></ul></div>

# Overview and requirements

**Notebook author:** emeinhardt@ucsd.edu

This is a notebook documenting the conversion of transcriptions of the English lexicon in 
 - Hammond's newdic 
 - IPhOD

to IPA.

At a high level, I am running Python 3.6.5, Jupyter Notebook 5.5.0, and otherwise Anaconda 5.2. More specifically, this notebook assumes the current working directory contains
 - a copy of Hammond's mysterious 'newdic' transcribed lexicon of English http://dingo.sbs.arizona.edu/~hammond/lsasummer11/newdic
 - a copy of the data associated with IPhOD (available from http://www.iphod.com/)

# Import data

In [2]:
%pwd

'/home/AD/emeinhar/c2-jn'

In [3]:
import csv

## Hammond's newdic

In [4]:
%ls Hammond*

"Hammond's mysterious newdic.txt"


In [5]:
hammond_fn = "Hammond's mysterious newdic.txt"

In [6]:
newdic_raw = []
fieldnames = ['Transcription', 'stressInfoA', 'stressInfoB', 'Orthography', 'Frequency', 'PoSs']
with open(hammond_fn) as csvfile:
    my_reader = csv.DictReader(csvfile, delimiter='\t', fieldnames=fieldnames)
    for row in my_reader:
        #print(row)
        newdic_raw.append(row)

newdic_raw[0]
len(newdic_raw[0].keys())

OrderedDict([('Transcription', 'x'),
             ('stressInfoA', '_'),
             ('stressInfoB', 'S1'),
             ('Orthography', 'a'),
             ('Frequency', '23178'),
             ('PoSs', '(N IA VB PP)')])

6

## IPhOD

In [7]:
%ls IPhOD*

IPhOD2_Words.txt  [0m[01;31mIPhODv2.0_REALS.zip[0m


In [8]:
#%cd IPhODv2.0_REALS

In [9]:
IPhOD_raw_filename = 'IPhOD2_Words.txt'

lexicon_raw = []
with open(IPhOD_raw_filename) as csvfile:
    my_reader = csv.DictReader(csvfile, delimiter='\t')
    for row in my_reader:
        #print(row)
        lexicon_raw.append(row)

lexicon_raw[0]
len(lexicon_raw[0].keys())

OrderedDict([('Indx', '1'),
             ('Word', 'a'),
             ('UnTrn', 'AH'),
             ('StTrn', 'AH0'),
             ('NSyll', '1'),
             ('NPhon', '1'),
             ('unsDENS', '26'),
             ('unsFDEN', '150377.45'),
             ('unsLDEN', '91.03'),
             ('unsCDEN', '136251'),
             ('strDENS', '21'),
             ('strFDEN', '145035.32'),
             ('strLDEN', '77.56'),
             ('strCDEN', '114167'),
             ('unsBPAV', '0'),
             ('unsFBPAV', '0'),
             ('unsLBPAV', '0'),
             ('unsCBPAV', '0'),
             ('strBPAV', '0'),
             ('strFBPAV', '0'),
             ('strLBPAV', '0'),
             ('strCBPAV', '0'),
             ('unsTPAV', '0'),
             ('unsFTPAV', '0'),
             ('unsLTPAV', '0'),
             ('unsCTPAV', '0'),
             ('strTPAV', '0'),
             ('strFTPAV', '0'),
             ('strLTPAV', '0'),
             ('strCTPAV', '0'),
             ('unsPOSPAV', '0.044

49

Let's trim the extra empty field:

In [10]:
def trimEmptyField(a_dict):
    del a_dict['']
    return a_dict

lexicon_raw = list(map(trimEmptyField, lexicon_raw))
lexicon_raw[0]

OrderedDict([('Indx', '1'),
             ('Word', 'a'),
             ('UnTrn', 'AH'),
             ('StTrn', 'AH0'),
             ('NSyll', '1'),
             ('NPhon', '1'),
             ('unsDENS', '26'),
             ('unsFDEN', '150377.45'),
             ('unsLDEN', '91.03'),
             ('unsCDEN', '136251'),
             ('strDENS', '21'),
             ('strFDEN', '145035.32'),
             ('strLDEN', '77.56'),
             ('strCDEN', '114167'),
             ('unsBPAV', '0'),
             ('unsFBPAV', '0'),
             ('unsLBPAV', '0'),
             ('unsCBPAV', '0'),
             ('strBPAV', '0'),
             ('strFBPAV', '0'),
             ('strLBPAV', '0'),
             ('strCBPAV', '0'),
             ('unsTPAV', '0'),
             ('unsFTPAV', '0'),
             ('unsLTPAV', '0'),
             ('unsCTPAV', '0'),
             ('strTPAV', '0'),
             ('strFTPAV', '0'),
             ('strLTPAV', '0'),
             ('strCTPAV', '0'),
             ('unsPOSPAV', '0.044

In [11]:
#%cd ..

# Convert phonological representations to IPA - Hammond

In [12]:
newdic_raw[2]

OrderedDict([('Transcription', 'xb@k'),
             ('stressInfoA', "_'"),
             ('stressInfoB', 'S2'),
             ('Orthography', 'aback'),
             ('Frequency', '2'),
             ('PoSs', '(AV)')])

We want to convert the transcription field into a string of IPA symbols, with each segment separated by a '.'

## Define Hammond inventory -> IPA mapping

In [13]:
hammond_IPA_relation = [
 ('h', 'h'),
 ('S', 'ʃ'),
 ('p', 'p'),
 ('x', 'ə'),
 ('m', 'm'),
 ('y', 'j'),
 ('v', 'v'),
 ('^', 'ʌ'),
 ('o', 'oʊ'),
 ('u', 'u'),
 ('I', 'ɪ'),
 ('G', 'ŋ'),
 ('N', 'ṇ'), #sllabic n is NOT in diphone gating inventory
 ('|', 'ɪ'), #as near as I can tell, this is unstressed/reduced 'ɪ'
 ('Z', 'ʒ'),
 ('L', 'l̩'),
 ('M', 'ṃ'), #syllabic m is NOT in diphone gating inventory
 ('i', 'i'),
 ('r', 'r'),
 ('g', 'g'),
 ('O', 'ɔɪ'),
 ('T', 'θ'),
 ('n', 'n'),
 ('J', 'dʒ'),
 ('d', 'd'),
 ('k', 'k'),
 ('W', 'aʊ'),
 ('f', 'f'),
 ('D', 'ð'),
 ('U', 'ʊ'),
 ('z', 'z'),
 ('Y', 'aɪ'),
 ('b', 'b'),
 ('X', 'ɚ'), #r-colored schwa - stressed
 ('a', 'ɑ'),
 ('s', 's'),
 ('e', 'eɪ'),
 ('C', 'tʃ'),
 ('t', 't'),
 ('R', 'ɚ'), #r-colored schwa - UNstressed
 ('E', 'ɛ'),
 ('w', 'w'),
 ('l', 'l'),
 ('@', 'æ'),
 ('c', 'ɔ')] #ɔ is NOT in diphone gating data inventory
def hammondToUnicodeIPA(diphoneSymbol):
    mapping = dict(hammond_IPA_relation)
    #print(mapping)
    return mapping[diphoneSymbol]
print(hammondToUnicodeIPA('T'))
def invertMapping(mydict): 
    return dict([[val, key] for key,val in mydict.items()])
def unicodeIPAToHammond(unicodeIPAsymbol):
    return invertMapping( dict(hammond_IPA_relation) )[unicodeIPAsymbol]
print(unicodeIPAToHammond('θ'))

θ
T


## Transform transcriptions to IPA...

In [14]:
t = newdic_raw[2]['Transcription']
t
tuple(t)
tuple(map(hammondToUnicodeIPA, tuple(t)))
'.'.join(tuple(map(hammondToUnicodeIPA, tuple(t))))

'xb@k'

('x', 'b', '@', 'k')

('ə', 'b', 'æ', 'k')

'ə.b.æ.k'

In [15]:
dottedStringToTuple = lambda ds: tuple(ds.split('.'))
tupleToDottedString = lambda t: '.'.join(t)

In [16]:
tupleToDottedString( tuple(map(hammondToUnicodeIPA, tuple(t))) )

'ə.b.æ.k'

In [17]:
def hammondTranscriptionToIPA(hammond_trn):
    ht_tup = tuple(hammond_trn)
    ipa_tup = tuple(map(hammondToUnicodeIPA, ht_tup))
    return tupleToDottedString(ipa_tup)
hammondTranscriptionToIPA(t)

'ə.b.æ.k'

## Transform entries to have IPA transcriptions...

In [18]:
def edit_dict(the_dict, the_key, the_new_value):
    '''
    Composable (because it returns a value) but stateful(= in-place) dictionary update.
    '''
    the_dict.update({the_key: the_new_value})
    return the_dict

def modify_dict(the_dict, the_key, the_new_value):
    '''
    Composable and (naively-implemented) non-mutating dictionary update.
    '''
    new_dict = {k:the_dict[k] for k in the_dict}
    new_dict.update({the_key: the_new_value})
    return new_dict

In [19]:
def IPAify_hammond_entry(entry):
    new_entry = modify_dict(entry, 'Transcription', hammondTranscriptionToIPA(entry['Transcription']))
    return new_entry

In [20]:
newdic_raw[2]

OrderedDict([('Transcription', 'xb@k'),
             ('stressInfoA', "_'"),
             ('stressInfoB', 'S2'),
             ('Orthography', 'aback'),
             ('Frequency', '2'),
             ('PoSs', '(AV)')])

In [21]:
IPAify_hammond_entry(newdic_raw[2])
newdic_raw[2]

{'Transcription': 'ə.b.æ.k',
 'stressInfoA': "_'",
 'stressInfoB': 'S2',
 'Orthography': 'aback',
 'Frequency': '2',
 'PoSs': '(AV)'}

OrderedDict([('Transcription', 'xb@k'),
             ('stressInfoA', "_'"),
             ('stressInfoB', 'S2'),
             ('Orthography', 'aback'),
             ('Frequency', '2'),
             ('PoSs', '(AV)')])

In [22]:
hammond_IPA = list(map(IPAify_hammond_entry, newdic_raw))
hammond_IPA[2]

{'Transcription': 'ə.b.æ.k',
 'stressInfoA': "_'",
 'stressInfoB': 'S2',
 'Orthography': 'aback',
 'Frequency': '2',
 'PoSs': '(AV)'}

## Write to file / read back in

In [23]:
%pwd

'/home/AD/emeinhar/c2-jn'

In [24]:
%ls *Hammond*

"Hammond's mysterious newdic.txt"


In [25]:
import csv

In [26]:
hammond_IPA[0]

{'Transcription': 'ə',
 'stressInfoA': '_',
 'stressInfoB': 'S1',
 'Orthography': 'a',
 'Frequency': '23178',
 'PoSs': '(N IA VB PP)'}

In [27]:
theFieldnames = ['Transcription', 'stressInfoA', 'stressInfoB', 'Orthography', 'Frequency', 'PoSs']
len(theFieldnames)
theFieldnames

6

['Transcription',
 'stressInfoA',
 'stressInfoB',
 'Orthography',
 'Frequency',
 'PoSs']

In [28]:
newdic_IPA_stem = 'Hammond_newdic_IPA'

In [29]:
with open(newdic_IPA_stem + '.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, delimiter='\t', fieldnames=theFieldnames)
    writer.writeheader()
    writer.writerows(hammond_IPA)

In [30]:
%pwd

'/home/AD/emeinhar/c2-jn'

In [31]:
%ls Hammond*

 Hammond_newdic_IPA.csv  "Hammond's mysterious newdic.txt"


# Convert phonological representations to IPA - IPhOD

## Identify fields to modify

In [32]:
phonological_rep_fieldnames = [
                      'UnTrn', #unstressed CMU pronouncing dictionary transcription
                      'StTrn', #stressed CMU pronouncing dictionary transcription
]

In [33]:
lexicon_raw[12345]['UnTrn']
lexicon_raw[12345]['StTrn']

'D.EH.F.AH.N.AH.T.L.IY'

'D.EH1.F.AH0.N.AH0.T.L.IY0'

In [34]:
lexicon_raw[12345]['UnTrn'].split('.')
lexicon_raw[12345]['StTrn'].split('.')

['D', 'EH', 'F', 'AH', 'N', 'AH', 'T', 'L', 'IY']

['D', 'EH1', 'F', 'AH0', 'N', 'AH0', 'T', 'L', 'IY0']

In [35]:
dottedStringToTuple = lambda ds: tuple(ds.split('.'))
dottedStringToTuple( lexicon_raw[12345]['UnTrn'] )

('D', 'EH', 'F', 'AH', 'N', 'AH', 'T', 'L', 'IY')

In [36]:
'.'.join(  lexicon_raw[12345]['UnTrn'].split('.')  )

'D.EH.F.AH.N.AH.T.L.IY'

In [37]:
tupleToDottedString = lambda t: '.'.join(t)
tupleToDottedString( dottedStringToTuple( lexicon_raw[12345]['UnTrn'] ) )

'D.EH.F.AH.N.AH.T.L.IY'

## Define arpabet ⟶ IPA mapping

In [38]:
from functools import reduce

In [39]:
def getArpabet(entry):
    return dottedStringToTuple(entry['UnTrn'])
list_arpabet_seqs = list(map(getArpabet, lexicon_raw))
list_arpabet_sets = [set(seq) for seq in list_arpabet_seqs]

# def setUnion(setA, setB):
#     setC = {each for each in setA}
#     setC.update(setB)
#     return setC

testSet = set(reduce(set.union, [{0}, {0, 1}, {2}, {2}, {3, 4}, {1,3}]))
testSet

arpabetSymbolsInLexicon = set(reduce(set.union, list_arpabet_sets))
len(arpabetSymbolsInLexicon)
arpabetSymbolsInLexicon

{0, 1, 2, 3, 4}

39

{'AA',
 'AE',
 'AH',
 'AO',
 'AW',
 'AY',
 'B',
 'CH',
 'D',
 'DH',
 'EH',
 'ER',
 'EY',
 'F',
 'G',
 'HH',
 'IH',
 'IY',
 'JH',
 'K',
 'L',
 'M',
 'N',
 'NG',
 'OW',
 'OY',
 'P',
 'R',
 'S',
 'SH',
 'T',
 'TH',
 'UH',
 'UW',
 'V',
 'W',
 'Y',
 'Z',
 'ZH'}

In [40]:
arpabetUnicodeIPArelation = set([ \
('AO', 'ɔ'),
('AA', 'ɑ'),
('IY', 'i'),
('UW', 'u'),
('EH', 'ɛ'),
('IH', 'ɪ'),
('UH', 'ʊ'),
('AH', 'ʌ'),
('AX', 'ə'),
('AE', 'æ'),
('EY', 'eɪ'),
('AY', 'aɪ'),
('OW', 'oʊ'),
('AW', 'aʊ'),
('OY', 'ɔɪ'),
('ER', 'ɚ'),
('P', 'p'),
('B', 'b'),
('T', 't'),
('D', 'd'),
('K', 'k'),
('G', 'g'),
('CH', 'tʃ'),
('JH', 'dʒ'),
('F', 'f'),
('V', 'v'),
('TH', 'θ'),
('DH', 'ð'),
('S', 's'),
('Z', 'z'),
('SH', 'ʃ'),
('ZH', 'ʒ'),
('HH', 'h'),
('M', 'm'),
('EM', 'm̩'),
('N', 'n'),
('EN', 'n̩'),
('NG', 'ŋ'),
('ENG', 'ŋ̩'),
('L', 'l'),
('EL', 'l̩'),
('R', 'r'),
('DX', 'ɾ'),
('NX', 'ɾ̃'),
('Y', 'j'),
('W', 'w'),
('Q', 'ʔ')
])

In [41]:
IPAableArpabet = list(map(lambda t: t[0], arpabetUnicodeIPArelation))
[symbol for symbol in arpabetSymbolsInLexicon if symbol not in IPAableArpabet]
assert(len([symbol for symbol in arpabetSymbolsInLexicon if symbol not in IPAableArpabet]) == 0)

[]

In [42]:
def arpabetToUnicodeIPA(arpabetSymbol):
    mapping = dict(arpabetUnicodeIPArelation)
    #print(mapping)
    return mapping[arpabetSymbol]
arpabetToUnicodeIPA('TH')

def invertMapping(mydict): 
    return dict([[val, key] for key,val in mydict.items()])
def unicodeIPAToArpabet(unicodeIPAsymbol):
    return invertMapping( dict(arpabetUnicodeIPArelation) )[unicodeIPAsymbol]
unicodeIPAToArpabet('θ')

'θ'

'TH'

In [43]:
def IPAifyUnstressedRep(unstressedRep):
    arpabetTuple = dottedStringToTuple( unstressedRep ) 
    
    IPAtuple = tuple(list(map(arpabetToUnicodeIPA, arpabetTuple)))
    
    dottedIPAsymbols = tupleToDottedString( IPAtuple )
    return dottedIPAsymbols

def IPAifyStressedRep(stressedRep):
    stressedArpabetTuple = dottedStringToTuple( stressedRep )

    arpabetSymbols = [symb[:-1] if symb[-1] == '0' or symb[-1] == '1' or symb[-1] == '2' else symb[:] for symb in stressedArpabetTuple]
    stresses = [symb[-1] if symb[-1] == '0' or symb[-1] == '1' or symb[-1] == '2' else '' for symb in stressedArpabetTuple]
#     print('arpabetSymbols:')
#     print(arpabetSymbols)
#     print('stresses:')
#     print(stresses)
#     print('together:')
    
    IPAsymbols = list(map(arpabetToUnicodeIPA, arpabetSymbols))
    stressedIPAtuple = tuple(map(lambda s01: s01[0] + s01[1], zip(IPAsymbols,stresses)))
    
    dottedStressedIPAsymbols = tupleToDottedString( stressedIPAtuple )
    return dottedStressedIPAsymbols

In [44]:
lexicon_raw[12345]['StTrn']

'D.EH1.F.AH0.N.AH0.T.L.IY0'

In [45]:
# t = dottedStringToTuple( lexicon_raw[12345]['StTrn'] )
t = dottedStringToTuple( lexicon_raw[2]['StTrn'] )
t

('T', 'R', 'IH2', 'P', 'AH0', 'L', 'EY1')

In [46]:
symbs = [symb[:-1] if symb[-1] == '0' or symb[-1] == '1' or symb[-1] == '2' else symb[:] for symb in t]
symbs

['T', 'R', 'IH', 'P', 'AH', 'L', 'EY']

In [47]:
strss = [symb[-1] if symb[-1] == '0' or symb[-1] == '1' or symb[-1] == '2' else '' for symb in t]
strss

['', '', '2', '', '0', '', '1']

In [48]:
list(zip(symbs,strss))

[('T', ''),
 ('R', ''),
 ('IH', '2'),
 ('P', ''),
 ('AH', '0'),
 ('L', ''),
 ('EY', '1')]

In [49]:
tuple(map(lambda s01: s01[0] + s01[1], zip(symbs,strss)))

('T', 'R', 'IH2', 'P', 'AH0', 'L', 'EY1')

In [50]:
tupleToDottedString(tuple(map(lambda s01: s01[0] + s01[1], zip(symbs,strss))))

'T.R.IH2.P.AH0.L.EY1'

In [51]:
ipaSymbs = list(map(arpabetToUnicodeIPA, symbs))
ipaSymbs

['t', 'r', 'ɪ', 'p', 'ʌ', 'l', 'eɪ']

In [52]:
tupleToDottedString(tuple(map(lambda s01: s01[0] + s01[1], zip(ipaSymbs,strss))))

't.r.ɪ2.p.ʌ0.l.eɪ1'

In [53]:
lexicon_raw[12345]['StTrn']
lexicon_raw[0]['StTrn']
lexicon_raw[2]['StTrn']

'D.EH1.F.AH0.N.AH0.T.L.IY0'

'AH0'

'T.R.IH2.P.AH0.L.EY1'

In [54]:
IPAifyUnstressedRep( lexicon_raw[12345]['UnTrn'] )
IPAifyStressedRep( lexicon_raw[12345]['StTrn'] )

'd.ɛ.f.ʌ.n.ʌ.t.l.i'

'd.ɛ1.f.ʌ0.n.ʌ0.t.l.i0'

In [55]:
lexicon_raw[0]['UnTrn']
lexicon_raw[0]['StTrn']
IPAifyUnstressedRep( lexicon_raw[0]['UnTrn'] )
IPAifyStressedRep( lexicon_raw[0]['StTrn'] )

'AH'

'AH0'

'ʌ'

'ʌ0'

In [56]:
lexicon_raw[2]['UnTrn']
lexicon_raw[2]['StTrn']
IPAifyUnstressedRep( lexicon_raw[2]['UnTrn'] )
IPAifyStressedRep( lexicon_raw[2]['StTrn'] )

'T.R.IH.P.AH.L.EY'

'T.R.IH2.P.AH0.L.EY1'

't.r.ɪ.p.ʌ.l.eɪ'

't.r.ɪ2.p.ʌ0.l.eɪ1'

## IPA-ify each field in the full dataset

In [57]:
def edit_dict(the_dict, the_key, the_new_value):
    '''
    Composable (because it returns a value) but stateful(= in-place) dictionary update.
    '''
    the_dict.update({the_key: the_new_value})
    return the_dict

def modify_dict(the_dict, the_key, the_new_value):
    '''
    Composable and (naively-implemented) non-mutating dictionary update.
    '''
    new_dict = {k:the_dict[k] for k in the_dict}
    new_dict.update({the_key: the_new_value})
    return new_dict

In [58]:
lexicon_raw[0:3]

[OrderedDict([('Indx', '1'),
              ('Word', 'a'),
              ('UnTrn', 'AH'),
              ('StTrn', 'AH0'),
              ('NSyll', '1'),
              ('NPhon', '1'),
              ('unsDENS', '26'),
              ('unsFDEN', '150377.45'),
              ('unsLDEN', '91.03'),
              ('unsCDEN', '136251'),
              ('strDENS', '21'),
              ('strFDEN', '145035.32'),
              ('strLDEN', '77.56'),
              ('strCDEN', '114167'),
              ('unsBPAV', '0'),
              ('unsFBPAV', '0'),
              ('unsLBPAV', '0'),
              ('unsCBPAV', '0'),
              ('strBPAV', '0'),
              ('strFBPAV', '0'),
              ('strLBPAV', '0'),
              ('strCBPAV', '0'),
              ('unsTPAV', '0'),
              ('unsFTPAV', '0'),
              ('unsLTPAV', '0'),
              ('unsCTPAV', '0'),
              ('strTPAV', '0'),
              ('strFTPAV', '0'),
              ('strLTPAV', '0'),
              ('strCTPAV', '0'),
   

In [59]:
def IPAify(entry):
#     print(entry['UnTrn'])
#     print(IPAifyUnstressedRep(entry['UnTrn']))
    updated_entry = modify_dict(entry, 'UnTrn', IPAifyUnstressedRep(entry['UnTrn']))
#     print('-------')
#     print(updated_entry['UnTrn'])
#     print('=======')
#     print(entry['StTrn'])
#     print(updated_entry['StTrn'])
#     print(IPAifyStressedRep(entry['StTrn']))
#     print(IPAifyStressedRep(updated_entry['StTrn']))
    updated_entry = modify_dict(updated_entry, 'StTrn', IPAifyStressedRep(entry['StTrn']))
#     print('-------')
#     print(updated_entry['StTrn'])
    return updated_entry

lexicon_IPA = [IPAify(entry) for entry in lexicon_raw]
lexicon_IPA[:10]
# lexicon_IPA

[{'Indx': '1',
  'Word': 'a',
  'UnTrn': 'ʌ',
  'StTrn': 'ʌ0',
  'NSyll': '1',
  'NPhon': '1',
  'unsDENS': '26',
  'unsFDEN': '150377.45',
  'unsLDEN': '91.03',
  'unsCDEN': '136251',
  'strDENS': '21',
  'strFDEN': '145035.32',
  'strLDEN': '77.56',
  'strCDEN': '114167',
  'unsBPAV': '0',
  'unsFBPAV': '0',
  'unsLBPAV': '0',
  'unsCBPAV': '0',
  'strBPAV': '0',
  'strFBPAV': '0',
  'strLBPAV': '0',
  'strCBPAV': '0',
  'unsTPAV': '0',
  'unsFTPAV': '0',
  'unsLTPAV': '0',
  'unsCTPAV': '0',
  'strTPAV': '0',
  'strFTPAV': '0',
  'strLTPAV': '0',
  'strCTPAV': '0',
  'unsPOSPAV': '0.04449866',
  'unsFPOSPAV': '0.07192868',
  'unsLPOSPAV': '0.03999447',
  'unsCPOSPAV': '0.04159586',
  'strPOSPAV': '0.03588891',
  'strFPOSPAV': '0.06061931',
  'strLPOSPAV': '0.03338777',
  'strCPOSPAV': '0.03349570',
  'unsLCPOSPAV': '0.07692308',
  'unsFLCPOSPAV': '0.22760866',
  'unsLLCPOSPAV': '0.12570126',
  'unsCLCPOSPAV': '0.15987477',
  'strLCPOSPAV': '0.06666667',
  'strFLCPOSPAV': '0.21988359

## Write to file / import and check contents

In [60]:
%pwd

'/home/AD/emeinhar/c2-jn'

In [61]:
#%cd IPhODv2.0_REALS

In [62]:
%ls *IPhOD*

IPhOD2_Words.txt  [0m[01;31mIPhODv2.0_REALS.zip[0m


In [63]:
import csv

In [64]:
lexicon_IPA[0]

{'Indx': '1',
 'Word': 'a',
 'UnTrn': 'ʌ',
 'StTrn': 'ʌ0',
 'NSyll': '1',
 'NPhon': '1',
 'unsDENS': '26',
 'unsFDEN': '150377.45',
 'unsLDEN': '91.03',
 'unsCDEN': '136251',
 'strDENS': '21',
 'strFDEN': '145035.32',
 'strLDEN': '77.56',
 'strCDEN': '114167',
 'unsBPAV': '0',
 'unsFBPAV': '0',
 'unsLBPAV': '0',
 'unsCBPAV': '0',
 'strBPAV': '0',
 'strFBPAV': '0',
 'strLBPAV': '0',
 'strCBPAV': '0',
 'unsTPAV': '0',
 'unsFTPAV': '0',
 'unsLTPAV': '0',
 'unsCTPAV': '0',
 'strTPAV': '0',
 'strFTPAV': '0',
 'strLTPAV': '0',
 'strCTPAV': '0',
 'unsPOSPAV': '0.04449866',
 'unsFPOSPAV': '0.07192868',
 'unsLPOSPAV': '0.03999447',
 'unsCPOSPAV': '0.04159586',
 'strPOSPAV': '0.03588891',
 'strFPOSPAV': '0.06061931',
 'strLPOSPAV': '0.03338777',
 'strCPOSPAV': '0.03349570',
 'unsLCPOSPAV': '0.07692308',
 'unsFLCPOSPAV': '0.22760866',
 'unsLLCPOSPAV': '0.12570126',
 'unsCLCPOSPAV': '0.15987477',
 'strLCPOSPAV': '0.06666667',
 'strFLCPOSPAV': '0.21988359',
 'strLLCPOSPAV': '0.07556101',
 'strCLCPO

In [65]:
theFieldnames = ['Indx', 'NPhon', 'NSyll', 'SCDcnt', 'SFreq', 'StTrn', 'UnTrn', 'Word', 'strBPAV', 'strCBPAV', 'strCDEN', 'strCLCPOSPAV', 'strCPOSPAV', 'strCTPAV', 'strDENS', 'strFBPAV', 'strFDEN', 'strFLCPOSPAV', 'strFPOSPAV', 'strFTPAV', 'strLBPAV', 'strLCPOSPAV', 'strLDEN', 'strLLCPOSPAV', 'strLPOSPAV', 'strLTPAV', 'strPOSPAV', 'strTPAV', 'unsBPAV', 'unsCBPAV', 'unsCDEN', 'unsCLCPOSPAV', 'unsCPOSPAV', 'unsCTPAV', 'unsDENS', 'unsFBPAV', 'unsFDEN', 'unsFLCPOSPAV', 'unsFPOSPAV', 'unsFTPAV', 'unsLBPAV', 'unsLCPOSPAV', 'unsLDEN', 'unsLLCPOSPAV', 'unsLPOSPAV', 'unsLTPAV', 'unsPOSPAV', 'unsTPAV']
len(theFieldnames)
theFieldnames

48

['Indx',
 'NPhon',
 'NSyll',
 'SCDcnt',
 'SFreq',
 'StTrn',
 'UnTrn',
 'Word',
 'strBPAV',
 'strCBPAV',
 'strCDEN',
 'strCLCPOSPAV',
 'strCPOSPAV',
 'strCTPAV',
 'strDENS',
 'strFBPAV',
 'strFDEN',
 'strFLCPOSPAV',
 'strFPOSPAV',
 'strFTPAV',
 'strLBPAV',
 'strLCPOSPAV',
 'strLDEN',
 'strLLCPOSPAV',
 'strLPOSPAV',
 'strLTPAV',
 'strPOSPAV',
 'strTPAV',
 'unsBPAV',
 'unsCBPAV',
 'unsCDEN',
 'unsCLCPOSPAV',
 'unsCPOSPAV',
 'unsCTPAV',
 'unsDENS',
 'unsFBPAV',
 'unsFDEN',
 'unsFLCPOSPAV',
 'unsFPOSPAV',
 'unsFTPAV',
 'unsLBPAV',
 'unsLCPOSPAV',
 'unsLDEN',
 'unsLLCPOSPAV',
 'unsLPOSPAV',
 'unsLTPAV',
 'unsPOSPAV',
 'unsTPAV']

In [66]:
IPhOD_IPA_filename_stem = 'IPhOD2_Words_IPA'

with open(IPhOD_IPA_filename_stem + '.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, delimiter='\t', fieldnames=theFieldnames)
    writer.writeheader()
    writer.writerows(lexicon_IPA)

In [67]:
%pwd

'/home/AD/emeinhar/c2-jn'

In [68]:
%ls *IPhOD2*

IPhOD2_Words_IPA.csv  IPhOD2_Words.txt


### ...read back in

In [69]:
import csv

In [70]:
IPhOD_IPA_filename = 'IPhOD2_Words_IPA.csv'

theFieldnames = ['Indx', 'NPhon', 'NSyll', 'SCDcnt', 'SFreq', 'StTrn', 'UnTrn', 'Word', 'strBPAV', 'strCBPAV', 'strCDEN', 'strCLCPOSPAV', 'strCPOSPAV', 'strCTPAV', 'strDENS', 'strFBPAV', 'strFDEN', 'strFLCPOSPAV', 'strFPOSPAV', 'strFTPAV', 'strLBPAV', 'strLCPOSPAV', 'strLDEN', 'strLLCPOSPAV', 'strLPOSPAV', 'strLTPAV', 'strPOSPAV', 'strTPAV', 'unsBPAV', 'unsCBPAV', 'unsCDEN', 'unsCLCPOSPAV', 'unsCPOSPAV', 'unsCTPAV', 'unsDENS', 'unsFBPAV', 'unsFDEN', 'unsFLCPOSPAV', 'unsFPOSPAV', 'unsFTPAV', 'unsLBPAV', 'unsLCPOSPAV', 'unsLDEN', 'unsLLCPOSPAV', 'unsLPOSPAV', 'unsLTPAV', 'unsPOSPAV', 'unsTPAV']

lexicon_IPA_in = []
with open(IPhOD_IPA_filename) as csvfile:
    my_reader = csv.DictReader(csvfile, delimiter='\t')
    for row in my_reader:
        #print(row)
        lexicon_IPA_in.append(row)

lexicon_IPA_in[0].keys()
set(lexicon_IPA_in[0].keys()) == set(theFieldnames)
len(lexicon_IPA_in[0].keys())
lexicon_IPA_in[0]


odict_keys(['Indx', 'NPhon', 'NSyll', 'SCDcnt', 'SFreq', 'StTrn', 'UnTrn', 'Word', 'strBPAV', 'strCBPAV', 'strCDEN', 'strCLCPOSPAV', 'strCPOSPAV', 'strCTPAV', 'strDENS', 'strFBPAV', 'strFDEN', 'strFLCPOSPAV', 'strFPOSPAV', 'strFTPAV', 'strLBPAV', 'strLCPOSPAV', 'strLDEN', 'strLLCPOSPAV', 'strLPOSPAV', 'strLTPAV', 'strPOSPAV', 'strTPAV', 'unsBPAV', 'unsCBPAV', 'unsCDEN', 'unsCLCPOSPAV', 'unsCPOSPAV', 'unsCTPAV', 'unsDENS', 'unsFBPAV', 'unsFDEN', 'unsFLCPOSPAV', 'unsFPOSPAV', 'unsFTPAV', 'unsLBPAV', 'unsLCPOSPAV', 'unsLDEN', 'unsLLCPOSPAV', 'unsLPOSPAV', 'unsLTPAV', 'unsPOSPAV', 'unsTPAV'])

True

48

OrderedDict([('Indx', '1'),
             ('NPhon', '1'),
             ('NSyll', '1'),
             ('SCDcnt', '8382'),
             ('SFreq', '20415.27'),
             ('StTrn', 'ʌ0'),
             ('UnTrn', 'ʌ'),
             ('Word', 'a'),
             ('strBPAV', '0'),
             ('strCBPAV', '0'),
             ('strCDEN', '114167'),
             ('strCLCPOSPAV', '0.10933994'),
             ('strCPOSPAV', '0.03349570'),
             ('strCTPAV', '0'),
             ('strDENS', '21'),
             ('strFBPAV', '0'),
             ('strFDEN', '145035.32'),
             ('strFLCPOSPAV', '0.21988359'),
             ('strFPOSPAV', '0.06061931'),
             ('strFTPAV', '0'),
             ('strLBPAV', '0'),
             ('strLCPOSPAV', '0.06666667'),
             ('strLDEN', '77.56'),
             ('strLLCPOSPAV', '0.07556101'),
             ('strLPOSPAV', '0.03338777'),
             ('strLTPAV', '0'),
             ('strPOSPAV', '0.03588891'),
             ('strTPAV', '0'),
            

In [71]:
def match(dictA, dictB):
    return dictA.keys() == dictB.keys() and all([dictA[k] == dictB[k] for k in dictA.keys()])

assert(all(map(lambda out_in_tuple: match(out_in_tuple[0], out_in_tuple[1]), zip(lexicon_IPA, lexicon_IPA_in))))

In [72]:
lexicon_IPA = lexicon_IPA_in

# Add probability annotations to IPhOD

In [73]:
lexicon_IPA[2]

OrderedDict([('Indx', '3'),
             ('NPhon', '7'),
             ('NSyll', '3'),
             ('SCDcnt', '23'),
             ('SFreq', '0.49'),
             ('StTrn', 't.r.ɪ2.p.ʌ0.l.eɪ1'),
             ('UnTrn', 't.r.ɪ.p.ʌ.l.eɪ'),
             ('Word', 'Aaa'),
             ('strBPAV', '0.00479395'),
             ('strCBPAV', '0.00393286'),
             ('strCDEN', '0'),
             ('strCLCPOSPAV', '0.06229064'),
             ('strCPOSPAV', '0.04821133'),
             ('strCTPAV', '0.00011896'),
             ('strDENS', '0'),
             ('strFBPAV', '0.00221983'),
             ('strFDEN', '0'),
             ('strFLCPOSPAV', '0.06127196'),
             ('strFPOSPAV', '0.04222206'),
             ('strFTPAV', '0.00010726'),
             ('strLBPAV', '0.00441717'),
             ('strLCPOSPAV', '0.05360625'),
             ('strLDEN', '0'),
             ('strLLCPOSPAV', '0.05900326'),
             ('strLPOSPAV', '0.05090395'),
             ('strLTPAV', '0.00011330'),
             ('s

In [74]:
lexicon_IPA[2]['Word']
lexicon_IPA[2]['SFreq'] #SUBTLEX_US frequency count of the orthographic word

'Aaa'

'0.49'

In [75]:
def castCountsToFloats(the_dict):
    return edit_dict(the_dict, 'SFreq', float(the_dict['SFreq']))

lexicon_IPA = list(map(castCountsToFloats, lexicon_IPA))
lexicon_IPA[2]['Word']
lexicon_IPA[2]['SFreq']

'Aaa'

0.49

In [76]:
sumFreqs = sum(row['SFreq'] for row in lexicon_IPA)
sumFreqs

1405131.4800001306

In [77]:
from math import log, log2, pow, isclose

def addProbs(the_dict):
#     the_dict['Prob'] = float(the_dict['SFreq']) / sumFreqs
#     the_dict['Nlprob'] = -1.0 * log2( float(the_dict['SFreq']) / sumFreqs )
#     return the_dict
    new_dict = modify_dict(the_dict, 'Prob', float(the_dict['SFreq']) / sumFreqs)
    new_dict = modify_dict(new_dict, 'Nlprob', -1.0 * ( log2( float(the_dict['SFreq']) ) - log2( sumFreqs ) ) )
    return new_dict

lexicon_IPA_probanno = [addProbs(row) for row in lexicon_IPA]
lexicon_IPA_probanno[2]['Word']
lexicon_IPA_probanno[2]['SFreq']
lexicon_IPA_probanno[2]['Prob']
lexicon_IPA_probanno[2]['Nlprob']

'Aaa'

0.49

3.487218149862776e-07

21.451420046618022

## Export/import

In [78]:
#%cd IPhODv2.0_REALS

In [79]:
%pwd

'/home/AD/emeinhar/c2-jn'

In [80]:
%ls *IPhOD*

IPhOD2_Words_IPA.csv  IPhOD2_Words.txt  [0m[01;31mIPhODv2.0_REALS.zip[0m


In [81]:
import csv

In [82]:
lexicon_IPA_probanno[2]

{'Indx': '3',
 'NPhon': '7',
 'NSyll': '3',
 'SCDcnt': '23',
 'SFreq': 0.49,
 'StTrn': 't.r.ɪ2.p.ʌ0.l.eɪ1',
 'UnTrn': 't.r.ɪ.p.ʌ.l.eɪ',
 'Word': 'Aaa',
 'strBPAV': '0.00479395',
 'strCBPAV': '0.00393286',
 'strCDEN': '0',
 'strCLCPOSPAV': '0.06229064',
 'strCPOSPAV': '0.04821133',
 'strCTPAV': '0.00011896',
 'strDENS': '0',
 'strFBPAV': '0.00221983',
 'strFDEN': '0',
 'strFLCPOSPAV': '0.06127196',
 'strFPOSPAV': '0.04222206',
 'strFTPAV': '0.00010726',
 'strLBPAV': '0.00441717',
 'strLCPOSPAV': '0.05360625',
 'strLDEN': '0',
 'strLLCPOSPAV': '0.05900326',
 'strLPOSPAV': '0.05090395',
 'strLTPAV': '0.00011330',
 'strPOSPAV': '0.05032317',
 'strTPAV': '0.00012595',
 'unsBPAV': '0.00663829',
 'unsCBPAV': '0.00546635',
 'unsCDEN': '395',
 'unsCLCPOSPAV': '0.06879694',
 'unsCPOSPAV': '0.05521636',
 'unsCTPAV': '0.00027130',
 'unsDENS': '4',
 'unsFBPAV': '0.00318710',
 'unsFDEN': '10.94',
 'unsFLCPOSPAV': '0.06801497',
 'unsFPOSPAV': '0.04886090',
 'unsFTPAV': '0.00018534',
 'unsLBPAV': '0.0

In [83]:
theFieldnames = ['Indx', 'NPhon', 'NSyll', 'Nlprob', 'Prob', 'SCDcnt', 'SFreq', 'StTrn', 'UnTrn', 'Word', 'strBPAV', 'strCBPAV', 'strCDEN', 'strCLCPOSPAV', 'strCPOSPAV', 'strCTPAV', 'strDENS', 'strFBPAV', 'strFDEN', 'strFLCPOSPAV', 'strFPOSPAV', 'strFTPAV', 'strLBPAV', 'strLCPOSPAV', 'strLDEN', 'strLLCPOSPAV', 'strLPOSPAV', 'strLTPAV', 'strPOSPAV', 'strTPAV', 'unsBPAV', 'unsCBPAV', 'unsCDEN', 'unsCLCPOSPAV', 'unsCPOSPAV', 'unsCTPAV', 'unsDENS', 'unsFBPAV', 'unsFDEN', 'unsFLCPOSPAV', 'unsFPOSPAV', 'unsFTPAV', 'unsLBPAV', 'unsLCPOSPAV', 'unsLDEN', 'unsLLCPOSPAV', 'unsLPOSPAV', 'unsLTPAV', 'unsPOSPAV', 'unsTPAV']
len(theFieldnames)
theFieldnames

50

['Indx',
 'NPhon',
 'NSyll',
 'Nlprob',
 'Prob',
 'SCDcnt',
 'SFreq',
 'StTrn',
 'UnTrn',
 'Word',
 'strBPAV',
 'strCBPAV',
 'strCDEN',
 'strCLCPOSPAV',
 'strCPOSPAV',
 'strCTPAV',
 'strDENS',
 'strFBPAV',
 'strFDEN',
 'strFLCPOSPAV',
 'strFPOSPAV',
 'strFTPAV',
 'strLBPAV',
 'strLCPOSPAV',
 'strLDEN',
 'strLLCPOSPAV',
 'strLPOSPAV',
 'strLTPAV',
 'strPOSPAV',
 'strTPAV',
 'unsBPAV',
 'unsCBPAV',
 'unsCDEN',
 'unsCLCPOSPAV',
 'unsCPOSPAV',
 'unsCTPAV',
 'unsDENS',
 'unsFBPAV',
 'unsFDEN',
 'unsFLCPOSPAV',
 'unsFPOSPAV',
 'unsFTPAV',
 'unsLBPAV',
 'unsLCPOSPAV',
 'unsLDEN',
 'unsLLCPOSPAV',
 'unsLPOSPAV',
 'unsLTPAV',
 'unsPOSPAV',
 'unsTPAV']

In [84]:
IPhOD_IPA_probanno_filename_stem = 'IPhOD2_Words_IPA_prob'

with open(IPhOD_IPA_probanno_filename_stem + '.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, delimiter='\t', fieldnames=theFieldnames)
    writer.writeheader()
    writer.writerows(lexicon_IPA_probanno)

In [85]:
%pwd

'/home/AD/emeinhar/c2-jn'

In [86]:
%ls *IPhOD*

IPhOD2_Words_IPA.csv       IPhOD2_Words.txt
IPhOD2_Words_IPA_prob.csv  [0m[01;31mIPhODv2.0_REALS.zip[0m


### Import

In [87]:
%pwd

'/home/AD/emeinhar/c2-jn'

In [88]:
%ls *IPhOD*

IPhOD2_Words_IPA.csv       IPhOD2_Words.txt
IPhOD2_Words_IPA_prob.csv  [0m[01;31mIPhODv2.0_REALS.zip[0m


In [89]:
import csv

In [90]:
IPhOD_IPA_prob_filename = 'IPhOD2_Words_IPA_prob.csv'

theFieldnames = ['Indx', 'NPhon', 'NSyll', 'Nlprob', 'Prob', 'SCDcnt', 'SFreq', 'StTrn', 'UnTrn', 'Word', 'strBPAV', 'strCBPAV', 'strCDEN', 'strCLCPOSPAV', 'strCPOSPAV', 'strCTPAV', 'strDENS', 'strFBPAV', 'strFDEN', 'strFLCPOSPAV', 'strFPOSPAV', 'strFTPAV', 'strLBPAV', 'strLCPOSPAV', 'strLDEN', 'strLLCPOSPAV', 'strLPOSPAV', 'strLTPAV', 'strPOSPAV', 'strTPAV', 'unsBPAV', 'unsCBPAV', 'unsCDEN', 'unsCLCPOSPAV', 'unsCPOSPAV', 'unsCTPAV', 'unsDENS', 'unsFBPAV', 'unsFDEN', 'unsFLCPOSPAV', 'unsFPOSPAV', 'unsFTPAV', 'unsLBPAV', 'unsLCPOSPAV', 'unsLDEN', 'unsLLCPOSPAV', 'unsLPOSPAV', 'unsLTPAV', 'unsPOSPAV', 'unsTPAV']

lexicon_IPA_probanno_in = []
with open(IPhOD_IPA_prob_filename) as csvfile:
    my_reader = csv.DictReader(csvfile, delimiter='\t')
    for row in my_reader:
        #print(row)
        lexicon_IPA_probanno_in.append(row)

lexicon_IPA_probanno_in[0].keys()
set(lexicon_IPA_probanno_in[0].keys()) == set(theFieldnames)
len(lexicon_IPA_probanno_in[0].keys())
lexicon_IPA_probanno_in[0]


odict_keys(['Indx', 'NPhon', 'NSyll', 'Nlprob', 'Prob', 'SCDcnt', 'SFreq', 'StTrn', 'UnTrn', 'Word', 'strBPAV', 'strCBPAV', 'strCDEN', 'strCLCPOSPAV', 'strCPOSPAV', 'strCTPAV', 'strDENS', 'strFBPAV', 'strFDEN', 'strFLCPOSPAV', 'strFPOSPAV', 'strFTPAV', 'strLBPAV', 'strLCPOSPAV', 'strLDEN', 'strLLCPOSPAV', 'strLPOSPAV', 'strLTPAV', 'strPOSPAV', 'strTPAV', 'unsBPAV', 'unsCBPAV', 'unsCDEN', 'unsCLCPOSPAV', 'unsCPOSPAV', 'unsCTPAV', 'unsDENS', 'unsFBPAV', 'unsFDEN', 'unsFLCPOSPAV', 'unsFPOSPAV', 'unsFTPAV', 'unsLBPAV', 'unsLCPOSPAV', 'unsLDEN', 'unsLLCPOSPAV', 'unsLPOSPAV', 'unsLTPAV', 'unsPOSPAV', 'unsTPAV'])

True

50

OrderedDict([('Indx', '1'),
             ('NPhon', '1'),
             ('NSyll', '1'),
             ('Nlprob', '6.10491267350873'),
             ('Prob', '0.014529081648642661'),
             ('SCDcnt', '8382'),
             ('SFreq', '20415.27'),
             ('StTrn', 'ʌ0'),
             ('UnTrn', 'ʌ'),
             ('Word', 'a'),
             ('strBPAV', '0'),
             ('strCBPAV', '0'),
             ('strCDEN', '114167'),
             ('strCLCPOSPAV', '0.10933994'),
             ('strCPOSPAV', '0.03349570'),
             ('strCTPAV', '0'),
             ('strDENS', '21'),
             ('strFBPAV', '0'),
             ('strFDEN', '145035.32'),
             ('strFLCPOSPAV', '0.21988359'),
             ('strFPOSPAV', '0.06061931'),
             ('strFTPAV', '0'),
             ('strLBPAV', '0'),
             ('strLCPOSPAV', '0.06666667'),
             ('strLDEN', '77.56'),
             ('strLLCPOSPAV', '0.07556101'),
             ('strLPOSPAV', '0.03338777'),
             ('strLTPAV',

In [91]:
assert(all(map(lambda out_in_tuple: match(out_in_tuple[0], out_in_tuple[1]), zip(lexicon_IPA_probanno, lexicon_IPA_probanno_in))))

AssertionError: 

In [92]:
lexicon_IPA_probanno[0].keys() == lexicon_IPA_probanno_in[0].keys()

def mismatches(dictA, dictB):
    allKeys = set.union( set(dictA.keys()), set(dictB.keys()) )

    missingFromB = {k for k in allKeys if k not in dictB.keys()}
    missingFromA = {k for k in allKeys if k not in dictA.keys()}
    mismatches = set.union(missingFromB, missingFromA)

    definedOnBoth = allKeys - mismatches
    mismatches.update({k for k in definedOnBoth if dictA[k] != dictB[k]})

    return mismatches

mismatches(lexicon_IPA_probanno[0], lexicon_IPA_probanno_in[0])

True

{'Nlprob', 'Prob', 'SFreq'}

In [93]:
lexicon_IPA_probanno[0]
lexicon_IPA_probanno_in[0]

{'Indx': '1',
 'NPhon': '1',
 'NSyll': '1',
 'SCDcnt': '8382',
 'SFreq': 20415.27,
 'StTrn': 'ʌ0',
 'UnTrn': 'ʌ',
 'Word': 'a',
 'strBPAV': '0',
 'strCBPAV': '0',
 'strCDEN': '114167',
 'strCLCPOSPAV': '0.10933994',
 'strCPOSPAV': '0.03349570',
 'strCTPAV': '0',
 'strDENS': '21',
 'strFBPAV': '0',
 'strFDEN': '145035.32',
 'strFLCPOSPAV': '0.21988359',
 'strFPOSPAV': '0.06061931',
 'strFTPAV': '0',
 'strLBPAV': '0',
 'strLCPOSPAV': '0.06666667',
 'strLDEN': '77.56',
 'strLLCPOSPAV': '0.07556101',
 'strLPOSPAV': '0.03338777',
 'strLTPAV': '0',
 'strPOSPAV': '0.03588891',
 'strTPAV': '0',
 'unsBPAV': '0',
 'unsCBPAV': '0',
 'unsCDEN': '136251',
 'unsCLCPOSPAV': '0.15987477',
 'unsCPOSPAV': '0.04159586',
 'unsCTPAV': '0',
 'unsDENS': '26',
 'unsFBPAV': '0',
 'unsFDEN': '150377.45',
 'unsFLCPOSPAV': '0.22760866',
 'unsFPOSPAV': '0.07192868',
 'unsFTPAV': '0',
 'unsLBPAV': '0',
 'unsLCPOSPAV': '0.07692308',
 'unsLDEN': '91.03',
 'unsLLCPOSPAV': '0.12570126',
 'unsLPOSPAV': '0.03999447',
 'u

OrderedDict([('Indx', '1'),
             ('NPhon', '1'),
             ('NSyll', '1'),
             ('Nlprob', '6.10491267350873'),
             ('Prob', '0.014529081648642661'),
             ('SCDcnt', '8382'),
             ('SFreq', '20415.27'),
             ('StTrn', 'ʌ0'),
             ('UnTrn', 'ʌ'),
             ('Word', 'a'),
             ('strBPAV', '0'),
             ('strCBPAV', '0'),
             ('strCDEN', '114167'),
             ('strCLCPOSPAV', '0.10933994'),
             ('strCPOSPAV', '0.03349570'),
             ('strCTPAV', '0'),
             ('strDENS', '21'),
             ('strFBPAV', '0'),
             ('strFDEN', '145035.32'),
             ('strFLCPOSPAV', '0.21988359'),
             ('strFPOSPAV', '0.06061931'),
             ('strFTPAV', '0'),
             ('strLBPAV', '0'),
             ('strLCPOSPAV', '0.06666667'),
             ('strLDEN', '77.56'),
             ('strLLCPOSPAV', '0.07556101'),
             ('strLPOSPAV', '0.03338777'),
             ('strLTPAV',

In [94]:
lexicon_IPA_probanno[2]['Nlprob']
lexicon_IPA_probanno_in[2]['Nlprob']
lexicon_IPA_probanno[2]['Prob']
lexicon_IPA_probanno_in[2]['Prob']

21.451420046618022

'21.451420046618022'

3.487218149862776e-07

'3.487218149862776e-07'

Mismatches are just type mismatches - some numeric fields in lexicon_IPA_probanno are string fields in the read in file - not really a problem.