**Eric Meinhardt / emeinhardt@ucsd.edu**

In [1]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Reading the CMU dict in

In [2]:
import csv

CMU pronouncing dictionary v0.7b obtained from `http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b`

In [3]:
%ls

 cmudict-0.7b   cmudict-0.7b.tsv  'CMU to IPA.ipynb'


In [4]:
cmudict_raw_filename = 'cmudict-0.7b'

cmudict_raw = []
with open(cmudict_raw_filename, 'r', encoding='latin-1') as the_file:
    for row in the_file:
        cmudict_raw.append(row.rstrip('\n'))#(row.rstrip('\r\n'))

In [5]:
%cat -n cmudict-0.7b | head -60

     1	;;; # CMUdict  --  Major Version: 0.07
     2	;;; 
     3	;;; # $HeadURL$
     4	;;; # $Date::                                                   $:
     5	;;; # $Id::                                                     $:
     6	;;; # $Rev::                                                    $: 
     7	;;; # $Author::                                                 $:
     8	;;;
     9	;;; #
    11	;;; # Copyright (C) 1993-2015 Carnegie Mellon University. All rights reserved.
    12	;;; #
    13	;;; # Redistribution and use in source and binary forms, with or without
    14	;;; # modification, are permitted provided that the following conditions
    15	;;; # are met:
    16	;;; #
    17	;;; # 1. Redistributions of source code must retain the above copyright
    18	;;; #    notice, this list of conditions and the following disclaimer.
    19	;;; #    The contents of this file are deemed to be source code.
    20	;;; #
    21	;;; # 2. Redistributions in binary f

In [6]:
cmudict_raw[0] #header...
cmudict_raw[55] #end header
cmudict_raw[56]

';;; # CMUdict  --  Major Version: 0.07'

';;; '

'!EXCLAMATION-POINT  EH2 K S K L AH0 M EY1 SH AH0 N P OY2 N T'

In [7]:
cmudict_entry_lines = cmudict_raw[56:]
cmudict_entry_lines[0]
cmudict_entry_lines[2434]

'!EXCLAMATION-POINT  EH2 K S K L AH0 M EY1 SH AH0 N P OY2 N T'

'ALBINIA  AA0 L B IY1 N IY0 AH0'

# Convert to tsv

Each entry line is split by a double space...

In [8]:
cmudict_entry_lines[5].split('  ')
cmudict_entry_lines[2434].split('  ')
cmudict_entry_lines[89347].split('  ')

['"IN-QUOTES', 'IH1 N K W OW1 T S']

['ALBINIA', 'AA0 L B IY1 N IY0 AH0']

['PATRICOF', 'P AE1 T R IH0 K AO2 F']

In [9]:
tab_separated_cmudict_lines = [entry.split('  ') for entry in cmudict_entry_lines]
tab_separated_cmudict_lines[:10]

[['!EXCLAMATION-POINT', 'EH2 K S K L AH0 M EY1 SH AH0 N P OY2 N T'],
 ['"CLOSE-QUOTE', 'K L OW1 Z K W OW1 T'],
 ['"DOUBLE-QUOTE', 'D AH1 B AH0 L K W OW1 T'],
 ['"END-OF-QUOTE', 'EH1 N D AH0 V K W OW1 T'],
 ['"END-QUOTE', 'EH1 N D K W OW1 T'],
 ['"IN-QUOTES', 'IH1 N K W OW1 T S'],
 ['"QUOTE', 'K W OW1 T'],
 ['"UNQUOTE', 'AH1 N K W OW1 T'],
 ['#HASH-MARK', 'HH AE1 M AA2 R K'],
 ['#POUND-SIGN', 'P AW1 N D S AY2 N']]

In [10]:
cmudict = list(map(lambda tscdl: {'Orthography':tscdl[0],
                                  'Transcription':tscdl[1]},
                   tab_separated_cmudict_lines))

In [11]:
from csv import DictWriter

In [12]:
cmudict_tsv_fn = 'cmudict-0.7b.tsv'

with open(cmudict_tsv_fn, 'w', newline='\n') as dl:
    fieldnames = ['Orthography', 'Transcription']
    writer = csv.DictWriter(dl, fieldnames=fieldnames, delimiter='\t', quoting=csv.QUOTE_NONE, quotechar='@')
    writer.writeheader()
    for entry in cmudict:
        _ = writer.writerow(entry) #assignment suppresses return value (= number of characters, iirc?)

# Convert wordform representations to dotted strings

In [13]:
def modify_dict(the_dict, the_key, the_new_value):
    '''
    Composable and (naively-implemented) non-mutating dictionary update.
    '''
    new_dict = {k:the_dict[k] for k in the_dict}
    new_dict.update({the_key: the_new_value})
    return new_dict

In [23]:
'AH1 N K W OW1 T'.split(' ')
'.'.join('AH1 N K W OW1 T'.split(' '))

['AH1', 'N', 'K', 'W', 'OW1', 'T']

'AH1.N.K.W.OW1.T'

In [24]:
ds_lexicon = list(map(lambda row: modify_dict(row, 
                                              'Transcription', 
                                              '.'.join( row['Transcription'].split(' ') )),
                      cmudict))
ds_lexicon[:5]

[{'Orthography': '!EXCLAMATION-POINT',
  'Transcription': 'EH2.K.S.K.L.AH0.M.EY1.SH.AH0.N.P.OY2.N.T'},
 {'Orthography': '"CLOSE-QUOTE', 'Transcription': 'K.L.OW1.Z.K.W.OW1.T'},
 {'Orthography': '"DOUBLE-QUOTE', 'Transcription': 'D.AH1.B.AH0.L.K.W.OW1.T'},
 {'Orthography': '"END-OF-QUOTE', 'Transcription': 'EH1.N.D.AH0.V.K.W.OW1.T'},
 {'Orthography': '"END-QUOTE', 'Transcription': 'EH1.N.D.K.W.OW1.T'}]

In [25]:
def tupleToDottedString(pair): 
    return '.'.join(pair)

def dottedStringToTuple(s): 
    return tuple(s.split('.'))

t2ds = tupleToDottedString
ds2t = dottedStringToTuple

In [26]:
ds2t('AH1.N.K.W.OW1.T')

('AH1', 'N', 'K', 'W', 'OW1', 'T')

# Convert ARPABET to IPA

In [27]:
from functools import reduce

In [30]:
def union(Ss):
    return reduce(set.union, Ss)

## Identify all ARPABET symbols in use in the data

In [29]:
transcriptions = list(map(lambda entry: entry['Transcription'],
                          ds_lexicon))
transcriptions[:10]

['EH2.K.S.K.L.AH0.M.EY1.SH.AH0.N.P.OY2.N.T',
 'K.L.OW1.Z.K.W.OW1.T',
 'D.AH1.B.AH0.L.K.W.OW1.T',
 'EH1.N.D.AH0.V.K.W.OW1.T',
 'EH1.N.D.K.W.OW1.T',
 'IH1.N.K.W.OW1.T.S',
 'K.W.OW1.T',
 'AH1.N.K.W.OW1.T',
 'HH.AE1.M.AA2.R.K',
 'P.AW1.N.D.S.AY2.N']

In [37]:
ARPABET_in_cmudict = union([set(t.split('.')) for t in transcriptions])
len(ARPABET_in_cmudict)
print(ARPABET_in_cmudict)

69

{'IH1', 'IY0', 'S', 'EY2', 'AE0', 'AW1', 'AA0', 'ER1', 'UW0', 'UH2', 'AY1', 'F', 'DH', 'AE1', 'EH1', 'AH2', 'AE2', 'M', 'AW0', 'OW0', 'NG', 'EH0', 'L', 'G', 'SH', 'D', 'Z', 'T', 'AO2', 'R', 'EH2', 'OW1', 'ZH', 'ER0', 'AH0', 'AY2', 'AO1', 'UH0', 'AO0', 'IH2', 'B', 'JH', 'OY0', 'AY0', 'OY2', 'UH1', 'IY1', 'K', 'CH', 'UW1', 'P', 'TH', 'AA1', 'AA2', 'OY1', 'AH1', 'EY1', 'UW2', 'W', 'V', 'ER2', 'IH0', 'OW2', 'IY2', 'Y', 'N', 'HH', 'EY0', 'AW2'}


In [38]:
arpabetUnicodeIPArelation = set([ \
('AO', 'ɔ'),
('AA', 'ɑ'),
('IY', 'i'),
('UW', 'u'),
('EH', 'ɛ'),
('IH', 'ɪ'),
('UH', 'ʊ'),
('AH', 'ʌ'),
('AX', 'ə'),
('AE', 'æ'),
('EY', 'eɪ'),
('AY', 'aɪ'),
('OW', 'oʊ'),
('AW', 'aʊ'),
('OY', 'ɔɪ'),
('ER', 'ɚ'),
('P', 'p'),
('B', 'b'),
('T', 't'),
('D', 'd'),
('K', 'k'),
('G', 'g'),
('CH', 'tʃ'),
('JH', 'dʒ'),
('F', 'f'),
('V', 'v'),
('TH', 'θ'),
('DH', 'ð'),
('S', 's'),
('Z', 'z'),
('SH', 'ʃ'),
('ZH', 'ʒ'),
('HH', 'h'),
('M', 'm'),
('EM', 'm̩'),
('N', 'n'),
('EN', 'n̩'),
('NG', 'ŋ'),
('ENG', 'ŋ̩'),
('L', 'l'),
('EL', 'l̩'),
('R', 'r'),
('DX', 'ɾ'),
('NX', 'ɾ̃'),
('Y', 'j'),
('W', 'w'),
('Q', 'ʔ')
])

In [42]:
{symbol for symbol in ARPABET_in_cmudict if symbol not in set(map(lambda r: r[0],
                                                                  arpabetUnicodeIPArelation))}

{'AA0',
 'AA1',
 'AA2',
 'AE0',
 'AE1',
 'AE2',
 'AH0',
 'AH1',
 'AH2',
 'AO0',
 'AO1',
 'AO2',
 'AW0',
 'AW1',
 'AW2',
 'AY0',
 'AY1',
 'AY2',
 'EH0',
 'EH1',
 'EH2',
 'ER0',
 'ER1',
 'ER2',
 'EY0',
 'EY1',
 'EY2',
 'IH0',
 'IH1',
 'IH2',
 'IY0',
 'IY1',
 'IY2',
 'OW0',
 'OW1',
 'OW2',
 'OY0',
 'OY1',
 'OY2',
 'UH0',
 'UH1',
 'UH2',
 'UW0',
 'UW1',
 'UW2'}

In [39]:
def arpabetToUnicodeIPA(arpabetSymbol):
    mapping = dict(arpabetUnicodeIPArelation)
    #print(mapping)
    return mapping[arpabetSymbol]
arpabetToUnicodeIPA('TH')

def invertMapping(mydict): 
    return dict([[val, key] for key,val in mydict.items()])
def unicodeIPAToArpabet(unicodeIPAsymbol):
    return invertMapping( dict(arpabetUnicodeIPArelation) )[unicodeIPAsymbol]
unicodeIPAToArpabet('θ')

'θ'

'TH'

In [40]:
def IPAifyUnstressedRep(unstressedRep):
    arpabetTuple = dottedStringToTuple( unstressedRep ) 
    
    IPAtuple = tuple(list(map(arpabetToUnicodeIPA, arpabetTuple)))
    
    dottedIPAsymbols = tupleToDottedString( IPAtuple )
    return dottedIPAsymbols

def IPAifyStressedRep(stressedRep):
    stressedArpabetTuple = dottedStringToTuple( stressedRep )

    arpabetSymbols = [symb[:-1] if symb[-1] == '0' or symb[-1] == '1' or symb[-1] == '2' else symb[:] for symb in stressedArpabetTuple]
    stresses = [symb[-1] if symb[-1] == '0' or symb[-1] == '1' or symb[-1] == '2' else '' for symb in stressedArpabetTuple]
#     print('arpabetSymbols:')
#     print(arpabetSymbols)
#     print('stresses:')
#     print(stresses)
#     print('together:')
    
    IPAsymbols = list(map(arpabetToUnicodeIPA, arpabetSymbols))
    stressedIPAtuple = tuple(map(lambda s01: s01[0] + s01[1], zip(IPAsymbols,stresses)))
    
    dottedStressedIPAsymbols = tupleToDottedString( stressedIPAtuple )
    return dottedStressedIPAsymbols

In [41]:
IPAifyStressedRep('EH2.K.S.K.L.AH0.M.EY1.SH.AH0.N.P.OY2.N.T')

'ɛ2.k.s.k.l.ʌ0.m.eɪ1.ʃ.ʌ0.n.p.ɔɪ2.n.t'