In [1]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Overview" data-toc-modified-id="Overview-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Overview</a></span></li><li><span><a href="#Parameters" data-toc-modified-id="Parameters-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Parameters</a></span></li><li><span><a href="#Import-data" data-toc-modified-id="Import-data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Import data</a></span></li><li><span><a href="#Project-down-to-orthography-and-transcription-columns" data-toc-modified-id="Project-down-to-orthography-and-transcription-columns-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Project down to orthography and transcription columns</a></span></li><li><span><a href="#Export" data-toc-modified-id="Export-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Export</a></span></li></ul></div>

# Overview

Any given notebook like this is designed to produce a `.tsv` file with two columns from an arbitrary source
 - orthographic wordforms
 - transcribed wordforms
 
i.e. to define a relation between orthographic wordforms and transcribed wordforms.

The transcribed lexicon relation file can then be used somewhat more uniformly by downstream processing notebooks.

# Parameters

In [4]:
from os import chdir, getcwd, listdir, path

In [5]:
lexiconDataInFilename = 'cmudict-0.7b_IPA_stressed.tsv'

lexiconDataOutFilename = 'LTR_CMU_stressed.tsv'

In [6]:
getcwd()

'/mnt/cube/home/AD/emeinhar/wr/LTR_CMU_stressed'

# Import data

In [7]:
orthography_out_fieldname = 'Orthographic_Wordform'
transcription_out_fieldname = 'Transcription'
fieldnames_out = (orthography_out_fieldname, transcription_out_fieldname)

In [8]:
import csv

In [9]:
lexicon_in = []
with open(lexiconDataInFilename, 'r', newline='', encoding='utf-8') as csvfile:
    my_reader = csv.DictReader(csvfile, delimiter='\t', quoting=csv.QUOTE_NONE, quotechar='@')
    for row in my_reader:
        #print(row)
        lexicon_in.append(row)

len(lexicon_in)
lexicon_in[0].keys()
lexicon_in[0]

133854

odict_keys(['Orthography', 'Transcription'])

OrderedDict([('Orthography', '!EXCLAMATION-POINT'),
             ('Transcription', 'ɛ2.k.s.k.l.ʌ0.m.eɪ1.ʃ.ʌ0.n.p.ɔɪ2.n.t')])

# Project down to orthography and transcription columns

In [10]:
from collections import OrderedDict

In [11]:
def project(raw_row):
    new_row = OrderedDict()
    new_row[orthography_out_fieldname] = raw_row['Orthography']
    new_row[transcription_out_fieldname] = raw_row['Transcription']
    return new_row

In [12]:
lexicon_out = list(map(project,
                       lexicon_in))
len(lexicon_out)
lexicon_out[0]

133854

OrderedDict([('Orthographic_Wordform', '!EXCLAMATION-POINT'),
             ('Transcription', 'ɛ2.k.s.k.l.ʌ0.m.eɪ1.ʃ.ʌ0.n.p.ɔɪ2.n.t')])

# Export

In [13]:
with open(lexiconDataOutFilename, 'w', newline='\n') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames_out, delimiter='\t', quoting=csv.QUOTE_NONE, quotechar='@')

    writer.writeheader()
    writer.writerows(lexicon_out)

In [14]:
getcwd()
listdir()

'/mnt/cube/home/AD/emeinhar/wr/LTR_CMU_stressed'

['Making a Transcribed Lexicon Relation - CMU_stressed.ipynb',
 '.ipynb_checkpoints',
 'cmudict-0.7b_IPA_stressed.tsv',
 'LTR_CMU_stressed.tsv']

In [16]:
!cat -n LTR_CMU_stressed.tsv | head -20

     1	Orthographic_Wordform	Transcription
     2	!EXCLAMATION-POINT	ɛ2.k.s.k.l.ʌ0.m.eɪ1.ʃ.ʌ0.n.p.ɔɪ2.n.t
     3	"CLOSE-QUOTE	k.l.oʊ1.z.k.w.oʊ1.t
     4	"DOUBLE-QUOTE	d.ʌ1.b.ʌ0.l.k.w.oʊ1.t
     5	"END-OF-QUOTE	ɛ1.n.d.ʌ0.v.k.w.oʊ1.t
     6	"END-QUOTE	ɛ1.n.d.k.w.oʊ1.t
     7	"IN-QUOTES	ɪ1.n.k.w.oʊ1.t.s
     8	"QUOTE	k.w.oʊ1.t
     9	"UNQUOTE	ʌ1.n.k.w.oʊ1.t
    10	#HASH-MARK	h.æ1.m.ɑ2.ɹ.k
    11	#POUND-SIGN	p.aʊ1.n.d.s.aɪ2.n
    12	#SHARP-SIGN	ʃ.ɑ1.ɹ.p.s.aɪ2.n
    13	%PERCENT	p.ɚ0.s.ɛ1.n.t
    14	&AMPERSAND	æ1.m.p.ɚ0.s.æ2.n.d
    15	'ALLO	ɑ2.l.oʊ1
    16	'APOSTROPHE	ʌ0.p.ɑ1.s.t.ɹ.ʌ0.f.i0
    17	'BOUT	b.aʊ1.t
    18	'CAUSE	k.ʌ0.z
    19	'COURSE	k.ɔ1.ɹ.s
    20	'CUSE	k.j.u1.z
cat: write error: Broken pipe
