In [1]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Overview" data-toc-modified-id="Overview-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Overview</a></span></li><li><span><a href="#Parameters" data-toc-modified-id="Parameters-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Parameters</a></span></li><li><span><a href="#Import-data" data-toc-modified-id="Import-data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Import data</a></span></li><li><span><a href="#Project-down-to-orthography-and-transcription-columns" data-toc-modified-id="Project-down-to-orthography-and-transcription-columns-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Project down to orthography and transcription columns</a></span></li><li><span><a href="#Export" data-toc-modified-id="Export-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Export</a></span></li></ul></div>

# Overview

Any given notebook like this is designed to produce a `.tsv` file with two columns from an arbitrary source
 - orthographic wordforms
 - transcribed wordforms
 
i.e. to define a relation between orthographic wordforms and transcribed wordforms.

The transcribed lexicon relation file can then be used somewhat more uniformly by downstream processing notebooks.

# Parameters

In [2]:
from os import chdir, getcwd, listdir, path

In [3]:
lexiconDataInFilename = 'newdic_IPA.tsv'

lexiconDataOutFilename = 'LTR_newdic_destressed.tsv'

In [4]:
getcwd()

'/mnt/cube/home/AD/emeinhar/wr/LTR_newdic_destressed'

# Import data

In [5]:
orthography_out_fieldname = 'Orthographic_Wordform'
transcription_out_fieldname = 'Transcription'
fieldnames_out = (orthography_out_fieldname, transcription_out_fieldname)

In [6]:
import csv

In [7]:
lexicon_in = []
with open(lexiconDataInFilename) as csvfile:
    my_reader = csv.DictReader(csvfile, delimiter='\t')
    for row in my_reader:
        #print(row)
        lexicon_in.append(row)

len(lexicon_in)
lexicon_in[0].keys()
lexicon_in[0]

19528

odict_keys(['Transcription', 'stressInfoA', 'stressInfoB', 'Orthography', 'Frequency', 'PoSs'])

OrderedDict([('Transcription', 'ə'),
             ('stressInfoA', '_'),
             ('stressInfoB', 'S1'),
             ('Orthography', 'a'),
             ('Frequency', '23178'),
             ('PoSs', '(N IA VB PP)')])

# Project down to orthography and transcription columns

In [8]:
from collections import OrderedDict

In [9]:
def project(raw_row):
    new_row = OrderedDict()
    new_row[orthography_out_fieldname] = raw_row['Orthography']
    new_row[transcription_out_fieldname] = raw_row['Transcription']
    return new_row

In [10]:
lexicon_out = list(map(project,
                       lexicon_in))
len(lexicon_out)
lexicon_out[0]

19528

OrderedDict([('Orthographic_Wordform', 'a'), ('Transcription', 'ə')])

# Export

In [11]:
with open(lexiconDataOutFilename, 'w', newline='\n') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames_out, delimiter='\t')

    writer.writeheader()
    writer.writerows(lexicon_out)

In [12]:
getcwd()
listdir()

'/mnt/cube/home/AD/emeinhar/wr/LTR_newdic_destressed'

['LTR_newdic_destressed.tsv',
 'Making a Transcribed Lexicon Relation - newdic_destressed.ipynb',
 'newdic_IPA.tsv',
 '.ipynb_checkpoints']

In [14]:
!cat -n LTR_newdic_destressed.tsv | head -20

     1	Orthographic_Wordform	Transcription
     2	a	ə
     3	aardvark	ɑ.ɹ.d.v.ɑ.ɹ.k
     4	aback	ə.b.æ.k
     5	abacus	æ.b.ə.k.ə.s
     6	abaft	ə.b.æ.f.t
     7	abalone	æ.b.ə.l.oʊ.n.i
     8	abandon	ə.b.æ.n.d.ɪ.n
     9	abase	ə.b.eɪ.s
    10	abash	ə.b.æ.ʃ
    11	abate	ə.b.eɪ.t
    12	abatis	æ.b.ə.t.i
    13	abattoir	æ.b.ə.t.w.ɑ.ɹ
    14	abbacy	æ.b.ə.s.i
    15	abbe	æ.b.eɪ
    16	abbess	æ.b.ə.s
    17	abbey	æ.b.i
    18	abbot	æ.b.ə.t
    19	abbreviate	ə.b.ɹ.i.v.i.eɪ.t
    20	abbreviation	ə.b.ɹ.i.v.i.eɪ.ʃ.ɪ.n
cat: write error: Broken pipe
