In [1]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Overview" data-toc-modified-id="Overview-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Overview</a></span></li><li><span><a href="#Parameters" data-toc-modified-id="Parameters-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Parameters</a></span></li><li><span><a href="#Import-data" data-toc-modified-id="Import-data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Import data</a></span></li><li><span><a href="#Rename-columns" data-toc-modified-id="Rename-columns-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Rename columns</a></span></li><li><span><a href="#Export" data-toc-modified-id="Export-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Export</a></span></li></ul></div>

# Overview

Any given notebook like this is designed to produce a `.tsv` file with two columns from an arbitrary source
 - orthographic wordforms
 - transcribed wordforms
 
i.e. to define a relation between orthographic wordforms and transcribed wordforms.

The transcribed lexicon relation file can then be used somewhat more uniformly by downstream processing notebooks; in fact, downstream code is expecting the resulting `.tsv` file to have the same filename (sans extension) as the directory this notebook is in.

# Parameters

In [2]:
from os import chdir, getcwd, listdir, path

In [3]:
ltr_dir = getcwd()
ltr_dir

'/mnt/cube/home/AD/emeinhar/wr/LTR_NXT_swbd_destressed'

In [4]:
listdir()

['nxt_swbd_orthography_transcription_relation.tsv',
 '.ipynb_checkpoints',
 'Making a Transcribed Lexicon Relation - NXT_swbd.ipynb',
 'LTR_NXT_swbd_destressed.tsv']

The `.tsv` below is produced by running the notebook `Preprocessing NXT Switchboard corpus transcriptions for ease of processing and use with kenlm` in the repository `switchboard-lm`. (See that repository and notebook for details.) Once it has been produced, copy the `.tsv` into the same directory as this notebook and then run this notebook.

In [5]:
!cp ../../switchboard-lm/nxt_swbd_orthography_transcription_relation.tsv ./nxt_swbd_orthography_transcription_relation.tsv

In [6]:
lexiconDataInFilename = 'nxt_swbd_orthography_transcription_relation.tsv'

lexiconDataOutFilename = 'LTR_NXT_swbd_destressed.tsv'

# Import data

In [7]:
chdir('..')
repo_dir = getcwd()
from boilerplate import *

chdir(ltr_dir)

In [8]:
orthography_out_fieldname = 'Orthographic_Wordform'
transcription_out_fieldname = 'Transcription'
fieldnames_out = (orthography_out_fieldname, transcription_out_fieldname)

In [9]:
lexicon_in = loadTSV_as_dictlist(lexiconDataInFilename)
len(lexicon_in)

lexicon_in[:5]

lexicon_in[0]
lexicon_in[0].keys()

15833

[OrderedDict([('Orthography', '007'),
              ('Transcription', 'd.ʌ.b.ə.l.oʊ.s.ɛ.v.ɪ.n')]),
 OrderedDict([('Orthography', '1'), ('Transcription', 'w.ʌ.n')]),
 OrderedDict([('Orthography', '101'), ('Transcription', 'w.ʌ.n.oʊ.w.ʌ.n')]),
 OrderedDict([('Orthography', '128'), ('Transcription', 'w.ʌ.n.t.u.eɪ.t')]),
 OrderedDict([('Orthography', '2'), ('Transcription', 't.u')])]

OrderedDict([('Orthography', '007'),
             ('Transcription', 'd.ʌ.b.ə.l.oʊ.s.ɛ.v.ɪ.n')])

odict_keys(['Orthography', 'Transcription'])

In [10]:
list(filter(lambda d: d['Orthography'] == '<rem>', 
            lexicon_in))

[]

# Rename columns

In [10]:
from collections import OrderedDict

In [11]:
# lexicon_out = set(map(lambda d: (d['Orthographic_Wordform'], d['Transcription']),
#                       lexicon_out))
lexicon_out = set(map(lambda d: tuple(d.values()),
                      lexicon_in))
len(lexicon_out)

lexicon_out = list(map(lambda pair: OrderedDict({orthography_out_fieldname:pair[0],
                                                 transcription_out_fieldname:pair[1]}),
                       lexicon_out))

lexicon_out = sorted(lexicon_out, key=lambda d: d[orthography_out_fieldname])
lexicon_out[:10]

15833

[OrderedDict([('Orthographic_Wordform', '007'),
              ('Transcription', 'd.ʌ.b.ə.l.oʊ.s.ɛ.v.ɪ.n')]),
 OrderedDict([('Orthographic_Wordform', '1'), ('Transcription', 'w.ʌ.n')]),
 OrderedDict([('Orthographic_Wordform', '101'),
              ('Transcription', 'w.ʌ.n.oʊ.w.ʌ.n')]),
 OrderedDict([('Orthographic_Wordform', '128'),
              ('Transcription', 'w.ʌ.n.t.u.eɪ.t')]),
 OrderedDict([('Orthographic_Wordform', '2'), ('Transcription', 't.u')]),
 OrderedDict([('Orthographic_Wordform', '286'),
              ('Transcription', 't.u.eɪ.t.i.s.ɪ.k.s')]),
 OrderedDict([('Orthographic_Wordform', '302'),
              ('Transcription', 'θ.ɹ.i.oʊ.t.u')]),
 OrderedDict([('Orthographic_Wordform', '365'),
              ('Transcription', 'θ.ɹ.i.s.ɪ.k.s.t.i.f.aɪ.v')]),
 OrderedDict([('Orthographic_Wordform', '380'),
              ('Transcription', 'θ.ɹ.i.eɪ.t.i')]),
 OrderedDict([('Orthographic_Wordform', '386'),
              ('Transcription', 'θ.ɹ.i.eɪ.t.i.s.ɪ.k.s')])]

# Export

In [12]:
with open(lexiconDataOutFilename, 'w', newline='\n') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames_out, delimiter='\t')

    writer.writeheader()
    writer.writerows(lexicon_out)

In [13]:
getcwd()
listdir()

'/mnt/cube/home/AD/emeinhar/wr/LTR_NXT_swbd_destressed'

['nxt_swbd_orthography_transcription_relation.tsv',
 '.ipynb_checkpoints',
 'Making a Transcribed Lexicon Relation - NXT_swbd.ipynb',
 'LTR_NXT_swbd_destressed.tsv']

In [14]:
!cat -n LTR_NXT_swbd_destressed.tsv | head -20

     1	Orthographic_Wordform	Transcription
     2	007	d.ʌ.b.ə.l.oʊ.s.ɛ.v.ɪ.n
     3	1	w.ʌ.n
     4	101	w.ʌ.n.oʊ.w.ʌ.n
     5	128	w.ʌ.n.t.u.eɪ.t
     6	2	t.u
     7	286	t.u.eɪ.t.i.s.ɪ.k.s
     8	302	θ.ɹ.i.oʊ.t.u
     9	365	θ.ɹ.i.s.ɪ.k.s.t.i.f.aɪ.v
    10	380	θ.ɹ.i.eɪ.t.i
    11	386	θ.ɹ.i.eɪ.t.i.s.ɪ.k.s
    12	401k	f.oʊ.ɹ.oʊ.w.ʌ.n.k.eɪ
    13	486	f.oʊ.ɹ.eɪ.t.i.s.ɪ.k.s
    14	49ers	f.oʊ.ɹ.t.i.n.aɪ.n.ɚ.z
    15	5	f.aɪ.v
    16	635's	ɪ.k.s.θ.ɚ.t.i.f.aɪ.v
    17	69	s.ɪ.k.s.t.i.n.aɪ.n
    18	6s	s.ɪ.k.s.ɪ.z
    19	7-eleven	s.ɛ.v.ɪ.n.ɪ.l.ɛ.v.ɪ.n
    20	747	s.ɛ.v.ɪ.n.f.oʊ.ɹ.s.ɛ.v.ɪ.n
cat: write error: Broken pipe
