In [1]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Overview" data-toc-modified-id="Overview-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Overview</a></span></li><li><span><a href="#Parameters" data-toc-modified-id="Parameters-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Parameters</a></span></li><li><span><a href="#Import-data" data-toc-modified-id="Import-data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Import data</a></span></li><li><span><a href="#Rename-columns" data-toc-modified-id="Rename-columns-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Rename columns</a></span></li><li><span><a href="#Export" data-toc-modified-id="Export-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Export</a></span></li></ul></div>

# Overview

Any given notebook like this is designed to produce a `.tsv` file with two columns from an arbitrary source
 - orthographic wordforms
 - transcribed wordforms
 
i.e. to define a relation between orthographic wordforms and transcribed wordforms.

The transcribed lexicon relation file can then be used somewhat more uniformly by downstream processing notebooks; in fact, downstream code is expecting the resulting `.tsv` file to have the same filename (sans extension) as the directory this notebook is in.

# Parameters

In [2]:
from os import chdir, getcwd, listdir, path

The history saving thread hit an unexpected error (OperationalError('database is locked')).History will not be written to the database.


In [3]:
ltr_dir = getcwd()
ltr_dir

'/mnt/cube/home/AD/emeinhar/wr/LTR_Buckeye'

In [4]:
listdir()

['LTR_Buckeye.tsv',
 'buckeye_orthography_phonemic_transcription_relation.tsv',
 'buckeye_words_analysis_relation.json',
 '.ipynb_checkpoints',
 'Making a Transcribed Lexicon Relation - Buckeye.ipynb',
 'Making a Transcribed Lexicon Relation - Buckeye-Copy1.ipynb']

The `.tsv` below is produced by running the notebook `Preprocessing Buckeye corpus transcriptions for ease of processing and use with kenlm` in the repository `buckeye-lm`. (See that repository and notebook for details.) Once it has been produced, copy the `.tsv` into the same directory as this notebook and then run this notebook.

In [1]:
!cp ../../buckeye-lm/buckeye_orthography_phonemic_transcription_relation.tsv ./buckeye_orthography_phonemic_transcription_relation.tsv

In [5]:
# lexiconDataInFilename = 'buckeye_words_analysis_relation.json'
lexiconDataInFilename = 'buckeye_orthography_phonemic_transcription_relation.tsv'

lexiconDataOutFilename = 'LTR_Buckeye.tsv'

# Import data

In [6]:
chdir('..')
repo_dir = getcwd()
from boilerplate import *

chdir(ltr_dir)

In [7]:
orthography_out_fieldname = 'Orthographic_Wordform'
transcription_out_fieldname = 'Transcription'
fieldnames_out = (orthography_out_fieldname, transcription_out_fieldname)

In [8]:
lexicon_in = loadTSV_as_dictlist(lexiconDataInFilename)
len(lexicon_in)

lexicon_in[:5]

lexicon_in[0]
lexicon_in[0].keys()

7998

[OrderedDict([('Orthography', "'em"), ('Transcription', 'ɛ.m')]),
 OrderedDict([('Orthography', 'a'), ('Transcription', 'eɪ')]),
 OrderedDict([('Orthography', "a's"), ('Transcription', 'eɪ.z.z')]),
 OrderedDict([('Orthography', "aaron's"), ('Transcription', 'eɪ.ɹ.ʌ.n.z')]),
 OrderedDict([('Orthography', 'abandoned'),
              ('Transcription', 'ʌ.b.æ.n.d.ʌ.n.d')])]

OrderedDict([('Orthography', "'em"), ('Transcription', 'ɛ.m')])

odict_keys(['Orthography', 'Transcription'])

# Rename columns

In [11]:
from collections import OrderedDict

In [12]:
# lexicon_out = set(map(lambda d: (d['Orthographic_Wordform'], d['Transcription']),
#                       lexicon_out))
lexicon_out = set(map(lambda d: tuple(d.values()),
                      lexicon_in))
len(lexicon_out)

lexicon_out = list(map(lambda pair: OrderedDict({orthography_out_fieldname:pair[0],
                                                 transcription_out_fieldname:pair[1]}),
                       lexicon_out))

lexicon_out = sorted(lexicon_out, key=lambda d: d[orthography_out_fieldname])
lexicon_out[:10]

7998

[OrderedDict([('Orthographic_Wordform', "'em"), ('Transcription', 'ɛ.m')]),
 OrderedDict([('Orthographic_Wordform', 'a'), ('Transcription', 'eɪ')]),
 OrderedDict([('Orthographic_Wordform', "a's"), ('Transcription', 'eɪ.z.z')]),
 OrderedDict([('Orthographic_Wordform', "aaron's"),
              ('Transcription', 'eɪ.ɹ.ʌ.n.z')]),
 OrderedDict([('Orthographic_Wordform', 'abandoned'),
              ('Transcription', 'ʌ.b.æ.n.d.ʌ.n.d')]),
 OrderedDict([('Orthographic_Wordform', 'abercrombie'),
              ('Transcription', 'æ.b.ɚ.k.ɹ.ɑ.m.b.i')]),
 OrderedDict([('Orthographic_Wordform', 'abhorrent'),
              ('Transcription', 'ʌ.b.h.oʊ.ɹ.ʌ.n.t')]),
 OrderedDict([('Orthographic_Wordform', 'abide'),
              ('Transcription', 'ʌ.b.aɪ.d')]),
 OrderedDict([('Orthographic_Wordform', 'ability'),
              ('Transcription', 'ʌ.b.ɪ.l.ʌ.t.i')]),
 OrderedDict([('Orthographic_Wordform', 'able'),
              ('Transcription', 'eɪ.b.l̩')])]

# Export

In [13]:
with open(lexiconDataOutFilename, 'w', newline='\n') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames_out, delimiter='\t')

    writer.writeheader()
    writer.writerows(lexicon_out)

In [14]:
getcwd()
listdir()

'/mnt/cube/home/AD/emeinhar/wr/LTR_Buckeye'

['LTR_Buckeye.tsv',
 'buckeye_orthography_phonemic_transcription_relation.tsv',
 'buckeye_words_analysis_relation.json',
 '.ipynb_checkpoints',
 'Making a Transcribed Lexicon Relation - Buckeye.ipynb',
 'Making a Transcribed Lexicon Relation - Buckeye-Copy1.ipynb']

In [15]:
!cat -n LTR_Buckeye.tsv | head -20

     1	Orthographic_Wordform	Transcription
     2	'em	ɛ.m
     3	a	eɪ
     4	a's	eɪ.z.z
     5	aaron's	eɪ.ɹ.ʌ.n.z
     6	abandoned	ʌ.b.æ.n.d.ʌ.n.d
     7	abercrombie	æ.b.ɚ.k.ɹ.ɑ.m.b.i
     8	abhorrent	ʌ.b.h.oʊ.ɹ.ʌ.n.t
     9	abide	ʌ.b.aɪ.d
    10	ability	ʌ.b.ɪ.l.ʌ.t.i
    11	able	eɪ.b.l̩
    12	abortion	ʌ.b.oʊ.ɹ.ʃ.ʌ.n
    13	abortions	ʌ.b.oʊ.ɹ.ʃ.ʌ.n.z
    14	about	ʌ.b.aʊ.t
    15	above	ʌ.b.ʌ.v
    16	abraham	eɪ.b.ɹ.ʌ.h.æ.m
    17	abroad	ʌ.b.ɹ.ɑ.d
    18	abrupt	ʌ.b.ɹ.ʌ.p.t
    19	abruptly	ʌ.b.ɹ.ʌ.p.t.l.i
    20	absence	æ.b.s.ʌ.n.s
cat: write error: Broken pipe


In [1]:
!tail -n +2 LTR_Buckeye.tsv | cut -f1 > buckeye_vocabulary_main.txt