In [1]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Overview" data-toc-modified-id="Overview-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Overview</a></span></li><li><span><a href="#Parameters" data-toc-modified-id="Parameters-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Parameters</a></span></li><li><span><a href="#Import-data" data-toc-modified-id="Import-data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Import data</a></span></li><li><span><a href="#Project-down-to-orthography-and-transcription-columns" data-toc-modified-id="Project-down-to-orthography-and-transcription-columns-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Project down to orthography and transcription columns</a></span></li><li><span><a href="#Export" data-toc-modified-id="Export-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Export</a></span></li></ul></div>

# Overview

Any given notebook like this is designed to produce a `.tsv` file with two columns from an arbitrary source
 - orthographic wordforms
 - transcribed wordforms
 
i.e. to define a relation between orthographic wordforms and transcribed wordforms.

The transcribed lexicon relation file can then be used somewhat more uniformly by downstream processing notebooks.

# Parameters

In [2]:
from os import chdir, getcwd, listdir, path

In [3]:
listdir()

['LTR_Buckeye.tsv',
 'buckeye_words_analysis_relation.json',
 '.ipynb_checkpoints',
 'Making a Transcribed Lexicon Relation - Buckeye.ipynb']

In [4]:
lexiconDataInFilename = 'buckeye_words_analysis_relation.json'

lexiconDataOutFilename = 'LTR_Buckeye.tsv'

# Import data

In [5]:
orthography_out_fieldname = 'Orthographic_Wordform'
transcription_out_fieldname = 'Transcription'
fieldnames_out = (orthography_out_fieldname, transcription_out_fieldname)

In [6]:
import json

In [7]:
with open(lexiconDataInFilename, encoding='utf-8') as data_file:
        lexicon_in = json.loads(data_file.read())
        
len(lexicon_in)
lexicon_in[0].keys()
lexicon_in[0]

216062

dict_keys(['hasAdjacentPauseOrDisfluency', 'phonemes', 'phones', 'hasAdjacentFilledPause', 'POS', 'duration', 'hasSyllabicSegsInPhones', 'end', 'hasClitic', 'preceding_4_wordforms', 'preceding_wordforms', 'syllables', 'beg', 'misalgined', 'orthographic_wordform', 'speaker_age', 'speech_rate', 'speaker_interviewer', 'speaker_sex', 'speaker_name', 'track_name', 'dist_from_left_edge', 'dist_from_right_edge'])

{'hasAdjacentPauseOrDisfluency': False,
 'phonemes': 'aɪ',
 'phones': 'aɪ',
 'hasAdjacentFilledPause': True,
 'POS': 'PRP',
 'duration': 0.3510480000000058,
 'hasSyllabicSegsInPhones': True,
 'end': 40.735815,
 'hasClitic': False,
 'preceding_4_wordforms': [],
 'preceding_wordforms': [],
 'syllables': 1,
 'beg': 40.384767,
 'misalgined': False,
 'orthographic_wordform': 'i',
 'speaker_age': 'o',
 'speech_rate': 5.078163086223139,
 'speaker_interviewer': 'f',
 'speaker_sex': 'f',
 'speaker_name': 's05',
 'track_name': 's0501a',
 'dist_from_left_edge': 0,
 'dist_from_right_edge': 4}

# Project down to orthography and transcription columns

In [8]:
from collections import OrderedDict

In [9]:
def project(raw_row):
    new_row = OrderedDict()
    new_row[orthography_out_fieldname] = raw_row['orthographic_wordform']
    new_row[transcription_out_fieldname] = raw_row['phonemes']
    return new_row

In [10]:
lexicon_out = list(map(project,
                       lexicon_in))
len(lexicon_out)
lexicon_out[0]

216062

OrderedDict([('Orthographic_Wordform', 'i'), ('Transcription', 'aɪ')])

In [11]:
od = lexicon_out[0]

In [12]:
# lexicon_out = set(map(lambda d: (d['Orthographic_Wordform'], d['Transcription']),
#                       lexicon_out))
lexicon_out = set(map(lambda d: tuple(d.values()),
                      lexicon_out))
len(lexicon_out)

lexicon_out = list(map(lambda pair: OrderedDict({'Orthographic_Wordform':pair[0],
                                                 'Transcription':pair[1]}),
                       lexicon_out))

lexicon_out = sorted(lexicon_out, key=lambda d: d['Orthographic_Wordform'])
lexicon_out[:10]

7998

[OrderedDict([('Orthographic_Wordform', "'em"), ('Transcription', 'ɛ.m')]),
 OrderedDict([('Orthographic_Wordform', 'Ellimen'),
              ('Transcription', 'ɛ.l.ʌ.m.ɛ.n')]),
 OrderedDict([('Orthographic_Wordform', 'Ellison'),
              ('Transcription', 'ɛ.l.ɪ.s.ʌ.n')]),
 OrderedDict([('Orthographic_Wordform', 'Ralph'),
              ('Transcription', 'ɹ.æ.l.f')]),
 OrderedDict([('Orthographic_Wordform', 'a'), ('Transcription', 'eɪ')]),
 OrderedDict([('Orthographic_Wordform', "a's"), ('Transcription', 'eɪ.z.z')]),
 OrderedDict([('Orthographic_Wordform', "aaron's"),
              ('Transcription', 'eɪ.ɹ.ʌ.n.z')]),
 OrderedDict([('Orthographic_Wordform', 'abandoned'),
              ('Transcription', 'ʌ.b.æ.n.d.ʌ.n.d')]),
 OrderedDict([('Orthographic_Wordform', 'abercrombie'),
              ('Transcription', 'æ.b.ɚ.k.ɹ.ɑ.m.b.i')]),
 OrderedDict([('Orthographic_Wordform', 'abhorrent'),
              ('Transcription', 'ʌ.b.h.oʊ.ɹ.ʌ.n.t')])]

# Export

In [13]:
import csv

In [14]:
with open(lexiconDataOutFilename, 'w', newline='\n') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames_out, delimiter='\t')

    writer.writeheader()
    writer.writerows(lexicon_out)

In [15]:
getcwd()
listdir()

'/mnt/cube/home/AD/emeinhar/wr/LTR_Buckeye'

['LTR_Buckeye.tsv',
 'buckeye_words_analysis_relation.json',
 '.ipynb_checkpoints',
 'Making a Transcribed Lexicon Relation - Buckeye.ipynb']

In [16]:
!cat -n LTR_Buckeye.tsv | head -20

     1	Orthographic_Wordform	Transcription
     2	'em	ɛ.m
     3	Ellimen	ɛ.l.ʌ.m.ɛ.n
     4	Ellison	ɛ.l.ɪ.s.ʌ.n
     5	Ralph	ɹ.æ.l.f
     6	a	eɪ
     7	a's	eɪ.z.z
     8	aaron's	eɪ.ɹ.ʌ.n.z
     9	abandoned	ʌ.b.æ.n.d.ʌ.n.d
    10	abercrombie	æ.b.ɚ.k.ɹ.ɑ.m.b.i
    11	abhorrent	ʌ.b.h.oʊ.ɹ.ʌ.n.t
    12	abide	ʌ.b.aɪ.d
    13	ability	ʌ.b.ɪ.l.ʌ.t.i
    14	able	eɪ.b.l̩
    15	abortion	ʌ.b.oʊ.ɹ.ʃ.ʌ.n
    16	abortions	ʌ.b.oʊ.ɹ.ʃ.ʌ.n.z
    17	about	ʌ.b.aʊ.t
    18	above	ʌ.b.ʌ.v
    19	abraham	eɪ.b.ɹ.ʌ.h.æ.m
    20	abroad	ʌ.b.ɹ.ɑ.d
cat: write error: Broken pipe
