In [1]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Overview" data-toc-modified-id="Overview-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Overview</a></span></li><li><span><a href="#Parameters" data-toc-modified-id="Parameters-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Parameters</a></span></li><li><span><a href="#Import-data" data-toc-modified-id="Import-data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Import data</a></span></li><li><span><a href="#Project-down-to-orthography-and-transcription-columns" data-toc-modified-id="Project-down-to-orthography-and-transcription-columns-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Project down to orthography and transcription columns</a></span></li><li><span><a href="#Export" data-toc-modified-id="Export-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Export</a></span></li></ul></div>

# Overview

Any given notebook like this is designed to produce a `.tsv` file with two columns from an arbitrary source
 - orthographic wordforms
 - transcribed wordforms
 
i.e. to define a relation between orthographic wordforms and transcribed wordforms.

The transcribed lexicon relation file can then be used somewhat more uniformly by downstream processing notebooks.

# Parameters

In [2]:
from os import chdir, getcwd, listdir, path

In [3]:
listdir()

['buckeye_words_analysis_relation.json',
 '.ipynb_checkpoints',
 'Making a Transcribed Lexicon Relation - Buckeye.ipynb']

In [4]:
lexiconDataInFilename = 'buckeye_words_analysis_relation.json'

lexiconDataOutFilename = 'LTR_buckeye.tsv'

# Import data

In [5]:
orthography_out_fieldname = 'Orthographic_Wordform'
transcription_out_fieldname = 'Transcription'
fieldnames_out = (orthography_out_fieldname, transcription_out_fieldname)

In [6]:
import json

In [9]:
with open(lexiconDataInFilename, encoding='utf-8') as data_file:
        lexicon_in = json.loads(data_file.read())
        
len(lexicon_in)
lexicon_in[0].keys()
lexicon_in[0]

216062

dict_keys(['hasAdjacentPauseOrDisfluency', 'phonemes', 'phones', 'hasAdjacentFilledPause', 'POS', 'duration', 'hasSyllabicSegsInPhones', 'end', 'hasClitic', 'preceding_4_wordforms', 'preceding_wordforms', 'syllables', 'beg', 'misalgined', 'orthographic_wordform', 'speaker_age', 'speech_rate', 'speaker_interviewer', 'speaker_sex', 'speaker_name', 'track_name', 'dist_from_left_edge', 'dist_from_right_edge'])

{'hasAdjacentPauseOrDisfluency': False,
 'phonemes': 'aɪ',
 'phones': 'aɪ',
 'hasAdjacentFilledPause': True,
 'POS': 'PRP',
 'duration': 0.3510480000000058,
 'hasSyllabicSegsInPhones': True,
 'end': 40.735815,
 'hasClitic': False,
 'preceding_4_wordforms': [],
 'preceding_wordforms': [],
 'syllables': 1,
 'beg': 40.384767,
 'misalgined': False,
 'orthographic_wordform': 'i',
 'speaker_age': 'o',
 'speech_rate': 5.078163086223139,
 'speaker_interviewer': 'f',
 'speaker_sex': 'f',
 'speaker_name': 's05',
 'track_name': 's0501a',
 'dist_from_left_edge': 0,
 'dist_from_right_edge': 4}

# Project down to orthography and transcription columns

In [10]:
from collections import OrderedDict

In [11]:
def project(raw_row):
    new_row = OrderedDict()
    new_row[orthography_out_fieldname] = raw_row['orthographic_wordform']
    new_row[transcription_out_fieldname] = raw_row['phonemes']
    return new_row

In [12]:
lexicon_out = list(map(project,
                       lexicon_in))
len(lexicon_out)
lexicon_out[0]

216062

OrderedDict([('Orthographic_Wordform', 'i'), ('Transcription', 'aɪ')])

# Export

In [14]:
import csv

In [15]:
with open(lexiconDataOutFilename, 'w', newline='\n') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames_out, delimiter='\t')

    writer.writeheader()
    writer.writerows(lexicon_out)

In [16]:
getcwd()
listdir()

'/mnt/cube/home/AD/emeinhar/wr/LTR_Buckeye'

['LTR_buckeye.tsv',
 'buckeye_words_analysis_relation.json',
 '.ipynb_checkpoints',
 'Making a Transcribed Lexicon Relation - Buckeye.ipynb']

In [17]:
!cat -n LTR_buckeye.tsv | head -20

     1	Orthographic_Wordform	Transcription
     2	i	aɪ
     3	uh	ʌ
     4	grew	g.ɹ.u
     5	up	ʌ.p
     6	in	ɪ.n
     7	is	ɪ.z
     8	this	ð.ɪ.s
     9	is	ɪ.z
    10	this	ð.ɪ.s
    11	because	b.ɪ.k.ʌ.z
    12	it	ɪ.t
    13	slipped	s.l.ɪ.p.t
    14	i	aɪ
    15	since	s.ɪ.n.s
    16	i	aɪ
    17	set	s.ɛ.t
    18	it	ɪ.t
    19	it's	ɪ.t.s
    20	okay	oʊ.k.eɪ
cat: write error: Broken pipe
