In [8]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

**Notebook author:** emeinhardt@ucsd.edu

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Overview" data-toc-modified-id="Overview-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Overview</a></span><ul class="toc-item"><li><span><a href="#Requirements" data-toc-modified-id="Requirements-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Requirements</a></span></li><li><span><a href="#Usage" data-toc-modified-id="Usage-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Usage</a></span></li></ul></li><li><span><a href="#Parameters" data-toc-modified-id="Parameters-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Parameters</a></span></li><li><span><a href="#Imports-/-load-data" data-toc-modified-id="Imports-/-load-data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Imports / load data</a></span></li><li><span><a href="#Calculate-triphones-in-each-model" data-toc-modified-id="Calculate-triphones-in-each-model-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Calculate triphones in each model</a></span></li><li><span><a href="#Filter-channel-models-against-lexicon" data-toc-modified-id="Filter-channel-models-against-lexicon-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Filter channel models against lexicon</a></span></li><li><span><a href="#Export-new-channel-model" data-toc-modified-id="Export-new-channel-model-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Export new channel model</a></span></li></ul></div>

# Overview

Given
 - a transcribed lexicon relation filepath $l$
 - a triphone channel model filepath $c$
 - an output filepath $o$
 
this notebook produces a new channel model defined only with the stimuli triphones that can be found in $l$ and writes it to $o$. Note that $l$ must not contain any triphones not present in the stimuli triphones of $c$.

Lazy later addendum: this notebook will also look for 'preview' and 'postview' diphone channel distributions on the same path as $c$ based on assumptions about the naming conventions, apply the same filtering, and use similar assumptions about naming conventions to choose filenames for these filtered diphone channel models.

## Requirements

There are no salient third-party package requirements.

## Usage

#FIXME

# Parameters

In [3]:
from os import getcwd, chdir, listdir, path, mkdir, makedirs

In [6]:
# parameters

l = ''
# l = 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.tsv'

c = ''
# c = 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01/pY1X0X1X2.json'

o = ''
# o = 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json'

In [9]:
output_dir = path.dirname(o)
if not path.exists(output_dir):
    print("Creating output path '{0}'".format(output_dir))
    makedirs(output_dir)

In [7]:
e = path.join(path.dirname(c), 'p3Y1X01' + '.json')
e_o = path.join(path.dirname(o), path.basename(o).split('pY1X0X1X2.json')[0] + 'p3Y1X01' + '.json')
e
e_o


s = path.join(path.dirname(c), 'p6Y0X01' + '.json')
s_o = path.join(path.dirname(o), path.basename(o).split('pY1X0X1X2.json')[0] + 'p6Y0X01' + '.json')
s
s_o

'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01/p3Y1X01.json'

'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01/LTR_Buckeye_aligned_CM_filtered_LM_filtered_p3Y1X01.json'

'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01/p6Y0X01.json'

'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01/LTR_Buckeye_aligned_CM_filtered_LM_filtered_p6Y0X01.json'

# Imports / load data

In [10]:
import csv
import json

from probdist import *
from boilerplate import *
from string_utils import *

In [11]:
lexicon = []

with open(l) as csvfile:
    my_reader = csv.DictReader(csvfile, delimiter='\t', quoting=csv.QUOTE_NONE, quotechar='@')
    for row in my_reader:
        #print(row)
        lexicon.append(row)
lexicon[:5]

[OrderedDict([('Orthographic_Wordform', "'em"), ('Transcription', 'ɛ.m')]),
 OrderedDict([('Orthographic_Wordform', 'a'), ('Transcription', 'eɪ')]),
 OrderedDict([('Orthographic_Wordform', "a's"), ('Transcription', 'eɪ.z.z')]),
 OrderedDict([('Orthographic_Wordform', "aaron's"),
              ('Transcription', 'eɪ.ɹ.ʌ.n.z')]),
 OrderedDict([('Orthographic_Wordform', 'abandoned'),
              ('Transcription', 'ʌ.b.æ.n.d.ʌ.n.d')])]

In [22]:
center_channel_model = importProbDist(c)

In [23]:
preview_channel_model = importProbDist(e)

In [24]:
postview_channel_model = importProbDist(s)

# Calculate triphones in each model

In [25]:
len(center_channel_model)
stimuli_triphones = set(center_channel_model.keys())

46860

In [26]:
len(preview_channel_model)
preview_diphones = set(preview_channel_model.keys())

1323

In [27]:
len(postview_channel_model)
postview_diphones = set(postview_channel_model.keys())

1323

In [29]:
assert preview_diphones == postview_diphones

stimuli_diphones = preview_diphones

In [18]:
def padInputSequenceWithBoundaries(inputSeq):
    temp = list(dottedStringToTuple(inputSeq))
    temp = tuple([leftEdge] + temp + [rightEdge])
    return tupleToDottedString(temp)

def trimBoundariesFromSequence(seq):
    temp = list(dottedStringToTuple(seq))
    if temp[0] == leftEdge:
        temp = temp[1:]
    if temp[-1] == rightEdge:
        temp = temp[:-1]
    return tupleToDottedString(tuple(temp))

def rowToTriphones(row):
    three_factors = dsTo3factors(padInputSequenceWithBoundaries(row['Transcription']))
    return three_factors

def rowToDiphones(row):
    two_factors = dsTo2factors(padInputSequenceWithBoundaries(row['Transcription']))
    return two_factors

In [15]:
list(map(rowToTriphones, lexicon[:10]))

[{'ɛ.m.⋉', '⋊.ɛ.m'},
 {'⋊.eɪ.⋉'},
 {'eɪ.z.z', 'z.z.⋉', '⋊.eɪ.z'},
 {'eɪ.ɹ.ʌ', 'n.z.⋉', 'ɹ.ʌ.n', 'ʌ.n.z', '⋊.eɪ.ɹ'},
 {'b.æ.n', 'd.ʌ.n', 'n.d.ʌ', 'n.d.⋉', 'æ.n.d', 'ʌ.b.æ', 'ʌ.n.d', '⋊.ʌ.b'},
 {'b.i.⋉',
  'b.ɚ.k',
  'k.ɹ.ɑ',
  'm.b.i',
  'æ.b.ɚ',
  'ɑ.m.b',
  'ɚ.k.ɹ',
  'ɹ.ɑ.m',
  '⋊.æ.b'},
 {'b.h.oʊ', 'h.oʊ.ɹ', 'n.t.⋉', 'oʊ.ɹ.ʌ', 'ɹ.ʌ.n', 'ʌ.b.h', 'ʌ.n.t', '⋊.ʌ.b'},
 {'aɪ.d.⋉', 'b.aɪ.d', 'ʌ.b.aɪ', '⋊.ʌ.b'},
 {'b.ɪ.l', 'l.ʌ.t', 't.i.⋉', 'ɪ.l.ʌ', 'ʌ.b.ɪ', 'ʌ.t.i', '⋊.ʌ.b'},
 {'b.oʊ.ɹ', 'oʊ.ɹ.ʃ', 'ɹ.ʃ.ʌ', 'ʃ.ʌ.n', 'ʌ.b.oʊ', 'ʌ.n.⋉', '⋊.ʌ.b'}]

In [16]:
lexicon_triphs = union(map(rowToTriphones, lexicon))
len(lexicon_triphs)
list(lexicon_triphs)[:10]

5760

['æ.s.i',
 't.s.ʌ',
 '⋊.k.j',
 's.ʌ.t',
 'ɹ.ɪ.l',
 'p.ɚ.m',
 'p.i.t',
 'eɪ.n.dʒ',
 'b.i.p',
 'ɛ.v.ɚ']

In [17]:
illegal_lexicon_triphs = list(filter(lambda triph: triph not in stimuli_triphones,
                                     lexicon_triphs))
assert len(illegal_lexicon_triphs) == 0, f'Found triphones in \n\t{l}\nnot defined in\n\t{c}\n{illegal_lexicon_triphs}'

In [19]:
list(map(rowToDiphones, lexicon[:10]))

[{'m.⋉', 'ɛ.m', '⋊.ɛ'},
 {'eɪ.⋉', '⋊.eɪ'},
 {'eɪ.z', 'z.z', 'z.⋉', '⋊.eɪ'},
 {'eɪ.ɹ', 'n.z', 'z.⋉', 'ɹ.ʌ', 'ʌ.n', '⋊.eɪ'},
 {'b.æ', 'd.ʌ', 'd.⋉', 'n.d', 'æ.n', 'ʌ.b', 'ʌ.n', '⋊.ʌ'},
 {'b.i', 'b.ɚ', 'i.⋉', 'k.ɹ', 'm.b', 'æ.b', 'ɑ.m', 'ɚ.k', 'ɹ.ɑ', '⋊.æ'},
 {'b.h', 'h.oʊ', 'n.t', 'oʊ.ɹ', 't.⋉', 'ɹ.ʌ', 'ʌ.b', 'ʌ.n', '⋊.ʌ'},
 {'aɪ.d', 'b.aɪ', 'd.⋉', 'ʌ.b', '⋊.ʌ'},
 {'b.ɪ', 'i.⋉', 'l.ʌ', 't.i', 'ɪ.l', 'ʌ.b', 'ʌ.t', '⋊.ʌ'},
 {'b.oʊ', 'n.⋉', 'oʊ.ɹ', 'ɹ.ʃ', 'ʃ.ʌ', 'ʌ.b', 'ʌ.n', '⋊.ʌ'}]

In [20]:
lexicon_diphs = union(map(rowToDiphones, lexicon))
len(lexicon_diphs)
list(lexicon_diphs)[:10]

904

['⋊.k', 'f.æ', 'u.b', 'l.s', 'n.u', 'v.ɛ', 'oʊ.h', 'f.aɪ', 'm.eɪ', 'aɪ.f']

Unlike with the triphones, there *should* be exactly two classes of diphones in the lexicon that are not defined in the preview or postview distributions: diphones where one of the segments is a word edge symbol.

In [35]:
illegal_lexicon_diphs = list(filter(lambda diph: diph not in stimuli_diphones,
                                     lexicon_diphs))

print(illegal_lexicon_diphs)

assert all((leftEdge in diph) or (rightEdge in diph) for diph in illegal_lexicon_diphs)

remaining_illegal_lexicon_diphs = list(filter(lambda diph: not (leftEdge in diph or rightEdge in diph),
                                              illegal_lexicon_diphs))

assert len(remaining_illegal_lexicon_diphs) == 0, f'Found diphones in \n\t{l}\nnot defined in\n\t{c}\n{remaining_illegal_lexicon_diphs}'

['⋊.k', 'u.⋉', 'g.⋉', 'dʒ.⋉', '⋊.l', '⋊.eɪ', 'b.⋉', 'm.⋉', 'tʃ.⋉', '⋊.h', 'ʃ.⋉', 'ɑ.⋉', '⋊.ɛ', '⋊.oʊ', 'k.⋉', '⋊.æ', 'aɪ.⋉', '⋊.ʃ', '⋊.v', '⋊.w', '⋊.ɪ', '⋊.g', 'ŋ.⋉', '⋊.z', 'ʒ.⋉', '⋊.t', 't.⋉', 'æ.⋉', 'ɔɪ.⋉', 'f.⋉', 'z.⋉', 'eɪ.⋉', '⋊.tʃ', '⋊.θ', '⋊.ɹ', 'd.⋉', 's.⋉', '⋊.d', 'l.⋉', 'oʊ.⋉', 'ɚ.⋉', 'i.⋉', '⋊.s', '⋊.p', '⋊.f', '⋊.n', '⋊.aʊ', 'aʊ.⋉', 'n.⋉', '⋊.i', '⋊.aɪ', 'ð.⋉', 'ɛ.⋉', 'θ.⋉', 'ɹ.⋉', '⋊.ð', '⋊.m', 'v.⋉', 'p.⋉', '⋊.ɑ', 'ʌ.⋉', '⋊.dʒ', '⋊.j', '⋊.ɔɪ', '⋊.ɚ', '⋊.b', '⋊.ʌ']


# Filter channel models against lexicon

In [17]:
# def existsWordformWithTriphone(triph_ds):
#     return triph_ds in lexicon_triphs

In [36]:
projected_center_channel_model = condDistsAsProbDists(project_dict(center_channel_model, lexicon_triphs))

In [37]:
print('# stimuli triphones in old channel model: {0}'.format(len(stimuli_triphones)))
print('# stimuli triphones in new channel model: {0}'.format(len(projected_center_channel_model.keys())))
print('|Loss| = {0}'.format(len(stimuli_triphones) -  len(projected_center_channel_model.keys()) ))
print('% loss = {0:3}'.format( (len(stimuli_triphones) -  len(projected_center_channel_model.keys())) / len(stimuli_triphones) * 100.0 ))

# stimuli triphones in old channel model: 46860
# stimuli triphones in new channel model: 5760
|Loss| = 41100
% loss = 87.70806658130603


In [38]:
projected_preview_channel_model = condDistsAsProbDists(project_dict(preview_channel_model, lexicon_diphs))

In [40]:
print('# stimuli diphones in old channel model: {0}'.format(len(stimuli_diphones)))
print('# stimuli diphones in new channel model: {0}'.format(len(projected_preview_channel_model.keys())))
print('|Loss| = {0}'.format(len(stimuli_diphones) -  len(projected_preview_channel_model.keys()) ))
print('% loss = {0:3}'.format( (len(stimuli_diphones) -  len(projected_preview_channel_model.keys())) / len(stimuli_diphones) * 100.0 ))

# stimuli diphones in old channel model: 1323
# stimuli diphones in new channel model: 837
|Loss| = 486
% loss = 36.734693877551024


In [39]:
projected_postview_channel_model = condDistsAsProbDists(project_dict(postview_channel_model, lexicon_diphs))

In [41]:
print('# stimuli diphones in old channel model: {0}'.format(len(stimuli_diphones)))
print('# stimuli diphones in new channel model: {0}'.format(len(projected_postview_channel_model.keys())))
print('|Loss| = {0}'.format(len(stimuli_diphones) -  len(projected_postview_channel_model.keys()) ))
print('% loss = {0:3}'.format( (len(stimuli_diphones) -  len(projected_postview_channel_model.keys())) / len(stimuli_diphones) * 100.0 ))

# stimuli diphones in old channel model: 1323
# stimuli diphones in new channel model: 837
|Loss| = 486
% loss = 36.734693877551024


# Export new channel model

In [35]:
exportProbDist(o, condProbDistAsDicts_for_export(projected_center_channel_model))

In [None]:
o

In [None]:
exportProbDist(e_o, condProbDistAsDicts_for_export(projected_preview_channel_model))

In [None]:
e_o

In [None]:
exportProbDist(s_o, condProbDistAsDicts_for_export(projected_postview_channel_model))

In [None]:
s_o

In [None]:
listdir(output_dir)