In [7]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

**Notebook author:** emeinhardt@ucsd.edu

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Overview" data-toc-modified-id="Overview-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Overview</a></span><ul class="toc-item"><li><span><a href="#Requirements" data-toc-modified-id="Requirements-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Requirements</a></span></li><li><span><a href="#Usage" data-toc-modified-id="Usage-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Usage</a></span></li></ul></li><li><span><a href="#Parameters" data-toc-modified-id="Parameters-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Parameters</a></span></li><li><span><a href="#Imports-/-load-data" data-toc-modified-id="Imports-/-load-data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Imports / load data</a></span></li><li><span><a href="#Calculate-triphones-in-each-model" data-toc-modified-id="Calculate-triphones-in-each-model-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Calculate triphones in each model</a></span></li><li><span><a href="#Filter-channel-model-against-lexicon" data-toc-modified-id="Filter-channel-model-against-lexicon-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Filter channel model against lexicon</a></span></li><li><span><a href="#Export-new-channel-model" data-toc-modified-id="Export-new-channel-model-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Export new channel model</a></span></li></ul></div>

# Overview

Given
 - a transcribed lexicon relation filepath $l$
 - a triphone channel model filepath $c$
 - an output filepath $o$
 
this notebook produces a new channel model defined only with the stimuli triphones that can be found in $l$ and writes it to $o$. Note that $l$ must not contain any triphones not present in the stimuli triphones of $c$.

## Requirements

There are no salient third-party package requirements.

## Usage

#FIXME

# Parameters

In [1]:
from os import getcwd, chdir, listdir, path, mkdir, makedirs

In [37]:
# parameters

l = ''
# l = 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.tsv'

c = ''
# c = 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01/pY1X0X1X2.json'

o = ''
# o = 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json'

In [38]:
output_dir = path.dirname(o)
if not path.exists(output_dir):
    print("Creating output path '{0}'".format(output_dir))
    makedirs(output_dir)

# Imports / load data

In [3]:
import csv
import json

from probdist import *
from boilerplate import *
from string_utils import *

In [23]:
lexicon = []

with open(l) as csvfile:
    my_reader = csv.DictReader(csvfile, delimiter='\t', quoting=csv.QUOTE_NONE, quotechar='@')
    for row in my_reader:
        #print(row)
        lexicon.append(row)
lexicon[:5]

[OrderedDict([('Orthographic_Wordform', "aaron's"),
              ('Transcription', 'eɪ.ɹ.ʌ.n.z')]),
 OrderedDict([('Orthographic_Wordform', 'abandoned'),
              ('Transcription', 'ʌ.b.æ.n.d.ʌ.n.d')]),
 OrderedDict([('Orthographic_Wordform', 'abercrombie'),
              ('Transcription', 'æ.b.ɚ.k.ɹ.ɑ.m.b.i')]),
 OrderedDict([('Orthographic_Wordform', 'abhorrent'),
              ('Transcription', 'ʌ.b.h.oʊ.ɹ.ʌ.n.t')]),
 OrderedDict([('Orthographic_Wordform', 'abide'),
              ('Transcription', 'ʌ.b.aɪ.d')])]

In [8]:
channel_model = importProbDist(c)

# Calculate triphones in each model

In [9]:
len(channel_model)
stimuli_triphones = set(channel_model.keys())

46860

In [10]:
def padInputSequenceWithBoundaries(inputSeq):
    temp = list(dottedStringToTuple(inputSeq))
    temp = tuple([leftEdge] + temp + [rightEdge])
    return tupleToDottedString(temp)

def trimBoundariesFromSequence(seq):
    temp = list(dottedStringToTuple(seq))
    if temp[0] == leftEdge:
        temp = temp[1:]
    if temp[-1] == rightEdge:
        temp = temp[:-1]
    return tupleToDottedString(tuple(temp))

def rowToTriphones(row):
    three_factors = dsTo3factors(padInputSequenceWithBoundaries(row['Transcription']))
    return three_factors

In [24]:
list(map(rowToTriphones, lexicon[:10]))

[{'eɪ.ɹ.ʌ', 'n.z.⋉', 'ɹ.ʌ.n', 'ʌ.n.z', '⋊.eɪ.ɹ'},
 {'b.æ.n', 'd.ʌ.n', 'n.d.ʌ', 'n.d.⋉', 'æ.n.d', 'ʌ.b.æ', 'ʌ.n.d', '⋊.ʌ.b'},
 {'b.i.⋉',
  'b.ɚ.k',
  'k.ɹ.ɑ',
  'm.b.i',
  'æ.b.ɚ',
  'ɑ.m.b',
  'ɚ.k.ɹ',
  'ɹ.ɑ.m',
  '⋊.æ.b'},
 {'b.h.oʊ', 'h.oʊ.ɹ', 'n.t.⋉', 'oʊ.ɹ.ʌ', 'ɹ.ʌ.n', 'ʌ.b.h', 'ʌ.n.t', '⋊.ʌ.b'},
 {'aɪ.d.⋉', 'b.aɪ.d', 'ʌ.b.aɪ', '⋊.ʌ.b'},
 {'b.ɪ.l', 'l.ʌ.t', 't.i.⋉', 'ɪ.l.ʌ', 'ʌ.b.ɪ', 'ʌ.t.i', '⋊.ʌ.b'},
 {'b.oʊ.ɹ', 'oʊ.ɹ.ʃ', 'ɹ.ʃ.ʌ', 'ʃ.ʌ.n', 'ʌ.b.oʊ', 'ʌ.n.⋉', '⋊.ʌ.b'},
 {'b.oʊ.ɹ', 'n.z.⋉', 'oʊ.ɹ.ʃ', 'ɹ.ʃ.ʌ', 'ʃ.ʌ.n', 'ʌ.b.oʊ', 'ʌ.n.z', '⋊.ʌ.b'},
 {'aʊ.t.⋉', 'b.aʊ.t', 'ʌ.b.aʊ', '⋊.ʌ.b'},
 {'b.ʌ.v', 'ʌ.b.ʌ', 'ʌ.v.⋉', '⋊.ʌ.b'}]

In [25]:
lexicon_triphs = union(map(rowToTriphones, lexicon))
len(lexicon_triphs)
list(lexicon_triphs)[:10]

5756

['ŋ.k.⋉',
 't.ʌ.f',
 'ɪ.s.ɛ',
 's.ɛ.t',
 'l.oʊ.ɚ',
 '⋊.k.l',
 'æ.θ.w',
 'ʌ.v.ɔɪ',
 't.ɚ.f',
 'ɛ.t.z']

In [26]:
illegal_lexicon_triphs = list(filter(lambda triph: triph not in stimuli_triphones,
                                     lexicon_triphs))
assert len(illegal_lexicon_triphs) == 0, f'Found triphones in \n\t{l}\nnot defined in\n\t{c}\n{illegal_lexicon_triphs}'

# Filter channel model against lexicon

In [17]:
# def existsWordformWithTriphone(triph_ds):
#     return triph_ds in lexicon_triphs

In [28]:
projected_channel_model = condDistsAsProbDists(project_dict(channel_model, lexicon_triphs))

5756

In [34]:
print('# stimuli triphones in old channel model: {0}'.format(len(stimuli_triphones)))
print('# stimuli triphones in new channel model: {0}'.format(len(projected_channel_model.keys())))
print('|Loss| = {0}'.format(len(stimuli_triphones) -  len(projected_channel_model.keys()) ))
print('% loss = {0:3}'.format( (len(stimuli_triphones) -  len(projected_channel_model.keys())) / len(stimuli_triphones) * 100.0 ))

# stimuli triphones in old channel model: 46860
# stimuli triphones in new channel model: 5756
|Loss| = 41104
% loss = 87.7166026461801


# Export new channel model

In [35]:
exportProbDist(o, condProbDistAsDicts_for_export(projected_channel_model))

In [None]:
o

In [None]:
listdir(output_dir)