In [1]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

**Notebook author:** emeinhardt@ucsd.edu

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Overview" data-toc-modified-id="Overview-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Overview</a></span><ul class="toc-item"><li><span><a href="#Usage" data-toc-modified-id="Usage-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Usage</a></span></li></ul></li><li><span><a href="#Parameters" data-toc-modified-id="Parameters-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Parameters</a></span></li><li><span><a href="#Imports-/-load-data" data-toc-modified-id="Imports-/-load-data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Imports / load data</a></span></li><li><span><a href="#Ceate-and-write-uniform-distribution-to-file" data-toc-modified-id="Ceate-and-write-uniform-distribution-to-file-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Ceate and write uniform distribution to file</a></span></li></ul></div>

# Overview

Given 
 - a filepath to a channel model $c$ (a `.json` file)
 - an output filepath prefix $o$
 
this notebook produces a uniform distribution on the stimuli triphones of the model in $c$ (only those that consist of three non-word-edge symbols) and writes it as a `.json` file to $o$.json and as a binary/pickled `numpy` array to $o$.npy, where the ordering is given by sorting the stimuli triphones (without any word-edge symbols in them) in $c$.

Note: this is roughly an analogue to the notebook for Step 3e (`Define a conditional distribution on segmental wordforms given an orthographic one`) that produces a distribution $p(W|V)$ on segmental wordforms given an orthographic one.

That means, among other things that word edge symbols are added to segmental triphone-words here.

**If $r$ is `'False'`, then only one word edge symbol will be appended to each side of each transcription.** If $r$ is `'True'` (or left unspecified), then one left word edge symbol will be prepended to each transcription and two right word edge symbols will be appended.

## Usage

#FIXME

# Parameters

In [2]:
from os import getcwd, chdir, listdir, path, mkdir, makedirs

In [3]:
# Parameters

c = ''
# c = 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json'

o = ''
# o = 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2'

r = ''
# r = 'False'

In [None]:
if r in {'', 'True'}:
    r = True
elif r == 'False':
    r = False
else:
    raise Exception(f"r must be one of ['', 'True', 'False'], got '{r}' instead")

In [4]:
output_dir = path.dirname(o)
if not path.exists(output_dir):
    print("Making output path '{output_dir}'")
    makedirs(output_dir)

# Imports / load data

In [5]:
from probdist import *

In [6]:
channel_model = importProbDist(c)

In [7]:
stimuli_triphones = sorted(list(channel_model.keys()))
len(stimuli_triphones)

5760

In [8]:
def has_no_edge_symbols(stimuli_triphone):
    x012_t = ds2t(stimuli_triphone)
    return all(x_i not in edgeSymbols for x_i in x012_t)

In [9]:
from itertools import tee, filterfalse

# slightly adapted from itertools recipes
def partition(pred, iterable):
    'Use a predicate to partition entries into true entries and false entries'
    # partition(is_odd, range(10)) --> 1 3 5 7 9 and  0 2 4 6 8
    t1, t2 = tee(iterable)
    return set(filter(pred, t2)), set(filterfalse(pred, t1))

In [10]:
noEdges, hasEdges = partition(has_no_edge_symbols, stimuli_triphones)
len(noEdges)
len(hasEdges)

4889

871

In [11]:
extraSuffix = '' if not r else '.' + rightEdge

Ws = set(map(lambda w: padInputSequenceWithBoundaries(w) + extraSuffix, noEdges))
Ws_t = tuple(sorted(list(Ws)))
Ws_t[:5]

('⋊.aɪ.b.z.⋉.⋉',
 '⋊.aɪ.b.ɪ.⋉.⋉',
 '⋊.aɪ.b.ɹ.⋉.⋉',
 '⋊.aɪ.d.aɪ.⋉.⋉',
 '⋊.aɪ.d.eɪ.⋉.⋉')

# Ceate and write uniform distribution to file

In [12]:
pW = Uniform(Ws)

In [13]:
pW_np = distToNP(pW)
pW_np.shape
pW_np.dtype
pW_np.nbytes / 1e6 #MB

(4889,)

dtype('float64')

0.039112

In [14]:
exportProbDist(o + '.json', mapValues(float, dict(pW)))

In [15]:
np.save(o + '.npy', pW_np)

In [16]:
pW_md = {'W':{'from fp':c,
              'changes':'only includes triphones containing no word edge symbols; subsequently padded with edge symbols and then sorted.',
              'size':len(Ws)}}
exportMatrixMetadata(o + '.npy' + '_metadata.json',
                     o + '.npy',
                     pW_np,
                     pW_md,
                     'Step 4a',
                     'Generate triphone lexicon distribution from channel model',
                    {})

Wrote metadata for 
	CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2.npy
 to 
	CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2.npy_metadata.json


In [17]:
o

'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2'

In [18]:
listdir(output_dir)

['pX0X1X2.npy',
 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_pC1X012.npy_metadata.json',
 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_p3Y1X01.json',
 'p6Y0X01.json',
 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_CMs_by_wordform_index.pickle_metadata.json',
 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_CMs_by_wordform_index.pickle',
 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_pC1X0X1X2.npy',
 'Calculate wordform channel matrices for LTR_Buckeye_aligned_CM_filtered_LM_filtered.ipynb',
 'p3YX.json',
 'p3Y0X01.json',
 'p3Y01X01.json',
 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2.npy',
 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_CMs_by_prefix_index.pickle_metadata.json',
 'Generating LTR_Buckeye_aligned_CM_filtered_LM_filtered uniform triphone lexicon dist.ipynb',
 'p6Y01X01.json',
 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_p3Y1X012.npy_metadata.json',
 'p3Y1X01.json',
 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_p6Y0X01.json',
 'LTR_Buckeye_aligned_CM_filtered_LM_filtere