In [4]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

**Notebook author:** emeinhardt@ucsd.edu

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Overview" data-toc-modified-id="Overview-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Overview</a></span><ul class="toc-item"><li><span><a href="#Requirements" data-toc-modified-id="Requirements-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Requirements</a></span></li><li><span><a href="#Usage" data-toc-modified-id="Usage-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Usage</a></span></li></ul></li><li><span><a href="#Parameters" data-toc-modified-id="Parameters-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Parameters</a></span></li><li><span><a href="#Imports" data-toc-modified-id="Imports-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Imports</a></span></li><li><span><a href="#Load-data" data-toc-modified-id="Load-data-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Load data</a></span></li><li><span><a href="#numpy-representations" data-toc-modified-id="numpy-representations-5"><span class="toc-item-num">5&nbsp;&nbsp;</span><code>numpy</code> representations</a></span></li><li><span><a href="#Calculation" data-toc-modified-id="Calculation-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Calculation</a></span></li><li><span><a href="#Export" data-toc-modified-id="Export-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Export</a></span><ul class="toc-item"><li><span><a href="#Segment-sequence-(all-prefixes-or-just-wordforms)-channel-matrices" data-toc-modified-id="Segment-sequence-(all-prefixes-or-just-wordforms)-channel-matrices-7.1"><span class="toc-item-num">7.1&nbsp;&nbsp;</span>Segment sequence (all prefixes or just wordforms) channel matrices</a></span></li><li><span><a href="#Representations-of-$p_3(Y_1|X_0,-X_1;-X2)$-(and-$p_3(Y_1|X_0;-X_1)$)" data-toc-modified-id="Representations-of-$p_3(Y_1|X_0,-X_1;-X2)$-(and-$p_3(Y_1|X_0;-X_1)$)-7.2"><span class="toc-item-num">7.2&nbsp;&nbsp;</span>Representations of $p_3(Y_1|X_0, X_1; X2)$ (and $p_3(Y_1|X_0; X_1)$)</a></span></li></ul></li></ul></div>

# Overview

Given
 - a filepath to a triphone channel model $c$
 - a filepath $w$ to a `.json` file specifying a conditional distribution $p(W|V)$ on segmental wordforms given orthographic ones
 - an output filepath prefix $o$
 - an optional filepath $p$ to a `.json` file specifying a 'preview' channel distribution to be included in calculated channel matrices.

this notebook calculates a channel matrix for each source prefix and writes these channel matrices to file (with prefix given by $o$), with each file corresponding to a block of source prefixes of the same length. Within a block, the ordering of source prefixes/wordforms is given by alphabetically sorting the relevant set of prefixes (or just full wordforms, if $f$).

#FIXME update to reflect other exports (including the channel matrix stacks acccctually used in subsequent notebooks...

## Requirements

 - `numpy`
 - `pytorch`

## Usage

#FIXME

# Parameters

In [5]:
from os import getcwd, chdir, listdir, path, mkdir, makedirs

In [6]:
from boilerplate import *

In [19]:
# Parameters

c = ''
# c = 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json'
# c = "CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.01/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json"

w = ''
# w = 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json'
# w = 'LTR_newdic_destressed_aligned_w_GD_AmE_destressed/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered.pW_V.json'

o = ''
# o = 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01/LTR_Buckeye_aligned_CM_filtered_LM_filtered_'
# o = 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.01/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered_'

p = ''
# p = 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01/p3Y1X01.json'

In [8]:
ensure_dir_exists(path.dirname(o))

In [20]:
if p == '':
    r = False
else:
    r = True
    print('Including preview distribution in channel matrix calculations.')

# Imports

In [10]:
from probdist import *

In [11]:
from string_utils import *

In [12]:
import numpy as np
import torch

In [13]:
import pickle

In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')
    if torch.cuda.device_count() > 1:
        print(torch.cuda.get_device_name(1))
        print('Memory Usage:')
        print('Allocated:', round(torch.cuda.memory_allocated(1)/1024**3,1), 'GB')
        print('Cached:   ', round(torch.cuda.memory_cached(1)/1024**3,1), 'GB')

Using device: cuda

GeForce GTX 1080
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [15]:
gpu = torch.device('cuda')
cpu = torch.device('cpu')

my_device = cpu

In [16]:
cuda_ft = torch.cuda.FloatTensor
cuda_dt = torch.cuda.DoubleTensor

ft = torch.FloatTensor
dt = torch.DoubleTensor

my_ft = ft
my_dt = dt

torch.set_default_tensor_type(my_ft)

# Load data

In [17]:
p3Y1X012 = condDistsAsProbDists(importProbDist(c))

assert uniformOutcomes(p3Y1X012)

In [21]:
if r:
    p3Y1X01 = condDistsAsProbDists(importProbDist(p))
    assert uniformOutcomes(pY1X01)

In [22]:
pW_V = condDistsAsProbDists(importProbDist(w))

In [23]:
#extract segmental wordforms from w
Ws = union(list(map(lambda d: set(conditions(d)), 
                    pW_V.values())))
Ws_t = tuple(sorted(list(Ws)))
print(f'|Wordforms| = {len(Ws)}')

#extract prefixes from w
Ps = union(map(getPrefixes, Ws))
prefixes = Ps
print(f'|Prefixes| = {len(Ps)}')
Ps_t = tuple(sorted(list(Ws)))
prefixes_t = Ps_t

#extract inventory from w
Xs = lexiconToInventory(Ws)
    
#extract triphones from w
lexiconTriphones = lexiconTo3factors(Ws)
print(f'|triphones| in lexicon = {len(lexiconTriphones)}')

|Wordforms| = 9172
|Prefixes| = 42231
|triphones| in lexicon = 7412


In [24]:
#extract triphones from c
channelTriphones = set(p3Y1X012.keys())

print(f'|triphones| in channel model = {len(channelTriphones)}')

X012s = channelTriphones
X012s_t = tuple(sorted(list(X012s)))

#extract response phones
Y1s = outcomes(p3Y1X012)
Y1s_t = tuple(sorted(list(Y1s)))
print(f'|Y1s| = {len(Y1s)}')

leftEdge in Y1s
rightEdge in Y1s

|triphones| in channel model = 7381
|Y1s| = 38


False

False

In [25]:
missing_from_channel = {triph for triph in lexiconTriphones if triph not in channelTriphones}
if len(missing_from_channel) > 0:
    print(len(missing_from_channel))
    print(missing_from_channel)

assert len(missing_from_channel) == 0 or all({rightEdge + '.' + rightEdge in triph for triph in missing_from_channel})
# assert all({triph in channelTriphones for triph in lexiconTriphones})

31
{'ŋ.⋉.⋉', 'k.⋉.⋉', 'eɪ.⋉.⋉', 'i.⋉.⋉', 'ɹ.⋉.⋉', 'ʃ.⋉.⋉', 'g.⋉.⋉', 'oʊ.⋉.⋉', 'p.⋉.⋉', 'θ.⋉.⋉', 'd.⋉.⋉', 'u.⋉.⋉', 'n.⋉.⋉', 'ʒ.⋉.⋉', 'l.⋉.⋉', 'b.⋉.⋉', 'aɪ.⋉.⋉', 'ð.⋉.⋉', 'm.⋉.⋉', 'ɔɪ.⋉.⋉', 'f.⋉.⋉', 'ɚ.⋉.⋉', 's.⋉.⋉', 'dʒ.⋉.⋉', 'v.⋉.⋉', 't.⋉.⋉', 'ə.⋉.⋉', 'aʊ.⋉.⋉', 'z.⋉.⋉', 'ɑ.⋉.⋉', 'tʃ.⋉.⋉'}


If there are $x_0.⋉.⋉$ triphones in the lexicon triphone set, then we need to modify the central distribution:
 - we need to add all the relevant $x_0.⋉.⋉$ triphones to the conditions
 - the $p(Y1|\cdot)$ distribution for each of them needs to have all its mass on ⋉
 - ⋉ needs to be added to the outcomes of every other kind of condition with probability zero.

In [26]:
from copy import deepcopy

In [27]:
if len(missing_from_channel) > 0:
    assert not r
    
    #Convert p3Y1X012 back to a dictionary of dictionaries
    p3Y1X012 = condProbDistAsDicts(p3Y1X012)
    
    #add the new conditions
    for triph in missing_from_channel:
        p3Y1X012[triph] = {y1:0.0 for y1 in Y1s}
    
    #add the new outcome, with appropriate probability mass
    for x012 in p3Y1X012:
        if x012 not in missing_from_channel:
            p3Y1X012[x012][rightEdge] = 0.0
        else:
            p3Y1X012[x012][rightEdge] = 1.0
    
    #define a new set of outcomes and conditions
    Y1s_RE = Y1s | {rightEdge}
    Y1s_RE_t = tuple(sorted(list(Y1s_RE)))
    
    X012s_RE = X012s | missing_from_channel
    X012s_RE_t = tuple(sorted(list(X012s_RE)))
    
    #define a placeholder for the old outcomes and conditions
    Y1s_old = deepcopy(Y1s)
    Y1s_t_old = deepcopy(Y1s_t)
    
    X012s_old = deepcopy(X012s)
    X012s_t_old = deepcopy(X012s_t)
    
    #replace the old variables
    Y1s = Y1s_RE
    Y1s_t = Y1s_RE_t
    
    X012s = X012s_RE
    X012s_t = X012s_RE_t
    
    assert areNormalized(p3Y1X012)
    assert uniformOutcomes(p3Y1X012)
    
    #convert p3Y1X012 back to a dictionary of ProbDists
    p3Y1X012 = condDistsAsProbDists(p3Y1X012)

In [28]:
if r:
    channelDiphones = set(p3Y1X01.keys())
    print(f'|X012s| in channel model = {len(channelDiphones)}')
    
    lexiconDiphones = lexiconTo2factors(Ws)
    unmodelableLexiconDiphones = {diph for diph in lexiconDiphones if diph not in channelDiphones}
    print(f'unmodelable lexicon diphones = \n{unmodelableLexiconDiphones}')
    assert all({diph in channelDiphones for diph in lexiconDiphones if ds2t(diph)[0] != leftEdge and ds2t(diph)[1] != rightEdge})
    print(f'|X012s| in lexicon = {len(lexiconDiphones)}')
    
    X01s = lexiconDiphones
    assert outcomes(p3Y1X01) == Y1s
    
    

There are no gating trials that bear on $p(Y_{i+1}|X_i; X_{i+1} = ⋉)$, but a reasonable assumption is that there are plenty of good acoustic cues that any given segment $X_i$ is the end of the word (i.e. that $X_{i+1} = ⋉$) given the context of an isolated word recognition task, and that there are plenty of good acoustic cues that any given segment is NOT the end of the word.

In [29]:
if r:
    p3Y1X01 = condProbDistAsDicts(p3Y1X01)
    
    # add ⋉ to the outcomes of every existing conditioning outcome
    for x01 in p3Y1X01:
        p3Y1X01[x01].update({rightEdge:0.0})

    # create new conditioning events
    wordEndDiphones = {x + '.' + rightEdge for x in Xs}
    list(wordEndDiphones)[:5]

    # create their distribution over outcomes
    deltaDist = {y1:0.0 for y1 in Y1s}
    deltaDist.update({rightEdge:1.0})

    # add the new wordend conditioning events to the preview distribution
    p3Y1X01.update({wordEnd:deltaDist for wordEnd in wordEndDiphones})
    p3Y1X01['aʊ.s']['s']
    p3Y1X01['ɑ.⋉']

    # check that everything worked
    for x01 in p3Y1X01:
        assert rightEdge in p3Y1X01[x01]
    #     if rightEdge not in p3Y1X01[x01]:
    #         p3Y1X01[x01][rightEdge] = 0.0

    assert areNormalized(p3Y1X01)
    assert uniformOutcomes(p3Y1X01)

    channelDiphones = set(p3Y1X01.keys())

    unmodelableLexiconDiphones = {diph for diph in lexiconDiphones if diph not in channelDiphones}
    print(f'unmodelable lexicon diphones = \n{unmodelableLexiconDiphones}')
    assert all({diph in channelDiphones for diph in lexiconDiphones if ds2t(diph)[0] != leftEdge and ds2t(diph)[1] != rightEdge})
    
    #we'll worry about left-edge initial diphones later
    
    # let's trim the preview model's conditioning events
    p3Y1X01 = {x01:p3Y1X01[x01] for x01 in p3Y1X01 if x01 in lexiconDiphones}
    
    p3Y1X01 = condDistsAsProbDists(p3Y1X01)
    
    X01s_RE = set(p3Y1X01.keys())
    len(X01s_RE)
    
#     print(X01s_RE - X01s)

# `numpy` representations

In [30]:
Xmap = seqsToIndexMap(Xs)
XOHmap = seqsToOneHotMap(Xs)

In [31]:
X012map = seqsToIndexMap(X012s)
# X012OHs = seqMapToOneHots(X012map)
X012OHmap = seqsToOneHotMap(X012s)

In [32]:
Y1map = seqsToIndexMap(Y1s)

In [33]:
if r:
    X01REmap = seqsToIndexMap(X01s_RE)
    X01REOHs = seqMapToOneHots(X01REmap)
    X01REOHmap = seqsToOneHotMap(X01s_RE)
    
    Y1s_RE = outcomes(p3Y1X01)
    len(Y1s_RE)
    Y1s_RE_list = sorted(list(Y1s_RE))

    print(Y1s_RE - Y1s)

    Y1REmap = seqsToIndexMap(Y1s_RE)

    Y1REOHs = seqMapToOneHots(Y1REmap)
    Y1REOHmap = seqsToOneHotMap(Y1s_RE)
    OHY1REmap = oneHotToSeqMap(Y1s_RE)

If `r` is `True`, then to ensure uniformity of event spaces between the triphone channel distribution and the preview distribution, we'll add a $⋉$ outcome (with probability 0.0) to each conditional distribution in the triphone channel distribution.

In [34]:
if r:
    for x012 in p3Y1X012:
        p3Y1X012[x012].update({rightEdge:0.0})
        assert rightEdge in p3Y1X012[x012]
        assert p3Y1X012[x012][rightEdge] == 0.0

    outcomes(p3Y1X012) == Y1s
    outcomes(p3Y1X012) == Y1s_RE
    areNormalized(p3Y1X012)
    uniformOutcomes(p3Y1X012)

In [35]:
def dsToUniphoneIndices(ds, uniphoneToIndexMap):
    uniphoneSeq = ds2t(ds)
    return np.array([uniphoneToIndexMap[uniphone] for uniphone in uniphoneSeq])

def dsToUniphoneOHs(ds, uniphoneToOHmap):
    uniphoneSeq = ds2t(ds)
    return np.array([uniphoneToOHmap[uniphone] for uniphone in uniphoneSeq])

def dsToTriphoneSeq(ds):
    return dsToKfactorSequence(3, ds)

def dsToTriphoneIndices(ds, triphoneToIndexMap):
    triphoneSeq = dsToTriphoneSeq(ds)
    return np.array([triphoneToIndexMap[triphone] for triphone in triphoneSeq])

def dsToTriphoneOHs(ds, triphoneToOHmap):
    triphoneSeq = dsToTriphoneSeq(ds)
    return np.array([triphoneToOHmap[triphone] for triphone in triphoneSeq])

dsToUniphoneIndices('t.i.f.l', Xmap)
dsToUniphoneOHs('t.i.f.l', XOHmap)
dsToTriphoneSeq('t.i.f.l')
dsToTriphoneIndices('t.i.f.l', X012map)
dsToTriphoneOHs('t.i.f.l', X012OHmap)
dsToTriphoneOHs('t.i.f.l', X012OHmap).shape
dsToTriphoneOHs('t.i.f.l', X012OHmap)[0].shape
dsToTriphoneOHs('t.i.f.l', X012OHmap)[0][5528]
dsToTriphoneOHs('t.i.f.l', X012OHmap)[1][5352]

array([18,  9,  6, 12])

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0.]])

('t.i.f', 'i.f.l')

array([3756, 1449])

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

(2, 7412)

(7412,)

0.0

0.0

In [36]:
p3Y1X012_np = condDistFamilyToNP(p3Y1X012)
if r:
    testNPcondDist(p3Y1X012_np, X012map, Y1REmap, p3Y1X012)
else:
    testNPcondDist(p3Y1X012_np, X012map, Y1map, p3Y1X012)
p3Y1X012_np.shape

(39, 7412)

In [37]:
if r:
    p3Y1X01_np = condDistFamilyToNP(p3Y1X01)
    testNPcondDist(p3Y1X01_np, X01REmap, Y1REmap, p3Y1X01)
    p3Y1X01_np.shape

In [38]:
from random import choice

In [39]:
random_source_wordform = choice(list(Ws))
random_source_wordform

'⋊.ʌ.l.t.ɪ.ɹ.i.ɚ.⋉.⋉'

In [40]:
random_source_prefix = choice(list(Ps))
random_source_prefix

'⋊.g.l.ʌ.t.n'

In [41]:
def randomPrefix(l, alphabet=Xs):
    return randomString(alphabet, l, hasLeftEdge=True)

In [42]:
random_channel_prefix2 = randomPrefix(len(ds2t(random_source_wordform))-1, alphabet=Y1s)
random_channel_prefix2

'⋊.oʊ.ɛ.æ.h.ð.θ.ɪ.dʒ.z'

In [43]:
# random_source_prefix = getRandomKey(pX0i)
random_source_prefix = choice(list(Ps))
while ds2t(random_source_prefix)[-1] == rightEdge:
#     random_source_prefix = getRandomKey(pX0i)
    random_source_prefix = choice(list(Ps))
while len(ds2t(random_source_prefix)) > len(ds2t(random_source_wordform)):
#     random_source_prefix = getRandomKey(pX0i)
    random_source_prefix = choice(list(Ps))
random_source_prefix
random_channel_prefix = randomPrefix(len(ds2t(random_source_prefix))-1, alphabet=Y1s)
random_channel_prefix

'⋊.p.ɑ.l.ɪ.t.ɛ'

'⋊.ɑ.b.f.ɪ.ʌ.ɑ'

# Calculation

In [44]:
def sourcePrefixToTriphones(x0k):
    xp_t = ds2t(x0k) #"x prefix"
    
#     xi = xp_t[-2] #just-completed segment
#     xk = xp_t[-1] #upcoming segment that we only get coarticulatory information about
    
#     xik_ds = t2ds((xi, xk))
#     preview_dist = p3Y1X01[xik_ds]
    
    x012s = dsToKfactorSequence(3, t2ds(xp_t))
    return x012s

random_triphoneSeq = sourcePrefixToTriphones(random_source_prefix)
random_triphoneSeq
threeFactorSequenceToDS(random_triphoneSeq)

('⋊.p.ɑ', 'p.ɑ.l', 'ɑ.l.ɪ', 'l.ɪ.t', 'ɪ.t.ɛ')

'⋊.p.ɑ.l.ɪ.t.ɛ'

In [45]:
def sourcePrefixToTriphoneIndices(x0k):
    triphoneSequence = sourcePrefixToTriphones(x0k)
    return tuple(map(lambda x012: X012map[x012], triphoneSequence))

sourcePrefixToTriphoneIndices(random_source_prefix)

(7163, 3247, 4953, 2272, 6045)

In [46]:
blah = np.zeros((len(Y1s), 1))
blah[-1] = 1.0
blah

array([[0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.]])

In [47]:
def sourcePrefixToChannelMatrix_l(x0k, debug=False):
    triphoneOHs = dsToTriphoneOHs(x0k, X012OHmap)
    if debug:
        print('x0k = {0}'.format(x0k))
        print('|x0k| = {0}'.format(len(x0k)))
        print('triphoneIdxs = {0}'.format(sourcePrefixToTriphoneIndices(x0k)))
        print('triphoneOHs.shape = {0}'.format(triphoneOHs.shape))
        print('p3Y1X012_np.shape = {0}'.format(p3Y1X012_np.shape))
        print('result = p3Y1X012_np * triphoneOHs.T')
    result = np.matmul(p3Y1X012_np, triphoneOHs.T)
    return result
# sourcePrefixToChannelMatrix_l(random_source_prefix, True)

if r:
    def sourcePrefixToChannelMatrix(x0k):
        triphoneIndices = sourcePrefixToTriphoneIndices(x0k)
        C = np.array([[p3Y1X012_np[Y1REmap[y1], x012_idx] for x012_idx in triphoneIndices] for y1 in sorted(Y1s_RE)])
    #     C = np.array([[p3Y1X012_np[Y1map[y1], x012_idx] 
    #                    for x012_idx in triphoneIndices] 
    #                   for y1 in Y1s_t])
        if x0k == leftEdge or (len(ds2t(x0k)) == 2 and ds2t(x0k)[0] == leftEdge):
            C = np.zeros((len(Y1s_RE), 1))
    #         C = np.zeros((len(Y1s), 1))
            C[-1] = 1.0
            return C.reshape(len(Y1s_RE),1)
    #         return C.reshape(len(Y1s),1)
        return C
else:
    def sourcePrefixToChannelMatrix(x0k):
        triphoneIndices = sourcePrefixToTriphoneIndices(x0k)
#         C = np.array([[p3Y1X012_np[Y1REmap[y1], x012_idx] for x012_idx in triphoneIndices] for y1 in sorted(Y1s_RE)])
        C = np.array([[p3Y1X012_np[Y1map[y1], x012_idx] 
                       for x012_idx in triphoneIndices] 
                      for y1 in Y1s_t])
        if x0k == leftEdge or (len(ds2t(x0k)) == 2 and ds2t(x0k)[0] == leftEdge):
#             C = np.zeros((len(Y1s_RE), 1))
            C = np.zeros((len(Y1s), 1))
            C[-1] = 1.0
#             return C.reshape(len(Y1s_RE),1)
            return C.reshape(len(Y1s),1)
        assert len(triphoneIndices) == len(dsToKfactorSequence(3, x0k)), f"{len(triphoneIndices)} != {len(dsToKfactorSequence(3, x0k))}\n\t x0k = {x0k}\n\t {dsToKfactorSequence(3, x0k)}\n\t {triphoneIndices}"
        assert len(dsToKfactorSequence(3, x0k)) == C.shape[1], f"{C.shape[1]} != {len(dsToKfactorSequence(3, x0k))}\n\t x0f = {wordform}"
        return C


# sourcePrefixToChannelMatrix(random_source_prefix)

random_source_prefix
sourcePrefixToChannelMatrix_l(random_source_prefix).shape
print(sourcePrefixToChannelMatrix_l(random_source_prefix) == sourcePrefixToChannelMatrix(random_source_prefix))

'⋊.p.ɑ.l.ɪ.t.ɛ'

(39, 5)

[[ True  True  True  True  True]
 [ True  True  True  True  True]
 [ True  True  True  True  True]
 [ True  True  True  True  True]
 [ True  True  True  True  True]
 [ True  True  True  True  True]
 [ True  True  True  True  True]
 [ True  True  True  True  True]
 [ True  True  True  True  True]
 [ True  True  True  True  True]
 [ True  True  True  True  True]
 [ True  True  True  True  True]
 [ True  True  True  True  True]
 [ True  True  True  True  True]
 [ True  True  True  True  True]
 [ True  True  True  True  True]
 [ True  True  True  True  True]
 [ True  True  True  True  True]
 [ True  True  True  True  True]
 [ True  True  True  True  True]
 [ True  True  True  True  True]
 [ True  True  True  True  True]
 [ True  True  True  True  True]
 [ True  True  True  True  True]
 [ True  True  True  True  True]
 [ True  True  True  True  True]
 [ True  True  True  True  True]
 [ True  True  True  True  True]
 [ True  True  True  True  True]
 [ True  True  True  True  True]
 [ True  T

In [48]:
from random import choices

In [49]:
random_source_prefixes = choices(Ps_t, k=5000)

In [50]:
%%timeit

sourcePrefixToChannelMatrix_l(choice(random_source_prefixes))

97.1 µs ± 311 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [51]:
if r:
    def sourcePrefixToPreviewVector(x0k):
        xp_t = ds2t(x0k) #"x prefix"

        if len(xp_t) < 2:
            raise Exception('|x0k| must be > 1.')
        if len(xp_t) == 2 and xp_t[0] == leftEdge:
    #         raise Exception("There's no gating data that bears on this calculation, nor is it that interesting.")
            uniformProb = 1.0 / len(Y1s_RE)
            preview_dist = uniformProb * np.ones((len(Y1s_RE), 1))#garbage
            return preview_dist.reshape(len(Y1s_RE),1)

        xi = xp_t[-2] #just-completed segment
        xk = xp_t[-1] #upcoming segment that we only get coarticulatory information about

        xik_ds = t2ds((xi, xk))
        preview_dist = p3Y1X01[xik_ds]
    #     assert Y1s_RE == set(preview_dist.keys()) #comment out once you are reasonably confident this is true by construction

        return np.array([preview_dist[y1] for y1 in sorted(Y1s_RE)])

    sourcePrefixToPreviewVector(random_source_prefix)

In [52]:
if r:
    # returns p(Y0K|x0k)
    def makeExtendedChannelMatrixByPrefix(prefix):
        # NB:
        # if len(prefix) == n (including leftEdge), 
        # then the extended channel matrix will have dimensions 39 x (n-1)

        p = prefix
        if prefix != leftEdge:# and not (len(ds2t(p)) == 2 and ds2t(p)[0] == leftEdge):
    #     if prefix != leftEdge and not (len(ds2t(p)) == 2 and ds2t(p)[0] == leftEdge):
            return np.hstack( (sourcePrefixToChannelMatrix(p) , sourcePrefixToPreviewVector(p).reshape(39,1)))
        else: #the extended channel matrix is garbage that should never be asked for
            l = len(ds2t(p))
            return np.zeros((39, l-1))

In [53]:
# if f:
#     print('Source sequences = wordforms and prefixes')
#     source_seqs = prefixes_t #prefixes include full wordforms
# else:
#     print('Source sequences = just full wordforms')
#     source_seqs = Ws_t

In [54]:
# if r:
#     xCMsByPrefixIndex = [makeExtendedChannelMatrixByPrefix(s)
#                          for s in source_seqs]
#     xCMsByPrefixIndex_torch = [None] + [torch.from_numpy(each) for each in xCMsByPrefixIndex[1:]]

#     xCMsByPrefixIndex[3].shape
if r:
    xCMsByPrefixIndex = [makeExtendedChannelMatrixByPrefix(p)
                         for p in prefixes_t]
    xCMsByPrefixIndex_torch = [None] + [torch.from_numpy(each) for each in xCMsByPrefixIndex[1:]]

    xCMsByPrefixIndex[3].shape

In [55]:
# CMsByPrefixIndex = [sourcePrefixToChannelMatrix_l(s)
#                      for s in source_seqs]
# CMsByPrefixIndex_torch = [None] + [torch.from_numpy(each) for each in CMsByPrefixIndex[1:]]

# CMsByPrefixIndex[3].shape
CMsByPrefixIndex = [sourcePrefixToChannelMatrix_l(p)
                     for p in prefixes_t]
CMsByPrefixIndex_torch = [None] + [torch.from_numpy(each) for each in CMsByPrefixIndex[1:]]

CMsByPrefixIndex[3].shape

(39, 8)

In [56]:
CMsByWordformIndex = [sourcePrefixToChannelMatrix_l(w)
                     for w in Ws_t]
CMsByWordformIndex_torch = [None] + [torch.from_numpy(each) for each in CMsByWordformIndex[1:]]

CMsByWordformIndex[3].shape

(39, 8)

In [57]:
# def wordformsOfLength(l, includingEdges = False):
#     if includingEdges:
#         return {w for w in Ws if len(ds2t(w)) == l}
#     return {w for w in Ws if len(ds2t(w)) == l + 2}

In [58]:
wordformsOfLength(16, Ws)

{'⋊.aɪ.s.ə.l.eɪ.ʃ.ɪ.n.ɪ.z.m.⋉.⋉',
 '⋊.b.aɪ.oʊ.k.ɛ.m.ə.s.t.ɹ.i.⋉.⋉',
 '⋊.b.ɪ.b.l.i.ɑ.g.ɹ.ə.f.i.⋉.⋉',
 '⋊.d.ɑ.k.j.ʊ.m.ɛ.n.t.ɚ.i.⋉.⋉',
 '⋊.d.ə.l.æ.p.ə.d.eɪ.t.ɪ.d.⋉.⋉',
 '⋊.d.ɛ.m.ə.n.s.t.ɹ.eɪ.t.ɚ.⋉.⋉',
 '⋊.d.ɪ.s.k.w.ə.z.ɪ.ʃ.ɪ.n.⋉.⋉',
 '⋊.d.ɪ.s.t.ɹ.ɪ.b.j.u.t.ɚ.⋉.⋉',
 '⋊.d.ɪ.s.ə.b.i.d.i.ə.n.s.⋉.⋉',
 '⋊.d.ɪ.s.ɪ.d.v.æ.n.t.ɪ.dʒ.⋉.⋉',
 '⋊.d.ɪ.s.ɪ.n.t.ɪ.g.ɹ.eɪ.t.⋉.⋉',
 '⋊.d.ɪ.t.ɚ.m.ə.n.eɪ.ʃ.ɪ.n.⋉.⋉',
 '⋊.dʒ.i.oʊ.p.ɑ.l.ə.t.ɪ.k.s.⋉.⋉',
 '⋊.dʒ.ʊ.ɹ.ɪ.s.d.ɪ.k.ʃ.ɪ.n.⋉.⋉',
 '⋊.f.ɑ.ɹ.m.æ.l.d.ə.h.aɪ.d.⋉.⋉',
 '⋊.f.ɪ.l.ɪ.n.θ.ɹ.ɑ.p.ɪ.k.⋉.⋉',
 '⋊.h.aɪ.p.ə.k.ɑ.n.d.ɹ.i.ə.⋉.⋉',
 '⋊.h.ɑ.s.p.ə.t.æ.l.ɪ.t.i.⋉.⋉',
 '⋊.h.ɪ.s.t.ɚ.ɛ.k.t.ə.m.i.⋉.⋉',
 '⋊.j.u.n.ə.f.ɑ.ɹ.m.ə.t.i.⋉.⋉',
 '⋊.j.u.n.ə.f.ə.k.eɪ.ʃ.ɪ.n.⋉.⋉',
 '⋊.j.ʊ.n.æ.n.ə.m.ə.s.l.i.⋉.⋉',
 '⋊.j.ʊ.ɹ.ə.n.æ.l.ɪ.s.ɪ.s.⋉.⋉',
 '⋊.k.aɪ.ɹ.oʊ.p.ɹ.æ.k.t.ɪ.k.⋉.⋉',
 '⋊.k.aʊ.n.t.ɚ.b.æ.l.ɪ.n.s.⋉.⋉',
 '⋊.k.j.u.m.j.ʊ.l.ɪ.t.ɪ.v.⋉.⋉',
 '⋊.k.w.ɑ.n.t.ɪ.t.eɪ.t.ɪ.v.⋉.⋉',
 '⋊.k.æ.l.k.j.ʊ.l.eɪ.ʃ.ɪ.n.⋉.⋉',
 '⋊.k.æ.l.ɪ.s.θ.ɛ.n.ɪ.k.s.⋉.⋉',
 '⋊.k.æ.t.ɪ.s.t.ɹ.ɑ.f.ɪ.k.⋉.⋉',
 '⋊.k.ɑ.m.j.u.n.ɪ.s.

In [59]:
wordlengthsInclEdges = set(len(ds2t(w)) for w in Ws)
wordlengthsInclEdges
numWordsOfExactlyLength = {l:len(wordformsOfLength(l, Ws, True)) for l in wordlengthsInclEdges}
numWordsOfExactlyLength

{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19}

{4: 5,
 5: 173,
 6: 1266,
 7: 1790,
 8: 1646,
 9: 1328,
 10: 1052,
 11: 828,
 12: 508,
 13: 329,
 14: 150,
 15: 66,
 16: 24,
 17: 6,
 19: 1}

In [60]:
wordlengthsNotIncludingEdges = {each-2 for each in wordlengthsInclEdges}
wordlengthsNotIncludingEdges

{2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17}

In [61]:
# def wordformsAtLeastLlong(l, includingEdges = False):
#     if includingEdges:
#         maxL = max(wordlengthsInclEdges)
#         return union([wordformsOfLength(eachl, includingEdges) for eachl in range(l, maxL+1)])
#     else:
#         maxL = max(wordlengthsNotIncludingEdges)
#         return union([wordformsOfLength(eachl, includingEdges) for eachl in range(l, maxL+1)])

In [62]:
lengthFreqs = {l:len(wordformsAtLeastLlong(l, Ws, True)) for l in wordlengthsInclEdges}
lengthFreqs

{4: 9172,
 5: 9167,
 6: 8994,
 7: 7728,
 8: 5938,
 9: 4292,
 10: 2964,
 11: 1912,
 12: 1084,
 13: 576,
 14: 247,
 15: 97,
 16: 31,
 17: 7,
 19: 1}

In [67]:
wordformsByLength = {l:{w for w in wordformsOfLength(l, Ws, True)}
                       for l in wordlengthsInclEdges}
wordformsByLength_t = {l:tuple(sorted([w for w in wordformsOfLength(l, Ws, True)]))
                       for l in wordlengthsInclEdges}

In [69]:
wordformsByLength[4]
wordformsByLength_t[4]
wordformsByLength_t[15]
wordformsByLength_t[4]

{'⋊.aɪ.⋉.⋉', '⋊.i.⋉.⋉', '⋊.oʊ.⋉.⋉', '⋊.ɑ.⋉.⋉', '⋊.ə.⋉.⋉'}

('⋊.aɪ.⋉.⋉', '⋊.i.⋉.⋉', '⋊.oʊ.⋉.⋉', '⋊.ɑ.⋉.⋉', '⋊.ə.⋉.⋉')

('⋊.d.ɛ.m.ə.n.s.t.ɹ.eɪ.ʃ.ɪ.n.⋉.⋉',
 '⋊.d.ɪ.s.t.ɹ.ɪ.b.j.u.ʃ.ɪ.n.⋉.⋉',
 '⋊.d.ɹ.ɑ.m.ə.t.ə.z.eɪ.ʃ.ɪ.n.⋉.⋉',
 '⋊.dʒ.ʌ.s.t.ɪ.f.ə.k.eɪ.ʃ.ɪ.n.⋉.⋉',
 '⋊.f.l.ɛ.k.s.ə.b.ɪ.l.ɪ.t.i.⋉.⋉',
 '⋊.f.ə.l.æ.n.θ.ɹ.ə.p.ɪ.s.t.⋉.⋉',
 '⋊.g.l.oʊ.ɹ.ə.f.ə.k.eɪ.ʃ.ɪ.n.⋉.⋉',
 '⋊.g.ɹ.æ.t.ɪ.f.ə.k.eɪ.ʃ.ɪ.n.⋉.⋉',
 '⋊.j.u.n.j.ɪ.n.ɪ.z.eɪ.ʃ.ɪ.n.⋉.⋉',
 '⋊.k.l.æ.s.ə.f.ə.k.eɪ.ʃ.ɪ.n.⋉.⋉',
 '⋊.k.l.ɑ.s.t.ɹ.ə.f.oʊ.b.i.ə.⋉.⋉',
 '⋊.k.w.ɑ.l.ɪ.f.ə.k.eɪ.ʃ.ɪ.n.⋉.⋉',
 '⋊.k.æ.p.ə.t.ə.l.ɪ.s.t.ɪ.k.⋉.⋉',
 '⋊.k.ɑ.m.p.l.ə.m.ɛ.n.t.ɚ.i.⋉.⋉',
 '⋊.k.ɑ.n.s.ɪ.n.t.ɹ.eɪ.ʃ.ɪ.n.⋉.⋉',
 '⋊.k.ɑ.n.t.ɹ.ə.b.j.u.ʃ.ɪ.n.⋉.⋉',
 '⋊.k.ə.n.s.t.ɪ.tʃ.u.ə.n.s.i.⋉.⋉',
 '⋊.k.ə.n.t.ɪ.n.j.u.eɪ.ʃ.ɪ.n.⋉.⋉',
 '⋊.m.æ.g.n.ə.f.ə.k.eɪ.ʃ.ɪ.n.⋉.⋉',
 '⋊.m.æ.n.ə.f.ɪ.s.t.eɪ.ʃ.ɪ.n.⋉.⋉',
 '⋊.m.ə.n.ɪ.p.j.ʊ.l.eɪ.ʃ.ɪ.n.⋉.⋉',
 '⋊.m.ɛ.l.oʊ.d.ɹ.ə.m.æ.t.ɪ.k.⋉.⋉',
 '⋊.m.ɛ.t.ə.m.ɑ.ɹ.f.ə.s.ɪ.s.⋉.⋉',
 '⋊.m.ɪ.s.ɪ.n.f.ɚ.m.eɪ.ʃ.ɪ.n.⋉.⋉',
 '⋊.m.ɪ.s.ʌ.n.d.ɚ.s.t.æ.n.d.⋉.⋉',
 '⋊.n.aɪ.t.ɹ.oʊ.g.l.ɪ.s.ɚ.ə.n.⋉.⋉',
 '⋊.p.eɪ.l.i.ɑ.n.t.ɑ.l.ə.dʒ.i.⋉.⋉',
 '⋊.p.j.ʊ.ɹ.ə.f.ə.k.eɪ.ʃ.ɪ.n.⋉.⋉',
 '⋊.p.ɹ.æ.k.t.ɪ.k.æ.l.ɪ.

('⋊.aɪ.⋉.⋉', '⋊.i.⋉.⋉', '⋊.oʊ.⋉.⋉', '⋊.ɑ.⋉.⋉', '⋊.ə.⋉.⋉')

In [70]:
# returns p(Y0i|x0f), padded if necessary
def makeChannelMatrixByWordformAndLength(wordform, key_length, exact_length_only = False):
    x0f = wordform
    x0f_t = ds2t(x0f)
    x0f_length = len(x0f_t)
    if x0f_length == key_length:
        return sourcePrefixToChannelMatrix(x0f)
    elif exact_length_only:
        cm = np.zeros(shape=(len(Y1s), key_length - 2))
        return cm
    elif x0f_length > key_length:
#         print('middle case')
        #trim the wordform to be a prefix of length = key_length
        x0k_t = x0f_t[:key_length]
#         assert len(x0k_t) == key_length
        x0k = t2ds(x0k_t)
#         print('x0k: {0}'.format(x0k))
        cm = sourcePrefixToChannelMatrix(x0k)
        assert len(dsToKfactorSequence(3, x0k)) == cm.shape[1], f"{cm.shape[1]} != {len(dsToKfactorSequence(3, x0k))}\n\t x0f = {wordform}\n\t key_length = {key_length}"
        return cm
    else:
        #grab the source 
        my_CM = sourcePrefixToChannelMatrix(x0f)
        goal_l = key_length
        #extend the channel matrix with padding
        cm = np.pad(my_CM, ((0,0), (0, goal_l - my_CM.shape[1] - 2)), 
                      'constant', constant_values=0.0)
        assert key_length - 2 == cm.shape[1], f"{cm.shape[1]} != {key_length - 2}\n\t x0f = {wordform}\n\t key_length = {key_length}"
        return cm

In [71]:
if r:
    # returns p(Y0K|x0f)
    def makeExtendedChannelMatrixByWordformAndLength(wordform, key_length):
        x0f = wordform
        x0f_t = ds2t(x0f)
        x0f_length = len(x0f_t)
        if x0f_length == key_length:
            return makeExtendedChannelMatrixByPrefix(x0f)
        elif x0f_length > key_length:
    #         print('middle case')
            #trim the wordform to be a prefix of length = key_length
            x0k_t = x0f_t[:key_length]
            x0k = t2ds(x0k_t)
    #         print('x0k: {0}'.format(x0k))
            return makeExtendedChannelMatrixByPrefix(x0k)
        else:
            #grab the source 
            my_xCM = makeExtendedChannelMatrixByPrefix(x0f)
            goal_l = key_length
            return np.pad(my_xCM, ((0,0), (0, goal_l - my_xCM.shape[1] - 1)), 
                          'constant', constant_values=0.0)

In [72]:
wordlengthsInclEdges

{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19}

In [73]:
if sorted(list(wordlengthsInclEdges)) != sorted(list(range(min(wordlengthsInclEdges), \
                                                           max(wordlengthsInclEdges)+1))):
    print("Missing word lengths:")
    print({l for l in range(min(wordlengthsInclEdges), max(wordlengthsInclEdges)+1) if l not in wordlengthsInclEdges})
    wordlengthsInclEdges_range = list(range(min(wordlengthsInclEdges), max(wordlengthsInclEdges)+1))
else:
    wordlengthsInclEdges_range = sorted(list(wordlengthsInclEdges))

Missing word lengths:
{18}


In [74]:
# ~17s on wittgenstein under load
offset = [np.zeros(shape=(0,0)) for each in range(min(wordlengthsInclEdges))]
cmsByLengthByWordformIndex = offset + [np.array([makeChannelMatrixByWordformAndLength(w, l)
                                                 for w in Ws_t])
                                       for l in wordlengthsInclEdges_range]
cmsByLengthByWordformIndex_torch = list(map(lambda cm: torch.from_numpy(cm).type(my_ft), cmsByLengthByWordformIndex))

In [77]:
for l in wordlengthsInclEdges_range:
    assert all(cm.shape[1] == l - 2 for cm in cmsByLengthByWordformIndex[l])

In [78]:
if r:
    xCMsByLengthByWordformIndex = offset + [np.array([makeExtendedChannelMatrixByWordformAndLength(w, l)
                                                      for w in Ws_t])
                                            for l in wordlengthsInclEdges_range]
    xCMsByLengthByWordformIndex_torch = list(map(lambda xCM: torch.from_numpy(xCM).type(my_ft), xCMsByLengthByWordformIndex))

In [95]:
wordformsOfLength(18, Ws, True)

set()

In [96]:
exactCMsByLengthByWordformIndex = offset + [np.array([makeChannelMatrixByWordformAndLength(w, l, exact_length_only = True)
                                                 for w in wordformsByLength_t[l]]) if len(wordformsOfLength(l, Ws, True)) > 0 else np.zeros(shape=(0,0))
                                            for l in wordlengthsInclEdges_range]
exactCMsByLengthByWordformIndex_torch = list(map(lambda cm: torch.from_numpy(cm).type(my_ft), exactCMsByLengthByWordformIndex))

# Export

## Segment sequence (all prefixes or just wordforms) channel matrices

We want to save 
 - `CMsByPrefixIndex`
 - `CMsByWordformIndex`
 - `cmsByLengthByWordformIndex`
 - `exactCMsByLengthByWordformIndex`
 
(and/or their extended analogues, if `r`) to disk, and when importing, we will need to know
 - the set/sequence of key strings (prefixes or just wordforms)

In [81]:
len(CMsByWordformIndex)

9172

In [82]:
len(CMsByPrefixIndex)
# CMsByPrefixIndex.nbytes / 1e9

9172

In [83]:
len(cmsByLengthByWordformIndex)
cmsByLengthByWordformIndex[0].shape
cmsByLengthByWordformIndex[1].shape
cmsByLengthByWordformIndex[2].shape
cmsByLengthByWordformIndex[3].shape
cmsByLengthByWordformIndex[10].nbytes / 1e9

20

(0, 0)

(0, 0)

(0, 0)

(0, 0)

0.022893312

In [84]:
if r:
    pickle.dump(xCMsByPrefixIndex, open(o + 'xCMs_by_prefix_index.pickle', 'wb'))
else:
    pickle.dump(CMsByPrefixIndex, open(o + 'CMs_by_prefix_index.pickle', 'wb'))
    pickle.dump(CMsByWordformIndex, open(o + 'CMs_by_wordform_index.pickle', 'wb'))

In [85]:
if not r:
    CMsByPrefixIndex_in = pickle.load(open(o + 'CMs_by_prefix_index.pickle', 'rb'))
    len(CMsByPrefixIndex_in)

    assert all(np.array_equal(CMsByPrefixIndex_in[i], CMsByPrefixIndex[i]) for i in range(len(CMsByPrefixIndex)))

9172

In [86]:
if not r:
    CMsByPrefixIndex_in[3].shape
    CMsByPrefixIndex[3].shape

(39, 8)

(39, 8)

In [87]:
CMs_by_prefix_idx_md = {
    'r':r,
    'length':len(xCMsByPrefixIndex) if r else len(CMsByPrefixIndex),
    'W':{'from fp':w,
         'changes':'(x)CMs constructed from sorted prefixes of W',
         'size':len(Ws_t)},
    'P':{'from fp':w,
         'changes':'(x)CMs constructed from sorted prefixes of W',
         'size':len(Ps_t)},
    'C':{'from fp':c,
         'changes':"Added ⋉ to the outcomes of every existing conditioning outcome; added new conditioning events X⋉" if r else 'None'}
}

my_fp = o + 'xCMs_by_prefix_index.pickle' if r else o + 'CMs_by_prefix_index.pickle'
exportMatrixMetadata(my_fp + '_metadata.json',
                     my_fp,
                     None,
                     CMs_by_prefix_idx_md,
                     'Step 4e',
                     'Calculate segmental wordform and prefix channel matrices',
                     {})

Wrote metadata for 
	CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.01/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered_CMs_by_prefix_index.pickle
 to 
	CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.01/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered_CMs_by_prefix_index.pickle_metadata.json


In [88]:
CMs_by_wordform_idx_md = {
    'r':r,
    'length':len(CMsByWordformIndex),
    'W':{'from fp':w,
         'changes':'CMs constructed from sorted W',
         'size':len(Ws_t)},
    'C':{'from fp':c,
         'changes':'None'}
}

my_fp = o + 'CMs_by_wordform_index.pickle'
exportMatrixMetadata(my_fp + '_metadata.json',
                     my_fp,
                     None,
                     CMs_by_wordform_idx_md,
                     'Step 4e',
                     'Calculate segmental wordform and prefix channel matrices',
                     {})

Wrote metadata for 
	CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.01/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered_CMs_by_wordform_index.pickle
 to 
	CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.01/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered_CMs_by_wordform_index.pickle_metadata.json


In [89]:
# importDict(o + '.pW_C' + '_metadata.json')

In [90]:
if r:
    pickle.dump(xCMsByLengthByWordformIndex, open(o + 'xCMs_by_length_by_wordform_index.pickle', 'wb'))
else:
    pickle.dump(cmsByLengthByWordformIndex, open(o + 'CMs_by_length_by_wordform_index.pickle', 'wb'))
    pickle.dump(exactCMsByLengthByWordformIndex, open(o + 'exact_CMs_by_length_by_wordform_index.pickle', 'wb'))
    

In [91]:
CMs_by_length_by_wordform_idx_md = {
    'r':r,
    'length':len(xCMsByLengthByWordformIndex) if r else len(cmsByLengthByWordformIndex),
    'W':{'from fp':w,
         'changes':'(x)CMs constructed from sorted wordforms of W',
         'size':len(Ws_t)},
    'P':{'from fp':w,
         'changes':'(x)CMs constructed from sorted prefixes of W',
         'size':len(Ps_t)},
    'C':{'from fp':c,
         'changes':"Added ⋉ to the outcomes of every existing conditioning outcome; added new conditioning events X⋉" if r else 'None'}
}

my_fp = o + 'xCMs_by_length_by_wordform_index.pickle' if r else o + 'CMs_by_length_by_wordform_index.pickle'
exportMatrixMetadata(my_fp + '_metadata.json',
                     my_fp,
                     None,
                     CMs_by_length_by_wordform_idx_md,
                     'Step 4e',
                     'Calculate segmental wordform and prefix channel matrices',
                     {})

Wrote metadata for 
	CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.01/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered_CMs_by_length_by_wordform_index.pickle
 to 
	CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.01/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered_CMs_by_length_by_wordform_index.pickle_metadata.json


In [97]:
exact_CMs_by_length_by_wordform_idx_md = {
    'r':r,
    'length':len(exactCMsByLengthByWordformIndex),
    'W':{'from fp':w,
         'changes':'CMs organized by sorting wordforms of W by length and then subsorting all wordforms of the same length alphabetically (ie as unicode sequences)',
         'size':len(Ws_t)},
    'C':{'from fp':c,
         'changes':'None'}
}

my_fp = o + 'exact_CMs_by_length_by_wordform_index.pickle'
exportMatrixMetadata(my_fp + '_metadata.json',
                     my_fp,
                     None,
                     exact_CMs_by_length_by_wordform_idx_md,
                     'Step 4e',
                     'Calculate segmental wordform and prefix channel matrices',
                     {'comment':'Each index corresponds to a possible full wordform length (including word edges); any unnattested wordlength l between 0 and the maximum wordlength is included. Each index maps to a list of channel matrices associated with a wordform of length l and where each list is organized alphabetically.'})

Wrote metadata for 
	CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.01/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle
 to 
	CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.01/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle_metadata.json


In [98]:
o

'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.01/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered_'

In [99]:
listdir(path.dirname(o))

['LTR_newdic_destressed_aligned_CM_filtered_LM_filtered_pC1X012.npy',
 'p6Y0X01.json',
 'LTR_newdic_destressed_aligned_CM_filtered_LM_filtered_ODexact_CMs_by_length_by_prefix_index.pickle_metadata.json',
 'LTR_newdic_destressed_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle_metadata.json',
 'Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_newdic_destressed, pc=0.01.ipynb',
 'LTR_newdic_destressed_aligned_CM_filtered_LM_filtered_CMs_by_wordform_index.pickle',
 'LTR_newdic_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'LTR_newdic_destressed_aligned_CM_filtered_LM_filtered_pC1X012Y012s.txt',
 'LTR_newdic_destressed_aligned_CM_filtered_LM_filtered_CMs_by_prefix_index.pickle_metadata.json',
 'pYX.json',
 'LTR_newdic_destressed_aligned_CM_filtered_LM_filtered_ODCMs_by_prefix_index.pickle_metadata.json',
 'LTR_newdic_destressed_aligned_CM_filtered_LM_filtered_p3Y1X01.json',
 'LTR_newdic_destressed_aligned_CM_filtered_LM_filtere

## Representations of $p_3(Y_1|X_0, X_1; X2)$ (and $p_3(Y_1|X_0; X_1)$)

In [100]:
#if not r, export numpy representation of triphone channel distribution
if not r:
    print(f"Saving p3Y1X012_np to filepath '{o + 'p3Y1X012' + '.npy'}'")
    np.save(o + 'p3Y1X012' + '.npy', p3Y1X012_np)

Saving p3Y1X012_np to filepath 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.01/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered_p3Y1X012.npy'


In [101]:
if not r:
    CM_md = {
        'r':r,
        'C':{'from fp':c,
             'changes':'None' if len(missing_from_channel) == 0 else 'adjustment of conditions and outcomes as noted in adjacent dictionaries'},
        'X012s':{'from fp':c,
                 'changes':"None" if len(missing_from_channel) == 0 else "For every stimulus triphone x_0.x_1.⋉ in the original, added x1.⋉.⋉"},
        'Y1s':{'from fp':c,
               'changes':"None" if len(missing_from_channel) == 0 else "Added ⋉, where p(⋉|x012) = 1 iff x_1 == ⋉ and otherwise = 0"}
    }

    my_fp = o + 'p3Y1X012' + '.npy'
    exportMatrixMetadata(my_fp + '_metadata.json',
                         my_fp,
                         p3Y1X012_np,
                         CM_md,
                         'Step 4e',
                         'Calculate segmental wordform and prefix channel matrices',
                         {})

Wrote metadata for 
	CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.01/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered_p3Y1X012.npy
 to 
	CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.01/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered_p3Y1X012.npy_metadata.json


In [102]:
#if not r, export json representation of (modified!) triphone channel distribution
if not r:
    print(f"Saving p3Y1X012 to filepath '{o + 'p3Y1X012' + '.json'}'")
    exportProbDist(o + 'p3Y1X012' + '.json', p3Y1X012)

Saving p3Y1X012 to filepath 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.01/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered_p3Y1X012.json'


In [103]:
#if r, export .json of modified triphone channel distribution and preview distribution
if r:
    print(f"Saving extended, human-readable version of p3Y1X012 to filepath '{o + 'p3Y1X012_RE' + '.json'}'")
    exportDict(o + 'p3Y1X012_RE' + '.json', condProbDistAsDicts(p3Y1X012))
          
    print(f"Saving extended, human-readable version of p3Y1X01 to filepath '{o + 'p3Y1X01_RE' + '.json'}'")
    exportDict(o + 'p3Y1X01_RE' + '.json', condProbDistAsDicts(p3Y1X01))

#if r, export numpy representation of triphone channel distribution and preview distribution
if r:
    print(f"Saving p3Y1X012_np to filepath '{o + 'p3Y1X012_RE' + '.npy'}'")
    np.save(o + 'p3Y1X012_RE' + '.npy', p3Y1X012_np)
    print(f"Saving p3Y1X01_np to filepath '{o + 'p3Y1X01_RE' + '.npy'}'")
    np.save(o + 'p3Y1X01_RE' + '.npy', p3Y1X01_np)

In [104]:
if r:
    CD_md = {
        'r':r,
        'C':{'from fp':c,
         'changes':"Added ⋉ to the outcomes of every existing conditioning outcome; added new conditioning events X⋉" if r else 'None'}
    }

    my_fp = o + 'p3Y1X012_RE' + '.npy'
    exportMatrixMetadata(my_fp + '_metadata.json',
                         my_fp,
                         p3Y1X012_np,
                         PD_md,
                         'Step 4e',
                         'Calculate segmental wordform and prefix channel matrices',
                         {'Comment':f"See also corresponding .json file @ {o + 'p3Y1X012_RE' + '.json'}"})

In [105]:
if r:
    PD_md = {
        'r':r,
        'C':{'from fp':c,
         'changes':"Added ⋉ to the outcomes of every existing conditioning outcome; added new conditioning events X⋉" if r else 'None'}
    }

    my_fp = o + 'p3Y1X01_RE' + '.npy'
    exportMatrixMetadata(my_fp + '_metadata.json',
                         my_fp,
                         p3Y1X01_np,
                         PD_md,
                         'Step 4e',
                         'Calculate segmental wordform and prefix channel matrices',
                         {'Comment':f"See also corresponding .json file @ {o + 'p3Y1X01_RE' + '.json'}"})