In [1]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

**Notebook author:** emeinhardt@ucsd.edu

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Overview" data-toc-modified-id="Overview-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Overview</a></span><ul class="toc-item"><li><span><a href="#Requirements" data-toc-modified-id="Requirements-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Requirements</a></span></li><li><span><a href="#Usage" data-toc-modified-id="Usage-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Usage</a></span></li></ul></li><li><span><a href="#Parameters" data-toc-modified-id="Parameters-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Parameters</a></span></li><li><span><a href="#Imports" data-toc-modified-id="Imports-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Imports</a></span></li><li><span><a href="#Load-data" data-toc-modified-id="Load-data-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Load data</a></span></li><li><span><a href="#numpy-representations" data-toc-modified-id="numpy-representations-5"><span class="toc-item-num">5&nbsp;&nbsp;</span><code>numpy</code> representations</a></span></li><li><span><a href="#Calculation" data-toc-modified-id="Calculation-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Calculation</a></span></li><li><span><a href="#Export" data-toc-modified-id="Export-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Export</a></span><ul class="toc-item"><li><span><a href="#Segment-sequence-(all-prefixes-or-just-wordforms)-channel-matrices" data-toc-modified-id="Segment-sequence-(all-prefixes-or-just-wordforms)-channel-matrices-7.1"><span class="toc-item-num">7.1&nbsp;&nbsp;</span>Segment sequence (all prefixes or just wordforms) channel matrices</a></span></li><li><span><a href="#Representations-of-$p_3(Y_1|X_0,-X_1;-X2)$-(and-$p_3(Y_1|X_0;-X_1)$)" data-toc-modified-id="Representations-of-$p_3(Y_1|X_0,-X_1;-X2)$-(and-$p_3(Y_1|X_0;-X_1)$)-7.2"><span class="toc-item-num">7.2&nbsp;&nbsp;</span>Representations of $p_3(Y_1|X_0, X_1; X2)$ (and $p_3(Y_1|X_0; X_1)$)</a></span></li></ul></li></ul></div>

# Overview

Given
 - a filepath to a triphone channel model $c$
 - a filepath $w$ to a `.json` file specifying a conditional distribution $p(W|V)$ on segmental wordforms given orthographic ones
 - an output filepath prefix $o$
 - an optional flag $f$ indicating whether to do calculations for both full wordforms and prefixes (`True`, default) or just full wordforms (`False`)
 - an optional filepath $p$ to a `.json` file specifying a 'preview' channel distribution to be included in calculated channel matrices.

this notebook calculates a channel matrix for each source prefix (if $f$, otherwise just for full source wordforms) and writes these channel matrices to file (with prefix given by $o$), with each file corresponding to a block of source prefixes (if $f$, else full source wordforms) of the same length. Within a block, the ordering of source prefixes/wordforms is given by alphabetically sorting the relevant set of prefixes (or just full wordforms, if $f$).

#FIXME update to reflect other exports

!!!! #FIXME **magic numbers "38" and "39" should be replaced with references to the size of Y1s / Y1s_RE**

## Requirements

 - `numpy`
 - `pytorch`

## Usage

#FIXME

# Parameters

In [2]:
from os import getcwd, chdir, listdir, path, mkdir, makedirs

In [3]:
from boilerplate import *

In [4]:
# Parameters

c = ''
# c = 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json'

w = ''
# w = 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json'

o = ''
# o = 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01/LTR_Buckeye_aligned_CM_filtered_LM_filtered_'

f = ''
# f = 'True'

p = ''
# p = 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01/p3Y1X01.json'

In [5]:
ensure_dir_exists(path.dirname(o))

In [6]:
if f == '':
    f = 'True'

if p == '':
    r = False
else:
    r = True
    print('Including preview distribution in channel matrix calculations.')

In [7]:
if f == 'True':
    f = True
elif f == 'False':
    f = False
else:
    raise Exception(f"f must be either 'True' or 'False', got '{f}'")

# Imports

In [8]:
from probdist import *

In [9]:
from string_utils import *

In [10]:
import numpy as np
import torch

In [11]:
import pickle

In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')
    if torch.cuda.device_count() > 1:
        print(torch.cuda.get_device_name(1))
        print('Memory Usage:')
        print('Allocated:', round(torch.cuda.memory_allocated(1)/1024**3,1), 'GB')
        print('Cached:   ', round(torch.cuda.memory_cached(1)/1024**3,1), 'GB')

Using device: cuda

GeForce RTX 2070
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
GeForce RTX 2070
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [13]:
gpu = torch.device('cuda')
cpu = torch.device('cpu')

my_device = cpu

In [14]:
cuda_ft = torch.cuda.FloatTensor
cuda_dt = torch.cuda.DoubleTensor

ft = torch.FloatTensor
dt = torch.DoubleTensor

my_ft = ft
my_dt = dt

torch.set_default_tensor_type(my_ft)

# Load data

In [15]:
p3Y1X012 = condDistsAsProbDists(importProbDist(c))

assert uniformOutcomes(p3Y1X012)

In [16]:
if r:
    p3Y1X01 = condDistsAsProbDists(importProbDist(p))
    assert uniformOutcomes(pY1X01)

In [17]:
pW_V = condDistsAsProbDists(importProbDist(w))

In [18]:
#extract segmental wordforms from w
Ws = union(list(map(lambda d: set(conditions(d)), 
                    pW_V.values())))
Ws_t = tuple(sorted(list(Ws)))
print(f'|Wordforms| = {len(Ws)}')

#extract prefixes from w
Ps = union(map(getPrefixes, Ws))
prefixes = Ps
print(f'|Prefixes| = {len(Ps)}')
Ps_t = tuple(sorted(list(Ws)))
prefixes_t = Ps_t

#extract inventory from w
Xs = lexiconToInventory(Ws)
    
#extract triphones from w
lexiconTriphones = lexiconTo3factors(Ws)
print(f'|triphones| in lexicon = {len(lexiconTriphones)}')

|Wordforms| = 6403
|Prefixes| = 21475
|triphones| in lexicon = 5760


In [19]:
#extract triphones from c
channelTriphones = set(p3Y1X012.keys())

print(f'|triphones| in channel model = {len(channelTriphones)}')

X012s = channelTriphones
X012s_t = tuple(sorted(list(X012s)))

#extract response phones
Y1s = outcomes(p3Y1X012)
Y1s_t = tuple(sorted(list(Y1s)))
print(f'|Y1s| = {len(Y1s)}')

leftEdge in Y1s
rightEdge in Y1s

|triphones| in channel model = 5760
|Y1s| = 38


False

False

In [20]:
assert all({triph in channelTriphones for triph in lexiconTriphones})

In [21]:
if r:
    channelDiphones = set(p3Y1X01.keys())
    print(f'|X012s| in channel model = {len(channelDiphones)}')
    
    lexiconDiphones = lexiconTo2factors(Ws)
    unmodelableLexiconDiphones = {diph for diph in lexiconDiphones if diph not in channelDiphones}
    print(f'unmodelable lexicon diphones = \n{unmodelableLexiconDiphones}')
    assert all({diph in channelDiphones for diph in lexiconDiphones if ds2t(diph)[0] != leftEdge and ds2t(diph)[1] != rightEdge})
    print(f'|X012s| in lexicon = {len(lexiconDiphones)}')
    
    X01s = lexiconDiphones
    assert outcomes(p3Y1X01) == Y1s
    
    

There are no gating trials that bear on $p(Y_{i+1}|X_i; X_{i+1} = ⋉)$, but a reasonable assumption is that there are plenty of good acoustic cues that any given segment $X_i$ is the end of the word (i.e. that $X_{i+1} = ⋉$) given the context of an isolated word recognition task, and that there are plenty of good acoustic cues that any given segment is NOT the end of the word.

In [22]:
if r:
    p3Y1X01 = condProbDistAsDicts(p3Y1X01)
    
    # add ⋉ to the outcomes of every existing conditioning outcome
    for x01 in p3Y1X01:
        p3Y1X01[x01].update({rightEdge:0.0})

    # create new conditioning events
    wordEndDiphones = {x + '.' + rightEdge for x in Xs}
    list(wordEndDiphones)[:5]

    # create their distribution over outcomes
    deltaDist = {y1:0.0 for y1 in Y1s}
    deltaDist.update({rightEdge:1.0})

    # add the new wordend conditioning events to the preview distribution
    p3Y1X01.update({wordEnd:deltaDist for wordEnd in wordEndDiphones})
    p3Y1X01['aʊ.s']['s']
    p3Y1X01['ɑ.⋉']

    # check that everything worked
    for x01 in p3Y1X01:
        assert rightEdge in p3Y1X01[x01]
    #     if rightEdge not in p3Y1X01[x01]:
    #         p3Y1X01[x01][rightEdge] = 0.0

    assert areNormalized(p3Y1X01)
    assert uniformOutcomes(p3Y1X01)

    channelDiphones = set(p3Y1X01.keys())

    unmodelableLexiconDiphones = {diph for diph in lexiconDiphones if diph not in channelDiphones}
    print(f'unmodelable lexicon diphones = \n{unmodelableLexiconDiphones}')
    assert all({diph in channelDiphones for diph in lexiconDiphones if ds2t(diph)[0] != leftEdge and ds2t(diph)[1] != rightEdge})
    
    #we'll worry about left-edge initial diphones later
    
    # let's trim the preview model's conditioning events
    p3Y1X01 = {x01:p3Y1X01[x01] for x01 in p3Y1X01 if x01 in lexiconDiphones}
    
    p3Y1X01 = condDistsAsProbDists(p3Y1X01)
    
    X01s_RE = set(p3Y1X01.keys())
    len(X01s_RE)
    
#     print(X01s_RE - X01s)

# `numpy` representations

In [23]:
Xmap = seqsToIndexMap(Xs)
XOHmap = seqsToOneHotMap(Xs)

In [24]:
X012map = seqsToIndexMap(X012s)
# X012OHs = seqMapToOneHots(X012map)
X012OHmap = seqsToOneHotMap(X012s)

In [25]:
Y1map = seqsToIndexMap(Y1s)

In [26]:
if r:
    X01REmap = seqsToIndexMap(X01s_RE)
    X01REOHs = seqMapToOneHots(X01REmap)
    X01REOHmap = seqsToOneHotMap(X01s_RE)
    
    Y1s_RE = outcomes(p3Y1X01)
    len(Y1s_RE)
    Y1s_RE_list = sorted(list(Y1s_RE))

    print(Y1s_RE - Y1s)

    Y1REmap = seqsToIndexMap(Y1s_RE)

    Y1REOHs = seqMapToOneHots(Y1REmap)
    Y1REOHmap = seqsToOneHotMap(Y1s_RE)
    OHY1REmap = oneHotToSeqMap(Y1s_RE)

If `r` is `True`, then to ensure uniformity of event spaces between the triphone channel distribution and the preview distribution, we'll add a $⋉$ outcome (with probability 0.0) to each conditional distribution in the triphone channel distribution.

In [27]:
if r:
    for x012 in p3Y1X012:
        p3Y1X012[x012].update({rightEdge:0.0})
        assert rightEdge in p3Y1X012[x012]
        assert p3Y1X012[x012][rightEdge] == 0.0

    outcomes(p3Y1X012) == Y1s
    outcomes(p3Y1X012) == Y1s_RE
    areNormalized(p3Y1X012)
    uniformOutcomes(p3Y1X012)

In [28]:
def dsToUniphoneIndices(ds, uniphoneToIndexMap):
    uniphoneSeq = ds2t(ds)
    return np.array([uniphoneToIndexMap[uniphone] for uniphone in uniphoneSeq])

def dsToUniphoneOHs(ds, uniphoneToOHmap):
    uniphoneSeq = ds2t(ds)
    return np.array([uniphoneToOHmap[uniphone] for uniphone in uniphoneSeq])

def dsToTriphoneSeq(ds):
    return dsToKfactorSequence(3, ds)

def dsToTriphoneIndices(ds, triphoneToIndexMap):
    triphoneSeq = dsToTriphoneSeq(ds)
    return np.array([triphoneToIndexMap[triphone] for triphone in triphoneSeq])

def dsToTriphoneOHs(ds, triphoneToOHmap):
    triphoneSeq = dsToTriphoneSeq(ds)
    return np.array([triphoneToOHmap[triphone] for triphone in triphoneSeq])

dsToUniphoneIndices('t.i.f.l', Xmap)
dsToUniphoneOHs('t.i.f.l', XOHmap)
dsToTriphoneSeq('t.i.f.l')
dsToTriphoneIndices('t.i.f.l', X012map)
dsToTriphoneOHs('t.i.f.l', X012OHmap)
dsToTriphoneOHs('t.i.f.l', X012OHmap).shape
dsToTriphoneOHs('t.i.f.l', X012OHmap)[0].shape
dsToTriphoneOHs('t.i.f.l', X012OHmap)[0][5528]
dsToTriphoneOHs('t.i.f.l', X012OHmap)[1][5352]

array([18,  9,  6, 12])

array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.]])

('t.i.f', 'i.f.l')

array([2904, 1146])

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

(2, 5760)

(5760,)

0.0

0.0

In [29]:
p3Y1X012_np = condDistFamilyToNP(p3Y1X012)
if r:
    testNPcondDist(p3Y1X012_np, X012map, Y1REmap, p3Y1X012)
else:
    testNPcondDist(p3Y1X012_np, X012map, Y1map, p3Y1X012)
p3Y1X012_np.shape

(38, 5760)

In [30]:
if r:
    p3Y1X01_np = condDistFamilyToNP(p3Y1X01)
    testNPcondDist(p3Y1X01_np, X01REmap, Y1REmap, p3Y1X01)
    p3Y1X01_np.shape

In [31]:
from random import choice

In [32]:
random_source_wordform = choice(list(Ws))
random_source_wordform

'⋊.ɛ.m.p.ʌ.θ.i.⋉'

In [33]:
random_source_prefix = choice(list(Ps))
random_source_prefix

'⋊.ɹ.ɪ.k.ʌ'

In [34]:
def randomPrefix(l, alphabet=Xs):
    return randomString(alphabet, l, hasLeftEdge=True)

In [35]:
random_channel_prefix2 = randomPrefix(len(ds2t(random_source_wordform))-1, alphabet=Y1s)
random_channel_prefix2

'⋊.s.ŋ.ɹ.b.b.ɚ.ɛ'

In [36]:
# random_source_prefix = getRandomKey(pX0i)
random_source_prefix = choice(list(Ps))
while ds2t(random_source_prefix)[-1] == rightEdge:
#     random_source_prefix = getRandomKey(pX0i)
    random_source_prefix = choice(list(Ps))
while len(ds2t(random_source_prefix)) > len(ds2t(random_source_wordform)):
#     random_source_prefix = getRandomKey(pX0i)
    random_source_prefix = choice(list(Ps))
random_source_prefix
random_channel_prefix = randomPrefix(len(ds2t(random_source_prefix))-1, alphabet=Y1s)
random_channel_prefix

'⋊.aʊ.t.g.oʊ.ɪ'

'⋊.h.ʌ.dʒ.oʊ.ɹ'

# Calculation

In [37]:
def sourcePrefixToTriphones(x0k):
    xp_t = ds2t(x0k) #"x prefix"
    
#     xi = xp_t[-2] #just-completed segment
#     xk = xp_t[-1] #upcoming segment that we only get coarticulatory information about
    
#     xik_ds = t2ds((xi, xk))
#     preview_dist = p3Y1X01[xik_ds]
    
    x012s = dsToKfactorSequence(3, t2ds(xp_t))
    return x012s

random_triphoneSeq = sourcePrefixToTriphones(random_source_prefix)
random_triphoneSeq
threeFactorSequenceToDS(random_triphoneSeq)

('⋊.aʊ.t', 'aʊ.t.g', 't.g.oʊ', 'g.oʊ.ɪ')

'⋊.aʊ.t.g.oʊ.ɪ'

In [38]:
def sourcePrefixToTriphoneIndices(x0k):
    triphoneSequence = sourcePrefixToTriphones(x0k)
    return tuple(map(lambda x012: X012map[x012], triphoneSequence))

sourcePrefixToTriphoneIndices(random_source_prefix)

(5341, 160, 2898, 953)

In [39]:
blah = np.zeros((len(Y1s), 1))
blah[-1] = 1.0
blah

array([[ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 1.]])

In [65]:
def sourcePrefixToChannelMatrix_l(x0k, debug=False):
    triphoneOHs = dsToTriphoneOHs(x0k, X012OHmap)
    if debug:
        print('x0k = {0}'.format(x0k))
        print('|x0k| = {0}'.format(len(x0k)))
        print('triphoneIdxs = {0}'.format(sourcePrefixToTriphoneIndices(x0k)))
        print('triphoneOHs.shape = {0}'.format(triphoneOHs.shape))
        print('p3Y1X012_np.shape = {0}'.format(p3Y1X012_np.shape))
        print('result = p3Y1X012_np * triphoneOHs.T')
    result = np.matmul(p3Y1X012_np, triphoneOHs.T)
    return result
# sourcePrefixToChannelMatrix_l(random_source_prefix, True)

if r:
    def sourcePrefixToChannelMatrix(x0k):
        triphoneIndices = sourcePrefixToTriphoneIndices(x0k)
        C = np.array([[p3Y1X012_np[Y1REmap[y1], x012_idx] for x012_idx in triphoneIndices] for y1 in sorted(Y1s_RE)])
    #     C = np.array([[p3Y1X012_np[Y1map[y1], x012_idx] 
    #                    for x012_idx in triphoneIndices] 
    #                   for y1 in Y1s_t])
        if x0k == leftEdge or (len(ds2t(x0k)) == 2 and ds2t(x0k)[0] == leftEdge):
            C = np.zeros((len(Y1s_RE), 1))
    #         C = np.zeros((len(Y1s), 1))
            C[-1] = 1.0
            return C.reshape(len(Y1s_RE),1)
    #         return C.reshape(len(Y1s),1)
        return C
else:
    def sourcePrefixToChannelMatrix(x0k):
        triphoneIndices = sourcePrefixToTriphoneIndices(x0k)
#         C = np.array([[p3Y1X012_np[Y1REmap[y1], x012_idx] for x012_idx in triphoneIndices] for y1 in sorted(Y1s_RE)])
        C = np.array([[p3Y1X012_np[Y1map[y1], x012_idx] 
                       for x012_idx in triphoneIndices] 
                      for y1 in Y1s_t])
        if x0k == leftEdge or (len(ds2t(x0k)) == 2 and ds2t(x0k)[0] == leftEdge):
#             C = np.zeros((len(Y1s_RE), 1))
            C = np.zeros((len(Y1s), 1))
            C[-1] = 1.0
#             return C.reshape(len(Y1s_RE),1)
            return C.reshape(len(Y1s),1)
        assert len(triphoneIndices) == len(dsToKfactorSequence(3, x0k)), f"{len(triphoneIndices)} != {len(dsToKfactorSequence(3, x0k))}\n\t x0k = {x0k}\n\t {dsToKfactorSequence(3, x0k)}\n\t {triphoneIndices}"
        assert len(dsToKfactorSequence(3, x0k)) == C.shape[1], f"{C.shape[1]} != {len(dsToKfactorSequence(3, x0k))}\n\t x0f = {wordform}"
        return C


# sourcePrefixToChannelMatrix(random_source_prefix)

random_source_prefix
sourcePrefixToChannelMatrix_l(random_source_prefix).shape
print(sourcePrefixToChannelMatrix_l(random_source_prefix) == sourcePrefixToChannelMatrix(random_source_prefix))

'⋊.aʊ.t.g.oʊ.ɪ'

(38, 4)

[[ True  True  True  True]
 [ True  True  True  True]
 [ True  True  True  True]
 [ True  True  True  True]
 [ True  True  True  True]
 [ True  True  True  True]
 [ True  True  True  True]
 [ True  True  True  True]
 [ True  True  True  True]
 [ True  True  True  True]
 [ True  True  True  True]
 [ True  True  True  True]
 [ True  True  True  True]
 [ True  True  True  True]
 [ True  True  True  True]
 [ True  True  True  True]
 [ True  True  True  True]
 [ True  True  True  True]
 [ True  True  True  True]
 [ True  True  True  True]
 [ True  True  True  True]
 [ True  True  True  True]
 [ True  True  True  True]
 [ True  True  True  True]
 [ True  True  True  True]
 [ True  True  True  True]
 [ True  True  True  True]
 [ True  True  True  True]
 [ True  True  True  True]
 [ True  True  True  True]
 [ True  True  True  True]
 [ True  True  True  True]
 [ True  True  True  True]
 [ True  True  True  True]
 [ True  True  True  True]
 [ True  True  True  True]
 [ True  True  True  True]
 

In [41]:
if r:
    def sourcePrefixToPreviewVector(x0k):
        xp_t = ds2t(x0k) #"x prefix"

        if len(xp_t) < 2:
            raise Exception('|x0k| must be > 1.')
        if len(xp_t) == 2 and xp_t[0] == leftEdge:
    #         raise Exception("There's no gating data that bears on this calculation, nor is it that interesting.")
            uniformProb = 1.0 / len(Y1s_RE)
            preview_dist = uniformProb * np.ones((len(Y1s_RE), 1))#garbage
            return preview_dist.reshape(len(Y1s_RE),1)

        xi = xp_t[-2] #just-completed segment
        xk = xp_t[-1] #upcoming segment that we only get coarticulatory information about

        xik_ds = t2ds((xi, xk))
        preview_dist = p3Y1X01[xik_ds]
    #     assert Y1s_RE == set(preview_dist.keys()) #comment out once you are reasonably confident this is true by construction

        return np.array([preview_dist[y1] for y1 in sorted(Y1s_RE)])

    sourcePrefixToPreviewVector(random_source_prefix)

In [42]:
if r:
    # returns p(Y0K|x0k)
    def makeExtendedChannelMatrixByPrefix(prefix):
        # NB:
        # if len(prefix) == n (including leftEdge), 
        # then the extended channel matrix will have dimensions 39 x (n-1)

        p = prefix
        if prefix != leftEdge:# and not (len(ds2t(p)) == 2 and ds2t(p)[0] == leftEdge):
    #     if prefix != leftEdge and not (len(ds2t(p)) == 2 and ds2t(p)[0] == leftEdge):
            return np.hstack( (sourcePrefixToChannelMatrix(p) , sourcePrefixToPreviewVector(p).reshape(39,1)))
        else: #the extended channel matrix is garbage that should never be asked for
            l = len(ds2t(p))
            return np.zeros((39, l-1))

In [43]:
if f:
    print('Source sequences = wordforms and prefixes')
    source_seqs = prefixes_t #prefixes include full wordforms
else:
    print('Source sequences = just full wordforms')
    source_seqs = Ws_t

Source sequences = wordforms and prefixes


In [44]:
if r:
    xCMsByPrefixIndex = [makeExtendedChannelMatrixByPrefix(s)
                         for s in source_seqs]
    xCMsByPrefixIndex_torch = [None] + [torch.from_numpy(each) for each in xCMsByPrefixIndex[1:]]

    xCMsByPrefixIndex[3].shape

In [45]:
CMsByPrefixIndex = [sourcePrefixToChannelMatrix_l(s)
                     for s in source_seqs]
CMsByPrefixIndex_torch = [None] + [torch.from_numpy(each) for each in CMsByPrefixIndex[1:]]

CMsByPrefixIndex[3].shape

(38, 5)

In [46]:
# def wordformsOfLength(l, includingEdges = False):
#     if includingEdges:
#         return {w for w in Ws if len(ds2t(w)) == l}
#     return {w for w in Ws if len(ds2t(w)) == l + 2}

In [49]:
wordformsOfLength(16, Ws)

{'⋊.d.ɛ.m.ʌ.n.s.t.ɹ.eɪ.t.ɪ.ŋ.⋉',
 '⋊.d.ɪ.n.ɑ.m.ɪ.n.eɪ.ʃ.ʌ.n.z.⋉',
 '⋊.d.ɪ.s.t.ɹ.ɪ.b.j.u.t.ɪ.ŋ.⋉',
 '⋊.d.ɪ.s.t.ɹ.ɪ.b.j.u.t.ʌ.d.⋉',
 '⋊.d.ɪ.s.ʌ.p.ɔɪ.n.t.m.ʌ.n.t.⋉',
 '⋊.d.ɪ.v.ɛ.l.ʌ.p.m.ʌ.n.t.s.⋉',
 '⋊.g.ɹ.æ.n.d.tʃ.ɪ.l.d.ɹ.ʌ.n.⋉',
 '⋊.k.ɑ.m.p.l.ɪ.k.eɪ.ʃ.ʌ.n.z.⋉',
 '⋊.k.ɑ.m.p.ɹ.ɪ.h.ɛ.n.ʃ.ʌ.n.⋉',
 '⋊.k.ɑ.n.s.ɪ.k.w.ɛ.n.s.ʌ.z.⋉',
 '⋊.k.ɑ.n.s.ɪ.k.w.ɛ.n.t.l.i.⋉',
 '⋊.k.ɑ.n.s.ʌ.n.t.ɹ.eɪ.t.ʌ.d.⋉',
 '⋊.k.ɑ.z.m.ʌ.p.ɑ.l.ɪ.t.ʌ.n.⋉',
 '⋊.k.ʌ.m.j.u.n.ɪ.k.eɪ.t.ʌ.d.⋉',
 '⋊.k.ʌ.m.j.u.n.ɪ.k.eɪ.ʃ.ʌ.n.⋉',
 '⋊.k.ʌ.n.t.ɪ.n.j.u.ʌ.s.l.i.⋉',
 '⋊.k.ʌ.n.t.ɪ.n.j.ʊ.eɪ.ʃ.ʌ.n.⋉',
 '⋊.k.ʌ.n.t.ɹ.ɪ.b.j.u.t.ɪ.ŋ.⋉',
 '⋊.k.ʌ.n.v.i.n.i.ʌ.n.s.ʌ.z.⋉',
 '⋊.m.ɛ.t.ɹ.ʌ.p.ɑ.l.ɪ.t.ʌ.n.⋉',
 '⋊.m.ʌ.n.ɪ.p.j.ʊ.l.eɪ.t.ɪ.ŋ.⋉',
 '⋊.oʊ.ɹ.g.ʌ.n.aɪ.z.eɪ.ʃ.ʌ.n.z.⋉',
 '⋊.p.i.d.i.ʌ.t.ɹ.ɪ.ʃ.ʌ.n.z.⋉',
 '⋊.p.ɑ.ɹ.t.ɪ.s.ɪ.p.eɪ.t.ɪ.ŋ.⋉',
 '⋊.p.ɑ.ɹ.t.ɪ.s.ɪ.p.eɪ.t.ʌ.d.⋉',
 '⋊.p.ɑ.ɹ.t.ɪ.s.ɪ.p.eɪ.ʃ.ʌ.n.⋉',
 '⋊.p.ɹ.ɛ.z.b.ɪ.t.ɪ.ɹ.i.ʌ.n.⋉',
 '⋊.p.ɹ.ɪ.s.k.ɹ.ɪ.p.ʃ.ʌ.n.z.⋉',
 '⋊.s.p.ɪ.ɹ.ɪ.tʃ.ʊ.æ.l.ʌ.t.i.⋉',
 '⋊.s.ʌ.b.s.k.ɹ.ɪ.p.ʃ.ʌ.n.z.⋉',
 '⋊.t.ɛ.l.ʌ.m.ɑ.ɹ.k.ɛ.d

In [52]:
wordlengthsInclEdges = set(len(ds2t(w)) for w in Ws)
wordlengthsInclEdges
numWordsOfExactlyLength = {l:len(wordformsOfLength(l, Ws, True)) for l in wordlengthsInclEdges}
numWordsOfExactlyLength

{3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19}

{3: 11,
 4: 144,
 5: 808,
 6: 1201,
 7: 1183,
 8: 974,
 9: 779,
 10: 555,
 11: 354,
 12: 211,
 13: 98,
 14: 51,
 15: 25,
 16: 5,
 17: 2,
 18: 1,
 19: 1}

In [53]:
wordlengthsNotIncludingEdges = {each-2 for each in wordlengthsInclEdges}
wordlengthsNotIncludingEdges

{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}

In [None]:
# def wordformsAtLeastLlong(l, includingEdges = False):
#     if includingEdges:
#         maxL = max(wordlengthsInclEdges)
#         return union([wordformsOfLength(eachl, includingEdges) for eachl in range(l, maxL+1)])
#     else:
#         maxL = max(wordlengthsNotIncludingEdges)
#         return union([wordformsOfLength(eachl, includingEdges) for eachl in range(l, maxL+1)])

In [55]:
lengthFreqs = {l:len(wordformsAtLeastLlong(l, Ws, True)) for l in wordlengthsInclEdges}
lengthFreqs

{3: 6403,
 4: 6392,
 5: 6248,
 6: 5440,
 7: 4239,
 8: 3056,
 9: 2082,
 10: 1303,
 11: 748,
 12: 394,
 13: 183,
 14: 85,
 15: 34,
 16: 9,
 17: 4,
 18: 2,
 19: 1}

In [66]:
# returns p(Y0i|x0f), padded if necessary
def makeChannelMatrixByWordformAndLength(wordform, key_length):
    x0f = wordform
    x0f_t = ds2t(x0f)
    x0f_length = len(x0f_t)
    if x0f_length == key_length:
        return sourcePrefixToChannelMatrix(x0f)
    elif x0f_length > key_length:
#         print('middle case')
        #trim the wordform to be a prefix of length = key_length
        x0k_t = x0f_t[:key_length]
#         assert len(x0k_t) == key_length
        x0k = t2ds(x0k_t)
#         print('x0k: {0}'.format(x0k))
        cm = sourcePrefixToChannelMatrix(x0k)
        assert len(dsToKfactorSequence(3, x0k)) == cm.shape[1], f"{cm.shape[1]} != {len(dsToKfactorSequence(3, x0k))}\n\t x0f = {wordform}\n\t key_length = {key_length}"
        return cm
    else:
        #grab the source 
        my_CM = sourcePrefixToChannelMatrix(x0f)
        goal_l = key_length
        #extend the channel matrix with padding
        cm = np.pad(my_CM, ((0,0), (0, goal_l - my_CM.shape[1] - 2)), 
                      'constant', constant_values=0.0)
        assert key_length - 2 == cm.shape[1], f"{cm.shape[1]} != {key_length - 2}\n\t x0f = {wordform}\n\t key_length = {key_length}"
        return cm

In [57]:
if r:
    # returns p(Y0K|x0f)
    def makeExtendedChannelMatrixByWordformAndLength(wordform, key_length):
        x0f = wordform
        x0f_t = ds2t(x0f)
        x0f_length = len(x0f_t)
        if x0f_length == key_length:
            return makeExtendedChannelMatrixByPrefix(x0f)
        elif x0f_length > key_length:
    #         print('middle case')
            #trim the wordform to be a prefix of length = key_length
            x0k_t = x0f_t[:key_length]
            x0k = t2ds(x0k_t)
    #         print('x0k: {0}'.format(x0k))
            return makeExtendedChannelMatrixByPrefix(x0k)
        else:
            #grab the source 
            my_xCM = makeExtendedChannelMatrixByPrefix(x0f)
            goal_l = key_length
            return np.pad(my_xCM, ((0,0), (0, goal_l - my_xCM.shape[1] - 1)), 
                          'constant', constant_values=0.0)

In [58]:
wordlengthsInclEdges

{3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19}

In [67]:
# ~17s on wittgenstein under load
offset = [np.zeros(shape=(0,0)) for each in range(min(wordlengthsInclEdges))]
cmsByLengthByWordformIndex = offset + [np.array([makeChannelMatrixByWordformAndLength(w, l)
                                                 for w in Ws_t])
                                       for l in sorted(list(wordlengthsInclEdges))]
cmsByLengthByWordformIndex_torch = list(map(lambda cm: torch.from_numpy(cm).type(my_ft), cmsByLengthByWordformIndex))

In [68]:
for l in wordlengthsInclEdges:
    assert all(cm.shape[1] == l - 2 for cm in cmsByLengthByWordformIndex[l])

In [69]:
if r:
    xCMsByLengthByWordformIndex = offset + [np.array([makeExtendedChannelMatrixByWordformAndLength(w, l)
                                                      for w in Ws_t])
                                            for l in sorted(list(wordlengthsInclEdges))]
    xCMsByLengthByWordformIndex_torch = list(map(lambda xCM: torch.from_numpy(xCM).type(my_ft), xCMsByLengthByWordformIndex))

# Export

## Segment sequence (all prefixes or just wordforms) channel matrices

We want to save 
 - `CMsByPrefixIndex`
 - `cmsByLengthByWordformIndex`
 
(and/or their extended analogues, if `r`) to disk, and when importing, we will need to know
 - the set/sequence of key strings (prefixes or just wordforms)

In [70]:
len(CMsByPrefixIndex)
# CMsByPrefixIndex.nbytes / 1e9

6403

In [71]:
len(cmsByLengthByWordformIndex)
cmsByLengthByWordformIndex[0].shape
cmsByLengthByWordformIndex[1].shape
cmsByLengthByWordformIndex[2].shape
cmsByLengthByWordformIndex[3].shape
cmsByLengthByWordformIndex[10].nbytes / 1e9

20

(0, 0)

(0, 0)

(0, 0)

(6403, 38, 1)

0.015572096

In [72]:
if r:
    pickle.dump(xCMsByPrefixIndex, open(o + 'xCMs_by_prefix_index.pickle', 'wb'))
else:
    pickle.dump(CMsByPrefixIndex, open(o + 'CMs_by_prefix_index.pickle', 'wb'))

In [73]:
if not r:
    CMsByPrefixIndex_in = pickle.load(open(o + 'CMs_by_prefix_index.pickle', 'rb'))
    len(CMsByPrefixIndex_in)

    assert all(np.array_equal(CMsByPrefixIndex_in[i], CMsByPrefixIndex[i]) for i in range(len(CMsByPrefixIndex)))

6403

In [74]:
if not r:
    CMsByPrefixIndex_in[3].shape
    CMsByPrefixIndex[3].shape

(38, 5)

(38, 5)

In [75]:
CMs_by_prefix_idx_md = {
    'r':r,
    'length':len(xCMsByPrefixIndex) if r else len(CMsByPrefixIndex),
    'W':{'from fp':w,
         'changes':'(x)CMs constructed from sorted prefixes of W',
         'size':len(Ws_t)},
    'P':{'from fp':w,
         'changes':'(x)CMs constructed from sorted prefixes of W',
         'size':len(Ps_t)},
    'C':{'from fp':c,
         'changes':"Added ⋉ to the outcomes of every existing conditioning outcome; added new conditioning events X⋉" if r else 'None'}
}

my_fp = o + 'xCMs_by_prefix_index.pickle' if r else o + 'CMs_by_prefix_index.pickle'
exportMatrixMetadata(my_fp + '_metadata.json',
                     my_fp,
                     None,
                     CMs_by_prefix_idx_md,
                     'Step 4e',
                     'Calculate segmental wordform and prefix channel matrices',
                     {})

Wrote metadata for 
	CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01/LTR_Buckeye_aligned_CM_filtered_LM_filtered_CMs_by_prefix_index.pickle
 to 
	CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01/LTR_Buckeye_aligned_CM_filtered_LM_filtered_CMs_by_prefix_index.pickle_metadata.json


In [76]:
# importDict(o + '.pW_C' + '_metadata.json')

In [77]:
if r:
    pickle.dump(xCMsByLengthByWordformIndex, open(o + 'xCMs_by_length_by_prefix_index.pickle', 'wb'))
else:
    pickle.dump(cmsByLengthByWordformIndex, open(o + 'CMs_by_length_by_prefix_index.pickle', 'wb'))

In [78]:
CMs_by_length_by_prefix_idx_md = {
    'r':r,
    'length':len(xCMsByLengthByWordformIndex) if r else len(cmsByLengthByWordformIndex),
    'W':{'from fp':w,
         'changes':'(x)CMs constructed from sorted prefixes of W',
         'size':len(Ws_t)},
    'P':{'from fp':w,
         'changes':'(x)CMs constructed from sorted prefixes of W',
         'size':len(Ps_t)},
    'C':{'from fp':c,
         'changes':"Added ⋉ to the outcomes of every existing conditioning outcome; added new conditioning events X⋉" if r else 'None'}
}

my_fp = o + 'xCMs_by_length_by_prefix_index.pickle' if r else o + 'CMs_by_length_by_prefix_index.pickle'
exportMatrixMetadata(my_fp + '_metadata.json',
                     my_fp,
                     None,
                     CMs_by_length_by_prefix_idx_md,
                     'Step 4e',
                     'Calculate segmental wordform and prefix channel matrices',
                     {})

Wrote metadata for 
	CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01/LTR_Buckeye_aligned_CM_filtered_LM_filtered_CMs_by_length_by_prefix_index.pickle
 to 
	CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01/LTR_Buckeye_aligned_CM_filtered_LM_filtered_CMs_by_length_by_prefix_index.pickle_metadata.json


In [79]:
o

'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01/LTR_Buckeye_aligned_CM_filtered_LM_filtered_'

In [80]:
listdir(path.dirname(o))

['pX0X1X2.npy',
 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_pC1X012.npy_metadata.json',
 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_p3Y1X01.json',
 'p6Y0X01.json',
 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_pC1X0X1X2.npy',
 'Calculate wordform channel matrices for LTR_Buckeye_aligned_CM_filtered_LM_filtered.ipynb',
 'p3YX.json',
 'p3Y0X01.json',
 'p3Y01X01.json',
 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2.npy',
 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_CMs_by_prefix_index.pickle_metadata.json',
 'Generating LTR_Buckeye_aligned_CM_filtered_LM_filtered uniform triphone lexicon dist.ipynb',
 'p6Y01X01.json',
 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_p3Y1X012.npy_metadata.json',
 'p3Y1X01.json',
 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_p6Y0X01.json',
 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_pC1X0X1X2.npy_metadata.json',
 'pYX.json',
 'Generating  uniform triphone lexicon dist.ipynb',
 'p6Y1X01.json',
 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_pC1X0X1X

## Representations of $p_3(Y_1|X_0, X_1; X2)$ (and $p_3(Y_1|X_0; X_1)$)

In [81]:
#if not r, export numpy representation of triphone channel distribution
if not r:
    print(f"Saving p3Y1X012_np to filepath '{o + 'p3Y1X012' + '.npy'}'")
    np.save(o + 'p3Y1X012' + '.npy', p3Y1X012_np)

Saving p3Y1X012_np to filepath 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01/LTR_Buckeye_aligned_CM_filtered_LM_filtered_p3Y1X012.npy'


In [82]:
if not r:
    CM_md = {
        'r':r,
        'C':{'from fp':c,
             'changes':'None'}
    }

    my_fp = o + 'p3Y1X012' + '.npy'
    exportMatrixMetadata(my_fp + '_metadata.json',
                         my_fp,
                         p3Y1X012_np,
                         CM_md,
                         'Step 4e',
                         'Calculate segmental wordform and prefix channel matrices',
                         {})

Wrote metadata for 
	CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01/LTR_Buckeye_aligned_CM_filtered_LM_filtered_p3Y1X012.npy
 to 
	CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01/LTR_Buckeye_aligned_CM_filtered_LM_filtered_p3Y1X012.npy_metadata.json


In [83]:
#if r, export .json of modified triphone channel distribution and preview distribution
if r:
    print(f"Saving extended, human-readable version of p3Y1X012 to filepath '{o + 'p3Y1X012_RE' + '.json'}'")
    exportDict(o + 'p3Y1X012_RE' + '.json', condProbDistAsDicts(p3Y1X012))
          
    print(f"Saving extended, human-readable version of p3Y1X01 to filepath '{o + 'p3Y1X01_RE' + '.json'}'")
    exportDict(o + 'p3Y1X01_RE' + '.json', condProbDistAsDicts(p3Y1X01))

#if r, export numpy representation of triphone channel distribution and preview distribution
if r:
    print(f"Saving p3Y1X012_np to filepath '{o + 'p3Y1X012_RE' + '.npy'}'")
    np.save(o + 'p3Y1X012_RE' + '.npy', p3Y1X012_np)
    print(f"Saving p3Y1X01_np to filepath '{o + 'p3Y1X01_RE' + '.npy'}'")
    np.save(o + 'p3Y1X01_RE' + '.npy', p3Y1X01_np)

In [84]:
if r:
    CD_md = {
        'r':r,
        'C':{'from fp':c,
         'changes':"Added ⋉ to the outcomes of every existing conditioning outcome; added new conditioning events X⋉" if r else 'None'}
    }

    my_fp = o + 'p3Y1X012_RE' + '.npy'
    exportMatrixMetadata(my_fp + '_metadata.json',
                         my_fp,
                         p3Y1X012_np,
                         PD_md,
                         'Step 4e',
                         'Calculate segmental wordform and prefix channel matrices',
                         {'Comment':f"See also corresponding .json file @ {o + 'p3Y1X012_RE' + '.json'}"})

In [85]:
if r:
    PD_md = {
        'r':r,
        'C':{'from fp':c,
         'changes':"Added ⋉ to the outcomes of every existing conditioning outcome; added new conditioning events X⋉" if r else 'None'}
    }

    my_fp = o + 'p3Y1X01_RE' + '.npy'
    exportMatrixMetadata(my_fp + '_metadata.json',
                         my_fp,
                         p3Y1X01_np,
                         PD_md,
                         'Step 4e',
                         'Calculate segmental wordform and prefix channel matrices',
                         {'Comment':f"See also corresponding .json file @ {o + 'p3Y1X01_RE' + '.json'}"})