In [1]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

**Notebook author:** emeinhardt@ucsd.edu

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Overview" data-toc-modified-id="Overview-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Overview</a></span><ul class="toc-item"><li><span><a href="#Requirements" data-toc-modified-id="Requirements-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Requirements</a></span></li><li><span><a href="#Usage" data-toc-modified-id="Usage-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Usage</a></span></li></ul></li><li><span><a href="#Parameters" data-toc-modified-id="Parameters-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Parameters</a></span></li><li><span><a href="#Imports" data-toc-modified-id="Imports-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Imports</a></span></li><li><span><a href="#Load-data" data-toc-modified-id="Load-data-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Load data</a></span></li><li><span><a href="#numpy-representations" data-toc-modified-id="numpy-representations-5"><span class="toc-item-num">5&nbsp;&nbsp;</span><code>numpy</code> representations</a></span></li><li><span><a href="#Calculation" data-toc-modified-id="Calculation-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Calculation</a></span></li><li><span><a href="#Export" data-toc-modified-id="Export-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Export</a></span></li></ul></div>

# Overview

Given
 - a filepath to a triphone channel model $c$
 - a filepath $w$ to a `.json` file specifying a conditional distribution $p(W|V)$ on segmental wordforms given orthographic ones
 - an output filepath prefix $o$
 - an optional flag $f$ indicating whether to do calculations for both full wordforms and prefixes (`True`, default) or just full wordforms (`False`)
 - an optional filepath $p$ to a `.json` file specifying a 'preview' channel distribution to be included in calculated channel matrices.

this notebook calculates a channel matrix for each source prefix (if $f$, otherwise just for full source wordforms) and writes these channel matrices to file (with prefix given by $o$), with each file corresponding to a block of source prefixes (if $f$, else full source wordforms) of the same length. Within a block, the ordering of source prefixes/wordforms is given by alphabetically sorting the relevant set of prefixes (or just full wordforms, if $f$).

## Requirements

 - `numpy`
 - `pytorch`

## Usage

#FIXME

# Parameters

In [2]:
from os import getcwd, chdir, listdir, path, mkdir, makedirs

In [3]:
from boilerplate import *

In [152]:
# Parameters

c = ''
c = 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json'

w = ''
w = 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json'

o = ''
o = 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01/LTR_Buckeye_aligned_CM_filtered_LM_filtered_'

f = ''
f = 'True'

p = ''
p = 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01/p3Y1X01.json'

In [5]:
ensure_dir_exists(path.dirname(o))

In [154]:
if f == '':
    f = 'True'

if p == '':
    r = False
else:
    r = True
    print('Including preview distribution in channel matrix calculations.')

Including preview distribution in channel matrix calculations.


In [7]:
if f == 'True':
    f = True
elif f == 'false':
    f = False
else:
    raise Exception(f"f must be either 'True' or 'False', got '{f}'")

# Imports

In [8]:
from probdist import *

In [9]:
from string_utils import *

In [10]:
import numpy as np
import torch

In [208]:
import pickle

In [117]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')
    print(torch.cuda.get_device_name(1))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(1)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_cached(1)/1024**3,1), 'GB')

Using device: cuda

GeForce RTX 2070
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
GeForce RTX 2070
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [118]:
gpu = torch.device('cuda')
cpu = torch.device('cpu')

my_device = cpu

In [119]:
cuda_ft = torch.cuda.FloatTensor
cuda_dt = torch.cuda.DoubleTensor

ft = torch.FloatTensor
dt = torch.DoubleTensor

my_ft = ft
my_dt = dt

torch.set_default_tensor_type(my_ft)

# Load data

In [186]:
p3Y1X012 = condDistsAsProbDists(importProbDist(c))

assert uniformOutcomes(p3Y1X012)

In [166]:
if r:
    p3Y1X01 = condDistsAsProbDists(importProbDist(p))
    assert uniformOutcomes(pY1X01)

In [12]:
pW_V = condDistsAsProbDists(importProbDist(w))

In [64]:
#extract segmental wordforms from w
Ws = union(list(map(lambda d: set(conditions(d)), 
                    pW_V.values())))
Ws_t = tuple(sorted(list(Ws)))
print(f'|Wordforms| = {len(Ws)}')

#extract prefixes from w
Ps = union(map(getPrefixes, Ws))
prefixes = Ps
print(f'|Prefixes| = {len(Ps)}')
Ps_t = tuple(sorted(list(Ws)))
prefixes_t = Ps_t

#extract inventory from w
Xs = lexiconToInventory(Ws)
    
#extract triphones from w
lexiconTriphones = lexiconTo3factors(Ws)
print(f'|triphones| in lexicon = {len(lexiconTriphones)}')

|Wordforms| = 6403
|Prefixes| = 21475
|triphones| in lexicon = 5760


In [187]:
#extract triphones from c
channelTriphones = set(p3Y1X012.keys())

print(f'|triphones| in channel model = {len(channelTriphones)}')

X012s = channelTriphones
X012s_t = tuple(sorted(list(X012s)))

#extract response phones
Y1s = outcomes(p3Y1X012)
Y1s_t = tuple(sorted(list(Y1s)))
print(f'|Y1s| = {len(Y1s)}')

leftEdge in Y1s
rightEdge in Y1s

|triphones| in channel model = 5760
|Y1s| = 38


False

False

In [31]:
assert all({triph in channelTriphones for triph in lexiconTriphones})

In [169]:
if r:
    channelDiphones = set(p3Y1X01.keys())
    print(f'|X012s| in channel model = {len(channelDiphones)}')
    
    lexiconDiphones = lexiconTo2factors(Ws)
    unmodelableLexiconDiphones = {diph for diph in lexiconDiphones if diph not in channelDiphones}
    print(f'unmodelable lexicon diphones = \n{unmodelableLexiconDiphones}')
    assert all({diph in channelDiphones for diph in lexiconDiphones if ds2t(diph)[0] != leftEdge and ds2t(diph)[1] != rightEdge})
    print(f'|X012s| in lexicon = {len(lexiconDiphones)}')
    
    X01s = lexiconDiphones
    assert outcomes(p3Y1X01) == Y1s
    
    

|X012s| in channel model = 1323
unmodelable lexicon diphones = 
{'s.⋉', '⋊.h', '⋊.ɚ', '⋊.ʃ', '⋊.s', 'oʊ.⋉', '⋊.æ', '⋊.ð', 'ɔɪ.⋉', 'd.⋉', 'b.⋉', '⋊.ɪ', 'ð.⋉', 'ɚ.⋉', 'ɹ.⋉', 'ʌ.⋉', '⋊.b', 't.⋉', '⋊.m', '⋊.g', 'p.⋉', 'æ.⋉', 'n.⋉', '⋊.k', '⋊.p', 'ʒ.⋉', 'dʒ.⋉', '⋊.t', 'v.⋉', 'aʊ.⋉', '⋊.dʒ', '⋊.n', '⋊.ɔɪ', '⋊.l', 'z.⋉', '⋊.ʌ', 'ŋ.⋉', 'ɛ.⋉', '⋊.w', '⋊.ɛ', 'f.⋉', '⋊.tʃ', 'ʃ.⋉', 'ɑ.⋉', '⋊.oʊ', 'aɪ.⋉', '⋊.v', 'u.⋉', 'i.⋉', 'tʃ.⋉', '⋊.z', '⋊.ɑ', 'eɪ.⋉', '⋊.aʊ', '⋊.j', '⋊.d', '⋊.i', '⋊.aɪ', '⋊.f', '⋊.ɹ', '⋊.eɪ', 'k.⋉', 'm.⋉', 'l.⋉', 'θ.⋉', '⋊.θ', 'g.⋉'}
|X012s| in lexicon = 904


There are no gating trials that bear on $p(Y_{i+1}|X_i; X_{i+1} = ⋉)$, but a reasonable assumption is that there are plenty of good acoustic cues that any given segment $X_i$ is the end of the word (i.e. that $X_{i+1} = ⋉$) given the context of an isolated word recognition task, and that there are plenty of good acoustic cues that any given segment is NOT the end of the word.

In [175]:
if r:
    p3Y1X01 = condProbDistAsDicts(p3Y1X01)
    
    # add ⋉ to the outcomes of every existing conditioning outcome
    for x01 in p3Y1X01:
        p3Y1X01[x01].update({rightEdge:0.0})

    # create new conditioning events
    wordEndDiphones = {x + '.' + rightEdge for x in Xs}
    list(wordEndDiphones)[:5]

    # create their distribution over outcomes
    deltaDist = {y1:0.0 for y1 in Y1s}
    deltaDist.update({rightEdge:1.0})

    # add the new wordend conditioning events to the preview distribution
    p3Y1X01.update({wordEnd:deltaDist for wordEnd in wordEndDiphones})
    p3Y1X01['aʊ.s']['s']
    p3Y1X01['ɑ.⋉']

    # check that everything worked
    for x01 in p3Y1X01:
        assert rightEdge in p3Y1X01[x01]
    #     if rightEdge not in p3Y1X01[x01]:
    #         p3Y1X01[x01][rightEdge] = 0.0

    assert areNormalized(p3Y1X01)
    assert uniformOutcomes(p3Y1X01)

    channelDiphones = set(p3Y1X01.keys())

    unmodelableLexiconDiphones = {diph for diph in lexiconDiphones if diph not in channelDiphones}
    print(f'unmodelable lexicon diphones = \n{unmodelableLexiconDiphones}')
    assert all({diph in channelDiphones for diph in lexiconDiphones if ds2t(diph)[0] != leftEdge and ds2t(diph)[1] != rightEdge})
    
    #we'll worry about left-edge initial diphones later
    
    # let's trim the preview model's conditioning events
    p3Y1X01 = {x01:p3Y1X01[x01] for x01 in p3Y1X01 if x01 in lexiconDiphones}
    
    p3Y1X01 = condDistsAsProbDists(p3Y1X01)
    
    X01s_RE = set(p3Y1X01.keys())
    len(X01s_RE)
    
#     print(X01s_RE - X01s)

['s.⋉', 'w.⋉', 'oʊ.⋉', 'ɔɪ.⋉', 'd.⋉']

0.007009776793949458

{'aɪ': 0.0,
 'æ': 0.0,
 'ʃ': 0.0,
 'g': 0.0,
 't': 0.0,
 'v': 0.0,
 'dʒ': 0.0,
 'ɹ': 0.0,
 'aʊ': 0.0,
 'u': 0.0,
 'oʊ': 0.0,
 'ʒ': 0.0,
 'k': 0.0,
 'ð': 0.0,
 'θ': 0.0,
 'tʃ': 0.0,
 'eɪ': 0.0,
 'j': 0.0,
 'm': 0.0,
 'ɚ': 0.0,
 'b': 0.0,
 'p': 0.0,
 'n': 0.0,
 's': 0.0,
 'w': 0.0,
 'f': 0.0,
 'ʊ': 0.0,
 'i': 0.0,
 'ɔɪ': 0.0,
 'z': 0.0,
 'ɪ': 0.0,
 'ŋ': 0.0,
 'ʌ': 0.0,
 'ɑ': 0.0,
 'ɛ': 0.0,
 'h': 0.0,
 'l': 0.0,
 'd': 0.0,
 '⋉': 1.0}

unmodelable lexicon diphones = 
{'⋊.h', '⋊.ɚ', '⋊.ð', '⋊.ʃ', '⋊.s', '⋊.æ', '⋊.ɪ', '⋊.b', '⋊.m', '⋊.g', '⋊.k', '⋊.p', '⋊.t', '⋊.dʒ', '⋊.n', '⋊.ɔɪ', '⋊.l', '⋊.ʌ', '⋊.w', '⋊.ɛ', '⋊.tʃ', '⋊.oʊ', '⋊.v', '⋊.z', '⋊.ɑ', '⋊.aʊ', '⋊.j', '⋊.d', '⋊.i', '⋊.aɪ', '⋊.f', '⋊.ɹ', '⋊.eɪ', '⋊.θ'}


870

# `numpy` representations

In [44]:
Xmap = seqsToIndexMap(Xs)
XOHmap = seqsToOneHotMap(Xs)

In [35]:
X012map = seqsToIndexMap(X012s)
# X012OHs = seqMapToOneHots(X012map)
X012OHmap = seqsToOneHotMap(X012s)

In [56]:
Y1map = seqsToIndexMap(Y1s)

In [176]:
if r:
    X01REmap = seqsToIndexMap(X01s_RE)
    X01REOHs = seqMapToOneHots(X01REmap)
    X01REOHmap = seqsToOneHotMap(X01s_RE)
    
    Y1s_RE = outcomes(p3Y1X01)
    len(Y1s_RE)
    Y1s_RE_list = sorted(list(Y1s_RE))

    print(Y1s_RE - Y1s)

    Y1REmap = seqsToIndexMap(Y1s_RE)

    Y1REOHs = seqMapToOneHots(Y1REmap)
    Y1REOHmap = seqsToOneHotMap(Y1s_RE)
    OHY1REmap = oneHotToSeqMap(Y1s_RE)

39

{'⋉'}


If `r` is `True`, then to ensure uniformity of event spaces between the triphone channel distribution and the preview distribution, we'll add a $⋉$ outcome (with probability 0.0) to each conditional distribution in the triphone channel distribution.

In [188]:
if r:
    for x012 in p3Y1X012:
        p3Y1X012[x012].update({rightEdge:0.0})
        assert rightEdge in p3Y1X012[x012]
        assert p3Y1X012[x012][rightEdge] == 0.0

    outcomes(p3Y1X012) == Y1s
    outcomes(p3Y1X012) == Y1s_RE
    areNormalized(p3Y1X012)
    uniformOutcomes(p3Y1X012)

False

True

True

True

In [50]:
def dsToUniphoneIndices(ds, uniphoneToIndexMap):
    uniphoneSeq = ds2t(ds)
    return np.array([uniphoneToIndexMap[uniphone] for uniphone in uniphoneSeq])

def dsToUniphoneOHs(ds, uniphoneToOHmap):
    uniphoneSeq = ds2t(ds)
    return np.array([uniphoneToOHmap[uniphone] for uniphone in uniphoneSeq])

def dsToTriphoneSeq(ds):
    return dsToKfactorSequence(3, ds)

def dsToTriphoneIndices(ds, triphoneToIndexMap):
    triphoneSeq = dsToTriphoneSeq(ds)
    return np.array([triphoneToIndexMap[triphone] for triphone in triphoneSeq])

def dsToTriphoneOHs(ds, triphoneToOHmap):
    triphoneSeq = dsToTriphoneSeq(ds)
    return np.array([triphoneToOHmap[triphone] for triphone in triphoneSeq])

dsToUniphoneIndices('t.i.f.l', Xmap)
dsToUniphoneOHs('t.i.f.l', XOHmap)
dsToTriphoneSeq('t.i.f.l')
dsToTriphoneIndices('t.i.f.l', X012map)
dsToTriphoneOHs('t.i.f.l', X012OHmap)
dsToTriphoneOHs('t.i.f.l', X012OHmap).shape
dsToTriphoneOHs('t.i.f.l', X012OHmap)[0].shape
dsToTriphoneOHs('t.i.f.l', X012OHmap)[0][5528]
dsToTriphoneOHs('t.i.f.l', X012OHmap)[1][5352]

array([18,  9,  6, 12])

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.]])

('t.i.f', 'i.f.l')

array([2904, 1146])

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

(2, 5760)

(5760,)

0.0

0.0

In [189]:
p3Y1X012_np = condDistFamilyToNP(p3Y1X012)
testNPcondDist(p3Y1X012_np, X012map, Y1REmap, p3Y1X012)
p3Y1X012_np.shape

(39, 5760)

In [190]:
p3Y1X01_np = condDistFamilyToNP(p3Y1X01)
testNPcondDist(p3Y1X01_np, X01REmap, Y1REmap, p3Y1X01)
p3Y1X01_np.shape

(39, 870)

In [19]:
from random import choice

In [20]:
random_source_wordform = choice(list(Ws))
random_source_wordform

'⋊.k.ʌ.n.s.ɪ.d.ɚ.z.⋉'

In [23]:
random_source_prefix = choice(list(Ps))
random_source_prefix

'⋊.t.ɹ.æ.n.z.l'

In [21]:
def randomPrefix(l, alphabet=Xs):
    return randomString(alphabet, l, hasLeftEdge=True)

In [22]:
random_channel_prefix2 = randomPrefix(len(ds2t(random_source_wordform))-1, alphabet=Y1s)
random_channel_prefix2

'⋊.z.u.b.eɪ.d.p.z.ʒ.ŋ'

In [25]:
# random_source_prefix = getRandomKey(pX0i)
random_source_prefix = choice(list(Ps))
while ds2t(random_source_prefix)[-1] == rightEdge:
#     random_source_prefix = getRandomKey(pX0i)
    random_source_prefix = choice(list(Ps))
while len(ds2t(random_source_prefix)) > len(ds2t(random_source_wordform)):
#     random_source_prefix = getRandomKey(pX0i)
    random_source_prefix = choice(list(Ps))
random_source_prefix
random_channel_prefix = randomPrefix(len(ds2t(random_source_prefix))-1, alphabet=Y1s)
random_channel_prefix

'⋊.w.ʌ.ʃ'

'⋊.ɑ.ɪ.aɪ'

# Calculation

In [26]:
def sourcePrefixToTriphones(x0k):
    xp_t = ds2t(x0k) #"x prefix"
    
#     xi = xp_t[-2] #just-completed segment
#     xk = xp_t[-1] #upcoming segment that we only get coarticulatory information about
    
#     xik_ds = t2ds((xi, xk))
#     preview_dist = p3Y1X01[xik_ds]
    
    x012s = dsToKfactorSequence(3, t2ds(xp_t))
    return x012s

random_triphoneSeq = sourcePrefixToTriphones(random_source_prefix)
random_triphoneSeq
threeFactorSequenceToDS(random_triphoneSeq)

('⋊.w.ʌ', 'w.ʌ.ʃ')

'⋊.w.ʌ.ʃ'

In [36]:
def sourcePrefixToTriphoneIndices(x0k):
    triphoneSequence = sourcePrefixToTriphones(x0k)
    return tuple(map(lambda x012: X012map[x012], triphoneSequence))

sourcePrefixToTriphoneIndices(random_source_prefix)

(5627, 3445)

In [66]:
blah = np.zeros((len(Y1s), 1))
blah[-1] = 1.0
blah

array([[0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.]])

In [193]:
def sourcePrefixToChannelMatrix_l(x0k, debug=False):
    triphoneOHs = dsToTriphoneOHs(x0k, X012OHmap)
    if debug:
        print('x0k = {0}'.format(x0k))
        print('|x0k| = {0}'.format(len(x0k)))
        print('triphoneIdxs = {0}'.format(sourcePrefixToTriphoneIndices(x0k)))
        print('triphoneOHs.shape = {0}'.format(triphoneOHs.shape))
        print('p3Y1X012_np.shape = {0}'.format(p3Y1X012_np.shape))
        print('result = p3Y1X012_np * triphoneOHs.T')
    result = np.matmul(p3Y1X012_np, triphoneOHs.T)
    return result
# sourcePrefixToChannelMatrix_l(random_source_prefix, True)

if r:
    def sourcePrefixToChannelMatrix(x0k):
        triphoneIndices = sourcePrefixToTriphoneIndices(x0k)
        C = np.array([[p3Y1X012_np[Y1REmap[y1], x012_idx] for x012_idx in triphoneIndices] for y1 in sorted(Y1s_RE)])
    #     C = np.array([[p3Y1X012_np[Y1map[y1], x012_idx] 
    #                    for x012_idx in triphoneIndices] 
    #                   for y1 in Y1s_t])
        if x0k == leftEdge or (len(ds2t(x0k)) == 2 and ds2t(x0k)[0] == leftEdge):
            C = np.zeros((len(Y1s_RE), 1))
    #         C = np.zeros((len(Y1s), 1))
            C[-1] = 1.0
            return C.reshape(39,1)
    #         return C.reshape(38,1)
        return C
else:
    def sourcePrefixToChannelMatrix(x0k):
        triphoneIndices = sourcePrefixToTriphoneIndices(x0k)
#         C = np.array([[p3Y1X012_np[Y1REmap[y1], x012_idx] for x012_idx in triphoneIndices] for y1 in sorted(Y1s_RE)])
        C = np.array([[p3Y1X012_np[Y1map[y1], x012_idx] 
                       for x012_idx in triphoneIndices] 
                      for y1 in Y1s_t])
        if x0k == leftEdge or (len(ds2t(x0k)) == 2 and ds2t(x0k)[0] == leftEdge):
#             C = np.zeros((len(Y1s_RE), 1))
            C = np.zeros((len(Y1s), 1))
            C[-1] = 1.0
#             return C.reshape(39,1)
            return C.reshape(38,1)
        return C


# sourcePrefixToChannelMatrix(random_source_prefix)

random_source_prefix
sourcePrefixToChannelMatrix_l(random_source_prefix).shape
print(sourcePrefixToChannelMatrix_l(random_source_prefix) == sourcePrefixToChannelMatrix(random_source_prefix))

'⋊.w.ʌ.ʃ'

(39, 2)

[[ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]]


In [195]:
if r:
    def sourcePrefixToPreviewVector(x0k):
        xp_t = ds2t(x0k) #"x prefix"

        if len(xp_t) < 2:
            raise Exception('|x0k| must be > 1.')
        if len(xp_t) == 2 and xp_t[0] == leftEdge:
    #         raise Exception("There's no gating data that bears on this calculation, nor is it that interesting.")
            uniformProb = 1.0 / len(Y1s_RE)
            preview_dist = uniformProb * np.ones((len(Y1s_RE), 1))#garbage
            return preview_dist.reshape(39,1)

        xi = xp_t[-2] #just-completed segment
        xk = xp_t[-1] #upcoming segment that we only get coarticulatory information about

        xik_ds = t2ds((xi, xk))
        preview_dist = p3Y1X01[xik_ds]
    #     assert Y1s_RE == set(preview_dist.keys()) #comment out once you are reasonably confident this is true by construction

        return np.array([preview_dist[y1] for y1 in sorted(Y1s_RE)])

    sourcePrefixToPreviewVector(random_source_prefix)

array([0.01110786, 0.01110786, 0.01110786, 0.04004677, 0.01052324,
       0.01110786, 0.01081555, 0.01110786, 0.27418883, 0.01110786,
       0.01110786, 0.0695703 , 0.01110786, 0.01081555, 0.04033908,
       0.01110786, 0.04004677, 0.01110786, 0.18649518, 0.01110786,
       0.01110786, 0.01110786, 0.01110786, 0.01081555, 0.01110786,
       0.01052324, 0.00818474, 0.01110786, 0.01110786, 0.01110786,
       0.01110786, 0.01110786, 0.01110786, 0.01052324, 0.01110786,
       0.01110786, 0.01081555, 0.01081555, 0.        ])

In [196]:
if r:
    # returns p(Y0K|x0k)
    def makeExtendedChannelMatrixByPrefix(prefix):
        # NB:
        # if len(prefix) == n (including leftEdge), 
        # then the extended channel matrix will have dimensions 39 x (n-1)

        p = prefix
        if prefix != leftEdge:# and not (len(ds2t(p)) == 2 and ds2t(p)[0] == leftEdge):
    #     if prefix != leftEdge and not (len(ds2t(p)) == 2 and ds2t(p)[0] == leftEdge):
            return np.hstack( (sourcePrefixToChannelMatrix(p) , sourcePrefixToPreviewVector(p).reshape(39,1)))
        else: #the extended channel matrix is garbage that should never be asked for
            l = len(ds2t(p))
            return np.zeros((39, l-1))

In [197]:
if f:
    print('Source sequences = wordforms and prefixes')
    source_seqs = prefixes_t #prefixes include full wordforms
else:
    print('Source sequences = just full wordforms')
    source_seqs = Ws_t

Source sequences = wordforms and prefixes


In [201]:
if r:
    xCMsByPrefixIndex = [makeExtendedChannelMatrixByPrefix(s)
                         for s in source_seqs]
    xCMsByPrefixIndex_torch = [None] + [torch.from_numpy(each) for each in xCMsByPrefixIndex[1:]]

    xCMsByPrefixIndex[3].shape

(39, 6)

In [200]:
CMsByPrefixIndex = [sourcePrefixToChannelMatrix_l(s)
                     for s in source_seqs]
CMsByPrefixIndex_torch = [None] + [torch.from_numpy(each) for each in CMsByPrefixIndex[1:]]

CMsByPrefixIndex[3].shape

(39, 5)

In [69]:
def wordformsOfLength(l, includingEdges = False):
    if includingEdges:
        return {w for w in Ws if len(ds2t(w)) == l}
    return {w for w in Ws if len(ds2t(w)) == l + 2}

In [73]:
wordformsOfLength(16)

{'⋊.t.ɛ.l.ɪ.k.ʌ.m.j.u.n.ɪ.k.eɪ.ʃ.ʌ.n.⋉'}

In [74]:
wordlengthsInclEdges = set(len(ds2t(w)) for w in Ws)
wordlengthsInclEdges
numWordsOfExactlyLength = {l:len(wordformsOfLength(l, True)) for l in wordlengthsInclEdges}
numWordsOfExactlyLength

{3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19}

{3: 11,
 4: 144,
 5: 808,
 6: 1201,
 7: 1183,
 8: 974,
 9: 779,
 10: 555,
 11: 354,
 12: 211,
 13: 98,
 14: 51,
 15: 25,
 16: 5,
 17: 2,
 18: 1,
 19: 1}

In [75]:
wordlengthsNotIncludingEdges = {each-2 for each in wordlengthsInclEdges}
wordlengthsNotIncludingEdges

{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}

In [76]:
def wordformsAtLeastLlong(l, includingEdges = False):
    if includingEdges:
        maxL = max(wordlengthsInclEdges)
        return union([wordformsOfLength(eachl, includingEdges) for eachl in range(l, maxL+1)])
    else:
        maxL = max(wordlengthsNotIncludingEdges)
        return union([wordformsOfLength(eachl, includingEdges) for eachl in range(l, maxL+1)])

In [77]:
lengthFreqs = {l:len(wordformsAtLeastLlong(l, True)) for l in wordlengthsInclEdges}
lengthFreqs

{3: 6403,
 4: 6392,
 5: 6248,
 6: 5440,
 7: 4239,
 8: 3056,
 9: 2082,
 10: 1303,
 11: 748,
 12: 394,
 13: 183,
 14: 85,
 15: 34,
 16: 9,
 17: 4,
 18: 2,
 19: 1}

In [112]:
# returns p(Y0i|x0f), padded if necessary
def makeChannelMatrixByWordformAndLength(wordform, key_length):
    x0f = wordform
    x0f_t = ds2t(x0f)
    x0f_length = len(x0f_t)
    if x0f_length == key_length:
        return sourcePrefixToChannelMatrix(x0f)
    elif x0f_length > key_length:
#         print('middle case')
        #trim the wordform to be a prefix of length = key_length
        x0k_t = x0f_t[:key_length]
        x0k = t2ds(x0k_t)
#         print('x0k: {0}'.format(x0k))
        return sourcePrefixToChannelMatrix(x0k)
    else:
        #grab the source 
        my_CM = sourcePrefixToChannelMatrix(x0f)
        goal_l = key_length
        #extend the channel matrix with padding
        return np.pad(my_CM, ((0,0), (0, goal_l - my_CM.shape[1] - 2)), 
                      'constant', constant_values=0.0)

In [203]:
if r:
    # returns p(Y0K|x0f)
    def makeExtendedChannelMatrixByWordformAndLength(wordform, key_length):
        x0f = wordform
        x0f_t = ds2t(x0f)
        x0f_length = len(x0f_t)
        if x0f_length == key_length:
            return makeExtendedChannelMatrixByPrefix(x0f)
        elif x0f_length > key_length:
    #         print('middle case')
            #trim the wordform to be a prefix of length = key_length
            x0k_t = x0f_t[:key_length]
            x0k = t2ds(x0k_t)
    #         print('x0k: {0}'.format(x0k))
            return makeExtendedChannelMatrixByPrefix(x0k)
        else:
            #grab the source 
            my_xCM = makeExtendedChannelMatrixByPrefix(x0f)
            goal_l = key_length
            return np.pad(my_xCM, ((0,0), (0, goal_l - my_xCM.shape[1] - 1)), 
                          'constant', constant_values=0.0)

In [81]:
wordlengthsInclEdges

{3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19}

In [204]:
# ~17s on wittgenstein under load
cmsByLengthByWordformIndex = [np.array([makeChannelMatrixByWordformAndLength(w, l)
                                         for w in Ws_t])
                               for l in sorted(list(wordlengthsInclEdges))]
cmsByLengthByWordformIndex_torch = list(map(lambda cm: torch.from_numpy(cm).type(my_ft), cmsByLengthByWordformIndex))

In [205]:
if r:
    xCMsByLengthByWordformIndex = [np.array([makeExtendedChannelMatrixByWordformAndLength(w, l)
                                             for w in Ws_t])
                                   for l in sorted(list(wordlengthsInclEdges))]
    xCMsByLengthByWordformIndex_torch = list(map(lambda xCM: torch.from_numpy(xCM).type(my_ft), xCMsByLengthByWordformIndex))

# Export

We want to save 
 - `CMsByPrefixIndex`
 - `cmsByLengthByWordformIndex`
 
(and/or their extended analogues, if `r`) to disk, and when importing, we will need to know
 - the set/sequence of key strings (prefixes or just wordforms)

In [122]:
len(CMsByPrefixIndex)
# CMsByPrefixIndex.nbytes / 1e9

6403

In [207]:
len(cmsByLengthByWordformIndex)
cmsByLengthByWordformIndex[0].shape
cmsByLengthByWordformIndex[1].shape
cmsByLengthByWordformIndex[16].nbytes / 1e9

17

(6403, 39, 1)

(6403, 39, 2)

0.033961512

In [131]:
o

'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01/LTR_Buckeye_aligned_CM_filtered_LM_filtered_'

In [136]:
if r:
    pickle.dump(xCMsByPrefixIndex, open(o + 'xCMs_by_prefix_index.pickle', 'wb'))
else:
    pickle.dump(CMsByPrefixIndex, open(o + 'CMs_by_prefix_index.pickle', 'wb'))

In [137]:
listdir(path.dirname(o))

['pX0X1X2.npy',
 'p6Y0X01.json',
 'p3YX.json',
 'p3Y0X01.json',
 'p3Y01X01.json',
 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2.npy',
 'Generating LTR_Buckeye_aligned_CM_filtered_LM_filtered uniform triphone lexicon dist.ipynb',
 'p6Y01X01.json',
 'p3Y1X01.json',
 'pYX.json',
 'Generating  uniform triphone lexicon dist.ipynb',
 'p6Y1X01.json',
 'Filter CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01 against LTR_Buckeye_aligned_CM_filtered_LM_filtered.ipynb',
 'gate6 trials.csv',
 'f3_Y0Y1_X0X1.json',
 'Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_Buckeye, pc=0.01.ipynb',
 'p6YX.json',
 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2.json',
 '.ipynb_checkpoints',
 'pY1X0X1X2.json',
 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_CMs_by_prefix_index.pickle',
 'f6_Y0Y1_X0X1.json',
 'pX0X1X2.json',
 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'gate3 trials.csv']

In [144]:
if not r:
    CMsByPrefixIndex_in = pickle.load(open(o + 'CMs_by_prefix_index.pickle', 'rb'))
    len(CMsByPrefixIndex_in)

    assert all(np.array_equal(CMsByPrefixIndex_in[i], CMsByPrefixIndex[i]) for i in range(len(CMsByPrefixIndex)))

6403

In [146]:
if not r:
    CMsByPrefixIndex_in[3].shape
    CMsByPrefixIndex[3].shape

(38, 5)

(38, 5)

In [147]:
if r:
    pickle.dump(xCMsByLengthByWordformIndex, open(o + 'xCMs_by_length_by_prefix_index.pickle', 'wb'))
else:
    pickle.dump(cmsByLengthByWordformIndex, open(o + 'CMs_by_length_by_prefix_index.pickle', 'wb'))

In [148]:
listdir(path.dirname(o))

['pX0X1X2.npy',
 'p6Y0X01.json',
 'p3YX.json',
 'p3Y0X01.json',
 'p3Y01X01.json',
 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2.npy',
 'Generating LTR_Buckeye_aligned_CM_filtered_LM_filtered uniform triphone lexicon dist.ipynb',
 'p6Y01X01.json',
 'p3Y1X01.json',
 'pYX.json',
 'Generating  uniform triphone lexicon dist.ipynb',
 'p6Y1X01.json',
 'Filter CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01 against LTR_Buckeye_aligned_CM_filtered_LM_filtered.ipynb',
 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_CMs_by_length_by_prefix_index.pickle',
 'gate6 trials.csv',
 'f3_Y0Y1_X0X1.json',
 'Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_Buckeye, pc=0.01.ipynb',
 'p6YX.json',
 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2.json',
 '.ipynb_checkpoints',
 'pY1X0X1X2.json',
 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_CMs_by_prefix_index.pickle',
 'f6_Y0Y1_X0X1.json',
 'pX0X1X2.json',
 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'g