In [1]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

**Notebook author:** emeinhardt@ucsd.edu

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Overview" data-toc-modified-id="Overview-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Overview</a></span><ul class="toc-item"><li><span><a href="#Requirements" data-toc-modified-id="Requirements-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Requirements</a></span></li><li><span><a href="#Usage" data-toc-modified-id="Usage-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Usage</a></span></li></ul></li><li><span><a href="#Parameters" data-toc-modified-id="Parameters-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Parameters</a></span></li><li><span><a href="#Imports-/-load-data" data-toc-modified-id="Imports-/-load-data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Imports / load data</a></span></li><li><span><a href="#numpy-representations" data-toc-modified-id="numpy-representations-4"><span class="toc-item-num">4&nbsp;&nbsp;</span><code>numpy</code> representations</a></span></li><li><span><a href="#Calculation" data-toc-modified-id="Calculation-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Calculation</a></span></li></ul></div>

# Overview

Given
 - a filepath to a triphone channel model $c$
 - a filepath $w$ to a `.json` file specifying a conditional distribution $p(W|V)$ on segmental wordforms given orthographic ones
 - an output filepath prefix $o$
 - an optional flag $f$ indicating whether to do calculations for both full wordforms and prefixes (`True`, default) or just full wordforms (`False`)
 - an optional flag $r$ indicating whether to include a 'preview' term

this notebook calculates a channel matrix for each source prefix (if $p$, otherwise just for full source wordforms) and writes these channel matrices to file (with prefix given by $o$), with each file corresponding to a block of source prefixes (if $p$, else full source wordforms) of the same length. Within a block, the ordering of source prefixes/wordforms is given by alphabetically sorting the relevant set of prefixes (full wordforms).

## Requirements

 - `numpy`
 - `pytorch`

## Usage

#FIXME

# Parameters

In [2]:
from os import getcwd, chdir, listdir, path, mkdir, makedirs

In [3]:
from boilerplate import *

In [4]:
# Parameters

c = ''
c = 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json'

w = ''
w = 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json'

o = ''
o = 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01/LTR_Buckeye_aligned_CM_filtered_LM_filtered_'

f = ''
f = 'True'

r = ''
r = 'True'

In [5]:
ensure_dir_exists(path.dirname(o))

In [6]:
if f == '':
    f = 'True'

if r == '':
    r = 'True'

In [7]:
if f == 'True':
    f = True
elif f == 'false':
    f = False
else:
    raise Exception(f"f must be either 'True' or 'False', got '{f}'")
    
if r == 'True':
    r = True
elif r == 'False':
    r = False
else:
    raise Exception(f"r must be either 'True' or 'False', got '{r}'")

# Imports / load data

In [8]:
from probdist import *

In [9]:
from string_utils import *

In [10]:
import numpy as np
import torch

In [11]:
pY1X012 = condDistsAsProbDists(importProbDist(c))

In [12]:
pW_V = condDistsAsProbDists(importProbDist(w))

assert uniformOutcomes(pY1X012)

In [64]:
#extract segmental wordforms from w
Ws = union(list(map(lambda d: set(conditions(d)), 
                    pW_V.values())))
Ws_t = tuple(sorted(list(Ws)))
print(f'|Wordforms| = {len(Ws)}')

#if f, extract prefixes from w
# if f:
Ps = union(map(getPrefixes, Ws))
prefixes = Ps
print(f'|Prefixes| = {len(Ps)}')
Ps_t = tuple(sorted(list(Ws)))
prefixes_t = Ps_t

Xs = lexiconToInventory(Ws)
    
#extract triphones from w
lexiconTriphones = lexiconTo3factors(Ws)
print(f'|triphones| in lexicon = {len(lexiconTriphones)}')

|Wordforms| = 6403
|Prefixes| = 21475
|triphones| in lexicon = 5760


In [30]:
#extract triphones from c
channelTriphones = set(pY1X012.keys())

print(f'|triphones| in channel model = {len(channelTriphones)}')

|triphones| in channel model = 5760


In [31]:
assert all({triph in channelTriphones for triph in lexiconTriphones})

In [32]:
X012s = channelTriphones
X012s_t = tuple(sorted(list(X012s)))

In [33]:
Y1s = outcomes(pY1X012)
Y1s_t = tuple(sorted(list(Y1s)))
print(f'|Y1s| = {len(Y1s)}')

|Y1s| = 38


In [34]:
leftEdge in Y1s
rightEdge in Y1s

False

False

# `numpy` representations

In [44]:
Xmap = seqsToIndexMap(Xs)
XOHmap = seqsToOneHotMap(Xs)

In [56]:
Y1map = seqsToIndexMap(Y1s)

In [35]:
X012map = seqsToIndexMap(X012s)
# X012OHs = seqMapToOneHots(X012map)
X012OHmap = seqsToOneHotMap(X012s)

In [50]:
def dsToUniphoneIndices(ds, uniphoneToIndexMap):
    uniphoneSeq = ds2t(ds)
    return np.array([uniphoneToIndexMap[uniphone] for uniphone in uniphoneSeq])

def dsToUniphoneOHs(ds, uniphoneToOHmap):
    uniphoneSeq = ds2t(ds)
    return np.array([uniphoneToOHmap[uniphone] for uniphone in uniphoneSeq])

def dsToTriphoneSeq(ds):
    return dsToKfactorSequence(3, ds)

def dsToTriphoneIndices(ds, triphoneToIndexMap):
    triphoneSeq = dsToTriphoneSeq(ds)
    return np.array([triphoneToIndexMap[triphone] for triphone in triphoneSeq])

def dsToTriphoneOHs(ds, triphoneToOHmap):
    triphoneSeq = dsToTriphoneSeq(ds)
    return np.array([triphoneToOHmap[triphone] for triphone in triphoneSeq])

dsToUniphoneIndices('t.i.f.l', Xmap)
dsToUniphoneOHs('t.i.f.l', XOHmap)
dsToTriphoneSeq('t.i.f.l')
dsToTriphoneIndices('t.i.f.l', X012map)
dsToTriphoneOHs('t.i.f.l', X012OHmap)
dsToTriphoneOHs('t.i.f.l', X012OHmap).shape
dsToTriphoneOHs('t.i.f.l', X012OHmap)[0].shape
dsToTriphoneOHs('t.i.f.l', X012OHmap)[0][5528]
dsToTriphoneOHs('t.i.f.l', X012OHmap)[1][5352]

array([18,  9,  6, 12])

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.]])

('t.i.f', 'i.f.l')

array([2904, 1146])

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

(2, 5760)

(5760,)

0.0

0.0

In [54]:
p3Y1X012_np = condDistFamilyToNP(pY1X012)
p3Y1X012_np.shape

(38, 5760)

In [19]:
from random import choice

In [20]:
random_source_wordform = choice(list(Ws))
random_source_wordform

'⋊.k.ʌ.n.s.ɪ.d.ɚ.z.⋉'

In [23]:
random_source_prefix = choice(list(Ps))
random_source_prefix

'⋊.t.ɹ.æ.n.z.l'

In [21]:
def randomPrefix(l, alphabet=Xs):
    return randomString(alphabet, l, hasLeftEdge=True)

In [22]:
random_channel_prefix2 = randomPrefix(len(ds2t(random_source_wordform))-1, alphabet=Y1s)
random_channel_prefix2

'⋊.z.u.b.eɪ.d.p.z.ʒ.ŋ'

# Calculation

In [25]:
# random_source_prefix = getRandomKey(pX0i)
random_source_prefix = choice(list(Ps))
while ds2t(random_source_prefix)[-1] == rightEdge:
#     random_source_prefix = getRandomKey(pX0i)
    random_source_prefix = choice(list(Ps))
while len(ds2t(random_source_prefix)) > len(ds2t(random_source_wordform)):
#     random_source_prefix = getRandomKey(pX0i)
    random_source_prefix = choice(list(Ps))
random_source_prefix
random_channel_prefix = randomPrefix(len(ds2t(random_source_prefix))-1, alphabet=Y1s)
random_channel_prefix

'⋊.w.ʌ.ʃ'

'⋊.ɑ.ɪ.aɪ'

In [26]:
def sourcePrefixToTriphones(x0k):
    xp_t = ds2t(x0k) #"x prefix"
    
#     xi = xp_t[-2] #just-completed segment
#     xk = xp_t[-1] #upcoming segment that we only get coarticulatory information about
    
#     xik_ds = t2ds((xi, xk))
#     preview_dist = p3Y1X01[xik_ds]
    
    x012s = dsToKfactorSequence(3, t2ds(xp_t))
    return x012s

random_triphoneSeq = sourcePrefixToTriphones(random_source_prefix)
random_triphoneSeq
threeFactorSequenceToDS(random_triphoneSeq)

('⋊.w.ʌ', 'w.ʌ.ʃ')

'⋊.w.ʌ.ʃ'

In [36]:
def sourcePrefixToTriphoneIndices(x0k):
    triphoneSequence = sourcePrefixToTriphones(x0k)
    return tuple(map(lambda x012: X012map[x012], triphoneSequence))

sourcePrefixToTriphoneIndices(random_source_prefix)

(5627, 3445)

In [66]:
blah = np.zeros((len(Y1s), 1))
blah[-1] = 1.0
blah

array([[0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.]])

In [58]:
def sourcePrefixToChannelMatrix_l(x0k, debug=False):
    triphoneOHs = dsToTriphoneOHs(x0k, X012OHmap)
    if debug:
        print('x0k = {0}'.format(x0k))
        print('|x0k| = {0}'.format(len(x0k)))
        print('triphoneIdxs = {0}'.format(sourcePrefixToTriphoneIndices(x0k)))
        print('triphoneOHs.shape = {0}'.format(triphoneOHs.shape))
        print('p3Y1X012_np.shape = {0}'.format(p3Y1X012_np.shape))
        print('result = p3Y1X012_np * triphoneOHs.T')
    result = np.matmul(p3Y1X012_np, triphoneOHs.T)
    return result
# sourcePrefixToChannelMatrix_l(random_source_prefix, True)

def sourcePrefixToChannelMatrix(x0k):
    triphoneIndices = sourcePrefixToTriphoneIndices(x0k)
#     C = np.array([[p3Y1X012_np[Y1REmap[y1], x012_idx] for x012_idx in triphoneIndices] for y1 in sorted(Y1s_RE)])
    C = np.array([[p3Y1X012_np[Y1map[y1], x012_idx] 
                   for x012_idx in triphoneIndices] 
                  for y1 in Y1s_t])
    if x0k == leftEdge or (len(ds2t(x0k)) == 2 and ds2t(x0k)[0] == leftEdge):
#         C = np.zeros((len(Y1s_RE), 1))
        C = np.zeros((len(Y1s), 1))
        C[-1] = 1.0
#         return C.reshape(39,1)
        return C.reshape(38,1)
    return C

# sourcePrefixToChannelMatrix(random_source_prefix)

random_source_prefix
print(sourcePrefixToChannelMatrix_l(random_source_prefix) == sourcePrefixToChannelMatrix(random_source_prefix))

'⋊.w.ʌ.ʃ'

[[ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]]


In [59]:
# def sourcePrefixToPreviewVector(x0k):
#     xp_t = ds2t(x0k) #"x prefix"
    
#     if len(xp_t) < 2:
#         raise Exception('|x0k| must be > 1.')
#     if len(xp_t) == 2 and xp_t[0] == leftEdge:
# #         raise Exception("There's no gating data that bears on this calculation, nor is it that interesting.")
#         uniformProb = 1.0 / len(Y1s_RE)
#         preview_dist = uniformProb * np.ones((len(Y1s_RE), 1))#garbage
#         return preview_dist.reshape(39,1)
    
#     xi = xp_t[-2] #just-completed segment
#     xk = xp_t[-1] #upcoming segment that we only get coarticulatory information about
    
#     xik_ds = t2ds((xi, xk))
#     preview_dist = p3Y1X01[xik_ds]
# #     assert Y1s_RE == set(preview_dist.keys()) #comment out once you are reasonably confident this is true by construction
    
#     return np.array([preview_dist[y1] for y1 in sorted(Y1s_RE)])

# sourcePrefixToPreviewVector(random_source_prefix)

In [60]:
# # returns p(Y0K|x0k)
# def makeExtendedChannelMatrixByPrefix(prefix):
#     # NB:
#     # if len(prefix) == n (including leftEdge), 
#     # then the extended channel matrix will have dimensions 39 x (n-1)
    
#     p = prefix
#     if prefix != leftEdge:# and not (len(ds2t(p)) == 2 and ds2t(p)[0] == leftEdge):
# #     if prefix != leftEdge and not (len(ds2t(p)) == 2 and ds2t(p)[0] == leftEdge):
#         return np.hstack( (sourcePrefixToChannelMatrix(p) , sourcePrefixToPreviewVector(p).reshape(39,1)))
#     else: #the extended channel matrix is garbage that should never be asked for
#         l = len(ds2t(p))
#         return np.zeros((39, l-1))

In [61]:
# xCMsByPrefixIndex = [makeExtendedChannelMatrixByPrefix(p)
#                      for p in sorted(list(prefixes))]
# xCMsByPrefixIndex_torch = [None] + [torch.from_numpy(each) for each in xCMsByPrefixIndex[1:]]

In [65]:
CMsByPrefixIndex = [sourcePrefixToChannelMatrix_l(p)
                     for p in prefixes_t]
CMsByPrefixIndex_torch = [None] + [torch.from_numpy(each) for each in CMsByPrefixIndex[1:]]

In [69]:
def wordformsOfLength(l, includingEdges = False):
    if includingEdges:
        return {w for w in Ws if len(ds2t(w)) == l}
    return {w for w in Ws if len(ds2t(w)) == l + 2}

In [73]:
wordformsOfLength(16)

{'⋊.t.ɛ.l.ɪ.k.ʌ.m.j.u.n.ɪ.k.eɪ.ʃ.ʌ.n.⋉'}

In [74]:
wordlengthsInclEdges = set(len(ds2t(w)) for w in Ws)
wordlengthsInclEdges
numWordsOfExactlyLength = {l:len(wordformsOfLength(l, True)) for l in wordlengthsInclEdges}
numWordsOfExactlyLength

{3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19}

{3: 11,
 4: 144,
 5: 808,
 6: 1201,
 7: 1183,
 8: 974,
 9: 779,
 10: 555,
 11: 354,
 12: 211,
 13: 98,
 14: 51,
 15: 25,
 16: 5,
 17: 2,
 18: 1,
 19: 1}

In [75]:
wordlengthsNotIncludingEdges = {each-2 for each in wordlengthsInclEdges}
wordlengthsNotIncludingEdges

{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}

In [76]:
def wordformsAtLeastLlong(l, includingEdges = False):
    if includingEdges:
        maxL = max(wordlengthsInclEdges)
        return union([wordformsOfLength(eachl, includingEdges) for eachl in range(l, maxL+1)])
    if not includingEdges:
        maxL = max(wordlengthsNotIncludingEdges)
        return union([wordformsOfLength(eachl, includingEdges) for eachl in range(l, maxL+1)])

In [77]:
lengthFreqs = {l:len(wordformsAtLeastLlong(l, True)) for l in wordlengthsInclEdges}
lengthFreqs

{3: 6403,
 4: 6392,
 5: 6248,
 6: 5440,
 7: 4239,
 8: 3056,
 9: 2082,
 10: 1303,
 11: 748,
 12: 394,
 13: 183,
 14: 85,
 15: 34,
 16: 9,
 17: 4,
 18: 2,
 19: 1}

In [67]:
# returns p(Y0i|x0f), padded if necessary
def makeChannelMatrixByWordformAndLength(wordform, key_length):
    x0f = wordform
    x0f_t = ds2t(x0f)
    x0f_length = len(x0f_t)
    if x0f_length == key_length:
        return sourcePrefixToChannelMatrix_l(x0f)
    elif x0f_length > key_length:
#         print('middle case')
        #trim the wordform to be a prefix of length = key_length
        x0k_t = x0f_t[:key_length]
        x0k = t2ds(x0k_t)
#         print('x0k: {0}'.format(x0k))
        return sourcePrefixToChannelMatrix_l(x0k)
    else:
        #grab the source 
        my_CM = sourcePrefixToChannelMatrix_l(x0f)
        goal_l = key_length
        #extend the channel matrix with padding
        return np.pad(my_CM, ((0,0), (0, goal_l - my_CM.shape[1] - 1)), 
                      'constant', constant_values=0.0)

In [None]:
# # returns p(Y0K|x0f)
# def makeExtendedChannelMatrixByWordformAndLength(wordform, key_length):
#     x0f = wordform
#     x0f_t = ds2t(x0f)
#     x0f_length = len(x0f_t)
#     if x0f_length == key_length:
#         return makeExtendedChannelMatrixByPrefix(x0f)
#     elif x0f_length > key_length:
# #         print('middle case')
#         #trim the wordform to be a prefix of length = key_length
#         x0k_t = x0f_t[:key_length]
#         x0k = t2ds(x0k_t)
# #         print('x0k: {0}'.format(x0k))
#         return makeExtendedChannelMatrixByPrefix(x0k)
#     else:
#         #grab the source 
#         my_xCM = makeExtendedChannelMatrixByPrefix(x0f)
#         goal_l = key_length
#         return np.pad(my_xCM, ((0,0), (0, goal_l - my_xCM.shape[1] - 1)), 
#                       'constant', constant_values=0.0)

In [80]:
cmsByLengthByWordformIndex = [np.array([makeChannelMatrixByWordformAndLength(w, l)
                                         for w in Ws_t])
                               for l in sorted(list(wordlengthsInclEdges))]
cmsByLengthByWordformIndex_torch = list(map(lambda cm: torch.from_numpy(cm).type(my_ft), cmsByLengthByWordformIndex))

ValueError: could not broadcast input array from shape (38,2) into shape (38)

In [None]:
# xCMsByLengthByWordformIndex = [np.array([makeExtendedChannelMatrixByWordformAndLength(w, l)
#                                          for w in sorted(list(Ws))])
#                                for l in sorted(list(wordlengthsInclEdges))]
# xCMsByLengthByWordformIndex_torch = list(map(lambda xCM: torch.from_numpy(xCM).type(my_ft), xCMsByLengthByWordformIndex))