In [1]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

**Notebook author:** emeinhardt@ucsd.edu

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Overview" data-toc-modified-id="Overview-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Overview</a></span><ul class="toc-item"><li><span><a href="#Requirements" data-toc-modified-id="Requirements-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Requirements</a></span></li><li><span><a href="#Usage" data-toc-modified-id="Usage-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Usage</a></span></li></ul></li><li><span><a href="#Parameters" data-toc-modified-id="Parameters-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Parameters</a></span></li><li><span><a href="#Imports-/-load-data" data-toc-modified-id="Imports-/-load-data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Imports / load data</a></span></li><li><span><a href="#numpy-representations" data-toc-modified-id="numpy-representations-4"><span class="toc-item-num">4&nbsp;&nbsp;</span><code>numpy</code> representations</a></span></li><li><span><a href="#Calculation" data-toc-modified-id="Calculation-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Calculation</a></span></li></ul></div>

# Overview

Given
 - a filepath to a triphone channel model $c$
 - a filepath $w$ to a `.json` file specifying a conditional distribution $p(W|V)$ on segmental wordforms given orthographic ones
 - an output filepath prefix $o$
 - an optional flag $p$ indicating whether to do calculations for both full wordforms and prefixes (`True`, default) or just full wordforms (`False`)
 - an optional flag $r$ indicating whether to include a 'preview' term

this notebook calculates a channel matrix for each source prefix (if $p$, otherwise just for full source wordforms) and writes these channel matrices to file (with prefix given by $o$), with each file corresponding to a block of source prefixes (if $p$, else full source wordforms) of the same length. Within a block, the ordering of source prefixes/wordforms is given by alphabetically sorting the relevant set of prefixes (full wordforms).

## Requirements

 - `numpy`
 - `pytorch`

## Usage

#FIXME

# Parameters

In [2]:
from os import getcwd, chdir, listdir, path, mkdir, makedirs

In [3]:
from boilerplate import *

In [4]:
# Parameters

c = ''
c = 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json'

# w = ''
w = 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json'

o = ''
o = 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01/LTR_Buckeye_aligned_CM_filtered_LM_filtered_'

p = ''
p = 'True'

r = ''
r = 'True'

In [6]:
ensure_dir_exists(path.dirname(o))

In [7]:
if p == '':
    p = 'True'

if r == '':
    r = 'True'

In [12]:
if p == 'True':
    p = True
else:
    p = False
    
if r == 'True':
    r = True
else:
    r = False

# Imports / load data

In [9]:
from probdist import *

In [13]:
from string_utils import *

In [8]:
import numpy as np
import torch

In [10]:
pY1X012 = condDistsAsProbDists(importProbDist(c))

In [11]:
pW_V = condDistsAsProbDists(importProbDist(w))

In [31]:
#extract segmental wordforms from w
Ws = union(list(map(lambda d: set(conditions(d)), 
                    pW_V.values())))
Ws_t = tuple(sorted(list(Ws)))
print(f'|Wordforms| = {len(Ws)}')

#if p, extract prefixes from w
if p:
    Ps = union(map(getPrefixes, Ws))
    print(f'|Prefixes| = {len(Ps)}')
    Ps_t = tuple(sorted(list(Ws)))

#extract triphones from w
lex_triphs = lexiconTo3factors(Ws)
print(f'|triphones| in lexicon = {len(lex_triphs)}')

|Wordforms| = 6403
|Prefixes| = 21475
|triphones| in lexicon = 5760


In [33]:
#extract triphones from c
channel_triphs = set(pY1X012.keys())

print(f'|triphones| in channel model = {len(channel_triphs)}')

|triphones| in channel model = 5760


In [34]:
assert all({triph in channel_triphs for triph in lex_triphs})

# `numpy` representations

#FIXME

# Calculation

In [None]:
random_source_wordform = getRandomKey(pW)
random_source_wordform
random_channel_prefix2 = randomPrefix(len(ds2t(random_source_wordform))-1, alphabet=Y1s)
random_channel_prefix2

In [None]:
random_source_prefix = getRandomKey(pX0i)
while ds2t(random_source_prefix)[-1] == rightEdge:
    random_source_prefix = getRandomKey(pX0i)
while len(ds2t(random_source_prefix)) > len(ds2t(random_source_wordform)):
    random_source_prefix = getRandomKey(pX0i)
random_source_prefix
random_channel_prefix = randomPrefix(len(ds2t(random_source_prefix))-1, alphabet=Y1s)
random_channel_prefix

In [None]:
def sourcePrefixToTriphones(x0k):
    xp_t = ds2t(x0k) #"x prefix"
    
#     xi = xp_t[-2] #just-completed segment
#     xk = xp_t[-1] #upcoming segment that we only get coarticulatory information about
    
#     xik_ds = t2ds((xi, xk))
#     preview_dist = p3Y1X01[xik_ds]
    
    x012s = dsToKfactorSequence(3, t2ds(xp_t))
    return x012s

random_triphoneSeq = sourcePrefixToTriphones(random_source_prefix)
random_triphoneSeq
threeFactorSequenceToDS(random_triphoneSeq)

In [None]:
def sourcePrefixToTriphoneIndices(x0k):
    triphoneSequence = sourcePrefixToTriphones(x0k)
    return tuple(map(lambda x012: X012map[x012], triphoneSequence))

sourcePrefixToTriphoneIndices(random_source_prefix)

In [None]:
blah = np.zeros((len(Y1s_RE), 1))
blah[-1] = 1.0
blah

In [None]:
def sourcePrefixToChannelMatrix_l(x0k, debug=False):
    triphoneOHs = dsToTriphoneOHs(x0k, X012OHmap)
    if debug:
        print('x0k = {0}'.format(x0k))
        print('|x0k| = {0}'.format(len(x0k)))
        print('triphoneIdxs = {0}'.format(sourcePrefixToTriphoneIndices(x0k)))
        print('triphoneOHs.shape = {0}'.format(triphoneOHs.shape))
        print('p3Y1X012_np.shape = {0}'.format(p3Y1X012_np.shape))
        print('result = p3Y1X012_np * triphoneOHs.T')
    result = np.matmul(p3Y1X012_np, triphoneOHs.T)
    return result
# sourcePrefixToChannelMatrix_l(random_source_prefix, True)

def sourcePrefixToChannelMatrix(x0k):
    triphoneIndices = sourcePrefixToTriphoneIndices(x0k)
    C = np.array([[p3Y1X012_np[Y1REmap[y1], x012_idx] for x012_idx in triphoneIndices] for y1 in sorted(Y1s_RE)])
    if x0k == leftEdge or (len(ds2t(x0k)) == 2 and ds2t(x0k)[0] == leftEdge):
        C = np.zeros((len(Y1s_RE), 1))
        C[-1] = 1.0
        return C.reshape(39,1)
    return C

# sourcePrefixToChannelMatrix(random_source_prefix)

print(sourcePrefixToChannelMatrix_l(random_source_prefix) == sourcePrefixToChannelMatrix(random_source_prefix))

In [None]:
def sourcePrefixToPreviewVector(x0k):
    xp_t = ds2t(x0k) #"x prefix"
    
    if len(xp_t) < 2:
        raise Exception('|x0k| must be > 1.')
    if len(xp_t) == 2 and xp_t[0] == leftEdge:
#         raise Exception("There's no gating data that bears on this calculation, nor is it that interesting.")
        uniformProb = 1.0 / len(Y1s_RE)
        preview_dist = uniformProb * np.ones((len(Y1s_RE), 1))#garbage
        return preview_dist.reshape(39,1)
    
    xi = xp_t[-2] #just-completed segment
    xk = xp_t[-1] #upcoming segment that we only get coarticulatory information about
    
    xik_ds = t2ds((xi, xk))
    preview_dist = p3Y1X01[xik_ds]
#     assert Y1s_RE == set(preview_dist.keys()) #comment out once you are reasonably confident this is true by construction
    
    return np.array([preview_dist[y1] for y1 in sorted(Y1s_RE)])

sourcePrefixToPreviewVector(random_source_prefix)

In [None]:
# returns p(Y0K|x0k)
def makeExtendedChannelMatrixByPrefix(prefix):
    # NB:
    # if len(prefix) == n (including leftEdge), 
    # then the extended channel matrix will have dimensions 39 x (n-1)
    
    p = prefix
    if prefix != leftEdge:# and not (len(ds2t(p)) == 2 and ds2t(p)[0] == leftEdge):
#     if prefix != leftEdge and not (len(ds2t(p)) == 2 and ds2t(p)[0] == leftEdge):
        return np.hstack( (sourcePrefixToChannelMatrix(p) , sourcePrefixToPreviewVector(p).reshape(39,1)))
    else: #the extended channel matrix is garbage that should never be asked for
        l = len(ds2t(p))
        return np.zeros((39, l-1))

In [None]:
xCMsByPrefixIndex = [makeExtendedChannelMatrixByPrefix(p)
                     for p in sorted(list(prefixes))]
xCMsByPrefixIndex_torch = [None] + [torch.from_numpy(each) for each in xCMsByPrefixIndex[1:]]