In [1]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

**Notebook author:** emeinhardt@ucsd.edu

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Overview" data-toc-modified-id="Overview-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Overview</a></span><ul class="toc-item"><li><span><a href="#Requirements" data-toc-modified-id="Requirements-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Requirements</a></span></li><li><span><a href="#Usage" data-toc-modified-id="Usage-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Usage</a></span></li></ul></li><li><span><a href="#Parameters" data-toc-modified-id="Parameters-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Parameters</a></span></li><li><span><a href="#Imports-/-load-data" data-toc-modified-id="Imports-/-load-data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Imports / load data</a></span><ul class="toc-item"><li><span><a href="#Load/extract-sanity-checking-data" data-toc-modified-id="Load/extract-sanity-checking-data-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Load/extract sanity-checking data</a></span><ul class="toc-item"><li><span><a href="#Segmental-lexicon,-prefixes,-inventory,-and-triphones" data-toc-modified-id="Segmental-lexicon,-prefixes,-inventory,-and-triphones-3.1.1"><span class="toc-item-num">3.1.1&nbsp;&nbsp;</span>Segmental lexicon, prefixes, inventory, and triphones</a></span></li><li><span><a href="#Triphone-channel-distribution-and-channel-alphabet" data-toc-modified-id="Triphone-channel-distribution-and-channel-alphabet-3.1.2"><span class="toc-item-num">3.1.2&nbsp;&nbsp;</span>Triphone channel distribution and channel alphabet</a></span></li><li><span><a href="#Corpus-contexts" data-toc-modified-id="Corpus-contexts-3.1.3"><span class="toc-item-num">3.1.3&nbsp;&nbsp;</span>Corpus contexts</a></span></li><li><span><a href="#Conversion-to-one-hot-vectors-/-sequences-thereof" data-toc-modified-id="Conversion-to-one-hot-vectors-/-sequences-thereof-3.1.4"><span class="toc-item-num">3.1.4&nbsp;&nbsp;</span>Conversion to one-hot vectors / sequences thereof</a></span></li></ul></li><li><span><a href="#Load-segmental-sequence-channel-matrices-$p(Y_0^f|X_0^k)$" data-toc-modified-id="Load-segmental-sequence-channel-matrices-$p(Y_0^f|X_0^k)$-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Load segmental sequence channel matrices $p(Y_0^f|X_0^k)$</a></span></li><li><span><a href="#Load-contextual-distribution-on-segmental-wordforms-$p(W|C)$" data-toc-modified-id="Load-contextual-distribution-on-segmental-wordforms-$p(W|C)$-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Load contextual distribution on segmental wordforms $p(W|C)$</a></span></li><li><span><a href="#Load-lexicon-metadata" data-toc-modified-id="Load-lexicon-metadata-3.4"><span class="toc-item-num">3.4&nbsp;&nbsp;</span>Load lexicon metadata</a></span></li><li><span><a href="#Sanity-checking-calculations" data-toc-modified-id="Sanity-checking-calculations-3.5"><span class="toc-item-num">3.5&nbsp;&nbsp;</span>Sanity-checking calculations</a></span><ul class="toc-item"><li><span><a href="#$p(Y_0^f|X_0^k)$" data-toc-modified-id="$p(Y_0^f|X_0^k)$-3.5.1"><span class="toc-item-num">3.5.1&nbsp;&nbsp;</span>$p(Y_0^f|X_0^k)$</a></span></li><li><span><a href="#$p(X_0^f|C)$-(or-$p(X_0^k|C)$)" data-toc-modified-id="$p(X_0^f|C)$-(or-$p(X_0^k|C)$)-3.5.2"><span class="toc-item-num">3.5.2&nbsp;&nbsp;</span>$p(X_0^f|C)$ (or $p(X_0^k|C)$)</a></span></li><li><span><a href="#$p(Y_0^f)$" data-toc-modified-id="$p(Y_0^f)$-3.5.3"><span class="toc-item-num">3.5.3&nbsp;&nbsp;</span>$p(Y_0^f)$</a></span></li></ul></li></ul></li><li><span><a href="#Calculation" data-toc-modified-id="Calculation-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Calculation</a></span></li></ul></div>

# Overview

Given a choice of parameters $\epsilon$ and $n$, and given
 - wordform channel matrices $p(Y_0^f|X_0^f)$
 - a contextual distribution on segmental wordforms $p(X_0^f|C)$
 - segmental lexicon metadata pre-calculating $k$-cousins/$k$-spheres up to $k=4$
 
Calculate

$$\hat{p}(\hat{X}_0^f = x_0^{'f}|X_0^f = x_0^{*f}, c) = \frac{1}{n} \sum\limits_{y_0^f \in S} p(\hat{X}_0^f = x_0^{'f}|y_0^f, c)$$
 where 
  - edit distance $d(x_0^{'f}, x_0^{*f}) \leq 4$
  - $S = $ a set of $n$ samples from $p(Y_0^f|x_0^{*f})$. In practice an $n \approx 50$ seems to result in estimates that are within $10^{-6}$ of the true estimate. 
  - $p(\hat{X}_0^f = x_0^{'f}|Y_0^f = y_0^f, c) = \frac{p(y_0^f|x_0^{'f})p(x_0^{'f}|c)}{p(y_0^f | c)}$
  - $p(y_0^f| c) = \sum\limits_{v', x_0^{''f}} p(y_0^f|x_0^{''f})p(x_0^{''f}|v')p(v'|c) = \sum\limits_{x_0^{''f}} p(y_0^f|x_0^{''f})p(x_0^{''f}|c)$

## Requirements

#FIXME

## Usage

#FIXME

# Parameters

In [2]:
from os import getcwd, chdir, listdir, path, mkdir, makedirs

In [3]:
from boilerplate import *

In [4]:
repo_dir = getcwd(); repo_dir

'/mnt/cube/home/AD/emeinhar/wr'

In [5]:
# Parameters

#p(Y_0^f|X_0^k)
c = ''
c = 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01/LTR_Buckeye_aligned_CM_filtered_LM_filtered_CMs_by_length_by_prefix_index.pickle'

#p(X_0^f|C)
w = ''
w = 'LD_Fisher_vocab_in_Buckeye_contexts/LTR_Buckeye_aligned_CM_filtered_LM_filtered_in_buckeye_contexts.pW_C.npy'

# LTR metadata directory
m = ''
m = 'LTR_Buckeye_aligned_w_GD_AmE_destressed'

# output filepath prefix for pW_WC
o = ''
o = 'LD_Fisher_vocab_in_Buckeye_contexts/LTR_Buckeye_aligned_CM_filtered_LM_filtered_in_buckeye_contexts.pW_WC'

# for sanity checking...
p = ''
p = 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json'

t = ''
t = 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json'

s = ''
s = 'LD_Fisher_vocab_in_Buckeye_contexts/LM_filtered_buckeye_contexts.txt'


# calculation parameters
n = ''
n = '50'

k = ''
k = '2'

r = ''
r = 'False' #if 'False', only calculate p(\hat{W}|W = w, c), i.e. don't calculate p(\hat{W}|P = p, c)

e = ''
e = 'True' #if r='False' and s='True', only calculate p(\hat{W} = w*| W = w*, c) ∀w ∈ W

In [6]:
output_dir = path.dirname(o)
if not path.exists(output_dir):
    print('Making output path {0}'.format(output_dir))
    makedirs(output_dir)

In [7]:
if n == '':
    n = 50
else:
    n = int(n)

In [8]:
if k == '':
    k = 2
else:
    k = int(k)

In [9]:
if r == '' or r == 'False':
    r = False
elif r == 'True':
    r = True
else:
    raise Exception(f"r must be one of {'','True','False'}, got '{r}' instead")

In [10]:
if not r and (e == '' or e == 'True'):
    e = True
elif r and (e == '' or e == 'True'):
    raise Exception("e can only be True if r is False")
elif e == 'False':
    e = False
else:
    raise Exception(f"e must be one of {'','True','False'}, got '{e}'")

# Imports / load data

In [11]:
import pickle

In [12]:
import numpy as np
import torch
import sparse

In [13]:
from probdist import *

In [14]:
from tqdm import tqdm, tqdm_gui, tqdm_notebook

In [15]:
from joblib import Parallel, delayed

J = -1
BACKEND = 'multiprocessing'
# BACKEND = 'loky'
V = 10
PREFER = 'processes'
# PREFER = 'threads'

def identity(x):
    return x

def par(gen_expr):
    return Parallel(n_jobs=J, backend=BACKEND, verbose=V, prefer=PREFER)(gen_expr)

In [16]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')
    print(torch.cuda.get_device_name(1))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(1)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_cached(1)/1024**3,1), 'GB')

Using device: cpu



In [17]:
gpu = torch.device('cuda')
cpu = torch.device('cpu')

my_device = cpu

In [18]:
cuda_ft = torch.cuda.FloatTensor
cuda_dt = torch.cuda.DoubleTensor

ft = torch.FloatTensor
dt = torch.DoubleTensor

my_ft = ft
my_dt = dt

torch.set_default_tensor_type(my_ft)

## Load/extract sanity-checking data

We want to be able to make queries, spot checks, and sanity checks. That means we want to be able to reference
 1. the set of strings constituting segmental wordforms and prefixes
 2. the source and channel alphabets
 3. the channel distribution's conditioning triphones ∩ the triphones in the lexicon = the triphones in the lexicon
 4.  

Segmental wordforms were necessary for 
 - the lexicon metadata calculation (step 4b)
 - the contextual distribution on segmental wordforms (step 4c)
 - the definition of the segmental sequence channel matrices (step 4d)
 
What does each use as input? (Pass that to this notebook.)

Each notebook uses ...pW_V.json (or something slightly downstream of that)

### Segmental lexicon, prefixes, inventory, and triphones

In [19]:
pW_V = condDistsAsProbDists(importProbDist(p))

Extract 
 - `orthographic vocabulary`
 - `segmental vocabulary`

In [20]:
Vs = set(pW_V.keys())
Ws = union(mapValues(lambda dist: set(conditions(dist)), 
                     pW_V).values())
len(Vs)
len(Ws)

6574

6403

In [21]:
wordlengthsInclEdges = set(len(ds2t(w)) for w in Ws)
wordlengthsInclEdges
numWordsOfExactlyLength = {l:len(wordformsOfLength(l, Ws, True)) for l in wordlengthsInclEdges}
numWordsOfExactlyLength

{3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19}

{3: 11,
 4: 144,
 5: 808,
 6: 1201,
 7: 1183,
 8: 974,
 9: 779,
 10: 555,
 11: 354,
 12: 211,
 13: 98,
 14: 51,
 15: 25,
 16: 5,
 17: 2,
 18: 1,
 19: 1}

In [22]:
wordlengthsNotIncludingEdges = {each-2 for each in wordlengthsInclEdges}
wordlengthsNotIncludingEdges

{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}

In [23]:
lengthFreqs = {l:len(wordformsAtLeastLlong(l, Ws, True)) for l in wordlengthsInclEdges}
lengthFreqs

{3: 6403,
 4: 6392,
 5: 6248,
 6: 5440,
 7: 4239,
 8: 3056,
 9: 2082,
 10: 1303,
 11: 748,
 12: 394,
 13: 183,
 14: 85,
 15: 34,
 16: 9,
 17: 4,
 18: 2,
 19: 1}

In [24]:
prefix_relation = set(union({(w,p) for p in getPrefixes(w)} for w in tqdm(Ws)))
len(prefix_relation)

100%|██████████| 6403/6403 [00:05<00:00, 1132.15it/s]


49429

Extract 
 - `segmental prefixes`

In [25]:
Ps = set(map(lambda pair: pair[1],
             prefix_relation))

Create 
 - `sorted` versions of the `segmental vocabulary` and `segmental prefixes`

In [26]:
Ws_t = tuple(sorted(list(Ws)))
Ps_t = tuple(sorted(list(Ps)))
num_wordforms = len(Ws_t)
num_prefixes = len(Ps_t)

num_wordforms
num_prefixes

6403

21475

Create mappings between `segmental wordforms` and `segmental prefixes`.

In [27]:
v_to_Ws = mapValues(lambda dist: set(conditions(dist)),
                    pW_V)
V_W_relation = {(v,w) 
                for v in v_to_Ws 
                for w in v_to_Ws[v]}
w_to_Vs = {w:{v for v in Vs if (v,w) in V_W_relation}
           for w in Ws}

Extract
 - `segmental inventory`
 - `triphones` in the `segmental lexicon`

In [28]:
source_alphabet = lexiconToInventory(Ws)
Xs = source_alphabet
len(Xs)
Xs

40

{'aɪ',
 'aʊ',
 'b',
 'd',
 'dʒ',
 'eɪ',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'oʊ',
 'p',
 's',
 't',
 'tʃ',
 'u',
 'v',
 'w',
 'z',
 'æ',
 'ð',
 'ŋ',
 'ɑ',
 'ɔɪ',
 'ɚ',
 'ɛ',
 'ɪ',
 'ɹ',
 'ʃ',
 'ʊ',
 'ʌ',
 'ʒ',
 'θ',
 '⋉',
 '⋊'}

In [29]:
Xs_t = tuple(sorted(Xs))

In [30]:
lexicon_triphones = lexiconTo3factors(Ws)
len(lexicon_triphones)
list(lexicon_triphones)[:10]

5760

['d.u.k',
 'z.ʌ.⋉',
 'ɪ.n.æ',
 'dʒ.ɪ.m',
 'æ.g.p',
 'ɪ.p.ɑ',
 't.ɚ.ɪ',
 'z.ɪ.s',
 'ɑ.s.u',
 '⋊.ʌ.n']

In [31]:
X012s = lexicon_triphones
X012s_t = tuple(sorted(list(X012s)))

In [32]:
triphs_with_LE = {triph for triph in lexicon_triphones if leftEdge in triph}
len(triphs_with_LE)

431

In [33]:
triphs_with_RE = {triph for triph in lexicon_triphones if rightEdge in triph}
len(triphs_with_RE)

451

### Triphone channel distribution and channel alphabet

In [34]:
p3Y1X012 = condDistsAsProbDists(importProbDist(t))

assert uniformOutcomes(p3Y1X012)

In [35]:
channel_triphones = conditions(p3Y1X012)
len(channel_triphones)

assert all(triph in channel_triphones for triph in lexicon_triphones)

5760

In [36]:
channel_alphabet = outcomes(p3Y1X012)
len(channel_alphabet)
channel_alphabet

38

{'aɪ',
 'aʊ',
 'b',
 'd',
 'dʒ',
 'eɪ',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'oʊ',
 'p',
 's',
 't',
 'tʃ',
 'u',
 'v',
 'w',
 'z',
 'æ',
 'ð',
 'ŋ',
 'ɑ',
 'ɔɪ',
 'ɚ',
 'ɛ',
 'ɪ',
 'ɹ',
 'ʃ',
 'ʊ',
 'ʌ',
 'ʒ',
 'θ'}

In [37]:
Y1s = channel_alphabet
Y1s_t = tuple(sorted(Y1s))

In [38]:
in_lexicon_inventory_but_not_in_channel_inventory = source_alphabet - channel_alphabet
in_lexicon_inventory_but_not_in_channel_inventory

assert in_lexicon_inventory_but_not_in_channel_inventory == {leftEdge, rightEdge}

{'⋉', '⋊'}

### Corpus contexts

In [39]:
Cs = importSeqs(s)
len(Cs)

Cs_t = tuple(sorted(Cs))
Cs_t[123:129]

16443

('a spiritual',
 'a spiritual bookstore',
 'a strip',
 'a strip mall',
 'a thing',
 'a thing about')

### Conversion to one-hot vectors / sequences thereof

In [39]:
# construct what you need to convert to/from one-hot representations

# look at segment sequence channel matrix notebook

In [40]:
Xmap = seqsToIndexMap(Xs)
XOHmap = seqsToOneHotMap(Xs)

In [41]:
X012map = seqsToIndexMap(X012s)
# X012OHs = seqMapToOneHots(X012map)
X012OHmap = seqsToOneHotMap(X012s)

In [42]:
Y1map = seqsToIndexMap(Y1s)

In [43]:
def dsToUniphoneIndices(ds, uniphoneToIndexMap):
    uniphoneSeq = ds2t(ds)
    return np.array([uniphoneToIndexMap[uniphone] for uniphone in uniphoneSeq])

def dsToUniphoneOHs(ds, uniphoneToOHmap):
    uniphoneSeq = ds2t(ds)
    return np.array([uniphoneToOHmap[uniphone] for uniphone in uniphoneSeq])

def dsToTriphoneSeq(ds):
    return dsToKfactorSequence(3, ds)

def dsToTriphoneIndices(ds, triphoneToIndexMap):
    triphoneSeq = dsToTriphoneSeq(ds)
    return np.array([triphoneToIndexMap[triphone] for triphone in triphoneSeq])

def dsToTriphoneOHs(ds, triphoneToOHmap):
    triphoneSeq = dsToTriphoneSeq(ds)
    return np.array([triphoneToOHmap[triphone] for triphone in triphoneSeq])

dsToUniphoneIndices('t.i.f.l', Xmap)
dsToUniphoneOHs('t.i.f.l', XOHmap)
dsToTriphoneSeq('t.i.f.l')
dsToTriphoneIndices('t.i.f.l', X012map)
dsToTriphoneOHs('t.i.f.l', X012OHmap)
dsToTriphoneOHs('t.i.f.l', X012OHmap).shape
dsToTriphoneOHs('t.i.f.l', X012OHmap)[0].shape
dsToTriphoneOHs('t.i.f.l', X012OHmap)[0][5528]
dsToTriphoneOHs('t.i.f.l', X012OHmap)[1][5352]

array([18,  9,  6, 12])

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.]])

('t.i.f', 'i.f.l')

array([2904, 1146])

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

(2, 5760)

(5760,)

0.0

0.0

In [78]:
# Y1s_RE = outcomes(p3Y1X01)
# len(Y1s_RE)
# Y1s_RE_list = sorted(list(Y1s_RE))

# print(Y1s_RE - Y1s)

# Y1REmap = seqsToIndexMap(Y1s_RE)

# Y1REOHs = seqMapToOneHots(Y1REmap)
# Y1REOHmap = seqsToOneHotMap(Y1s_RE)
Y1OHmap = seqsToOneHotMap(Y1s)
# OHY1REmap = oneHotToSeqMap(Y1s_RE)
OHY1map = oneHotToSeqMap(Y1s)

def ymap(y):
#     return Y1REmap[y]
    return Y1map[y]

def y0kMap(y0k):
    return np.array(list(map(ymap, ds2t(y0k)[1:])))

def channelSeqOHs2ds(y0k_OHs, addLeftEdge = False):
    if not addLeftEdge:
#         return t2ds(tuple( map(OHY1REmap, tuple(y0k_OHs) ) ))
        return t2ds(tuple( map(OHY1map, tuple(y0k_OHs) ) ))
#     return leftEdge + '.' + t2ds(tuple( map(OHY1REmap, tuple(y0k_OHs) ) ))
    return leftEdge + '.' + t2ds(tuple( map(OHY1map, tuple(y0k_OHs) ) ))

def channelSeqds2OHs(y0k):
    y0k_t = ds2t(y0k)
    if leftEdge == y0k_t[0]:
        y1k_t = y0k_t[1:]
    y1k_t = y0k_t[1:]
#     return np.array([Y1REOHmap[ yj ] for yj in y1k_t]) #shape should be (_, 39)
    return np.array([Y1OHmap[ yj ] for yj in y1k_t]) #shape should be (_, 38)

def y0kOHmap(y0k):
##     y0k_t = ds2t(y0k)
## #     y0k_indices = y0kMap(y0k) #np.array(list(map(lambda y1: Y1map[y1], y0k_t[1:])))
##     y1k = t2ds(y0k_t[1:])
##     y0k_OHs = dsToUniphoneOHs(y1k, Y1REOHmap)
#     return dsToUniphoneOHs(t2ds(ds2t(y0k)[1:]), Y1REOHmap)
    y0k_t = ds2t(y0k) #let l = len(y0k_t)
    if y0k_t[0] == leftEdge:
        y1k_t = y0k_t[1:]
#         return np.array([Y1REOHmap[ yj ] for yj in y1k_t]) #shape should be (l-1, 39)
        return np.array([Y1OHmap[ yj ] for yj in y1k_t]) #shape should be (l-1, 38)
#     return np.array([Y1REOHmap[ yj ] for yj in y0k_t]) #shape should be (l, 39)
    return np.array([Y1OHmap[ yj ] for yj in y0k_t]) #shape should be (l, 38)


# list(Y1s_RE)[0]
list(Y1s)[0]
# OHY1REmap( Y1REOHmap[ list(Y1s_RE)[0] ] )
OHY1map( Y1OHmap[ list(Y1s)[0] ] )

channelSeqds2OHs(leftEdge + '.'+ 't.u.p')

channelSeqOHs2ds( channelSeqds2OHs(leftEdge + '.'+ 't.u.p') , True)

channelSeqOHs2ds(y0kOHmap(leftEdge + '.' + 't.u.p'), True)
channelSeqOHs2ds(y0kOHmap('t.u.p'))
len(ds2t(leftEdge + '.' + 't.u.p'))
len(ds2t('t.u.p'))
y0kOHmap('t.u.p').shape
y0kOHmap(leftEdge + '.' + 't.u.p').shape

'dʒ'

'dʒ'

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.]])

'⋊.t.u.p'

'⋊.t.u.p'

't.u.p'

4

3

(3, 38)

(3, 38)

In [44]:
p3Y1X012_np = condDistFamilyToNP(p3Y1X012)
p3Y1X012_np.shape

(38, 5760)

In [45]:
from random import choice

In [46]:
random_source_wordform = choice(list(Ws))
random_source_wordform

random_source_prefix = choice(list(Ps))
random_source_prefix

'⋊.k.ɑ.l.⋉'

'⋊.d.ɪ.ɹ.ɛ.k.t.ɚ.⋉'

In [47]:
def randomPrefix(l, alphabet=Xs):
    return randomString(alphabet, l, hasLeftEdge=True)

In [48]:
random_channel_prefix2 = randomPrefix(len(ds2t(random_source_wordform))-1, alphabet=Y1s)
random_channel_prefix2

'⋊.ʌ.ʒ.eɪ.j'

In [49]:
# random_source_prefix = getRandomKey(pX0i)
random_source_prefix = choice(list(Ps))
while ds2t(random_source_prefix)[-1] == rightEdge:
#     random_source_prefix = getRandomKey(pX0i)
    random_source_prefix = choice(list(Ps))
while len(ds2t(random_source_prefix)) > len(ds2t(random_source_wordform)):
#     random_source_prefix = getRandomKey(pX0i)
    random_source_prefix = choice(list(Ps))
random_source_prefix
random_channel_prefix = randomPrefix(len(ds2t(random_source_prefix))-1, alphabet=Y1s)
random_channel_prefix

'⋊.l.u.p'

'⋊.j.t.b'

In [50]:
def sourcePrefixToTriphones(x0k):
    xp_t = ds2t(x0k) #"x prefix"
    
#     xi = xp_t[-2] #just-completed segment
#     xk = xp_t[-1] #upcoming segment that we only get coarticulatory information about
    
#     xik_ds = t2ds((xi, xk))
#     preview_dist = p3Y1X01[xik_ds]
    
    x012s = dsToKfactorSequence(3, t2ds(xp_t))
    return x012s

random_triphoneSeq = sourcePrefixToTriphones(random_source_prefix)
random_triphoneSeq
threeFactorSequenceToDS(random_triphoneSeq)

('⋊.l.u', 'l.u.p')

'⋊.l.u.p'

In [51]:
def sourcePrefixToTriphoneIndices(x0k):
    triphoneSequence = sourcePrefixToTriphones(x0k)
    return tuple(map(lambda x012: X012map[x012], triphoneSequence))

sourcePrefixToTriphoneIndices(random_source_prefix)

(5487, 1704)

## Load segmental sequence channel matrices $p(Y_0^f|X_0^k)$

In [53]:
pY0f_X0ks = pickle.load(open(c, 'rb'))
len(pY0f_X0ks)

17

In [54]:
pY0f_X0ks[0].shape
pY0f_X0ks[1].shape
pY0f_X0ks[2].shape
pY0f_X0ks[3].shape

(6403, 38, 1)

(6403, 38, 2)

(6403, 38, 3)

(6403, 38, 4)

In [55]:
pY0f_X0ks[0][0]

array([[0.65264711],
       [0.00700978],
       [0.02545656],
       [0.02545656],
       [0.00682531],
       [0.06235012],
       [0.00682531],
       [0.00700978],
       [0.00682531],
       [0.00700978],
       [0.0051651 ],
       [0.00682531],
       [0.00700978],
       [0.00700978],
       [0.00700978],
       [0.00700978],
       [0.00700978],
       [0.00682531],
       [0.00700978],
       [0.00700978],
       [0.00700978],
       [0.00682531],
       [0.00664084],
       [0.00682531],
       [0.00700978],
       [0.00682531],
       [0.00700978],
       [0.00700978],
       [0.00700978],
       [0.00682531],
       [0.00700978],
       [0.00700978],
       [0.00700978],
       [0.00682531],
       [0.00700978],
       [0.00700978],
       [0.00682531],
       [0.00700978]])

In [56]:
pY0f_X0ks[3][0]

array([[0.65264711, 0.02141693, 0.00537084, 0.01037752],
       [0.00700978, 0.00855777, 0.00537084, 0.0054456 ],
       [0.02545656, 0.00855777, 0.01023504, 0.0054456 ],
       [0.02545656, 0.58122414, 0.0052997 , 0.00537347],
       [0.00682531, 0.00821919, 0.00515835, 0.00523015],
       [0.06235012, 0.00855777, 0.00537084, 0.0054456 ],
       [0.00682531, 0.00833257, 0.0052295 , 0.0053023 ],
       [0.00700978, 0.02552273, 0.00537084, 0.0054456 ],
       [0.00682531, 0.00844442, 0.0052997 , 0.00537347],
       [0.00700978, 0.00855777, 0.79005246, 0.0054456 ],
       [0.0051651 , 0.00734595, 0.0046103 , 0.00467448],
       [0.00682531, 0.00844442, 0.0052997 , 0.00537347],
       [0.00700978, 0.00855777, 0.00537084, 0.0054456 ],
       [0.00700978, 0.00844442, 0.0052997 , 0.00537347],
       [0.00700978, 0.01630829, 0.00537084, 0.0054456 ],
       [0.00700978, 0.00855777, 0.00537084, 0.0054456 ],
       [0.00700978, 0.00844442, 0.0052997 , 0.00537347],
       [0.00682531, 0.00844442,

## Load contextual distribution on segmental wordforms $p(W|C)$

In [57]:
pW_C = np.load(w)
pW_C.shape
pW_C.dtype
pW_C.nbytes / 1e9

(6403, 16443)

dtype('float64')

0.842276232

## Load lexicon metadata

In [58]:
cousin_fn_map = {i:'{0}cousinsOf.npz'.format(i) 
                 for i in range(5)}
sphere_fn_map = {i:'{0}spheresOf.npz'.format(i) 
                 for i in range(5)}

In [59]:
cousin_fn_map

{0: '0cousinsOf.npz',
 1: '1cousinsOf.npz',
 2: '2cousinsOf.npz',
 3: '3cousinsOf.npz',
 4: '4cousinsOf.npz'}

In [60]:
sphere_fn_map

{0: '0spheresOf.npz',
 1: '1spheresOf.npz',
 2: '2spheresOf.npz',
 3: '3spheresOf.npz',
 4: '4spheresOf.npz'}

In [61]:
assert all(fn in listdir(m) for fn in cousin_fn_map.values())
assert all(fn in listdir(m) for fn in sphere_fn_map.values())

In [62]:
chdir(m)

In [63]:
cousin_mats = mapValues(sparse.load_npz, cousin_fn_map)

In [64]:
sphere_mats = mapValues(sparse.load_npz, sphere_fn_map)

In [65]:
chdir(repo_dir)

In [66]:
cousin_mats
mapValues(lambda m: m.nbytes / 1e9,
          cousin_mats)

{0: <COO: shape=(21475, 6403), dtype=uint8, nnz=49429, fill_value=0>,
 1: <COO: shape=(21475, 6403), dtype=uint8, nnz=590534, fill_value=0>,
 2: <COO: shape=(21475, 6403), dtype=uint8, nnz=4878514, fill_value=0>,
 3: <COO: shape=(21475, 6403), dtype=uint8, nnz=17429734, fill_value=0>,
 4: <COO: shape=(21475, 6403), dtype=uint8, nnz=26080551, fill_value=0>}

{0: 0.000840293,
 1: 0.010039078,
 2: 0.082934738,
 3: 0.296305478,
 4: 0.443369367}

In [67]:
sphere_mats
mapValues(lambda m: m.nbytes / 1e9,
          sphere_mats)

{0: <COO: shape=(21475, 6403), dtype=uint8, nnz=6403, fill_value=0>,
 1: <COO: shape=(21475, 6403), dtype=uint8, nnz=14910, fill_value=0>,
 2: <COO: shape=(21475, 6403), dtype=uint8, nnz=173130, fill_value=0>,
 3: <COO: shape=(21475, 6403), dtype=uint8, nnz=914686, fill_value=0>,
 4: <COO: shape=(21475, 6403), dtype=uint8, nnz=1476146, fill_value=0>}

{0: 0.000108851, 1: 0.00025347, 2: 0.00294321, 3: 0.015549662, 4: 0.025094482}

In [68]:
m

'LTR_Buckeye_aligned_w_GD_AmE_destressed'

In [69]:
segmental_wordforms = importSeqs(path.join(m, 'LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V_Transcriptions.txt'))
len(segmental_wordforms)

6403

In [70]:
list(segmental_wordforms)[0]

'h.i.t.ɚ.z'

In [71]:
Ws_t = tuple(sorted(list(map(padInputSequenceWithBoundaries,
                             segmental_wordforms))))
len(Ws_t)

6403

In [72]:
Ps = union(map(lambda w: getPrefixes(padInputSequenceWithBoundaries(w)), segmental_wordforms))
len(Ps)

21475

In [73]:
Ps_t = tuple(sorted(list(Ps)))
len(Ps_t)

21475

## Sanity-checking calculations

In [74]:
# FIXME update this cell once you add in cells for doing calculations the old-fashioned way 
# and verifying/demonstrating that the calculation cell works correctly

### $p(Y_0^f|X_0^k)$

In [105]:
len(pY0f_X0ks)
sorted(wordlengthsNotIncludingEdges)
assert len(pY0f_X0ks) == sorted(wordlengthsNotIncludingEdges)[-1]

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]

In [108]:
len(Ws)
len(Ps)

6403

21475

In [None]:
# for each_l in pY0f_X0ks:
    
#     assert each_l.shape

In [107]:
pY0f_X0ks[0].shape

(6403, 38, 1)

### $p(X_0^f|C)$ (or $p(X_0^k|C)$)

In [109]:
pW_C.shape
(len(Ws), len(Cs))
assert pW_C.shape[0] == len(Ws)
assert pW_C.shape[1] == len(Cs)

(6403, 16443)

(6403, 16443)

In [None]:
# def pX0f_C_calc(w, c):
    

### $p(Y_0^f)$

# Calculation

In [84]:
from random import choice

In [94]:
random_wordform = choice(Ws_t)
random_wordform_length = len(ds2t(random_wordform))
random_wordform_idx = Ws_t.index(random_wordform)
random_wordform, random_wordform_idx, random_wordform_length

('⋊.s.p.ɑ.n.s.ɚ.d.⋉', 3801, 9)

In [81]:
type(pW_C)
pW_C.shape
pW_C[:,0].shape

numpy.ndarray

(6403, 16443)

(6403,)

In [83]:
pX0f = pW_C[:,0]
pX0f_torch = torch.from_numpy(pX0f)

In [77]:
if not r:
    CMsByLengthByWordformIndex = pY0f_X0ks
    CMsByLengthByWordformIndex_torch = [torch.from_numpy(each) for each in CMsByLengthByWordformIndex]
else:
#     xCMsByLengthByWordformIndex_torch
    pass

In [104]:
random_wordform
ds2t(random_wordform)
random_wordform_idx
random_wordform_length
random_wordform_CM = CMsByLengthByWordformIndex_torch[random_wordform_length-3][random_wordform_idx]
CMsByLengthByWordformIndex_torch[random_wordform_length-1].shape
random_wordform_CM

'⋊.s.p.ɑ.n.s.ɚ.d.⋉'

('⋊', 's', 'p', 'ɑ', 'n', 's', 'ɚ', 'd', '⋉')

3801

9

torch.Size([6403, 38, 9])

tensor([[0.0111, 0.0094, 0.1207, 0.0090, 0.0091, 0.0071, 0.0070],
        [0.0111, 0.0094, 0.0076, 0.0090, 0.0091, 0.0071, 0.0070],
        [0.0111, 0.0094, 0.0076, 0.0090, 0.0091, 0.0071, 0.0070],
        [0.0111, 0.0093, 0.0075, 0.0089, 0.0090, 0.0070, 0.5602],
        [0.0108, 0.0090, 0.0073, 0.0168, 0.0088, 0.0068, 0.0066],
        [0.0111, 0.0094, 0.0076, 0.0090, 0.0091, 0.0071, 0.0070],
        [0.0108, 0.0091, 0.0074, 0.0088, 0.0089, 0.0069, 0.0068],
        [0.0111, 0.0094, 0.0076, 0.0090, 0.0091, 0.0071, 0.0624],
        [0.0108, 0.0413, 0.0075, 0.0089, 0.0173, 0.0070, 0.0070],
        [0.0111, 0.0094, 0.0076, 0.0090, 0.0091, 0.0071, 0.0070],
        [0.0082, 0.0080, 0.0065, 0.0077, 0.0078, 0.0061, 0.0070],
        [0.0108, 0.0093, 0.0075, 0.0089, 0.0090, 0.0070, 0.0624],
        [0.0111, 0.0094, 0.0076, 0.0090, 0.0091, 0.0071, 0.0070],
        [0.0111, 0.0093, 0.0075, 0.0089, 0.0090, 0.0070, 0.0068],
        [0.0111, 0.0094, 0.0076, 0.6556, 0.0091, 0.0071, 0.0070],
        [0

In [78]:
if not r:
    def depthSampler2a_t(CM, m=1):
        stack = torch.zeros((m, CM.shape[1], CM.shape[0]))
        for eachStack in torch.arange(m):
            for i in torch.arange(CM.shape[1]):
                stack[eachStack, i] = torch.distributions.Multinomial(1, CM[:,i]).sample()
        return stack
else:
    def depthSampler2a_t(xCM, m=1):
        stack = torch.zeros((m, xCM.shape[1], xCM.shape[0]))
        for eachStack in torch.arange(m):
            for i in torch.arange(xCM.shape[1]):
                stack[eachStack, i] = torch.distributions.Multinomial(1, xCM[:,i]).sample()
        return stack

In [None]:
#FIXME copy what you need from the segmental sequence notebook to interpret these samples

In [105]:
depthSampler2a_t(random_wordform_CM)

tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       

In [None]:
if not r:
    def pXhat0fX0k_pxt(xhat0f_idx, x0k_CM, m = 50):
    #     xhat0f_idx = Wmap[xhat0f]

    #     l = len(ds2t(x0k))
    #     x0k_CM = CMsByPrefixIndex[prefixMap[x0k]]

    #     my_Q_l = CMsByLengthByWordformIndex[l - 2]

        Y_prime = depthSampler2a_t(x0k_CM, m)#.float()
        l = Y_prime.shape[1] + 1

        my_Q_l = CMsByLengthByWordformIndex_torch[l - 3]#.float()

        # NORMALIZATION
        V_prime = torch.einsum('mli,kil->mkl', Y_prime, my_Q_l)  # :: (m,n,l)
        M_prime = torch.prod(V_prime, 2) # :: (m,n)
        N_prime = torch.matmul(M_prime, pX0f_torch) # :: (m, 1) <- prior probabilities of each of the m sampled channel prefixes
        Z_prime = 1.0 / N_prime # :: (m, 1)

        # NUMERATOR
        L_w = my_Q_l[xhat0f_idx]#.float()
        V_prime_w = torch.einsum('mij,ji->mi',Y_prime, L_w)
        O_w = torch.prod(V_prime_w, 1) # :: (m,1) likelihoods of each of the m sampled channel prefixes
        U_w = pX0f_torch[xhat0f_idx] * O_w ## :: (m,1) joint probabilities of xhat0f with each of the m sampled channel prefixes

        return torch.dot(Z_prime, U_w) / m

    #     return torch.dot( 1.0 / torch.matmul(torch.prod(torch.einsum('mli,kil->mkl', Y_prime, my_Q_l), 2), pX0f_torch) , 
    #        pX0f_torch[xhat0f_idx] * torch.prod(torch.einsum('mij,ji->mi',Y_prime, L_w), 1) ) / m
else:
    def pXhat0fX0k_pxt(xhat0f_idx, x0k_xCM, m = 50):
    #     xhat0f_idx = Wmap[xhat0f]

    #     l = len(ds2t(x0k))
    #     x0k_xCM = xCMsByPrefixIndex[prefixMap[x0k]]

    #     my_Q_l = xCMsByLengthByWordformIndex[l - 2]

        Y_prime = depthSampler2a_t(x0k_xCM, m)#.float()
        l = Y_prime.shape[1] + 1

        my_Q_l = xCMsByLengthByWordformIndex_torch[l - 3]#.float()

        # NORMALIZATION
        V_prime = torch.einsum('mli,kil->mkl', Y_prime, my_Q_l)  # :: (m,n,l)
        M_prime = torch.prod(V_prime, 2) # :: (m,n)
        N_prime = torch.matmul(M_prime, pX0f_torch) # :: (m, 1) <- prior probabilities of each of the m sampled channel prefixes
        Z_prime = 1.0 / N_prime # :: (m, 1)

        # NUMERATOR
        L_w = my_Q_l[xhat0f_idx]#.float()
        V_prime_w = torch.einsum('mij,ji->mi',Y_prime, L_w)
        O_w = torch.prod(V_prime_w, 1) # :: (m,1) likelihoods of each of the m sampled channel prefixes
        U_w = pX0f_torch[xhat0f_idx] * O_w ## :: (m,1) joint probabilities of xhat0f with each of the m sampled channel prefixes

        return torch.dot(Z_prime, U_w) / m

    #     return torch.dot( 1.0 / torch.matmul(torch.prod(torch.einsum('mli,kil->mkl', Y_prime, my_Q_l), 2), pX0f_torch) , 
    #        pX0f_torch[xhat0f_idx] * torch.prod(torch.einsum('mij,ji->mi',Y_prime, L_w), 1) ) / m