In [1]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

**Notebook author:** emeinhardt@ucsd.edu

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Overview" data-toc-modified-id="Overview-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Overview</a></span><ul class="toc-item"><li><span><a href="#Requirements" data-toc-modified-id="Requirements-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Requirements</a></span></li><li><span><a href="#Usage" data-toc-modified-id="Usage-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Usage</a></span></li></ul></li><li><span><a href="#Parameters" data-toc-modified-id="Parameters-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Parameters</a></span></li><li><span><a href="#Imports-/-load-data" data-toc-modified-id="Imports-/-load-data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Imports / load data</a></span></li><li><span><a href="#Define-main-calculation" data-toc-modified-id="Define-main-calculation-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Define main calculation</a></span></li><li><span><a href="#Calculate-distribution-as-dict/ProbDist" data-toc-modified-id="Calculate-distribution-as-dict/ProbDist-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Calculate distribution as <code>dict</code>/<code>ProbDist</code></a></span></li><li><span><a href="#Cast-to-matrix" data-toc-modified-id="Cast-to-matrix-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Cast to matrix</a></span></li><li><span><a href="#Export" data-toc-modified-id="Export-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Export</a></span></li></ul></div>

# Overview

Given
 - a 'postview' triphone channel model filepath $l$ defining $p_6(Y_{i-1} | X_{i-1}^i)$
 - a 'center' triphone channel model filepath $c$ defining $p_3(Y_i | X_{i-1}^i ; X_{i+1})$
 - a 'preview' triphone channel model filepath $r$ defining $p_3(Y_{i+1} | X_i; X_{i+1})$
 - an output filepath prefix $o$
 
this notebook produces a combined 'observation' channel model
$$p(Y_{i-1}, Y_i ; Y_{i+1} | X_{i-1}, X_i ; X_{i+1}) = p_6(Y_{i-1} | X_{i-1}^i) p_3(Y_i | X_{i-1}^i ; X_{i+1}) p_3(Y_{i+1} | X_i; X_{i+1})$$
and writes it as a `.npy` file (+ an associated matrix metadata `.json` file) according to `o`. (It also exports the outcomes in associated sorted order to file in the same output directory.)

If a `d` argument is passed with a value of `"True"`, then the notebook will also attempt to write distribution to disk as a `.json` file. This is not currently done efficiently and is strongly not recommended.

## Requirements

 - `numpy`
 - `joblib` is not strictly required, but because the sample space of the observation distribution is very large, it tremendously accelerates calculation.

## Usage

#FIXME

# Parameters

In [2]:
from os import getcwd, chdir, listdir, path, mkdir, makedirs

In [3]:
# parameters

l = ''
# l = 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01/LTR_Buckeye_aligned_CM_filtered_LM_filtered_p6Y0X01.json'

c = ''
# c = 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json'

r = ''
# r = 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01/LTR_Buckeye_aligned_CM_filtered_LM_filtered_p3Y1X01.json'

o = ''
# o = 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pC1X0X1X2'

d = ''
# d = 'False'

In [4]:
output_dir = path.dirname(o)
if not path.exists(output_dir):
    print("Creating output path '{0}'".format(output_dir))
    makedirs(output_dir)

In [5]:
if d == "True":
    d = True
else:
    d = False
    
d

False

# Imports / load data

In [6]:
from joblib import Parallel, delayed

J = -1
BACKEND = 'multiprocessing'
# BACKEND = 'loky'
V = 10
PREFER = 'processes'
# PREFER = 'threads'

def identity(x):
    return x

def par(gen_expr):
    return Parallel(n_jobs=J, backend=BACKEND, verbose=V, prefer=PREFER)(gen_expr)

In [7]:
import numpy as np

In [8]:
# import csv
import json

from probdist import *
from boilerplate import *
from string_utils import *

In [9]:
postview_channel_model = importProbDist(l)
center_channel_model = importProbDist(c)
preview_channel_model = importProbDist(r)

In [10]:
assert areNormalized(postview_channel_model)
assert areNormalized(center_channel_model)
assert areNormalized(preview_channel_model)

In [11]:
assert uniformOutcomes(postview_channel_model)
assert uniformOutcomes(center_channel_model)
assert uniformOutcomes(preview_channel_model)

In [12]:
post_diphones = conditions(postview_channel_model)
pre_diphones = conditions(preview_channel_model)

stimuli_triphones = set(conditions(center_channel_model))
len(stimuli_triphones)

5761

In [13]:
assert post_diphones == pre_diphones

stimuli_diphones = set(pre_diphones)
len(stimuli_diphones)

837

In [14]:
diphones_contained_in_stimuli_triphones = lexiconTo2factors(stimuli_triphones)
len(diphones_contained_in_stimuli_triphones)

triphone_diphs_undefined_in_lateral_dists = {diph for diph in diphones_contained_in_stimuli_triphones if not diph in stimuli_diphones}
len(triphone_diphs_undefined_in_lateral_dists)
triphone_diphs_undefined_in_lateral_dists

triphone_diphs_undefined_in_lateral_dists = {diph for diph in triphone_diphs_undefined_in_lateral_dists if not (rightEdge in diph or leftEdge in diph)}
len(triphone_diphs_undefined_in_lateral_dists)

# assert all(diph in diphones_contained_in_stimuli_triphones for diph in stimuli_diphones)
assert len(triphone_diphs_undefined_in_lateral_dists) == 0, f"Center channel distribution in \n\t{c}\n contains {len(triphone_diphs_undefined_in_lateral_dists)} diphones\n\t{triphone_diphs_undefined_in_lateral_dists} not defined in the preview and/or postview distributions in \n\t{l}\n\t{r}"

904

67

{'aɪ.⋉',
 'aʊ.⋉',
 'b.⋉',
 'd.⋉',
 'dʒ.⋉',
 'eɪ.⋉',
 'f.⋉',
 'g.⋉',
 'i.⋉',
 'k.⋉',
 'l.⋉',
 'm.⋉',
 'n.⋉',
 'oʊ.⋉',
 'p.⋉',
 's.⋉',
 't.⋉',
 'tʃ.⋉',
 'u.⋉',
 'v.⋉',
 'z.⋉',
 'æ.⋉',
 'ð.⋉',
 'ŋ.⋉',
 'ɑ.⋉',
 'ɔɪ.⋉',
 'ɚ.⋉',
 'ɛ.⋉',
 'ɹ.⋉',
 'ʃ.⋉',
 'ʌ.⋉',
 'ʒ.⋉',
 'θ.⋉',
 '⋊.aɪ',
 '⋊.aʊ',
 '⋊.b',
 '⋊.d',
 '⋊.dʒ',
 '⋊.eɪ',
 '⋊.f',
 '⋊.g',
 '⋊.h',
 '⋊.i',
 '⋊.j',
 '⋊.k',
 '⋊.l',
 '⋊.m',
 '⋊.n',
 '⋊.oʊ',
 '⋊.p',
 '⋊.s',
 '⋊.t',
 '⋊.tʃ',
 '⋊.v',
 '⋊.w',
 '⋊.z',
 '⋊.æ',
 '⋊.ð',
 '⋊.ɑ',
 '⋊.ɔɪ',
 '⋊.ɚ',
 '⋊.ɛ',
 '⋊.ɪ',
 '⋊.ɹ',
 '⋊.ʃ',
 '⋊.ʌ',
 '⋊.θ'}

0

In [15]:
pre_channel_phones = set(outcomes(preview_channel_model))
post_channel_phones = set(outcomes(preview_channel_model))

assert pre_channel_phones == post_channel_phones
lateral_channel_phones = pre_channel_phones

center_channel_phones = lexiconToInventory(set(outcomes(center_channel_model)))

assert center_channel_phones - edgeSymbols == lateral_channel_phones

channel_alphabet = center_channel_phones - edgeSymbols
len(channel_alphabet)
channel_alphabet

38

{'aɪ',
 'aʊ',
 'b',
 'd',
 'dʒ',
 'eɪ',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'oʊ',
 'p',
 's',
 't',
 'tʃ',
 'u',
 'v',
 'w',
 'z',
 'æ',
 'ð',
 'ŋ',
 'ɑ',
 'ɔɪ',
 'ɚ',
 'ɛ',
 'ɪ',
 'ɹ',
 'ʃ',
 'ʊ',
 'ʌ',
 'ʒ',
 'θ'}

In [16]:
Y1s = channel_alphabet
Y1s_t = tuple(sorted(list(Y1s)))
len(Y1s)

38

In [17]:
# def language_concatenation(A, B, concat=None):
#     if concat is None:
#         concat = lambda u, v: u + v
#     return set(starmap(concat,
#                        product(A, B)))

Y012s = set(map(t2ds,
                sigmaK(Y1s, 3)))
Y01s = set(map(t2ds,
               sigmaK(Y1s, 2)))

assert not any((leftEdge in triph or rightEdge in triph) for triph in Y012s)
assert not any((leftEdge in diph or rightEdge in diph) for diph in Y01s)

# Y012s = union([Y012s, 
#                {leftEdge + '.' + y01 for y01 in Y01s},
#                {y01 + '.' + rightEdge for y01 in Y01s},
#                {leftEdge + '.' + y1 + '.' + rightEdge for y1 in Y1s}])
Y012s = union([Y012s,
               language_concatenation({leftEdge}, Y01s, lambda u,v: u + '.' + v),
               language_concatenation(Y01s, {rightEdge}, lambda u,v: u + '.' + v),
               language_concatenation({leftEdge}, 
                                      language_concatenation(Y1s, {rightEdge}, lambda u,v: u + '.' + v), 
                                      lambda u,v: u + '.' + v)])

Y012s_t = tuple(sorted(list(Y012s)))
len(Y012s)
Y012s_t[0]

57798

'aɪ.aɪ.aɪ'

In [18]:
X012s = stimuli_triphones
X012s_t = tuple(sorted(list(X012s)))
X012s_t[0]

'aɪ.b.z'

# Define main calculation

$C_i = (Y_{i-1}^{x_i}, Y_i^{x_i}, Y_{i+1}^{x_i})$

$p(y_{i-1}, y_i; y_{i+1} | x_{i-1}^i; x_{i+1}) = p(y_{i-1} | x_{i-1}, x_i;) p(y_i | x_{i-1}^i ; x_{i+1}) p(y_{i+1} | x_i ; x_{i+1})$

In [19]:
p3Y1X01 = condDistsAsProbDists(preview_channel_model)
p3Y1X012 = condDistsAsProbDists(center_channel_model)
p6Y0X01 = condDistsAsProbDists(postview_channel_model)

In [20]:
def pC1_X012(y012, x012):
    x012_t = ds2t(x012)
    y012_t = ds2t(y012)
    
    x0, x1, x2 = x012_t[0], x012_t[1], x012_t[2]
    x01_t = (x0, x1)
    x01 = t2ds(x01_t)
    x12_t = (x1, x2)
    x12 = t2ds(x12_t)
    
    y0, y1, y2 = y012_t[0], y012_t[1], y012_t[2]
    
    if x0 == leftEdge:
        if y0 == leftEdge:
            left_term = 1.0
        else:
            left_term = 0.0
    else:
        if y0 == leftEdge:
            left_term = 0.0
        else:
            left_term = p6Y0X01[x01][y0]
    
    center_term = p3Y1X012[x012][y1]
    
    if x2 == rightEdge:
        if y2 == rightEdge:
            right_term = 1.0
        else:
            right_term = 0.0
    else:
        if y2 == rightEdge:
            right_term = 0.0
        else:
            right_term = p3Y1X01[x12][y2]
    
    terms = (left_term, center_term, right_term)
    
    result = prod(terms)
    
    return result

In [21]:
random_source_triphone = choice(X012s_t); random_source_triphone
random_channel_triphone = choice(Y012s_t); random_channel_triphone

'ʌ.θ.i'

'ʒ.dʒ.aɪ'

In [22]:
pC1_X012(random_channel_triphone, random_source_triphone)

6.679657108095648e-07

In [23]:
pC1_random_x012 = {y012:pC1_X012(y012, random_source_triphone) for y012 in Y012s}

In [24]:
isNormalized(pC1_random_x012)

if not isNormalized(pC1_random_x012):
    norm(pC1_random_x012)

True

# Calculate distribution as `dict`/`ProbDist`

In [25]:
def pY012_x012(x012):
    return {y012:pC1_X012(y012, x012) for y012 in Y012s}

def pY012_x012_calc(x012):
    return (x012, {y012:pC1_X012(y012, x012) for y012 in Y012s})

In [26]:
len(Y012s)
len(X012s)
len(Y012s) * len(X012s)
"{0:,} = {0:.2E}".format(len(Y012s) * len(X012s))

57798

5761

332974278

'332,974,278 = 3.33E+08'

In [27]:
# ≈3.66m on old sidious (160 processes)
pC1_X012_dist = dict(par(delayed(pY012_x012_calc)(x012) for x012 in X012s))

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done  98 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 117 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done 157 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 178 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done 201 tasks      | elapsed:    4.6s
[Parallel(n_jobs=-1)]: Done 224 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done 249 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done 274 tasks      |

In [28]:
#takes ~10s on old sidious

not_normalized = {x012 for x012 in X012s if not isNormalized(pC1_X012_dist[x012], epsilon = 1e-8)}
len(not_normalized)

for x012 in not_normalized:
    print(f"p(C_1 | X012 = {x012}) has normalization defect {normalizationDefect(pC1_X012_dist[x012])}")

0

In [29]:
#takes ~2m on old sidious
assert areNormalized(pC1_X012_dist, epsilon = 1e-8)
assert uniformOutcomes(pC1_X012_dist)

In [30]:
#takes ~4m on old sidious
pC1_X012_dist = condDistsAsProbDists(pC1_X012_dist)

# Cast to matrix

In [31]:
# takes 6m on old sidious
pC1_X012_np = condDistFamilyToNP(pC1_X012_dist)
pC1_X012_np.shape
pC1_X012_np.nbytes / 1e9

(57798, 5761)

2.663794224

# Export

In [114]:
o

'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pC1X0X1X2'

In [115]:
np.save(o, pC1_X012_np)

In [118]:
pC1_X012_md = {'X012s':{'from fp':c,
                        'changes':'sorted',
                        'size':len(X012s)},
               'Y012s':{'from fp':{'preview':r,
                                   'postview':l,
                                   'center':c},
                        'changes':'sorted',
                        'exported fp':o + 'Y012s' + '.txt',
                        'size':len(Y012s)}}
exportMatrixMetadata(o + '.npy' + '_metadata.json',
                     o + '.npy',
                     pC1_X012_np,
                     pC1_X012_md,
                     'Step 4d',
                     'Calculate observation distribution given channel models',
                     {})

Wrote metadata for 
	CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pC1X0X1X2.npy
 to 
	CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pC1X0X1X2.npy_metadata.json


In [119]:
exportSeqs(o + 'Y012s' + '.txt', Y012s_t)

In [100]:
if d:
    exportProbDist(o + '.json', pC1_X012_dist)

In [103]:
o

'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pC1X0X1X2.json'

In [120]:
listdir(output_dir)

['pX0X1X2.npy',
 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_p3Y1X01.json',
 'p6Y0X01.json',
 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_pC1X0X1X2.npy',
 'Calculate wordform channel matrices for LTR_Buckeye_aligned_CM_filtered_LM_filtered.ipynb',
 'p3YX.json',
 'p3Y0X01.json',
 'p3Y01X01.json',
 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2.npy',
 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_CMs_by_prefix_index.pickle_metadata.json',
 'Generating LTR_Buckeye_aligned_CM_filtered_LM_filtered uniform triphone lexicon dist.ipynb',
 'p6Y01X01.json',
 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_p3Y1X012.npy_metadata.json',
 'p3Y1X01.json',
 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_p6Y0X01.json',
 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_pC1X0X1X2.npy_metadata.json',
 'pYX.json',
 'Generating  uniform triphone lexicon dist.ipynb',
 'p6Y1X01.json',
 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_pC1X0X1X2Y012s.txt',
 'Filter CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0