**Eric Meinhardt / emeinhardt@ucsd.edu**

In [26]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Context-/-motivation" data-toc-modified-id="Context-/-motivation-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Context / motivation</a></span></li><li><span><a href="#Imports" data-toc-modified-id="Imports-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Imports</a></span></li><li><span><a href="#A-cautionary-note-on-the-primitive-datatypes-here" data-toc-modified-id="A-cautionary-note-on-the-primitive-datatypes-here-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>A cautionary note on the primitive datatypes here</a></span></li><li><span><a href="#Loading-the-source-distribution" data-toc-modified-id="Loading-the-source-distribution-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Loading the source distribution</a></span><ul class="toc-item"><li><span><a href="#The-source-distribution-itself-as-a-ProbDist" data-toc-modified-id="The-source-distribution-itself-as-a-ProbDist-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>The source distribution itself as a <code>ProbDist</code></a></span></li><li><span><a href="#The-outcomes,-the-source-alphabet,-and-the-source-alphabet-at-particular-indices" data-toc-modified-id="The-outcomes,-the-source-alphabet,-and-the-source-alphabet-at-particular-indices-4.2"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>The outcomes, the source alphabet, and the source alphabet at particular indices</a></span></li></ul></li><li><span><a href="#Loading-the-channel-distribution" data-toc-modified-id="Loading-the-channel-distribution-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Loading the channel distribution</a></span><ul class="toc-item"><li><span><a href="#Channel-distribution-conditioning-events" data-toc-modified-id="Channel-distribution-conditioning-events-5.1"><span class="toc-item-num">5.1&nbsp;&nbsp;</span>Channel distribution conditioning events</a></span></li><li><span><a href="#Channel-outcomes" data-toc-modified-id="Channel-outcomes-5.2"><span class="toc-item-num">5.2&nbsp;&nbsp;</span>Channel outcomes</a></span></li></ul></li></ul></div>

# Context / motivation

This is a notebook intended for showing the structure of the demo channel distribution files, what they mean (if not explaining in too much detail how they were made), what's in them, and how to work with them.

The demo files specify a channel distribution $p(Y_i | X_{i-1}, X_i, X_{i+1})$ and a uniform distribution over $p(X_0, X_1, X_2)$ based on a set of attested source triphones in a natural speech corpus of American English. 

The channel distribution has been smoothed and otherwise transformed from raw data; the exact details of the speech corpus and the generation of the channel distribution are detailed elsewhere...

# Imports

In [1]:
from boilerplate import *
from string_utils import *
from probdist import *

# A cautionary note on the primitive datatypes here 

A symbol = (speech) segment = alphabet element = phone is a `string`, usually of length 1 and sometimes of length 2. Because the length sometimes varies, sequences of segments are sometimes represented as `dotted strings` = strings where segments are separated by `.`, and sometimes as tuples of segments.

In [83]:
"food"

#note the special word edge symbols
leftEdge 
rightEdge

'⋊.f.u.d.⋉'
ds2t('⋊.f.u.d.⋉') 
t2ds(('⋊', 'f', 'u', 'd', '⋉')) 

'food'

'⋊'

'⋉'

'⋊.f.u.d.⋉'

('⋊', 'f', 'u', 'd', '⋉')

'⋊.f.u.d.⋉'

See the demo notebook and the modules `boilerplate` or `string_utils` for a bit more information on what's available for manipulating these representations.

# Loading the source distribution

...and setting up normalized (= sorted, uniquified) reference variables that are useful to have direct access to.

In [2]:
%pwd

'/mnt/cube/home/AD/emeinhar/shanchan'

In [3]:
%ls

 boilerplate.py
'Channel code synthesis.ipynb'
 [0m[01;34mCM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.015625[0m[K/
'Library demos.ipynb'
 probdist.py
 [01;34m__pycache__[0m/
 README.md
 string_utils.py
 Untitled.ipynb


In [6]:
cd "CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.015625/"

/mnt/cube/home/AD/emeinhar/shanchan/CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.015625


In [7]:
ls

LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X012.json
LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X012.npy
LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X012.npy_metadata.json
LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2.json
LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2.npy
LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2.npy_metadata.json


## The source distribution itself as a `ProbDist`

In [10]:
pX0X1X2 = importProbDist('LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2.json')

In [14]:
normalizationDefect(pX0X1X2)

1.1302070390684094e-13

In [15]:
pX0X1X2 = ProbDist(pX0X1X2) #does normalization and lets us use lots of other functions

In [25]:
pX0X1X2

'⋊.ɛ.l.p.⋉': 0.0001484340210776142
'⋊.f.ɚ.s.⋉': 0.0001484340210776142
'⋊.ɛ.p.ɹ.⋉': 0.0001484340210776142
'⋊.ɑ.p.ɹ.⋉': 0.0001484340210776142
'⋊.z.ɛ.n.⋉': 0.0001484340210776142
'⋊.s.ɚ.n.⋉': 0.0001484340210776142
'⋊.æ.k.ə.⋉': 0.0001484340210776142
'⋊.d.s.k.⋉': 0.0001484340210776142
'⋊.i.v.ə.⋉': 0.0001484340210776142
'⋊.ŋ.k.i.⋉': 0.0001484340210776142
'⋊.t.ə.s.⋉': 0.0001484340210776142
'⋊.n.ʊ.ɹ.⋉': 0.0001484340210776142
'⋊.f.g.æ.⋉': 0.0001484340210776142
'⋊.z.ɚ.t.⋉': 0.0001484340210776142
'⋊.aɪ.p.ə.⋉': 0.0001484340210776142
'⋊.ɚ.t.n.⋉': 0.0001484340210776142
'⋊.m.ɛ.n.⋉': 0.0001484340210776142
'⋊.n.eɪ.ʃ.⋉': 0.0001484340210776142
'⋊.i.n.v.⋉': 0.0001484340210776142
'⋊.ŋ.ɚ.z.⋉': 0.0001484340210776142
'⋊.b.ʌ.dʒ.⋉': 0.0001484340210776142
'⋊.eɪ.f.g.⋉': 0.0001484340210776142
'⋊.l.i.ə.⋉': 0.0001484340210776142
'⋊.k.eɪ.dʒ.⋉': 0.0001484340210776142
'⋊.i.oʊ.v.⋉': 0.0001484340210776142
'⋊.eɪ.s.ə.⋉': 0.0001484340210776142
'⋊.ʃ.oʊ.p.⋉': 0.0001484340210776142
'⋊.v.ɚ.dʒ.⋉': 0.0001484340210776142
'⋊.ɛ.v.ɹ.⋉

In [23]:
len(set(pX0X1X2.keys()))
len(tuple(pX0X1X2.keys()))

6737

6737

In [38]:
log(8) # = log2

1 / 6737

log(6737)

3.0

0.00014843402107763098

12.71789058398728

In [17]:
H(pX0X1X2) 
# => pX0X1X2 is a uniform distribution over its support = its set of outcomes

12.717890583986936

## The outcomes, the source alphabet, and the source alphabet at particular indices

`X0X1X2s` only includes triphones where none of the three sounds in the center are word edge symbols:

In [40]:
X0X1X2s = tuple(sorted(pX0X1X2.keys()))
X0X1X2s

('⋊.aɪ.b.aɪ.⋉',
 '⋊.aɪ.b.d.⋉',
 '⋊.aɪ.b.z.⋉',
 '⋊.aɪ.b.ɑ.⋉',
 '⋊.aɪ.b.ə.⋉',
 '⋊.aɪ.b.ɚ.⋉',
 '⋊.aɪ.b.ɪ.⋉',
 '⋊.aɪ.b.ɹ.⋉',
 '⋊.aɪ.d.eɪ.⋉',
 '⋊.aɪ.d.i.⋉',
 '⋊.aɪ.d.k.⋉',
 '⋊.aɪ.d.l.⋉',
 '⋊.aɪ.d.s.⋉',
 '⋊.aɪ.d.z.⋉',
 '⋊.aɪ.d.ə.⋉',
 '⋊.aɪ.d.ɚ.⋉',
 '⋊.aɪ.d.ɛ.⋉',
 '⋊.aɪ.d.ɪ.⋉',
 '⋊.aɪ.d.ɹ.⋉',
 '⋊.aɪ.dʒ.d.⋉',
 '⋊.aɪ.dʒ.ɛ.⋉',
 '⋊.aɪ.dʒ.ɪ.⋉',
 '⋊.aɪ.f.l.⋉',
 '⋊.aɪ.f.oʊ.⋉',
 '⋊.aɪ.f.s.⋉',
 '⋊.aɪ.f.t.⋉',
 '⋊.aɪ.f.ə.⋉',
 '⋊.aɪ.f.ɚ.⋉',
 '⋊.aɪ.g.æ.⋉',
 '⋊.aɪ.g.ɚ.⋉',
 '⋊.aɪ.g.ɹ.⋉',
 '⋊.aɪ.h.ɪ.⋉',
 '⋊.aɪ.i.v.⋉',
 '⋊.aɪ.k.aɪ.⋉',
 '⋊.aɪ.k.i.⋉',
 '⋊.aɪ.k.j.⋉',
 '⋊.aɪ.k.l.⋉',
 '⋊.aɪ.k.oʊ.⋉',
 '⋊.aɪ.k.s.⋉',
 '⋊.aɪ.k.t.⋉',
 '⋊.aɪ.k.w.⋉',
 '⋊.aɪ.k.ɑ.⋉',
 '⋊.aɪ.k.ə.⋉',
 '⋊.aɪ.k.ɚ.⋉',
 '⋊.aɪ.k.ɪ.⋉',
 '⋊.aɪ.k.ɹ.⋉',
 '⋊.aɪ.l.aɪ.⋉',
 '⋊.aɪ.l.d.⋉',
 '⋊.aɪ.l.i.⋉',
 '⋊.aɪ.l.s.⋉',
 '⋊.aɪ.l.u.⋉',
 '⋊.aɪ.l.z.⋉',
 '⋊.aɪ.l.æ.⋉',
 '⋊.aɪ.l.ɑ.⋉',
 '⋊.aɪ.l.ə.⋉',
 '⋊.aɪ.l.ɚ.⋉',
 '⋊.aɪ.l.ɪ.⋉',
 '⋊.aɪ.m.d.⋉',
 '⋊.aɪ.m.i.⋉',
 '⋊.aɪ.m.l.⋉',
 '⋊.aɪ.m.s.⋉',
 '⋊.aɪ.m.z.⋉',
 '⋊.aɪ.m.ə.⋉',
 '⋊.aɪ.m.ɚ.⋉',
 '⋊.aɪ.m.ɪ.⋉',
 '⋊.aɪ.n.aɪ.⋉',


In [73]:
len(X0X1X2s)

6737

In [64]:
Xs = union([set(ds2t(word)) for word in X0X1X2s])
Xs
len(Xs)
Xs_noEdges = Xs - {leftEdge, rightEdge}
len(Xs_noEdges)

Xs = tuple(sorted(Xs))
Xs_noEdges = tuple(sorted(Xs_noEdges))

{'aɪ',
 'aʊ',
 'b',
 'd',
 'dʒ',
 'eɪ',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'oʊ',
 'p',
 's',
 't',
 'tʃ',
 'u',
 'v',
 'w',
 'z',
 'æ',
 'ð',
 'ŋ',
 'ɑ',
 'ɔɪ',
 'ə',
 'ɚ',
 'ɛ',
 'ɪ',
 'ɹ',
 'ʃ',
 'ʊ',
 'ʌ',
 'ʒ',
 'θ',
 '⋉',
 '⋊'}

41

39

In [67]:
X_is = {i:tuple(sorted(set(map(lambda w: ds2t(w)[i+1],
                               X0X1X2s))))
        for i in (0,1,2)}
len(X_is[0])
len(X_is[1])
len(X_is[2])
Xs_noEdges == X_is[0]
X_is[0] == X_is[1]
X_is[0] == X_is[2]
X_is[1] == X_is[2]

39

39

39

True

True

True

True

# Loading the channel distribution

A conditional distribution is essentially a dictionary of dictionaries (or `ProbDist`s), where the outer keys are conditioning outcomes.

In [68]:
pY1X012 = importProbDist('LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X012.json')

In [75]:
pY1X012

{'ə.l.eɪ': {'ŋ': 0.02190957524616969,
  'g': 0.025523882740182014,
  'k': 0.025185803141211566,
  'tʃ': 0.025523882740182014,
  'ɔɪ': 0.025523882740182014,
  'ʊ': 0.025523882740182014,
  'ɛ': 0.025523882740182014,
  's': 0.025185803141211566,
  'p': 0.025185803141211566,
  'm': 0.025185803141211566,
  'θ': 0.025185803141211566,
  'w': 0.024843123185323605,
  'v': 0.025185803141211566,
  'z': 0.024852201615440384,
  'eɪ': 0.04864010296868689,
  'æ': 0.025523882740182014,
  'ɪ': 0.025523882740182014,
  'ɚ': 0.025185803141211566,
  'l': 0.01014376200291107,
  'u': 0.025523882740182014,
  'h': 0.025185803141211566,
  'b': 0.0638768396484544,
  'dʒ': 0.024514060667321782,
  'i': 0.025523882740182014,
  'aʊ': 0.025523882740182014,
  'd': 0.025185803141211566,
  'ɑ': 0.025523882740182014,
  'f': 0.024852201615440384,
  't': 0.025523882740182014,
  'ð': 0.024514060667321782,
  'ʒ': 0.024852201615440384,
  'oʊ': 0.025523882740182014,
  'ʃ': 0.024514060667321782,
  'n': 0.025523882740182014,
  '

In [74]:
len(X0X1X2s)
len(pY1X012) #hmm

6737

7757

In [77]:
areNormalized(pY1X012)

True

In [78]:
pY1X012 = condDistsAsProbDists(pY1X012)

## Channel distribution conditioning events

In [94]:
X012s = tuple(sorted(pY1X012.keys()))
len(X012s)
X012s 
# note the presence of triples where one of the triples is a word edge symbol

7757

('aɪ.aɪ.⋉',
 'aɪ.b.aɪ',
 'aɪ.b.d',
 'aɪ.b.z',
 'aɪ.b.ɑ',
 'aɪ.b.ə',
 'aɪ.b.ɚ',
 'aɪ.b.ɪ',
 'aɪ.b.ɹ',
 'aɪ.b.⋉',
 'aɪ.d.eɪ',
 'aɪ.d.i',
 'aɪ.d.k',
 'aɪ.d.l',
 'aɪ.d.s',
 'aɪ.d.z',
 'aɪ.d.ə',
 'aɪ.d.ɚ',
 'aɪ.d.ɛ',
 'aɪ.d.ɪ',
 'aɪ.d.ɹ',
 'aɪ.d.⋉',
 'aɪ.dʒ.d',
 'aɪ.dʒ.ɛ',
 'aɪ.dʒ.ɪ',
 'aɪ.eɪ.⋉',
 'aɪ.f.l',
 'aɪ.f.oʊ',
 'aɪ.f.s',
 'aɪ.f.t',
 'aɪ.f.ə',
 'aɪ.f.ɚ',
 'aɪ.f.⋉',
 'aɪ.g.æ',
 'aɪ.g.ɚ',
 'aɪ.g.ɹ',
 'aɪ.h.ɪ',
 'aɪ.i.v',
 'aɪ.i.⋉',
 'aɪ.k.aɪ',
 'aɪ.k.i',
 'aɪ.k.j',
 'aɪ.k.l',
 'aɪ.k.oʊ',
 'aɪ.k.s',
 'aɪ.k.t',
 'aɪ.k.w',
 'aɪ.k.ɑ',
 'aɪ.k.ə',
 'aɪ.k.ɚ',
 'aɪ.k.ɪ',
 'aɪ.k.ɹ',
 'aɪ.k.⋉',
 'aɪ.l.aɪ',
 'aɪ.l.d',
 'aɪ.l.i',
 'aɪ.l.s',
 'aɪ.l.u',
 'aɪ.l.z',
 'aɪ.l.æ',
 'aɪ.l.ɑ',
 'aɪ.l.ə',
 'aɪ.l.ɚ',
 'aɪ.l.ɪ',
 'aɪ.l.⋉',
 'aɪ.m.d',
 'aɪ.m.i',
 'aɪ.m.l',
 'aɪ.m.s',
 'aɪ.m.z',
 'aɪ.m.ə',
 'aɪ.m.ɚ',
 'aɪ.m.ɪ',
 'aɪ.m.⋉',
 'aɪ.n.aɪ',
 'aɪ.n.b',
 'aɪ.n.d',
 'aɪ.n.i',
 'aɪ.n.m',
 'aɪ.n.oʊ',
 'aɪ.n.s',
 'aɪ.n.t',
 'aɪ.n.u',
 'aɪ.n.z',
 'aɪ.n.æ',
 'aɪ.n.ə',
 'aɪ.n.ɚ',
 'aɪ.n.ɪ',
 'aɪ.n.ʌ',
 'aɪ.n.θ

`X012s` includes some triphones where only one of the elements is a word edge symbol and does not otherwise wrap every triphone in a pair of word edges.

In [85]:
uniformOutcomes(pY1X012) #all conditional distributions have the same set of outcomes

True

## Channel outcomes

In [89]:
Y1s = tuple(sorted(outcomes(pY1X012)))
Y1s
len(Y1s)

('aɪ',
 'aʊ',
 'b',
 'd',
 'dʒ',
 'eɪ',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'oʊ',
 'p',
 's',
 't',
 'tʃ',
 'u',
 'v',
 'w',
 'z',
 'æ',
 'ð',
 'ŋ',
 'ɑ',
 'ɔɪ',
 'ɚ',
 'ɛ',
 'ɪ',
 'ɹ',
 'ʃ',
 'ʊ',
 'ʌ',
 'ʒ',
 'θ',
 '⋉')

39

In [93]:
Xs_noEdges == Y1s
set(Xs_noEdges) - set(Y1s)
set(Y1s) - set(Xs_noEdges)

False

{'ə'}

{'⋉'}