In [1]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Table of Contents
 <p><div class="lev1 toc-item"><a href="#Overview-and-requirements" data-toc-modified-id="Overview-and-requirements-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Overview and requirements</a></div><div class="lev1 toc-item"><a href="#Boilerplate" data-toc-modified-id="Boilerplate-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Boilerplate</a></div><div class="lev1 toc-item"><a href="#Choose-which-channel-model-to-examine" data-toc-modified-id="Choose-which-channel-model-to-examine-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Choose which channel model to examine</a></div><div class="lev1 toc-item"><a href="#Import-data" data-toc-modified-id="Import-data-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Import data</a></div><div class="lev2 toc-item"><a href="#f_3(Y_0,-Y_1-|-X_0;-X_1)-and-f_6(Y_0,-Y_1-|-X_0,-X_1;)" data-toc-modified-id="f_3(Y_0,-Y_1-|-X_0;-X_1)-and-f_6(Y_0,-Y_1-|-X_0,-X_1;)-41"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>$f_3(Y_0, Y_1 | X_0; X_1)$ and $f_6(Y_0, Y_1 | X_0, X_1;)$</a></div><div class="lev2 toc-item"><a href="#p_3(Y_0,-Y_1-|-X_0;-X_1)-and-p_6(Y_0,-Y_1-|-X_0,-X_1;)" data-toc-modified-id="p_3(Y_0,-Y_1-|-X_0;-X_1)-and-p_6(Y_0,-Y_1-|-X_0,-X_1;)-42"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>$p_3(Y_0, Y_1 | X_0; X_1)$ and $p_6(Y_0, Y_1 | X_0, X_1;)$</a></div><div class="lev2 toc-item"><a href="#p_3(Y_0|X_0;),--p_6(Y_1|X_1;),-and-p(Y|X)" data-toc-modified-id="p_3(Y_0|X_0;),--p_6(Y_1|X_1;),-and-p(Y|X)-43"><span class="toc-item-num">4.3&nbsp;&nbsp;</span>$p_3(Y_0|X_0;)$,  $p_6(Y_1|X_1;)$, and $p(Y|X)$</a></div><div class="lev2 toc-item"><a href="#p_3(Y_1-|-X_0;-X_1)" data-toc-modified-id="p_3(Y_1-|-X_0;-X_1)-44"><span class="toc-item-num">4.4&nbsp;&nbsp;</span>$p_3(Y_1 | X_0; X_1)$</a></div><div class="lev2 toc-item"><a href="#p(Y_1|X_0,-X_1;-X_2)" data-toc-modified-id="p(Y_1|X_0,-X_1;-X_2)-45"><span class="toc-item-num">4.5&nbsp;&nbsp;</span>$p(Y_1|X_0, X_1; X_2)$</a></div><div class="lev1 toc-item"><a href="#Identify-sequence-sets" data-toc-modified-id="Identify-sequence-sets-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Identify sequence sets</a></div><div class="lev1 toc-item"><a href="#Analysis---observation-counts-f_3(Y_0,-Y_1-|-X_0;-X_1)-and-f_6(Y_0,-Y_1-|-X_0,-X_1;)" data-toc-modified-id="Analysis---observation-counts-f_3(Y_0,-Y_1-|-X_0;-X_1)-and-f_6(Y_0,-Y_1-|-X_0,-X_1;)-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Analysis - observation counts $f_3(Y_0, Y_1 | X_0; X_1)$ and $f_6(Y_0, Y_1 | X_0, X_1;)$</a></div><div class="lev1 toc-item"><a href="#Analysis---Accurate-Identification" data-toc-modified-id="Analysis---Accurate-Identification-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Analysis - Accurate Identification</a></div><div class="lev1 toc-item"><a href="#Triphone-Annihilations" data-toc-modified-id="Triphone-Annihilations-8"><span class="toc-item-num">8&nbsp;&nbsp;</span>Triphone Annihilations</a></div>

# Overview and requirements

**Notebook author:** emeinhardt@ucsd.edu

At a high level, I am running Python 3.6.5, Jupyter 5.5.0, and otherwise Anaconda 5.2. **Plots make use of the *plotnine package* (see http://plotnine.readthedocs.io/)**

This notebook is for analyzing the outputs of `Processing Notebook 3a - Producing channel distributions` corresponding to
 - $p_3(Y_0, Y_1 | X_0; X_1)$ and the associated unnormalized (but possibly smoothed) frequency count distribution $f_3(Y_0, Y_1 | X_0; X_1)$
 - $p_6(Y_0, Y_1 | X_0, X_1;)$ and the associated unnormalized (but possibly smoothed) frequency count distribution $f_6(Y_0, Y_1 | X_0 X_1;)$
 - $p_3(Y_0|X_0;)$
 - $p_6(Y_1|X_1;)$
 - $p(Y|X)$
 - $p_3(Y_1 | X_0; X_1)$
 - $p(Y_1|X_0, X_1; X_2)$

# Boilerplate

In [11]:
from collections import Counter

from math import log2, pow, isclose

def log(x):
    if x == 0.0:
        return 0.0
    return log2(x)

from itertools import product

#I've chosen to represent sequentially-organized joint events 
# (e.g. the production of a sequence of speech segments) 
# as *tuples* of atomic outcomes. Because English contains
# diphthongs and affricates (and prosodically annotated strings
# contain stress information as a digit), this makes iterating
# through diphones (and wordforms) easier and less bug-prone.
#
# Because the dotted format is useful for other purposes 
# (e.g. readability), the functions below turn strings like 
# the diphone 
#   ('aɪ', 'ŋ') 
# into the less cluttered
#   'aɪ.ŋ'
tupleToDottedString = lambda pair: '.'.join(pair)
dottedStringToTuple = lambda s: tuple(s.split('.'))

import pandas as pd
from plotnine import *

In [12]:
import random
def getRandomKey(a_dict, printKey = False):
    randKey = random.choice(list(a_dict.keys()))
    if printKey:
        print('Random key: {0}'.format(randKey))
    return randKey
def testRandomKey(a_dict, printKey = True, printVal = True):
    randKey = getRandomKey(a_dict)
    if printKey:
        print('Random key: {0}'.format(randKey))
    if printVal:
        print('value ⟶ {0}'.format(a_dict[randKey]))
    return {'key': randKey, 'val': a_dict[randKey]}

# Choose which channel model to examine

In [4]:
# which_alignment = 'unaligned'
which_alignment = 'Hammond-aligned'
# which_alignment = 'IPhOD-aligned'

In [5]:
which_stress = 'destressed'
# which_stress = 'stressed'

In [6]:
# pseudocount = 0
pseudocount = 0.01
# pseudocount = 1

which_pseudocount = 'pseudocount' + str(pseudocount)

In [7]:
which = '_'.join([which_alignment, which_stress, which_pseudocount])
which

whichNoCount = '_'.join([which_alignment, which_stress])
whichNoCount

'Hammond-aligned_destressed_pseudocount1'

'Hammond-aligned_destressed'

# Import data

In [8]:
%pwd

'/Users/ericmeinhardt/Downloads/c2-jnA'

In [10]:
import csv, json, codecs

##  $f_3(Y_0, Y_1 | X_0; X_1)$ and $f_6(Y_0, Y_1 | X_0, X_1;)$

##  $p_3(Y_0, Y_1 | X_0; X_1)$ and $p_6(Y_0, Y_1 | X_0, X_1;)$

## $p_3(Y_0|X_0;)$,  $p_6(Y_1|X_1;)$, and $p(Y|X)$

## $p_3(Y_1 | X_0; X_1)$

## $p(Y_1|X_0, X_1; X_2)$

In [13]:
if pseudocount == 0:
    pass

# Identify sequence sets

# Analysis - observation counts $f_3(Y_0, Y_1 | X_0; X_1)$ and $f_6(Y_0, Y_1 | X_0, X_1;)$

In [None]:
def countResponses(stim_diph, distSet):
    resp_dist = distSet[stim_diph]
    return sum(resp_dist.values())

print('test diphone: {0}'.format(myDiphoneInSeg))
print(sum(my_response_dist.values()))
countResponses(myDiphoneInSeg, gate3_dists)

In [None]:
gate3ResponseCountsDF = [{'stimulus diphone':stim_diph, 'responses':countResponses(stim_diph, gate3_dists)} for stim_diph in stimuli_diphones]
gate3ResponseCountsDF = pd.DataFrame(data = gate3ResponseCountsDF)
gate3ResponseCountsDF.head()

In [None]:
gate6ResponseCountsDF = [{'stimulus diphone':stim_diph, 'responses':countResponses(stim_diph, gate6_dists)} for stim_diph in stimuli_diphones]
gate6ResponseCountsDF = pd.DataFrame(data = gate6ResponseCountsDF)
gate6ResponseCountsDF.head()

In [None]:
ggplot(gate3ResponseCountsDF, aes(x='responses')) \
  + geom_histogram(binwidth = 5)

In [None]:
ggplot(gate6ResponseCountsDF, aes(x='responses')) \
  + geom_histogram(binwidth = 5)

\FIXME Probably important follow-up question/the answer is important to keep track of:
 - Q: *which* diphone types have 20, 40, and 80 responses?
  - A: Any diphone containing a segment type whose responses are pooled with those of another - 
      - E.g. responses for any diphone containing vowel consist of responses from the diphone gating data for both the diphone containing an unstressed version of that vowel **and** a stressed version.
      - E.g. responses for any diphone containing a 't' consist of responses from the diphone gating data for both 't' and taps...

# Analysis - Accurate Identification

Over all uniphone (segment) types $\phi$, what is the distribution over $p(Y = \phi|X = \phi)$?

In [14]:
# correctResp = lambda stim_seg: stim_seg if stim_seg != 'ə' else 'ʌ'

def correctResps(stim_seg):
    destressed_seg = removeStress(stim_seg)
    if destressed_seg == 'ə':
        return set(['ʌ', 'ə'])
    if destressed_seg == 'l̩':
        return set(['l̩', 'l'])
    if destressed_seg == 'ɾ':
        return set(['ɾ', 't', 'd'])
    return set([destressed_seg])

# p(Y = \dot{x}|X = \dot{x})
def pCorrect(stim_seg):
    terms = [uniphone_dist[stim_seg][correct_resp] for correct_resp in correctResps(stim_seg) if correct_resp in uniphone_dist[stim_seg]]
    return sum(terms)

def p0Correct(stim_seg):
    terms = [p_uniphone_0(correct_resp, stim_seg) for correct_resp in correctResps(stim_seg)]
    return sum(terms)

def p1Correct(stim_seg):
    terms = [p_uniphone_1(correct_resp, stim_seg) for correct_resp in correctResps(stim_seg)]
    return sum(terms)

probs = [pCorrect(eachSegType) for eachSegType in stimuli_uniphones]
probs

def h(p):
    if p != 0.0:
        return -1.0 * log(p)
    return p

infs = list(map(h, probs))
infs

NameError: name 'stimuli_uniphones' is not defined

In [None]:
uniphoneAccDF = [{'segment x':seg, \
                  'p(Y = x|X = x)':pCorrect(seg), \
                  'h(p(Y = x|X = x))':h(pCorrect(seg)),\
                  'p(Y_0 = x|X_0 = x)':p0Correct(seg),\
                  'h(p(Y_0 = x|X_0 = x))':h(p0Correct(seg)),\
                  'p(Y_1 = x|X_1 = x)':p1Correct(seg),\
                  'h(p(Y_1 = x|X_1 = x))': h(p1Correct(seg)),\
                  'h(p(Y_1 = x|X_1 = x)) - h(p(Y_0 = x|X_0 = x))': h(p1Correct(seg)) - h(p0Correct(seg)) \
                 } for seg in stimuli_uniphones]
uniphoneAccDF = pd.DataFrame(data = uniphoneAccDF)
uniphoneAccDF

In [None]:
ggplot(uniphoneAccDF, aes(x="segment x", y="p(Y = x|X = x)")) + geom_point() + theme(axis_text_x = element_text(rotation = 90))

In [None]:
ggplot(uniphoneAccDF, aes(x="segment x", y='h(p(Y = x|X = x))')) + geom_point() + theme(axis_text_x = element_text(rotation = 90))

In [None]:
low_acc_segs = [stim_uniphone for stim_uniphone in stimuli_uniphones if pCorrect(stim_uniphone) <= 0.5]
len(low_acc_segs)
low_acc_segs

What explains these outliers? Given the ubiquity of vowel reduction, schwa's presence is not that surprising, but why the other vowels? What's going on with the consonants?

Before investigating any of the outliers in detail, let's first glance at the separate uniphone distributions for first uniphones at gate 3 and for second uniphones at gate6, and then at the distribution over the difference in accuracy between the two positions; this should point out segments whose accurate identifiability is dramatically different between the two positions within a diphone.

In [None]:
ggplot(uniphoneAccDF, aes(x="segment x", y="h(p(Y_0 = x|X_0 = x))")) + geom_point() + theme(axis_text_x = element_text(rotation = 90))

In [None]:
ggplot(uniphoneAccDF, aes(x="segment x", y="h(p(Y_1 = x|X_1 = x))")) + geom_point() + theme(axis_text_x = element_text(rotation = 90))

In [None]:
uniphoneAccDF.sort_values(['h(p(Y_1 = x|X_1 = x)) - h(p(Y_0 = x|X_0 = x))','h(p(Y = x|X = x))'], ascending = [False, False])

In [None]:
ggplot(uniphoneAccDF, aes(x="segment x", y='h(p(Y_1 = x|X_1 = x)) - h(p(Y_0 = x|X_0 = x))')) + geom_point() + theme(axis_text_x = element_text(rotation = 90))b

Dots above 0 indicate segment types whose surprisal as a second segment is higher than their surprisal as a first segment.

In [None]:
uniphoneAccDF.sort_values(['h(p(Y = x|X = x))','h(p(Y_0 = x|X_0 = x))','h(p(Y_1 = x|X_1 = x))'], ascending = [False, False, False])

In [None]:
uniphoneAccDF.sort_values(['h(p(Y_0 = x|X_0 = x))','h(p(Y = x|X = x))','h(p(Y_1 = x|X_1 = x))'], ascending = [False, False, False])

In [None]:
uniphoneAccDF.sort_values(['h(p(Y_1 = x|X_1 = x))','h(p(Y = x|X = x))','h(p(Y_0 = x|X_0 = x))'], ascending = [False, False, False])

\FIXME import phonological corpus tools feature matrices and break down $h(p(seg|seg))$ into $h(p(f_0, f_1, f_2...|f_0, f_1, f_2...))$ i.e. into phonological features $f_0, f_1, f_2...$ of $seg$, identify the features most/least likely to be successfully transmitted.

# Triphone Annihilations

In [None]:
def annihilation(y1, x012, verbose = False):
    assert y1 in Ys, '{0} not a valid output segment'.format(y1)
    assert x012 in X_triphs, '{0} not a valid input triphone'.format(x012)
    Xs = dottedStringToTuple(x012)
    x0, x1, x2 = Xs[0], Xs[1], Xs[2]
    x01 = tupleToDottedString((x0, x1))
    x12 = tupleToDottedString((x1, x2))
    
    if leftEdge in Xs or rightEdge in Xs:
        return None
    
    key_term = (p6_y1(y1, x01), p3_y0(y1, x12)) #numerator
        
    hasAnyZeros = lambda tup: any(subterm == 0.0 for subterm in tup)
    hasAllZeros = lambda tup: all(subterm == 0.0 for subterm in tup)
    hasExactlyOneZero = lambda pair: hasAnyZeros(pair) and not hasAllZeros(pair)
    if hasExactlyOneZero(key_term):
        if verbose:
            print('Annihilation:')
            print('Xs = {0}'.format(x012))
            print('y = {0}'.format(y1))
            print('p(y = {0}|x01 = {1}) = {2}'.format(y1, x01, key_term[0]))
            print('p(y = {0}|x12 = {1}) = {2}'.format(y1, x12, key_term[1]))
        return key_term
    else:
        return None

In [None]:
# annihilations = [((y, x012), annihilation(y, x012)) for (y, x012) in product(Ys, X_triphs) if annihilation(y, x012) != None]
# len(annihilations)
# len(product(Ys, X_triphs))