In [1]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

**Eric Meinhardt / emeinhardt@ucsd.edu**

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Motivation" data-toc-modified-id="Motivation-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Motivation</a></span></li><li><span><a href="#Imports" data-toc-modified-id="Imports"><span class="toc-item-num">2&nbsp;&nbsp;</span>Imports</a></span></li><li><span><a href="#Basic-representations" data-toc-modified-id="Basic-representations-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Basic representations</a></span></li><li><span><a href="#Type-based-within-word-model" data-toc-modified-id="Type-based-within-word-model-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Type-based within-word model</a></span></li><li><span><a href="#Token-based-within-word-model" data-toc-modified-id="Token-based-within-word-model-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Token-based within-word model</a></span></li><li><span><a href="#Token-based-across-words-model" data-toc-modified-id="Token-based-across-words-model-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Token-based across-words model</a></span></li>

# Motivation

The goal of this notebook is to create and document code for generating one or more $n$-phone models for American English based on the NXT-annotated subset of the switchboard corpus

# Imports

In [2]:
from os import getcwd, chdir, listdir, path, mkdir, makedirs

In [3]:
from boilerplate import *

In [4]:
from probdist import *
from string_utils import *

In [5]:
from funcy import *

In [6]:
from tqdm import tqdm

from joblib import Parallel, delayed

J = -1
BACKEND = 'multiprocessing'
# BACKEND = 'loky'
V = 10
PREFER = 'processes'
# PREFER = 'threads'

def identity(x):
    return x

def par(gen_expr):
    return Parallel(n_jobs=J, backend=BACKEND, verbose=V, prefer=PREFER)(gen_expr)

In [7]:
import numpy as np

In [8]:
# Parameters

p = ''
# p = 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered.pW_V.json'
# p = 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json'
# p = 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim.pW_V.json'
# p = 'LTR_newdic_destressed_aligned_w_GD_AmE_destressed/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered.pW_V.json'
# p = 'LTR_newdic_destressed_aligned_w_GD_AmE_destressed/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered_trim.pW_V.json'
# p = 'LTR_CMU_destressed_aligned_w_GD_AmE_destressed/LTR_CMU_destressed_aligned_CM_filtered_LM_filtered.pW_V.json'
# p = 'LTR_CMU_destressed_aligned_w_GD_AmE_destressed/LTR_CMU_destressed_aligned_CM_filtered_LM_filtered_trim.pW_V.json'
# p = 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim.pW_V.json'

p = 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered.pW_V.json' 
# p = 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json'
# p = 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2.json'

u = ''
# u = 'LD_Buckeye_vocab_in_(empty)_(NA)_contexts_1gram_model/LD_Buckeye_vocab_in_(empty)_(NA)_contexts_1gram_model.pV.json'
u = 'LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model/LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model.pV.json'

o = ''
# o = 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered'
# o = 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered'
# o = 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim'
# o = 'LTR_newdic_destressed_aligned_w_GD_AmE_destressed/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered'
# o = 'LTR_newdic_destressed_aligned_w_GD_AmE_destressed/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered_trim'
# o = 'LTR_CMU_destressed_aligned_w_GD_AmE_destressed/LTR_CMU_destressed_aligned_CM_filtered_LM_filtered'
# o = 'LTR_CMU_destressed_aligned_w_GD_AmE_destressed/LTR_CMU_destressed_aligned_CM_filtered_LM_filtered_trim'

o = 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered'
# o = 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered'
# o = 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim'
# o = 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_X0X1X2'

# g = ''
# # g = 'False'

In [None]:
p = 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered.pW_V.json'
u = 'LD_Buckeye_vocab_in_(empty)_(NA)_contexts_1gram_model/LD_Buckeye_vocab_in_(empty)_(NA)_contexts_1gram_model.pV.json'
o = 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered'

In [8]:
p = 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered.pW_V.json' 
u = 'LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model/LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model.pV.json'
o = 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered'

In [9]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G         95G        3.7G        1.9M         26G         28G
Swap:          2.0G         12M        2.0G


In [10]:
if 'pW_V' in p:
    pW_V = condDistsAsProbDists(importProbDist(p))
elif 'pX0X1X2' in p:
    pW = ProbDist(importProbDist(p))
else:
    raise Exception(f"Unknown type of 'p' parameter = {p}")

In [11]:
if 'pV' in u:
    pV = ProbDist(importProbDist(u))
elif 'pX0X1X2' in u:
    pV = pW
else:
    raise Exception(f"Unknown type of 'u' parameter = {u}")

In [12]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G         95G        3.7G        1.9M         26G         28G
Swap:          2.0G         12M        2.0G


In [13]:
my_dtype = np.int64

# Basic representations

In [14]:
if 'pW_V' in p:
    # Vs = set(pW_V.keys())
    Ws = union(mapValues(lambda dist: set(conditions(dist)), 
                         pW_V).values())
elif 'pX0X1X2' in p:
    Ws = set(conditions(pW))
else:
    raise Exception(f"Unknown type of 'p' parameter = {p}")

# len(Vs)
len(Ws)

14779

In [15]:
Ws_t = tuple(sorted(list(Ws)))

In [16]:
Ws_t[:5]

('⋊.aɪ.aɪ.⋉.⋉',
 '⋊.aɪ.b.i.ɛ.m.z.⋉.⋉',
 '⋊.aɪ.b.i.ɛ.m.⋉.⋉',
 '⋊.aɪ.b.i.ɛ.s.⋉.⋉',
 '⋊.aɪ.d.i.ə.l.i.⋉.⋉')

In [17]:
Ws_tt = lmap(ds2t, Ws_t)
# Ps_tt = lmap(ds2t, Ps_t)

In [18]:
Ws_tt[:5]

[('⋊', 'aɪ', 'aɪ', '⋉', '⋉'),
 ('⋊', 'aɪ', 'b', 'i', 'ɛ', 'm', 'z', '⋉', '⋉'),
 ('⋊', 'aɪ', 'b', 'i', 'ɛ', 'm', '⋉', '⋉'),
 ('⋊', 'aɪ', 'b', 'i', 'ɛ', 's', '⋉', '⋉'),
 ('⋊', 'aɪ', 'd', 'i', 'ə', 'l', 'i', '⋉', '⋉')]

In [19]:
if 'pW_V' in p:
#     Vs = set(pV.keys())
    my_Vs = set(pW_V.keys())
    
    missing_from_prior = {v for v in my_Vs if v not in pV}
    len(missing_from_prior)
    assert len(missing_from_prior) == 0
    
    missing_from_conditions = {v for v in pV if v not in pW_V}
    len(missing_from_conditions)
    
    pV_trim = ProbDist({v:pV[v] for v in my_Vs})
    assert all(v in pW_V for v in pV_trim)
    
    pW = MarginalProbDist(pW_V, pV_trim)
#     pW = ProbDist({w:sum(pV_trim[v] * pW_V[v][w]
#                      for v in pV_trim)
#                    for w in Ws_t})

0

30885

In [20]:
Vs_t = tuple(my_Vs)
Vs_t[:5]

('youngsters', 'began', 'bees', 'tempering', 'attic')

In [21]:
pW_np = distToNP(pW)
pW_np.shape

(14779,)

# Type-based within-word model

The goal of this section is to produce an $n$-phone model of
 - **within-word** phonotactics
 
where 
 - **each word type** is weighted equally.

To use an $n$-gram model estimation tool like SRILM or kenlm, we need to create a 'corpus' with one sentence per word-type, where each sentence is the (space-separated) transcription of the word type. This is the model most comparable to Dautriche et al., assuming the information about how they generated their $n-$phone model in their earlier preprint perservered into their *Cognition* paper.

I will (somewhat arbitrarily) elect to use *segmental* word-types.

In [22]:
len(Ws_t)
assert set(Ws_t) == Ws
Ws_t[:5]

14779

('⋊.aɪ.aɪ.⋉.⋉',
 '⋊.aɪ.b.i.ɛ.m.z.⋉.⋉',
 '⋊.aɪ.b.i.ɛ.m.⋉.⋉',
 '⋊.aɪ.b.i.ɛ.s.⋉.⋉',
 '⋊.aɪ.d.i.ə.l.i.⋉.⋉')

In [23]:
w0 = Ws_t[0]

In [24]:
w0

'⋊.aɪ.aɪ.⋉.⋉'

In [25]:
trimBoundariesFromSequence(w0)

'aɪ.aɪ'

In [26]:
trimBoundariesFromSequence(w0).replace('.', ' ')

'aɪ aɪ'

In [27]:
ds2sentence = lambda ds: ds.replace('.', ' ')

In [28]:
Ws_ts = lmap(lambda w: ds2sentence(trimBoundariesFromSequence(w)), Ws_t)

In [29]:
Ws_ts[:5]

['aɪ aɪ', 'aɪ b i ɛ m z', 'aɪ b i ɛ m', 'aɪ b i ɛ s', 'aɪ d i ə l i']

In [30]:
o

'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered'

In [31]:
my_type_corpus_fn = o + '_word_type_corpus.txt'
my_type_corpus_fn

'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_word_type_corpus.txt'

In [32]:
exportSeqs(my_type_corpus_fn, Ws_ts)

In [33]:
!ngram-count -help

Usage of command "ngram-count"
 -version:                 print version information
 -order:                   max ngram order
		Default value: 3
 -varprune:                pruning threshold for variable order ngrams
		Default value: 0
 -debug:                   debugging level for LM
		Default value: 0
 -recompute:               recompute lower-order counts by summation
 -sort:                    sort ngrams output
 -write-order:             output ngram counts order
		Default value: 0
 -tag:                     file tag to use in messages
 -text:                    text file to read
 -text-has-weights:        text file contains count weights
 -no-sos:                  don't insert start-of-sentence tokens
 -no-eos:                  don't insert end-of-sentence tokens
 -read:                    counts file to read
 -intersect:               intersect counts with this file
 -read-with-mincounts:     apply minimum counts when reading counts file
 -read-google:       

In [34]:
%%bash
ngram-count -order 5 -text LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_word_type_corpus.txt -addsmooth 0.01 -lm "LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_word_type_5gram_model.arpa"
# ngram-count -order 5 -text LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_word_type_corpus.txt -addsmooth 0.01 -lm "LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_word_type_5gram_model.arpa"

cat "LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_word_type_5gram_model.arpa" | head -n 1000
# cat "LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_word_type_5gram_model.arpa" | head -n 1000


\data\
ngram 1=41
ngram 2=1069
ngram 3=6162
ngram 4=11980
ngram 5=11294

\1-grams:
-0.8598456	</s>
-99	<s>	-3.344613
-1.876894	aɪ	-3.025825
-2.453138	aʊ	-2.739682
-1.759276	b	-3.230759
-1.419362	d	-3.620923
-2.162018	dʒ	-3.03988
-1.705824	eɪ	-3.194059
-1.840841	f	-3.388049
-2.011202	g	-3.035421
-2.202762	h	-3.275283
-1.52312	i	-3.663747
-2.233604	j	-3.202615
-1.390602	k	-3.485464
-1.347616	l	-3.559835
-1.594601	m	-3.275356
-1.240544	n	-3.667866
-1.750735	oʊ	-3.151592
-1.581866	p	-3.571213
-1.257095	s	-3.802353
-1.224465	t	-3.567616
-2.300318	tʃ	-2.920782
-2.011619	u	-3.189631
-1.9216	v	-3.256412
-2.076694	w	-3.387295
-1.519079	z	-3.379411
-1.69163	æ	-3.649051
-2.922247	ð	-2.370153
-1.838596	ŋ	-3.351165
-1.677886	ɑ	-3.524691
-2.89592	ɔɪ	-2.24917
-1.363628	ə	-3.852805
-1.657314	ɚ	-3.490672
-1.59893	ɛ	-3.741518
-1.093631	ɪ	-4.372005
-1.319542	ɹ	-4.205064
-1.979881	ʃ	-3.244264
-2.609519	ʊ	-2.645634
-1.9877	ʌ	-3.331384
-3.184331	ʒ	-2.153266
-2.546603	θ	-2.488304

\2-grams:
-2.272004	<s> aɪ

In [37]:
%%bash

ngram -order 5 -lm "LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_word_type_5gram_model.arpa" -ppl "LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_word_type_corpus.txt" -debug 1 > "LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_word_type_corpus.log10pW"
# ngram -order 5 -lm "LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_word_type_5gram_model.arpa" -ppl "LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_word_type_corpus.txt" -debug 1 > "LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_word_type_corpus.log10pW"

reading 41 1-grams
reading 1069 2-grams
reading 6162 3-grams
reading 11980 4-grams
reading 11294 5-grams


In [38]:
!cat -n LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_word_type_corpus.log10pW | head -n 20
# !cat -n LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_word_type_corpus.log10pW | head -n 20

     1	aɪ aɪ
     2	1 sentences, 2 words, 0 OOVs
     3	0 zeroprobs, logprob= -7.065212 ppl= 226.5012 ppl1= 3408.833
     4	
     5	aɪ b i ɛ m z
     6	1 sentences, 6 words, 0 OOVs
     7	0 zeroprobs, logprob= -6.20959 ppl= 7.710531 ppl1= 10.83756
     8	
     9	aɪ b i ɛ m
    10	1 sentences, 5 words, 0 OOVs
    11	0 zeroprobs, logprob= -4.649106 ppl= 5.954579 ppl1= 8.507879
    12	
    13	aɪ b i ɛ s
    14	1 sentences, 5 words, 0 OOVs
    15	0 zeroprobs, logprob= -4.267175 ppl= 5.142756 ppl1= 7.135675
    16	
    17	aɪ d i ə l i
    18	1 sentences, 6 words, 0 OOVs
    19	0 zeroprobs, logprob= -5.092857 ppl= 5.34012 ppl1= 7.060078
    20	
cat: write error: Broken pipe


In [39]:
scored_lines = importSeqs("LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_word_type_corpus.log10pW", list)
# scored_lines = importSeqs("LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_word_type_corpus.log10pW", list)

In [40]:
chunked_lines = lchunks(4, scored_lines)
chunked_lines[0]
chunked_lines[1]
' '
chunked_lines[-3]
chunked_lines[-2]
chunked_lines[-1]

['aɪ aɪ',
 '1 sentences, 2 words, 0 OOVs',
 '0 zeroprobs, logprob= -7.065212 ppl= 226.5012 ppl1= 3408.833',
 '']

['aɪ b i ɛ m z',
 '1 sentences, 6 words, 0 OOVs',
 '0 zeroprobs, logprob= -6.20959 ppl= 7.710531 ppl1= 10.83756',
 '']

' '

['θ ʌ n d ɚ s t oʊ ɹ m z',
 '1 sentences, 11 words, 0 OOVs',
 '0 zeroprobs, logprob= -8.760152 ppl= 5.370475 ppl1= 6.257163',
 '']

['θ ʌ n d ɚ',
 '1 sentences, 5 words, 0 OOVs',
 '0 zeroprobs, logprob= -4.811982 ppl= 6.338653 ppl1= 9.17057',
 '']

['file LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_word_type_corpus.txt: 14779 sentences, 92247 words, 0 OOVs',
 '0 zeroprobs, logprob= -84405.6 ppl= 6.146753 ppl1= 8.222344']

In [41]:
chunked_lines[:-1][-1]

['θ ʌ n d ɚ',
 '1 sentences, 5 words, 0 OOVs',
 '0 zeroprobs, logprob= -4.811982 ppl= 6.338653 ppl1= 9.17057',
 '']

In [42]:
chunked_lines = chunked_lines[:-1]

In [43]:
chunked_lines[0]
' '
chunked_lines[0][2]
chunked_lines[0][2].split(' ')

['aɪ aɪ',
 '1 sentences, 2 words, 0 OOVs',
 '0 zeroprobs, logprob= -7.065212 ppl= 226.5012 ppl1= 3408.833',
 '']

' '

'0 zeroprobs, logprob= -7.065212 ppl= 226.5012 ppl1= 3408.833'

['0',
 'zeroprobs,',
 'logprob=',
 '-7.065212',
 'ppl=',
 '226.5012',
 'ppl1=',
 '3408.833']

In [44]:
10 ** -7.094849
np.log10(10 ** -7.094849)
(10 ** -7.094849) ** -(1 / 8) #ppl1 = excluding EOS
(10 ** -7.094849) ** -(1 / 9) #ppl = including (automatically inserted) EOS
# (10 ** -7.094849) ** -(1 / 10)

8.038055490376993e-08

-7.094849

7.706481230463152

6.142095369810131

In [45]:
padInputSequenceWithBoundaries('f.i.f') + '.' + rightEdge

'⋊.f.i.f.⋉.⋉'

In [46]:
def parse_chunk(chunk):
    spaced_word = chunk[0]
    
    word_length_no_edges = len(spaced_word.split(' '))
    word_length_with_edges = word_length_no_edges + 2
    
    score_info = chunk[2]
    score_info_split = score_info.split(' ')
    log10prob = float(score_info_split[3])
    ppl = float(score_info_split[5])
    ppl1 = float(score_info_split[7])
    
    prob = np.power(10, log10prob)
    surprisal = -1.0 * np.log2(prob)
    parse = {'w':padInputSequenceWithBoundaries(spaced_word.replace(' ', '.')) + '.' + rightEdge,
             'spaced_word':spaced_word,
             'word_length_no_edges':word_length_no_edges,
             'word_length_with_edges':word_length_with_edges,
             'log10prob':log10prob,
             'ppl':ppl,
             'ppl1':ppl1,
             'prob':prob,
             'surprisal':surprisal}
    return parse

parse_chunk(chunked_lines[0])

{'w': '⋊.aɪ.aɪ.⋉.⋉',
 'spaced_word': 'aɪ aɪ',
 'word_length_no_edges': 2,
 'word_length_with_edges': 4,
 'log10prob': -7.065212,
 'ppl': 226.5012,
 'ppl1': 3408.833,
 'prob': 8.605735623380343e-08,
 'surprisal': 23.47012623913533}

In [47]:
parsed_chunks = lmap(parse_chunk,
                     chunked_lines)
parsed_chunks[0]
parsed_chunks[-1]

{'w': '⋊.aɪ.aɪ.⋉.⋉',
 'spaced_word': 'aɪ aɪ',
 'word_length_no_edges': 2,
 'word_length_with_edges': 4,
 'log10prob': -7.065212,
 'ppl': 226.5012,
 'ppl1': 3408.833,
 'prob': 8.605735623380343e-08,
 'surprisal': 23.47012623913533}

{'w': '⋊.θ.ʌ.n.d.ɚ.⋉.⋉',
 'spaced_word': 'θ ʌ n d ɚ',
 'word_length_no_edges': 5,
 'word_length_with_edges': 7,
 'log10prob': -4.811982,
 'ppl': 6.338653,
 'ppl1': 9.17057,
 'prob': 1.541764352410409e-05,
 'surprisal': 15.98505819789228}

In [48]:
len(lpluck('w', parsed_chunks))
len(set(lpluck('w', parsed_chunks)))
assert len(lpluck('w', parsed_chunks)) == len(set(lpluck('w', parsed_chunks)))

14779

14779

In [49]:
W_surprisal_relation = lmap(partial(project, keys=['w', 'surprisal']),
                            parsed_chunks)

In [50]:
W_surprisal_relation[0]

{'w': '⋊.aɪ.aɪ.⋉.⋉', 'surprisal': 23.47012623913533}

In [51]:
saveDictList_as_TSV('LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_word_type_pW_nphone.tsv',
                    W_surprisal_relation,
                    ['w', 'surprisal'])
# saveDictList_as_TSV('LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_word_type_pW_nphone.tsv',
#                     W_surprisal_relation,
#                     ['w', 'surprisal'])

In [36]:
#FIXME continue...

# Token-based within-word model

The goal of this section is to produce a more broadly/ecologically representative $n$-phone model of
 - **within-word** phonotactics
 
where
 - **each word-token** in the corpus (that has a transcription!) is weighted equally.

To use an $n$-gram model estimation tool like SRILM or kenlm, we need to create a 'corpus' with one sentence per word token, where each sentence is the (space-separated) transcription of the word token.

In [None]:
#need every word token in the corpus

# Token-based across-words model

The goal of this section is to produce an even more broadly/ecologically representative $n$-phone model of
 - **across-word** phonotactics

To use an $n$-gram model estimation tool like SRILM or kenlm, we need to create a 'corpus' with one sentence per utterance, where each sentence is the (space-separated) transcriptions of the word tokens in the utterance.

In [None]:
#need every utterance in the corpus