In [2]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

**Eric Meinhardt / emeinhardt@ucsd.edu**

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Import-packages-and-data" data-toc-modified-id="Import-packages-and-data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import packages and data</a></span><ul class="toc-item"><li><span><a href="#Parameters" data-toc-modified-id="Parameters-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Parameters</a></span></li><li><span><a href="#Load,-prep,-and-vet-data" data-toc-modified-id="Load,-prep,-and-vet-data-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Load, prep, and vet data</a></span></li></ul></li><li><span><a href="#Add-probability-annotations" data-toc-modified-id="Add-probability-annotations-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Add probability annotations</a></span></li><li><span><a href="#Export" data-toc-modified-id="Export-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Export</a></span></li></ul></div>

In [3]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:            62G        4.9G         24G         12M         33G         57G
Swap:          2.0G        3.0M        2.0G


# Import packages and data

In [4]:
from collections import OrderedDict

In [5]:
import os
import csv
import json

In [6]:
from os import path

In [7]:
from funcy import *

In [8]:
from boilerplate import *
from probdist import *
from string_utils import *

In [9]:
repo_dir = os.getcwd(); repo_dir

'/mnt/cube/home/AD/emeinhar/wr'

In [10]:
import numpy as np
import torch

In [11]:
import tiledb

In [12]:
from tqdm import tqdm, tqdm_gui, tqdm_notebook

In [13]:
from joblib import Parallel, delayed

J = -1
# J = 16
BACKEND = 'multiprocessing'
# BACKEND = 'loky'
V = 10
PREFER = 'processes'
# PREFER = 'threads'

def identity(x):
    return x

def par(gen_expr):
    return Parallel(n_jobs=J, backend=BACKEND, verbose=V, prefer=PREFER)(gen_expr)

In [14]:
import re

In [15]:
def corpus_contexts_name(fp):
    if 'buckeye' in fp or 'Buckeye' in fp:
        return 'Buckeye'
    elif 'nxt_swbd' in fp or 'NXT_swbd' in fp:
        return 'NXT_swbd'
    else:
        raise Exception(f"corpus context in fp {fp} is neither buckeye nor nxt_swbd")
        
def get_contexts_direction(fp):
    if 'preceding' in fp:
        return 'preceding'
    elif 'following' in fp:
        return 'following'
    elif '1gram' in fp or '(empty)' in fp:
        return '(NA)'
    else:
        raise Exception(f"corpus context direction in fp {fp} is neither 'preceding' nor 'following' nor '(NA)' (= unigram)")
        
def get_contexts_order(fp):
#     direction = get_contexts_direction(fp)
    ngram_tokens = re.findall(r"[0-5]gram", fp)
    if len(ngram_tokens) < 1:
        raise Exception(f"No instance of substring matching '[0-5]gram' in {fp}; order could not be extracted.")
    orders = [token[0] for token in ngram_tokens]
    unique_orders = set(orders)
    if len(unique_orders) != 1:
        raise Exception(f"More than one 'n' for all substring tokens matching '[0-5]gram' in {fp}; no unique order could be extracted.")
    return list(unique_orders)[0]

## Parameters

In [16]:
a = ''
# a = 'buckeye_word_analysis_relation_filtered_annotated.json'
a = 'nxt_swbd_word_analysis_relation_filtered_annotated.json'
# a = 'buckeye_word_analysis_relation_filtered.json'
# a = 'nxt_swbd_word_analysis_relation_filtered.json'


p = ''
p = 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_pc0.001_l1.0_pW_WC_eq'
# p = 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_2gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pW_WC_e'
# p = 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_3gram_model/LTR_Buckeye_aligned_w_GD_AmE_destressed_pW_WC_e'

m = ''
m = 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_following_contexts_4gram_model.pW_C.npy'
# m = 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_2gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_following_contexts_2gram_model.pW_C.npy'
# m = 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_3gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_in_Buckeye_preceding_contexts_3gram_model.pW_C.npy'

w = ''
w = 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json'
# w = 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json'

c = ''
c = 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_4gram_model/LM_filtered_nxt_swbd_contexts_preceding_3_filtered.txt'
# c = 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_2gram_model/LM_filtered_nxt_swbd_contexts_following_1_filtered.txt'
# c = 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_3gram_model/LM_filtered_buckeye_contexts_preceding_2_filtered.txt'

# o = a
# o = 'nxt_swbd_word_analysis_relation_filtered_annotated.json'
# o = 'buckeye_word_analysis_relation_filtered_annotated.json'

In [17]:
unigram_model_arpa_fp = 'LM_Fisher/LD_Fisher_vocab_add1_unigram_model.arpa'
unigram_model_arpa_fp
unigram_model_json_fp = 'LM_Fisher/LD_Fisher_vocab_add1_unigram_model.pV.json'
unigram_model_json_fp
# unigram_model_np_fp = 'LM_Fisher/LD_Fisher_vocab_add1_unigram_model.pV.npy'
# unigram_model_np_fp

'LM_Fisher/LD_Fisher_vocab_add1_unigram_model.arpa'

'LM_Fisher/LD_Fisher_vocab_add1_unigram_model.pV.json'

In [18]:
for each in (a,p,w,c):
    if not path.exists(each):
        raise Exception(f"Argument path could not be found:\n\t{each}")

## Load, prep, and vet data

In [19]:
word_analysis_relation_fn = a

In [20]:
o = a

In [21]:
corpus_name = corpus_contexts_name(word_analysis_relation_fn)
corpus_name

'NXT_swbd'

In [22]:
pW_WC_dir = p

In [23]:
pW_C_fp = m

In [24]:
pW_V_fp = w

In [25]:
C_fp = c

In [26]:
order_from_c = int(get_contexts_order(c))
order_from_c
context_size = order_from_c - 1
context_size

4

3

In [27]:
direction_from_c = get_contexts_direction(c)
direction_from_c

'preceding'

In [28]:
context_field = str_join('_', [direction_from_c, str(context_size), 'wordforms'])
context_field

'preceding_3_wordforms'

In [29]:
word_analysis_relation = importDict(word_analysis_relation_fn)
len(word_analysis_relation)
word_analysis_relation[0]

113237

{'utt_id': 't1_0',
 'utt_start': 0.8,
 'utt_end': 8.851,
 'utt_duration': 8.051,
 'utt_orth': 'okay uh first um i need to know uh how do you feel about uh about sending um an elderly uh',
 'utt_speech': 'okay uh first um i need to know uh how do you feel about uh about sending um an elderly uh',
 'conversation_id': '2005',
 'speaker': 'A',
 'sex': 'M',
 'dob': 1961,
 'dialect': 'NORTH',
 'speaker_id': 'spkr1169',
 'interlocutor_id': 'spkr1133',
 'topic': 'CARE OF THE ELDERLY',
 'turn_id': 't1',
 'id': 'ms1A_pw6',
 'distance_from_left_edge_of_turn': 0,
 'distance_from_right_edge_of_turn': 0,
 'num_phonwords': 21,
 'num_syllables': 27,
 'start': 2.83775,
 'end': 3.0,
 'duration': 0.16224999999999978,
 'speech_rate': 3.3536206682399703,
 'orth': 'need',
 'speech': 'need',
 'has_extreme_speech_rate': False,
 'type': 'phonword',
 'tag': 'phonword',
 'msstate': 'sw2005A-ms98-a-0001',
 'stress': 'p',
 'child_type': 'syllable',
 'missing_syllable_info': False,
 'child_fn': 'sw2005.A.syllables.

In [30]:
pW_V = importDict(pW_V_fp)
len(pW_V)
Vs = set(pW_V.keys())
list(Vs)[:5]

13245

['camera', 'denial', 'respond', 'transaction', 'relate']

In [31]:
Ws = set(join(set(walk_values(lambda pW_v: frozenset(pW_v.keys()),
                       pW_V).values())))
type(Ws)
len(Ws)

Ws_t = tuple(sorted(Ws))
n_W = len(Ws_t)
n_W

Ws_t[:5]

set

12817

12817

('⋊.aɪ.aɪ.⋉.⋉',
 '⋊.aɪ.d.i.ə.l.i.⋉.⋉',
 '⋊.aɪ.d.i.ə.l.z.⋉.⋉',
 '⋊.aɪ.d.i.ə.l.ɪ.s.t.ɪ.k.⋉.⋉',
 '⋊.aɪ.d.i.ə.l.⋉.⋉')

In [32]:
singleRightPad = rightEdge; singleRightPad
doubleRightPad = str_join('.', [rightEdge, rightEdge]); doubleRightPad

assert all(leftEdge in w and rightEdge in w for w in Ws), f"Segmental wordforms from arg w must be padded!"

rightPadding = doubleRightPad if Ws_t[0][-3:] == doubleRightPad else singleRightPad
rightPadding

extraPadding = '.' + singleRightPad if rightPadding == doubleRightPad else ''
extraPadding

'⋉'

'⋉.⋉'

'⋉.⋉'

'.⋉'

In [33]:
transcription_field = 'transcription' if corpus_name == 'NXT_swbd' else 'phonemes'
corpus_name
transcription_field

'NXT_swbd'

'transcription'

In [34]:
len(word_analysis_relation)
noTr = lfilter(lambda rel: transcription_field not in rel, word_analysis_relation)
len(noTr)

113237

2

In [35]:
processed_orth_field = 'speech' if corpus_name == 'NXT_swbd' else 'orthographic_wordform'
corpus_name
processed_orth_field

'NXT_swbd'

'speech'

In [36]:
transcription_available = lambda rel: rel[processed_orth_field] in Vs
noTr_but_could_add_one = lfilter(transcription_available, noTr)
len(noTr_but_could_add_one)

noTr_cant_find_one = lfilter(lambda rel: not transcription_available(rel), noTr)
len(noTr_cant_find_one)
# lpluck('orth', noTr_cant_find_one)
lpluck(processed_orth_field, noTr_cant_find_one)

0

2

['marquee', 'scrimping']

In [37]:
if len(noTr_but_could_add_one) > 0:
    print(f'Adding transcription to {len(noTr_but_could_add_one)} relations')
    for rel in word_analysis_relation:
        if transcription_field not in rel and rel[processed_orth_field] in Vs:
            v = rel[processed_orth_field]
            my_Ws = tuple(sorted(pW_V[v].keys()))
            print(f"For v = {v}, choosing first transcription among {my_Ws}")
            rel[transcription_field] = my_Ws[0]
            rel['phones_length'] = len(ds2t(rel[transcription_field]))
            rel['missing_transcription?'] = True
            rel['transcription_added?'] = True
        elif transcription_field not in rel and rel[processed_orth_field] not in Vs:
            rel[transcription_field] = ''
            rel['missing_transcription?'] = True
            rel['transcription_added?'] = False
        else:
            pass


In [38]:
# list(noTr)[0]

In [39]:
segWord_field = 'segWord'

In [40]:
if not all(segWord_field in rel for rel in word_analysis_relation):
    print('Adding padded transcription field to every relation in word_analysis_relation...')
    for rel in word_analysis_relation:
        rel[segWord_field] = padInputSequenceWithBoundaries(rel[transcription_field]) + extraPadding

segWords_in_relation_unpadded = lpluck(transcription_field, word_analysis_relation)
segWords_in_relation_l = lpluck(segWord_field, word_analysis_relation)
len(segWords_in_relation_l)
segWords_in_relation = set(segWords_in_relation_l)
len(segWords_in_relation)
segWords_in_relation_t = tuple(sorted(segWords_in_relation))
len(segWords_in_relation_t)

Adding padded transcription field to every relation in word_analysis_relation...


KeyError: 'transcription'

In [40]:
segWords_missing_from_model = segWords_in_relation - Ws
len(segWords_missing_from_model)
segWords_missing_from_relation = Ws - segWords_in_relation
len(segWords_missing_from_relation)

910

1759

In [41]:
Cs_t = importSeqs(C_fp, tuple)
assert Cs_t == tuple(sorted(Cs_t))

Cs = set(Cs_t)
assert len(Cs) == len(Cs_t)

n_C = len(Cs_t)
n_C

Cs_t[:5]

15404

('<rem> a', '<rem> about', '<rem> abuse', '<rem> acting', '<rem> actually')

In [42]:
context_lengths = lmap(lambda c: len(c.split(' ')), Cs_t)
context_length_set = set(context_lengths)
context_length_range = tuple(sorted(context_length_set))
context_length_range

max_context_length = max(context_length_range)
min_context_length = min(context_length_range)
missing_context_lengths = tuple([l for l in range(1, max_context_length) if l not in context_length_set])
print(f"Num missing context lengths = {len(missing_context_lengths)}")
print(f"Missing contexts = {missing_context_lengths}")

(1, 2)

Num missing context lengths = 0
Missing contexts = ()


In [43]:
assert context_size == max_context_length, f"Context size {context_size} from arg c doesn't match max context length in the file {max_context_length}"

In [44]:
contexts_in_relation_l = lpluck(context_field, word_analysis_relation)
len(contexts_in_relation_l)
contexts_in_relation = set(contexts_in_relation_l)
len(contexts_in_relation)
contexts_in_relation_t = tuple(sorted(contexts_in_relation))
len(contexts_in_relation_t)

44127

15786

15786

In [45]:
n_C

15404

In [46]:
contexts_missing_from_model = contexts_in_relation - Cs
len(contexts_missing_from_model)
contexts_missing_from_relation = Cs - contexts_in_relation
len(contexts_missing_from_relation)

382

0

In [47]:
list(contexts_missing_from_model)[:5]

['', "uh doty's", 'yknow four', 'yknow terrible', 'fertilization i']

In [48]:
array_name = pW_WC_dir
array_name

'LD_Fisher_vocab_in_Buckeye_preceding_contexts_3gram_model/LTR_Buckeye_aligned_w_GD_AmE_destressed_pW_WC_e'

In [49]:
tiledb.object_type(array_name)

'array'

In [50]:
with tiledb.DenseArray(array_name, mode='r') as A:
    A.shape
    pW_WC_shape = A.shape
#     pW_WC = A[:]

(6404, 15404)

In [51]:
assert pW_WC_shape[0] == n_W, f"segWord dimension {pW_WC_shape[0]} of pW_WC array does not match dimension of W = {n_W}"
assert pW_WC_shape[1] == n_C, f"context dimension {pW_WC_shape[1]} of pW_WC array does not match dimension of C = {n_C}"

In [52]:
with tiledb.DenseArray(array_name, mode='r') as A:
#     A.shape
#     pW_WC_shape = A.shape
    pW_WC = A[:]['pW_WC_e']

In [53]:
type(pW_WC)
toHuman(pW_WC.nbytes)

numpy.ndarray

'376.31MB'

In [54]:
pW_WC.dtype
pW_WC.shape
n_W, n_C

dtype('float32')

(6404, 15404)

(6404, 15404)

In [55]:
has_inf_mask = pW_WC == np.inf
has_infs = has_inf_mask.any()
assert not has_infs
del has_inf_mask

In [56]:
has_nan_mask = pW_WC == np.NaN
has_nans = has_nan_mask.any()
assert not has_nans
del has_nan_mask

In [57]:
gtOne_mask = pW_WC > 1.0
has_gtOnes = gtOne_mask.any()
if has_gtOnes:
    deviating_probs = pW_WC[gtOne_mask]
    deviations_from_1 = deviating_probs - np.ones(deviating_probs.shape)
    print(f"Deviations from 1 = \n\t{deviations_from_1}")
    print(f"Deviating probabilities = \n\t{deviating_probs}")
    assert np.allclose(deviations_from_1, np.zeros(deviating_probs.shape), atol=1e-06)
    pW_WC[gtOne_mask] = np.ones(deviating_probs.shape)
# assert not has_gtOnes
del gtOne_mask

Deviations from 1 = 
	[1.1920929e-07 1.1920929e-07 1.1920929e-07 1.1920929e-07 1.1920929e-07
 1.1920929e-07 1.1920929e-07 1.1920929e-07 1.1920929e-07 1.1920929e-07
 1.1920929e-07 1.1920929e-07 1.1920929e-07]
Deviating probabilities = 
	[1.0000001 1.0000001 1.0000001 1.0000001 1.0000001 1.0000001 1.0000001
 1.0000001 1.0000001 1.0000001 1.0000001 1.0000001 1.0000001]


  """Entry point for launching an IPython kernel.


In [58]:
ltZero_mask = pW_WC < 0.0
has_ltZeros = ltZero_mask.any()
assert not has_ltZeros
del ltZero_mask

  """Entry point for launching an IPython kernel.


In [59]:
pW_C = np.load(pW_C_fp)
pW_C.dtype
pW_C.shape
toHuman(pW_C.nbytes)
n_W, n_C

dtype('float64')

(6404, 15404)

'752.62MB'

(6404, 15404)

In [60]:
assert pW_C.shape == pW_WC.shape, f"pW_C.shape != pW_WC.shape: {pW_C.shape} vs. {pW_WC.shape}, \n\t:pW_C_fp = {pW_C_fp}\n\t:pW_WC_dir = {pW_WC_dir}"

In [61]:
unigram_model_arpa_fp
unigram_model_json_fp

'LM_Fisher/LD_Fisher_vocab_add1_unigram_model.arpa'

'LM_Fisher/LD_Fisher_vocab_add1_unigram_model.pV.json'

In [62]:
pV_unigram = importProbDist(unigram_model_json_fp)

In [63]:
V_Fisher = set(pV_unigram.keys())
len(V_Fisher)

V_t_Fisher = tuple(sorted(V_Fisher))
V_t_Fisher[:5]

44066

("'and", "'berserkly'", "'bout", "'burb", "'burban")

In [64]:
no_unigram_score = {v for v in Vs if v not in V_Fisher}
len(no_unigram_score)
assert len(no_unigram_score) == 0, f"Vs without a unigram score = \n\t{no_unigram_score}"

0

In [65]:
random_v = choice(tuple(Vs)); random_v

'counseling'

In [66]:
pV_unigram[random_v]

5.652219482674406e-06

In [67]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:            62G        6.1G        464M         67M         56G         56G
Swap:          2.0G        1.2M        2.0G


# Add probability annotations

In [68]:
lm_str_p_unigram = 'p' + '(W = w*)'
lm_str_h_unigram = 'h' + '(W = w*)'

In [69]:
order_from_c
context_size
direction_from_c

3

2

'preceding'

In [70]:
ctxt_str_map = {'preceding':f'C = w_-{context_size}^-1',
                'following':f'C = w_+1^+{context_size}'}
ctxt_str = ctxt_str_map[direction_from_c]
ctxt_str

'C = w_-2^-1'

In [71]:
source_str = 'W = w*'
target_str = "W' = w*"

In [72]:
no_noise_body_str = f"{source_str} | {ctxt_str}"
no_noise_body_str

noise_body_str = f"{target_str} | {source_str}, {ctxt_str}"
noise_body_str

'W = w* | C = w_-2^-1'

"W' = w* | W = w*, C = w_-2^-1"

In [73]:
lm_str_p = 'p' + '(' + no_noise_body_str + ')'
lm_str_p
lm_str_h = 'h' + '(' + no_noise_body_str + ')'
lm_str_h
lm_str_h_bar = 'h-bar' + '(' + no_noise_body_str + ')'
lm_str_h_bar

'p(W = w* | C = w_-2^-1)'

'h(W = w* | C = w_-2^-1)'

'h-bar(W = w* | C = w_-2^-1)'

In [74]:
post_str_p = 'p' + '(' + noise_body_str + ')'
post_str_p
post_str_h = 'h' + '(' + noise_body_str + ')'
post_str_h
post_str_h_bar = 'h-bar' + '(' + noise_body_str + ')'
post_str_h_bar

"p(W' = w* | W = w*, C = w_-2^-1)"

"h(W' = w* | W = w*, C = w_-2^-1)"

"h-bar(W' = w* | W = w*, C = w_-2^-1)"

In [75]:
# ctxt_str_map = {'preceding':f'p(w*|w_-{context_size}^-1)',
#                 'following':f'p(w*|w_+1^+{context_size})'}
# p_annotation_str = ctxt_str_map[direction_from_c]
# p_annotation_str
# h_annotation_str = 'h' + p_annotation_str[1:]
# h_annotation_str

In [76]:
def idx(element, collection):
    return collection.index(element)

In [80]:
orth_field = 'orthographic_wordform' if 'buckeye' in a else 'speech'

In [81]:
def annotate(rel):
    my_c = rel[context_field]
    my_w = rel[segWord_field]
    modelable_c = my_c in Cs
    modelable_w = my_w in Ws
    
    my_v = rel[orth_field]
    if my_v not in pV_unigram:
        rel[lm_str_p_unigram] = 'bad_v'
        rel[lm_str_h_unigram] = 'bad_v'
    else:
        rel[lm_str_p_unigram] = pV_unigram[my_v]
        rel[lm_str_h_unigram] = -1.0 * float(np.log2(pV_unigram[my_v]))
    
    result = []
    if not modelable_c:
        result = ['bad_c']
    if not modelable_w:
        result += ['bad_w']
    
    if len(result) > 0:
        result = str_join(',', result)
        rel[lm_str_p] = result
        rel[lm_str_h] = result
        rel[lm_str_h_bar] = result
        rel[post_str_p] = result
        rel[post_str_h] = result
        rel[post_str_h_bar] = result
        return rel
    
    c_idx = idx(my_c, Cs_t)
    w_idx = idx(my_w, Ws_t)
    
    lm_result = pW_C[w_idx, c_idx]
#     lm_bar_result = #FIXME
    post_result = pW_WC[w_idx, c_idx]
#     post_bar_result = #FIXME
    
    rel[lm_str_p] = lm_result
    rel[lm_str_h] = -1.0 * float(np.log2(lm_result))
#     rel[lm_str_h_bar] = #FIXME
    rel[post_str_p] = float(post_result)
    rel[post_str_h] = -1.0 * float(np.log2(post_result))
#     rels[post_str_h_bar] = #FIXME
    return rel

In [82]:
partially_updated_word_analysis_relation = lmap(annotate, word_analysis_relation)

In [83]:
from statistics import mean

In [84]:
allSegWords = lpluck('segWord', partially_updated_word_analysis_relation)
len(allSegWords)
allSegWords = set(allSegWords)
len(allSegWords)

44127

5555

In [85]:
segWord_to_lm_probs = dict()
segWord_to_post_probs = dict()

def avg_prob_measure_update(rel):
    #NB: STATEFUL AF!
    my_w = rel['segWord']
    if my_w in segWord_to_lm_probs and type(rel[lm_str_h]) != type("foo"):
        my_update = {rel[lm_str_h]}
        segWord_to_lm_probs[my_w] = merge(segWord_to_lm_probs[my_w], my_update)
    elif my_w not in segWord_to_lm_probs and type(rel[lm_str_h]) != type("foo"):
        segWord_to_lm_probs[my_w] = {rel[lm_str_h]}
#     else:
#         print('')
    
    if my_w in segWord_to_post_probs and type(rel[post_str_h]) != type("foo"):
        my_update = {rel[post_str_h]}
        segWord_to_post_probs[my_w] = merge(segWord_to_post_probs[my_w], my_update)
    elif my_w not in segWord_to_post_probs and type(rel[post_str_h]) != type("foo"):
        segWord_to_post_probs[my_w] = {rel[post_str_h]}
#     else:
#         print('')

for rel in tqdm(partially_updated_word_analysis_relation):
    avg_prob_measure_update(rel)

100%|██████████| 44127/44127 [00:00<00:00, 211211.68it/s]


In [86]:
avg_prob_measure_lookup = dict()

def avg_prob_measure_lookup_update(segWord):
    #NB: STATEFUL AF!
    my_w = segWord
    if my_w in avg_prob_measure_lookup:
        raise Exception('This function should only be called once for each segWord.')
    
    lm_probs = segWord_to_lm_probs.get(my_w, set())
#     lm_probs = segWord_to_lm_probs[my_w]
    post_probs = segWord_to_post_probs.get(my_w, set())
#     post_probs = segWord_to_post_probs[my_w]
    
    n_contexts_with_w_and_lm_prob = len(lm_probs)
    n_contexts_with_w_and_post_prob = len(post_probs)
    
    lm_bar_result = sum(lm_probs) / n_contexts_with_w_and_lm_prob if n_contexts_with_w_and_lm_prob != 0 else 'bad_w'
    post_bar_result = sum(post_probs) / n_contexts_with_w_and_post_prob if n_contexts_with_w_and_post_prob != 0 else 'bad_w'
    
    total_result = {lm_str_h_bar:lm_bar_result,
                    post_str_h_bar:post_bar_result}
    avg_prob_measure_lookup[my_w] = total_result
    
for segWord in tqdm(allSegWords):
    avg_prob_measure_lookup_update(segWord)

100%|██████████| 5555/5555 [00:00<00:00, 363240.86it/s]


In [87]:
def annotate_avg_prob_measures(rel):
    my_w = rel[segWord_field]
    
    my_results = avg_prob_measure_lookup.get(my_w, 'bad_w')
    
    if my_results == 'bad_w':
        rel[lm_str_h_bar] = my_results
        rel[post_str_h_bar] = my_results
        
    rel[lm_str_h_bar] = my_results[lm_str_h_bar]
    rel[post_str_h_bar] = my_results[post_str_h_bar]
    
    return rel
    
    
# straightforward, but extremely inefficient code below...
    
# def get_rels_with(key_segword, rels):
#     return lfilter(lambda rel:rel['segWord'] == key_segword, rels)

# def calc_avg_prob_measures(segWord, rels):
#     my_w = segWord
#     modelable_w = my_w in Ws
    
#     result = []
#     if not modelable_w:
#         result += ['bad_w']
    
#     if len(result) > 0:
#         result = str_join(',', result)
#         return result
    
# #     c_idx = idx(my_c, Cs_t)
# #     w_idx = idx(my_w, Ws_t)
#     all_rels_with_w = get_rels_with(my_w, rels)
#     has_a_calculated_lm_prob = lfilter(lambda r: type(r[lm_str_h]) != type("foo"), 
#                                        all_rels_with_w)
#     lm_probs = lpluck(lm_str_h, has_a_calculated_lm_prob)
#     has_a_calculated_post_prob = lfilter(lambda r: type(r[post_str_h]) != type("foo"),
#                                          all_rels_with_w)
#     post_probs = lpluck(post_str_h, has_a_calculated_post_prob)
#     n_contexts_with_w_and_lm_prob = len(has_a_calculated_lm_prob)
#     n_contexts_with_w_and_post_prob = len(has_a_calculated_post_prob)
    
# #     lm_result = pW_C[w_idx, c_idx]
#     lm_bar_result = sum(lm_probs) / n_contexts_with_w_and_lm_prob if n_contexts_with_w_and_lm_prob != 0 else 'bad_w'
# #     post_result = pW_WC[w_idx, c_idx]
#     post_bar_result = sum(post_probs) / n_contexts_with_w_and_post_prob if n_contexts_with_w_and_post_prob != 0 else 'bad_w'
    
# ##     rel[lm_str_p] = lm_result
# ##     rel[lm_str_h] = -1.0 * float(np.log2(lm_result))
# #     rel[lm_str_h_bar] = lm_bar_result
# ##     rel[post_str_p] = float(post_result)
# #     rel[post_str_h] = -1.0 * float(np.log2(post_result))
# #     rels[post_str_h_bar] = post_bar_result
# #     return lm_bar_result, post_bar_result
#     return {lm_str_h_bar:lm_bar_result,
#             post_str_h_bar:post_bar_result}

# def calc_avg_prob_measures_helper(segWord, rels):
#     return {segWord:calc_avg_prob_measures(segWord, rels)}

# # avg_prob_measure_lookup = dict(par(delayed(calc_avg_prob_measures_helper)(sw, partially_updated_word_analysis_relation)
# #                               for sw in allSegWords))

# #31 segwords/s on kotoba = waaaaaaaay too slow
# # avg_prob_measure_lookup = dict([calc_avg_prob_measures_helper(sw, partially_updated_word_analysis_relation)
# #                                 for sw in tqdm(allSegWords)])
# # avg_prob_measure_lookup = dict(lmap(partial(calc_avg_prob_measures_helper, rels=partially_updated_word_analysis_relation),
# #                                allSegWords))

# #inefficient to map over all rels because of unused subcomputations
# def annotate_avg_prob_measures(rel, rels):
# #     my_c = rel[context_field]
#     my_w = rel[segWord_field]
# #     modelable_c = my_c in Cs
#     modelable_w = my_w in Ws
    
#     result = []
# #     if not modelable_c:
# #         result = ['bad_c']
#     if not modelable_w:
#         result += ['bad_w']
    
#     if len(result) > 0:
#         result = str_join(',', result)
# #         rel[lm_str_p] = result
# #         rel[lm_str_h] = result
#         rel[lm_str_h_bar] = result
# #         rel[post_str_p] = result
# #         rel[post_str_h] = result
#         rel[post_str_h_bar] = result
#         return rel
    
# #     c_idx = idx(my_c, Cs_t)
# #     w_idx = idx(my_w, Ws_t)
#     all_rels_with_w = get_rels_with(my_w, rels)
#     has_a_calculated_lm_prob = lfilter(lambda r: type(r[lm_str_h]) != type("foo"), 
#                                        all_rels_with_w)
#     lm_probs = lpluck(lm_str_h, has_a_calculated_lm_prob)
#     has_a_calculated_post_prob = lfilter(lambda r: type(r[post_str_h]) != type("foo"),
#                                          all_rels_with_w)
#     post_probs = lpluck(post_str_h, has_a_calculated_post_prob)
#     n_contexts_with_w_and_lm_prob = len(has_a_calculated_lm_prob)
#     n_contexts_with_w_and_post_prob = len(has_a_calculated_post_prob)
    
    
# #     lm_result = pW_C[w_idx, c_idx]
#     lm_bar_result = sum(lm_probs) / n_contexts_with_w_and_lm_prob if n_contexts_with_w_and_lm_prob != 0 else 'bad_w'
# #     post_result = pW_WC[w_idx, c_idx]
#     post_bar_result = sum(post_probs) / n_contexts_with_w_and_post_prob if n_contexts_with_w_and_post_prob != 0 else 'bad_w'
    
# #     rel[lm_str_p] = lm_result
# #     rel[lm_str_h] = -1.0 * float(np.log2(lm_result))
#     rel[lm_str_h_bar] = lm_bar_result
# #     rel[post_str_p] = float(post_result)
# #     rel[post_str_h] = -1.0 * float(np.log2(post_result))
#     rels[post_str_h_bar] = post_bar_result
#     return rel

In [88]:
updated_word_analysis_relation = lmap(annotate_avg_prob_measures, 
                                      partially_updated_word_analysis_relation)
# updated_word_analysis_relation = lmap(partial(annotate_avg_prob_measures, rels=partially_updated_word_analysis_relation), 
#                                       partially_updated_word_analysis_relation)

In [89]:
updated_word_analysis_relation[0]

{'orthographic_wordform': 'set',
 'orthographic_wordform_length': 3,
 'preceding_4_wordforms': '<rem> i since i',
 'preceding_3_wordforms': 'i since i',
 'preceding_2_wordforms': 'since i',
 'preceding_1_wordforms': 'i',
 'following_1_wordforms': 'it',
 'following_2_wordforms': "it it's",
 'following_3_wordforms': "it it's okay",
 'following_4_wordforms': "it it's okay",
 'preceding_wordforms': 'because it slipped <rem> i since i',
 'following_wordforms': "it it's okay",
 'bidirectional_context': ['because it slipped <rem> i since i',
  "it it's okay"],
 'POS': 'VBD',
 'isAdj': False,
 'isAdv': False,
 'isN': False,
 'isV': True,
 'phonemes': 's.ɛ.t',
 'phones': 's.ɛ.ɾ',
 'phonemes_length': 3,
 'phones_length': 3,
 'hasSyllabicSegsInPhones': True,
 'hasAdjacentPauseOrDisfluency': False,
 'hasAdjacentFilledPause': False,
 'hasClitic': False,
 'syllables': 1,
 'beg': 45.022068,
 'end': 45.243621,
 'duration': 0.2215530000000001,
 'misalgined': False,
 'track_name': 's0501a',
 'speaker_na

In [90]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:            62G        6.1G        515M         67M         56G         56G
Swap:          2.0G        1.2M        2.0G


In [91]:
len(updated_word_analysis_relation)
has_no_result_post_str_p = lfilter(lambda rel: type(rel[post_str_p]) == str,
                                   updated_word_analysis_relation)
len(has_no_result_post_str_p)

44127

9781

In [92]:
len(updated_word_analysis_relation)
has_no_result_lm_str_p_unigram = lfilter(lambda rel: type(rel[lm_str_p_unigram]) == str,
                                         updated_word_analysis_relation)
len(has_no_result_lm_str_p_unigram)

44127

368

In [None]:
%pwd

In [None]:
o

# Export

In [None]:
exportDict(o, updated_word_analysis_relation)