In [1]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

**Eric Meinhardt / emeinhardt@ucsd.edu**

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Import-packages-and-data" data-toc-modified-id="Import-packages-and-data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import packages and data</a></span><ul class="toc-item"><li><span><a href="#Parameters" data-toc-modified-id="Parameters-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Parameters</a></span></li><li><span><a href="#Load,-prep,-and-vet-data" data-toc-modified-id="Load,-prep,-and-vet-data-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Load, prep, and vet data</a></span></li></ul></li><li><span><a href="#Add-probability-annotations" data-toc-modified-id="Add-probability-annotations-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Add probability annotations</a></span></li><li><span><a href="#Export" data-toc-modified-id="Export-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Export</a></span></li></ul></div>

In [2]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:            62G        4.9G         24G         12M         33G         57G
Swap:          2.0G        3.0M        2.0G


# Import packages and data

In [3]:
from collections import OrderedDict

In [4]:
import os
import csv
import json

In [5]:
from os import path

In [6]:
from funcy import *

In [7]:
from boilerplate import *
from probdist import *
from string_utils import *

In [8]:
repo_dir = os.getcwd(); repo_dir

'/mnt/cube/home/AD/emeinhar/wr'

In [9]:
import numpy as np
import torch

In [10]:
import sparse

In [11]:
import tiledb

In [12]:
from tqdm import tqdm, tqdm_gui, tqdm_notebook

In [13]:
from joblib import Parallel, delayed

J = -1
# J = 16
BACKEND = 'multiprocessing'
# BACKEND = 'loky'
V = 10
PREFER = 'processes'
# PREFER = 'threads'

def identity(x):
    return x

def par(gen_expr):
    return Parallel(n_jobs=J, backend=BACKEND, verbose=V, prefer=PREFER)(gen_expr)

In [14]:
import re

In [15]:
def corpus_contexts_name(fp):
    if 'buckeye' in fp or 'Buckeye' in fp:
        return 'Buckeye'
    elif 'nxt_swbd' in fp or 'NXT_swbd' in fp:
        return 'NXT_swbd'
    else:
        raise Exception(f"corpus context in fp {fp} is neither buckeye nor nxt_swbd")
        
def get_contexts_direction(fp):
    if 'preceding' in fp:
        return 'preceding'
    elif 'following' in fp:
        return 'following'
    elif '1gram' in fp or '(empty)' in fp:
        return '(NA)'
    else:
        raise Exception(f"corpus context direction in fp {fp} is neither 'preceding' nor 'following' nor '(NA)' (= unigram)")
        
def get_contexts_order(fp):
#     direction = get_contexts_direction(fp)
    ngram_tokens = re.findall(r"[0-5]gram", fp)
    if len(ngram_tokens) < 1:
        raise Exception(f"No instance of substring matching '[0-5]gram' in {fp}; order could not be extracted.")
    orders = [token[0] for token in ngram_tokens]
    unique_orders = set(orders)
    if len(unique_orders) != 1:
        raise Exception(f"More than one 'n' for all substring tokens matching '[0-5]gram' in {fp}; no unique order could be extracted.")
    return list(unique_orders)[0]

def get_pseudocount(fp):
    pc_tokens = re.findall(r"pc0.[01]*", fp)
    if len(pc_tokens) < 1:
        raise Exception(f"No instance of substring matching 'pc0.[01]*' in {fp}; pseudocount could not be extracted.")
    pcs = [token.split('pc')[1] for token in pc_tokens]
    unique_pcs = set(pcs)
    if len(unique_pcs) != 1:
        raise Exception(f"More than one pseudocount for all substring tokens matching 'pc0.[01]*' in {fp}; no unique pseudocount could be extracted.")
    return list(unique_pcs)[0]

def get_lambda(fp):
    l_tokens = re.findall(r"l[01][.][0-9]*", fp)
    if len(l_tokens) < 1:
        raise Exception(f"No instance of substring matching 'l[01][.][0-9]*' in {fp}; lambda could not be extracted.")
    ls = [token.split('l')[1] for token in l_tokens]
    unique_ls = set(ls)
    if len(unique_ls) != 1:
        raise Exception(f"More than one lambda for all substring tokens matching 'l[01][.][0-9]*' in {fp}; no unique lambda could be extracted.")
    return list(unique_ls)[0]

## Parameters

In [16]:
a = ''
# a = 'nxt_swbd_word_analysis_relation_filtered_annotated.json'
# a = 'buckeye_word_analysis_relation_filtered_annotated.json'
# a = 'buckeye_word_analysis_relation_filtered.json'
# a = 'nxt_swbd_word_analysis_relation_filtered.json'


p = ''
# p = 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_pc0.001_l1.0_pW_WC_eq'
# p = 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_2gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pW_WC_e'
# p = 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_3gram_model/LTR_Buckeye_aligned_w_GD_AmE_destressed_pW_WC_e'

m = ''
# m = 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_following_contexts_4gram_model.pW_C.npy'
# m = 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_2gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_following_contexts_2gram_model.pW_C.npy'
# m = 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_3gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_in_Buckeye_preceding_contexts_3gram_model.pW_C.npy'

w = ''
# w = 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json'
# w = 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json'

c = ''
# c = 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LM_filtered_nxt_swbd_contexts_following_3_filtered.txt'
# c = 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_2gram_model/LM_filtered_nxt_swbd_contexts_following_1_filtered.txt'
# c = 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_3gram_model/LM_filtered_buckeye_contexts_preceding_2_filtered.txt'

# o = a
# o = 'nxt_swbd_word_analysis_relation_filtered_annotated.json'
# o = 'buckeye_word_analysis_relation_filtered_annotated.json'

In [17]:
# a = 'nxt_swbd_word_analysis_relation_filtered_annotated.json'
# p = 'LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model/LTR_NXT_swbd_destressed_aligned_pc0.001_l1.0_pW_WC_eq'
# m = 'LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered(empty)_(NA)_contexts.pW_C.npy'
# w = 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered.pW_V.json'
# c = ''

In [22]:
# a = 'buckeye_word_analysis_relation_filtered_annotated.json'
# p = 'LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model/LTR_Buckeye_aligned_pc0.001_l1.0_pW_WC_eq'
# m = 'LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_in_Buckeye_following_contexts_2gram_model.pW_C.npy'
# w = 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json'
# c = 'LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model/LM_filtered_buckeye_contexts_following_1_filtered.txt'

In [23]:
arg_bundle = {'a':a,
              'p':p,
              'm':m,
              'w':w,
              'c':c}

In [24]:
# unigram_model_arpa_fp = 'LM_Fisher/LD_Fisher_vocab_add1_unigram_model.arpa'
# unigram_model_arpa_fp
# unigram_model_json_fp = 'LM_Fisher/LD_Fisher_vocab_add1_unigram_model.pV.json'
# unigram_model_json_fp
# # unigram_model_np_fp = 'LM_Fisher/LD_Fisher_vocab_add1_unigram_model.pV.npy'
# # unigram_model_np_fp

In [25]:
for each in (a,p,w,c):
    if each != c or c != '':
        if not path.exists(each):
            raise Exception(f"Argument path could not be found:\n\t{each}")
    else:
        if each == c and c == '' and '(empty)' not in p:
            raise Exception(f"Can only pass empty context arg for unigram models...")

## Load, prep, and vet data

In [26]:
word_analysis_relation_fn = a

In [27]:
o = a

In [28]:
corpus_name = corpus_contexts_name(word_analysis_relation_fn)
corpus_name

'Buckeye'

In [29]:
pW_WC_dir = p

In [30]:
p

'LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model/LTR_Buckeye_aligned_pc0.001_l1.0_pW_WC_eq'

In [31]:
my_lambda = get_lambda(p)
my_lambda
assert my_lambda in lmap(str, [1.0, 0.5, 0.25, 0.125])
my_alpha = get_pseudocount(p)
my_alpha
assert my_alpha in lmap(str, [0.0, 0.001, 0.01, 0.1])

'1.0'

'0.001'

In [32]:
pW_C_fp = m

In [33]:
pW_V_fp = w

In [34]:
C_fp = c

In [35]:
if C_fp != '':
    order_from_c = int(get_contexts_order(c))
    order_from_c
    context_size = order_from_c - 1
    context_size
else:
    order_from_c = 1
    context_size = 0

2

1

In [36]:
if C_fp != '':
    direction_from_c = get_contexts_direction(c)
    direction_from_c
else:
    direction_from_c = '(NA)'

'following'

In [37]:
if C_fp != '':
    context_field = str_join('_', [direction_from_c, str(context_size), 'wordforms'])
    context_field
else:
    context_field = None

'following_1_wordforms'

In [38]:
word_analysis_relation = importDict(word_analysis_relation_fn)
len(word_analysis_relation)
word_analysis_relation[0]

44127

{'orthographic_wordform': 'set',
 'orthographic_wordform_length': 3,
 'preceding_4_wordforms': '<rem> i since i',
 'preceding_3_wordforms': 'i since i',
 'preceding_2_wordforms': 'since i',
 'preceding_1_wordforms': 'i',
 'following_1_wordforms': 'it',
 'following_2_wordforms': "it it's",
 'following_3_wordforms': "it it's okay",
 'following_4_wordforms': "it it's okay <s>",
 'preceding_wordforms': '<s> because it slipped <rem> i since i',
 'following_wordforms': "it it's okay <s>",
 'bidirectional_context': ['<s> because it slipped <rem> i since i',
  "it it's okay <s>"],
 'POS': 'VBD',
 'isAdj': False,
 'isAdv': False,
 'isN': False,
 'isV': True,
 'phonemes': 's.ɛ.t',
 'phones': 's.ɛ.ɾ',
 'phonemes_length': 3,
 'phones_length': 3,
 'hasSyllabicSegsInPhones': True,
 'hasAdjacentPauseOrDisfluency': False,
 'hasAdjacentFilledPause': False,
 'hasClitic': False,
 'syllables': 1,
 'beg': 45.022068,
 'end': 45.243621,
 'duration': 0.2215530000000001,
 'misalgined': False,
 'track_name': 's

In [39]:
pW_V = importDict(pW_V_fp)
len(pW_V)
Vs = set(pW_V.keys())
list(Vs)[:5]

6575

['front', 'dream', 'nerds', 'faintest', 'centralized']

In [40]:
Ws = set(join(set(walk_values(lambda pW_v: frozenset(pW_v.keys()),
                       pW_V).values())))
type(Ws)
len(Ws)

Ws_t = tuple(sorted(Ws))
n_W = len(Ws_t)
n_W

Ws_t[:5]

set

6404

6404

('⋊.aɪ.d.i.ɑ.l.ʌ.dʒ.i.⋉.⋉',
 '⋊.aɪ.d.i.ʌ.l.i.⋉.⋉',
 '⋊.aɪ.d.i.ʌ.l.ɪ.s.t.ɪ.k.⋉.⋉',
 '⋊.aɪ.d.i.ʌ.z.⋉.⋉',
 '⋊.aɪ.d.i.ʌ.⋉.⋉')

In [42]:
# pW_V['camera']

In [43]:
for each_v in pW_V:
    assert len(set(pW_V[each_v].keys())) == 1, f"Each orthographic word is assumed to map to a single unique segmental wordform: '{v}' maps to '[{set(pW_V[each_v].keys())}]'"

In [44]:
V_to_W = {v:list(pW_V[v].keys())[0]
          for v in pW_V}

In [45]:
processed_orth_field = 'speech' if corpus_name == 'NXT_swbd' else 'orthographic_wordform'
corpus_name
processed_orth_field

'Buckeye'

'orthographic_wordform'

In [46]:
segWord_field = 'segWord'

In [47]:
for rel in tqdm(word_analysis_relation):
    my_v = rel[processed_orth_field]
    if my_v not in Vs:
        print(f"{my_v} not in LM...")
        rel[segWord_field] = None
    else:
        rel[segWord_field] = V_to_W[my_v]

 52%|█████▏    | 23064/44127 [00:00<00:00, 215857.08it/s]

married not in LM...
stergle not in LM...
wexner not in LM...
different not in LM...
pencil not in LM...
cartoony not in LM...
parents not in LM...
medical not in LM...
primary not in LM...
cultural not in LM...
people not in LM...
airport not in LM...
gallery not in LM...
people not in LM...
people not in LM...
people not in LM...
gallery not in LM...
wexner not in LM...
couple not in LM...
couple not in LM...
people not in LM...
people not in LM...
people not in LM...
interesting not in LM...
political not in LM...
able not in LM...
little not in LM...
gorilla not in LM...
lecturers not in LM...
different not in LM...
people not in LM...
history not in LM...
able not in LM...
prepared not in LM...
different not in LM...
people not in LM...
different not in LM...
people not in LM...
tolerant not in LM...
people not in LM...
racists not in LM...
sure not in LM...
sure not in LM...
circles not in LM...
circles not in LM...
sprouts not in LM...
circles not in LM...
sure not in LM...
awar

100%|██████████| 44127/44127 [00:00<00:00, 176727.95it/s]


married not in LM...
medical not in LM...
medicare not in LM...
people not in LM...
people not in LM...
people not in LM...
parents not in LM...
little not in LM...
parents not in LM...
local not in LM...
differently not in LM...
areas not in LM...
grocery not in LM...
people not in LM...
sensational not in LM...
couple not in LM...
people not in LM...
affair not in LM...
people not in LM...
little not in LM...
away not in LM...
people not in LM...
barrage not in LM...
somewhere not in LM...
local not in LM...
channels not in LM...
somewhere not in LM...
different not in LM...
carried not in LM...
people not in LM...
personal not in LM...
personal not in LM...
insurance not in LM...
insurance not in LM...
regrettable not in LM...
level not in LM...
military not in LM...
people not in LM...
marriages not in LM...
summarized not in LM...
furor not in LM...
regrettable not in LM...
anywhere not in LM...
personal not in LM...
interesting not in LM...
primaries not in LM...
interesting not




In [48]:
len(word_analysis_relation)
no_seg_word_in_model = lfilter(lambda rel: segWord_field not in rel,
                               word_analysis_relation)
len(no_seg_word_in_model)

44127

0

In [49]:
# singleRightPad = rightEdge; singleRightPad
# doubleRightPad = str_join('.', [rightEdge, rightEdge]); doubleRightPad

# assert all(leftEdge in w and rightEdge in w for w in Ws), f"Segmental wordforms from arg w must be padded!"

# rightPadding = doubleRightPad if Ws_t[0][-3:] == doubleRightPad else singleRightPad
# rightPadding

# extraPadding = '.' + singleRightPad if rightPadding == doubleRightPad else ''
# extraPadding

In [50]:
# transcription_field = 'transcription' if corpus_name == 'NXT_swbd' else 'phonemes'
# corpus_name
# transcription_field

In [51]:
# len(word_analysis_relation)
# noTr = lfilter(lambda rel: transcription_field not in rel, word_analysis_relation)
# len(noTr)

In [52]:
# processed_orth_field = 'speech' if corpus_name == 'NXT_swbd' else 'orthographic_wordform'
# corpus_name
# processed_orth_field

In [53]:
# transcription_available = lambda rel: rel[processed_orth_field] in Vs
# noTr_but_could_add_one = lfilter(transcription_available, noTr)
# len(noTr_but_could_add_one)

# noTr_cant_find_one = lfilter(lambda rel: not transcription_available(rel), noTr)
# len(noTr_cant_find_one)
# # lpluck('orth', noTr_cant_find_one)
# lpluck(processed_orth_field, noTr_cant_find_one)

In [54]:
# if len(noTr_but_could_add_one) > 0:
#     print(f'Adding transcription to {len(noTr_but_could_add_one)} relations')
#     for rel in word_analysis_relation:
#         if transcription_field not in rel and rel[processed_orth_field] in Vs:
#             v = rel[processed_orth_field]
#             my_Ws = tuple(sorted(pW_V[v].keys()))
#             print(f"For v = {v}, choosing first transcription among {my_Ws}")
#             rel[transcription_field] = my_Ws[0]
#             rel['phones_length'] = len(ds2t(rel[transcription_field]))
#             rel['missing_transcription?'] = True
#             rel['transcription_added?'] = True
#         elif transcription_field not in rel and rel[processed_orth_field] not in Vs:
#             rel[transcription_field] = ''
#             rel['missing_transcription?'] = True
#             rel['transcription_added?'] = False
#         else:
#             pass


In [55]:
# list(noTr)[0]

In [56]:
# segWord_field = 'segWord'

In [57]:
# if not all(segWord_field in rel for rel in word_analysis_relation):
#     print('Adding padded transcription field to every relation in word_analysis_relation...')
#     for rel in word_analysis_relation:
#         rel[segWord_field] = padInputSequenceWithBoundaries(rel[transcription_field]) + extraPadding

# segWords_in_relation_unpadded = lpluck(transcription_field, word_analysis_relation)
# segWords_in_relation_l = lpluck(segWord_field, word_analysis_relation)
# len(segWords_in_relation_l)
# segWords_in_relation = set(segWords_in_relation_l)
# len(segWords_in_relation)
# segWords_in_relation_t = tuple(sorted(segWords_in_relation))
# len(segWords_in_relation_t)

In [58]:
# segWords_missing_from_model = segWords_in_relation - Ws
# len(segWords_missing_from_model)
# segWords_missing_from_relation = Ws - segWords_in_relation
# len(segWords_missing_from_relation)

In [59]:
if C_fp != '':
    Cs_t = importSeqs(C_fp, tuple)
    assert Cs_t == tuple(sorted(Cs_t))

    Cs = set(Cs_t)
    assert len(Cs) == len(Cs_t)

    n_C = len(Cs_t)
    n_C

    Cs_t[:5]
else:
    Cs_t = tuple([''])
    Cs = {''}
    n_C = 1

3213

('a', "a's", 'abercrombie', 'ability', 'able')

In [60]:
context_lengths = lmap(lambda c: len(c.split(' ')), Cs_t)
context_length_set = set(context_lengths)
context_length_range = tuple(sorted(context_length_set))
context_length_range

max_context_length = max(context_length_range)
min_context_length = min(context_length_range)
missing_context_lengths = tuple([l for l in range(1, max_context_length) if l not in context_length_set])
print(f"Num missing context lengths = {len(missing_context_lengths)}")
print(f"Missing contexts = {missing_context_lengths}")

(1,)

Num missing context lengths = 0
Missing contexts = ()


In [61]:
if c != '':
    assert context_size == max_context_length, f"Context size {context_size} from arg c doesn't match max context length in the file {max_context_length}"

In [62]:
if c != '':
    contexts_in_relation_l = lpluck(context_field, word_analysis_relation)
    len(contexts_in_relation_l)
    contexts_in_relation = set(contexts_in_relation_l)
    len(contexts_in_relation)
    contexts_in_relation_t = tuple(sorted(contexts_in_relation))
    len(contexts_in_relation_t)

44127

3306

3306

In [63]:
n_C

3213

In [64]:
if c != '':
    contexts_missing_from_model = contexts_in_relation - Cs
    len(contexts_missing_from_model)
    contexts_missing_from_relation = Cs - contexts_in_relation
    len(contexts_missing_from_relation)

93

0

In [65]:
if c != '':
    list(contexts_missing_from_model)[:5]

["amy's", 'hypo', 'femalee', 'fatherly', 'machineries']

In [66]:
array_name = pW_WC_dir
array_name

'LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model/LTR_Buckeye_aligned_pc0.001_l1.0_pW_WC_eq'

In [67]:
tiledb.object_type(array_name)

'array'

In [68]:
if '_eq' in array_name:
    array_type = tiledb.SparseArray
else:
    array_type = tiledb.DenseArray

with array_type(array_name, mode='r') as A:
    A.shape
    pW_WC_shape = A.shape
#     pW_WC = A[:]

(6404, 3213)

In [69]:
pW_WC_shape

(6404, 3213)

In [70]:
len(Cs_t)

3213

In [71]:
assert pW_WC_shape[0] == n_W, f"segWord dimension {pW_WC_shape[0]} of pW_WC array does not match dimension of W = {n_W}"
assert pW_WC_shape[1] == n_C, f"context dimension {pW_WC_shape[1]} of pW_WC array does not match dimension of C = {n_C}"

In [72]:
# rand_col_idx = choice(np.arange(pW_WC_shape[1]))
# rand_col_idx

In [73]:
# with array_type(array_name, mode='r') as A:
#     whole_mat = A[:]
# #     rand_row.shape
#     type(whole_mat)

In [74]:
# whole_mat_sparse = sparse.COO(coords=[lmap(first, whole_mat['coords']),
#                                       lmap(second, whole_mat['coords'])],
#                               data=whole_mat['pW_WC_eq'], shape=pW_WC_shape)

In [75]:
# with array_type(array_name, mode='r') as A:
#     rand_col = A[:, rand_col_idx]

In [76]:
# rand_col
# rand_col['pW_WC_eq']
# Ws_t[rand_col['coords'][0][0]]
# Cs_t[rand_col['coords'][0][1]]

In [77]:
# whole_mat_sparse[12665, 28081]

In [78]:
# rand_col['coords']
# lmap(first, rand_col['coords'])
# lmap(second, rand_col['coords'])
# [lmap(first, rand_col['coords']),
#  lmap(second, rand_col['coords'])]

In [79]:
# sparse.COO(coords=[lmap(first, rand_col['coords']),
#                    lmap(second, rand_col['coords'])],
#            data=rand_col['pW_WC_eq'])
# sparse.COO(coords=[lmap(first, rand_col['coords']),
#                    lmap(second, rand_col['coords'])],
#            data=rand_col['pW_WC_eq']).todense()

In [80]:
if array_type == tiledb.DenseArray:
    with array_type(array_name, mode='r') as A:
    #     A.shape
    #     pW_WC_shape = A.shape
        pW_WC = A[:]['pW_WC_e']
else:
    with array_type(array_name, mode='r') as A:
    #     A.shape
    #     pW_WC_shape = A.shape
#     pW_WC = A[:]['pW_WC_eq']
        whole_mat = A[:]
        whole_mat_sparse = sparse.COO(coords=[lmap(first, whole_mat['coords']),
                                          lmap(second, whole_mat['coords'])],
                                      data=whole_mat['pW_WC_eq'], 
                                      shape=pW_WC_shape,
                                      fill_value=np.nan)
        pW_WC = whole_mat_sparse.todense()

In [81]:
type(pW_WC)
toHuman(pW_WC.nbytes)

numpy.ndarray

'78.49MB'

In [82]:
pW_WC.dtype
pW_WC.shape
n_W, n_C

assert pW_WC.shape == (n_W, n_C)

dtype('float32')

(6404, 3213)

(6404, 3213)

In [83]:
has_inf_mask = pW_WC == np.inf
sum(has_inf_mask)
has_infs = has_inf_mask.any()
has_infs
# assert not has_infs
del has_inf_mask

array([0, 0, 0, ..., 0, 0, 0])

False

In [84]:
has_nan_mask = pW_WC == np.NaN
has_nans = has_nan_mask.any()
has_nans
assert not has_nans
del has_nan_mask

False

In [85]:
# np.array([p - 1])
# np.allclose(np.array([p - 1]), np.zeros(np.array([p - 1]).shape), atol=1e-06)

In [86]:
gtOne_mask = pW_WC > 1.0
has_gtOnes = gtOne_mask.any()
if has_gtOnes:
    deviating_probs = pW_WC[gtOne_mask]
    deviations_from_1 = deviating_probs - np.ones(deviating_probs.shape)
    print(f"Deviations from 1 = \n\t{deviations_from_1}")
    print(f"Deviating probabilities = \n\t{deviating_probs}")
#     assert np.allclose(deviations_from_1, np.zeros(deviating_probs.shape), atol=1e-06)
    pW_WC[gtOne_mask] = np.ones(deviating_probs.shape)
# assert not has_gtOnes
del gtOne_mask

  """Entry point for launching an IPython kernel.


In [87]:
ltZero_mask = pW_WC < 0.0
has_ltZeros = ltZero_mask.any()
assert not has_ltZeros
del ltZero_mask

  """Entry point for launching an IPython kernel.


In [88]:
pW_C = np.load(pW_C_fp)
pW_C.dtype
pW_C.shape
toHuman(pW_C.nbytes)
n_W, n_C

dtype('float64')

(6404, 3213)

'156.98MB'

(6404, 3213)

In [89]:
assert pW_C.shape == pW_WC.shape, f"pW_C.shape != pW_WC.shape: {pW_C.shape} vs. {pW_WC.shape}:\n\tpW_C_fp = {pW_C_fp}\n\tpW_WC_dir = {pW_WC_dir}"

In [None]:
# unigram_model_arpa_fp
# unigram_model_json_fp

In [None]:
# pV_unigram = importProbDist(unigram_model_json_fp)

In [None]:
# V_Fisher = set(pV_unigram.keys())
# len(V_Fisher)

# V_t_Fisher = tuple(sorted(V_Fisher))
# V_t_Fisher[:5]

In [None]:
# no_unigram_score = {v for v in Vs if v not in V_Fisher}
# len(no_unigram_score)
# assert len(no_unigram_score) == 0, f"Vs without a unigram score = \n\t{no_unigram_score}"

In [None]:
# random_v = choice(tuple(Vs)); random_v

In [None]:
# pV_unigram[random_v]

In [None]:
# !free -h

# Add probability annotations

In [90]:
lm_str_p_unigram = 'p' + '(W = w*)'
lm_str_h_unigram = 'h' + '(W = w*)'

In [91]:
order_from_c
context_size
direction_from_c

2

1

'following'

In [92]:
ctxt_str_map = {'preceding':f'C = w_-{context_size}^-1',
                'following':f'C = w_+1^+{context_size}',
                '(NA)':''}
ctxt_str = ctxt_str_map[direction_from_c]
ctxt_str

'C = w_+1^+1'

In [93]:
source_str = 'W = w*'
target_str = "W' = w*"

In [94]:
param_str = f' ; 𝛼={my_alpha}, λ={my_lambda}'
param_str

' ; 𝛼=0.001, λ=1.0'

In [95]:
if ctxt_str != '':
    no_noise_body_str = f"{source_str} | {ctxt_str}"
    no_noise_body_str
else:
    no_noise_body_str = f"{source_str}"
    no_noise_body_str

if ctxt_str != '':
    noise_body_str = f"{target_str} | {source_str}, {ctxt_str}" + param_str
    noise_body_str
else:
    noise_body_str = f"{target_str} | {source_str}" + param_str
    noise_body_str

'W = w* | C = w_+1^+1'

"W' = w* | W = w*, C = w_+1^+1 ; 𝛼=0.001, λ=1.0"

In [96]:
lm_str_p = 'p' + '(' + no_noise_body_str + ')'
lm_str_p
lm_str_h = 'h' + '(' + no_noise_body_str + ')'
lm_str_h
lm_str_h_bar = 'h-bar' + '(' + no_noise_body_str + ')'
lm_str_h_bar

'p(W = w* | C = w_+1^+1)'

'h(W = w* | C = w_+1^+1)'

'h-bar(W = w* | C = w_+1^+1)'

In [97]:
post_str_p = 'p' + '(' + noise_body_str + ')'
post_str_p
post_str_h = 'h' + '(' + noise_body_str + ')'
post_str_h
post_str_h_bar = 'h-bar' + '(' + noise_body_str + ')'
post_str_h_bar

"p(W' = w* | W = w*, C = w_+1^+1 ; 𝛼=0.001, λ=1.0)"

"h(W' = w* | W = w*, C = w_+1^+1 ; 𝛼=0.001, λ=1.0)"

"h-bar(W' = w* | W = w*, C = w_+1^+1 ; 𝛼=0.001, λ=1.0)"

In [98]:
# ctxt_str_map = {'preceding':f'p(w*|w_-{context_size}^-1)',
#                 'following':f'p(w*|w_+1^+{context_size})'}
# p_annotation_str = ctxt_str_map[direction_from_c]
# p_annotation_str
# h_annotation_str = 'h' + p_annotation_str[1:]
# h_annotation_str

In [99]:
def idx(element, collection):
    return collection.index(element)

In [100]:
orth_field = 'orthographic_wordform' if 'buckeye' in a else 'speech'

In [101]:
def annotate(rel):
    if C_fp != '':
        my_c = rel[context_field]
    else:
        my_c = ''
    
    if segWord_field not in rel:
        return rel
    
    my_w = rel[segWord_field]
    
    modelable_c = my_c in Cs
    modelable_w = my_w in Ws
    
#     my_v = rel[orth_field]
#     if my_v not in pV_unigram:
#         rel[lm_str_p_unigram] = 'bad_v'
#         rel[lm_str_h_unigram] = 'bad_v'
#     else:
#         rel[lm_str_p_unigram] = pV_unigram[my_v]
#         rel[lm_str_h_unigram] = -1.0 * float(np.log2(pV_unigram[my_v]))
    
    result = []
    if not modelable_c:
        result = ['bad_c']
    if not modelable_w:
        result += ['bad_w']
    
    if len(result) > 0:
        result = str_join(',', result)
        rel[lm_str_p] = result
        rel[lm_str_h] = result
        rel[lm_str_h_bar] = result
        rel[post_str_p] = result
        rel[post_str_h] = result
        rel[post_str_h_bar] = result
        return rel
    
    c_idx = idx(my_c, Cs_t)
    w_idx = idx(my_w, Ws_t)
    
    lm_result = pW_C[w_idx, c_idx]
#     lm_bar_result = #FIXME
    post_result = pW_WC[w_idx, c_idx]
#     post_bar_result = #FIXME
    
    if lm_result == np.inf:
        lm_result = float(np.nan)
        rel[lm_str_p] = lm_result
        rel[lm_str_h] = lm_result
    elif lm_result < 0.0:
        raise Exception(f"lm result < 0!\nctxt:{my_c}\nmy_w:{my_w}\nlm result:{lm_result}\n\nNB args:\n{arg_bundle}")
    elif lm_result > 1.0 and not np.allclose(np.array([lm_result - 1]), np.zeros(np.array([lm_result - 1]).shape), atol=1e-06):
#         lm_deviation = np.array([lm_result - 1])
#         np.allclose(lm_deviation, np.zeros(lm_deviation.shape), atol=1e-06)
        raise Exception(f"lm result > 1!\nctxt:{my_c}\nmy_w:{my_w}\nlm result:{lm_result}\nNB args:\n{arg_bundle}")
    else:
        rel[lm_str_p] = lm_result
        rel[lm_str_h] = -1.0 * float(np.log2(lm_result))
#         rel[lm_str_h_bar] = #FIXME

    if post_result == np.inf:
        post_result = float(np.nan)
        rel[post_str_p] = post_result
        rel[post_str_h] = post_result
    elif post_result < 0.0:
        raise Exception(f"post result < 0!\nctxt:{my_c}\nmy_w:{my_w}\nlm result:{lm_result}\npost_result{post_result}\n\nNB args:\n{arg_bundle}")
    elif post_result > 1.0 and not np.allclose(np.array([post_result - 1]), np.zeros(np.array([post_result - 1]).shape), atol=1e-06):
#         post_deviation = np.array([post_result - 1])
#         np.allclose(post_deviation, np.zeros(post_deviation.shape), atol=1e-06)
        raise Exception(f"post result > 1!\nctxt:{my_c}\nmy_w:{my_w}\nlm result:{lm_result}\npost_result{post_result}\nNB args:\n{arg_bundle}")
    else:
        rel[post_str_p] = float(post_result)
        rel[post_str_h] = -1.0 * float(np.log2(post_result))
#         rels[post_str_h_bar] = #FIXME
    
    return rel

In [102]:
partially_updated_word_analysis_relation = lmap(annotate, word_analysis_relation)

In [103]:
from statistics import mean

In [104]:
allSegWords = lpluck('segWord', partially_updated_word_analysis_relation)
len(allSegWords)
allSegWords = set(allSegWords)
len(allSegWords)
if None in allSegWords:
    print("Removing 'None'...")
    allSegWords.remove(None)
    len(allSegWords)

44127

4671

Removing 'None'...


4670

In [105]:
segWord_to_lm_probs = dict()
segWord_to_post_probs = dict()

def avg_prob_measure_update(rel):
    #NB: STATEFUL AF!
    my_w = rel['segWord']
    if my_w in segWord_to_lm_probs and type(rel[lm_str_h]) != type("foo"):
        my_update = {rel[lm_str_h]}
        segWord_to_lm_probs[my_w] = merge(segWord_to_lm_probs[my_w], my_update)
    elif my_w not in segWord_to_lm_probs and type(rel[lm_str_h]) != type("foo"):
        segWord_to_lm_probs[my_w] = {rel[lm_str_h]}
#     else:
#         print('')
    
    if my_w in segWord_to_post_probs and type(rel[post_str_h]) != type("foo"):
        my_update = {rel[post_str_h]}
        segWord_to_post_probs[my_w] = merge(segWord_to_post_probs[my_w], my_update)
    elif my_w not in segWord_to_post_probs and type(rel[post_str_h]) != type("foo"):
        segWord_to_post_probs[my_w] = {rel[post_str_h]}
#     else:
#         print('')

for rel in tqdm(partially_updated_word_analysis_relation):
    avg_prob_measure_update(rel)

100%|██████████| 44127/44127 [00:00<00:00, 221085.89it/s]


In [106]:
avg_prob_measure_lookup = dict()

def avg_prob_measure_lookup_update(segWord):
    #NB: STATEFUL AF!
    my_w = segWord
    if my_w in avg_prob_measure_lookup:
        raise Exception('This function should only be called once for each segWord.')
    
    lm_probs = segWord_to_lm_probs.get(my_w, set())
#     lm_probs = segWord_to_lm_probs[my_w]
    post_probs = segWord_to_post_probs.get(my_w, set())
#     post_probs = segWord_to_post_probs[my_w]
    
    n_contexts_with_w_and_lm_prob = len(lm_probs)
    n_contexts_with_w_and_post_prob = len(post_probs)
    
    lm_bar_result = sum(lm_probs) / n_contexts_with_w_and_lm_prob if n_contexts_with_w_and_lm_prob != 0 else 'bad_w'
    post_bar_result = sum(post_probs) / n_contexts_with_w_and_post_prob if n_contexts_with_w_and_post_prob != 0 else 'bad_w'
    
    total_result = {lm_str_h_bar:lm_bar_result,
                    post_str_h_bar:post_bar_result}
    avg_prob_measure_lookup[my_w] = total_result
    
for segWord in tqdm(allSegWords):
    avg_prob_measure_lookup_update(segWord)

100%|██████████| 4670/4670 [00:00<00:00, 382364.76it/s]


In [107]:
def annotate_avg_prob_measures(rel):
    my_w = rel[segWord_field]
    
    my_results = avg_prob_measure_lookup.get(my_w, 'bad_w')
    
    if my_results == 'bad_w':
        rel[lm_str_h_bar] = my_results
        rel[post_str_h_bar] = my_results
        return rel
    elif my_results == np.nan:
        rel[lm_str_h_bar] = my_results
        rel[post_str_h_bar] = my_results
        return rel
        
    rel[lm_str_h_bar] = my_results[lm_str_h_bar]
    rel[post_str_h_bar] = my_results[post_str_h_bar]
    
    return rel
    
    
# straightforward, but extremely inefficient code below...
    
# def get_rels_with(key_segword, rels):
#     return lfilter(lambda rel:rel['segWord'] == key_segword, rels)

# def calc_avg_prob_measures(segWord, rels):
#     my_w = segWord
#     modelable_w = my_w in Ws
    
#     result = []
#     if not modelable_w:
#         result += ['bad_w']
    
#     if len(result) > 0:
#         result = str_join(',', result)
#         return result
    
# #     c_idx = idx(my_c, Cs_t)
# #     w_idx = idx(my_w, Ws_t)
#     all_rels_with_w = get_rels_with(my_w, rels)
#     has_a_calculated_lm_prob = lfilter(lambda r: type(r[lm_str_h]) != type("foo"), 
#                                        all_rels_with_w)
#     lm_probs = lpluck(lm_str_h, has_a_calculated_lm_prob)
#     has_a_calculated_post_prob = lfilter(lambda r: type(r[post_str_h]) != type("foo"),
#                                          all_rels_with_w)
#     post_probs = lpluck(post_str_h, has_a_calculated_post_prob)
#     n_contexts_with_w_and_lm_prob = len(has_a_calculated_lm_prob)
#     n_contexts_with_w_and_post_prob = len(has_a_calculated_post_prob)
    
# #     lm_result = pW_C[w_idx, c_idx]
#     lm_bar_result = sum(lm_probs) / n_contexts_with_w_and_lm_prob if n_contexts_with_w_and_lm_prob != 0 else 'bad_w'
# #     post_result = pW_WC[w_idx, c_idx]
#     post_bar_result = sum(post_probs) / n_contexts_with_w_and_post_prob if n_contexts_with_w_and_post_prob != 0 else 'bad_w'
    
# ##     rel[lm_str_p] = lm_result
# ##     rel[lm_str_h] = -1.0 * float(np.log2(lm_result))
# #     rel[lm_str_h_bar] = lm_bar_result
# ##     rel[post_str_p] = float(post_result)
# #     rel[post_str_h] = -1.0 * float(np.log2(post_result))
# #     rels[post_str_h_bar] = post_bar_result
# #     return lm_bar_result, post_bar_result
#     return {lm_str_h_bar:lm_bar_result,
#             post_str_h_bar:post_bar_result}

# def calc_avg_prob_measures_helper(segWord, rels):
#     return {segWord:calc_avg_prob_measures(segWord, rels)}

# # avg_prob_measure_lookup = dict(par(delayed(calc_avg_prob_measures_helper)(sw, partially_updated_word_analysis_relation)
# #                               for sw in allSegWords))

# #31 segwords/s on kotoba = waaaaaaaay too slow
# # avg_prob_measure_lookup = dict([calc_avg_prob_measures_helper(sw, partially_updated_word_analysis_relation)
# #                                 for sw in tqdm(allSegWords)])
# # avg_prob_measure_lookup = dict(lmap(partial(calc_avg_prob_measures_helper, rels=partially_updated_word_analysis_relation),
# #                                allSegWords))

# #inefficient to map over all rels because of unused subcomputations
# def annotate_avg_prob_measures(rel, rels):
# #     my_c = rel[context_field]
#     my_w = rel[segWord_field]
# #     modelable_c = my_c in Cs
#     modelable_w = my_w in Ws
    
#     result = []
# #     if not modelable_c:
# #         result = ['bad_c']
#     if not modelable_w:
#         result += ['bad_w']
    
#     if len(result) > 0:
#         result = str_join(',', result)
# #         rel[lm_str_p] = result
# #         rel[lm_str_h] = result
#         rel[lm_str_h_bar] = result
# #         rel[post_str_p] = result
# #         rel[post_str_h] = result
#         rel[post_str_h_bar] = result
#         return rel
    
# #     c_idx = idx(my_c, Cs_t)
# #     w_idx = idx(my_w, Ws_t)
#     all_rels_with_w = get_rels_with(my_w, rels)
#     has_a_calculated_lm_prob = lfilter(lambda r: type(r[lm_str_h]) != type("foo"), 
#                                        all_rels_with_w)
#     lm_probs = lpluck(lm_str_h, has_a_calculated_lm_prob)
#     has_a_calculated_post_prob = lfilter(lambda r: type(r[post_str_h]) != type("foo"),
#                                          all_rels_with_w)
#     post_probs = lpluck(post_str_h, has_a_calculated_post_prob)
#     n_contexts_with_w_and_lm_prob = len(has_a_calculated_lm_prob)
#     n_contexts_with_w_and_post_prob = len(has_a_calculated_post_prob)
    
    
# #     lm_result = pW_C[w_idx, c_idx]
#     lm_bar_result = sum(lm_probs) / n_contexts_with_w_and_lm_prob if n_contexts_with_w_and_lm_prob != 0 else 'bad_w'
# #     post_result = pW_WC[w_idx, c_idx]
#     post_bar_result = sum(post_probs) / n_contexts_with_w_and_post_prob if n_contexts_with_w_and_post_prob != 0 else 'bad_w'
    
# #     rel[lm_str_p] = lm_result
# #     rel[lm_str_h] = -1.0 * float(np.log2(lm_result))
#     rel[lm_str_h_bar] = lm_bar_result
# #     rel[post_str_p] = float(post_result)
# #     rel[post_str_h] = -1.0 * float(np.log2(post_result))
#     rels[post_str_h_bar] = post_bar_result
#     return rel

In [108]:
updated_word_analysis_relation = lmap(annotate_avg_prob_measures, 
                                      partially_updated_word_analysis_relation)
# updated_word_analysis_relation = lmap(partial(annotate_avg_prob_measures, rels=partially_updated_word_analysis_relation), 
#                                       partially_updated_word_analysis_relation)

In [109]:
updated_word_analysis_relation[0]

{'orthographic_wordform': 'set',
 'orthographic_wordform_length': 3,
 'preceding_4_wordforms': '<rem> i since i',
 'preceding_3_wordforms': 'i since i',
 'preceding_2_wordforms': 'since i',
 'preceding_1_wordforms': 'i',
 'following_1_wordforms': 'it',
 'following_2_wordforms': "it it's",
 'following_3_wordforms': "it it's okay",
 'following_4_wordforms': "it it's okay <s>",
 'preceding_wordforms': '<s> because it slipped <rem> i since i',
 'following_wordforms': "it it's okay <s>",
 'bidirectional_context': ['<s> because it slipped <rem> i since i',
  "it it's okay <s>"],
 'POS': 'VBD',
 'isAdj': False,
 'isAdv': False,
 'isN': False,
 'isV': True,
 'phonemes': 's.ɛ.t',
 'phones': 's.ɛ.ɾ',
 'phonemes_length': 3,
 'phones_length': 3,
 'hasSyllabicSegsInPhones': True,
 'hasAdjacentPauseOrDisfluency': False,
 'hasAdjacentFilledPause': False,
 'hasClitic': False,
 'syllables': 1,
 'beg': 45.022068,
 'end': 45.243621,
 'duration': 0.2215530000000001,
 'misalgined': False,
 'track_name': 's

In [110]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:            62G        5.4G         23G         12M         33G         56G
Swap:          2.0G        3.0M        2.0G


In [111]:
len(updated_word_analysis_relation)
has_no_result_post_str_p = lfilter(lambda rel: type(rel[post_str_p]) == str,
                                   updated_word_analysis_relation)
len(has_no_result_post_str_p)

44127

4660

In [None]:
# len(updated_word_analysis_relation)
# has_no_result_lm_str_p_unigram = lfilter(lambda rel: type(rel[lm_str_p_unigram]) == str,
#                                          updated_word_analysis_relation)
# len(has_no_result_lm_str_p_unigram)

In [None]:
%pwd

In [None]:
o

# Export

In [None]:
exportDict(o, updated_word_analysis_relation)