In [1]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

**Eric Meinhardt / emeinhardt@ucsd.edu**

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Motivation" data-toc-modified-id="Motivation-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Motivation</a></span></li><li><span><a href="#Import-libraries-and-data" data-toc-modified-id="Import-libraries-and-data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Import libraries and data</a></span></li><li><span><a href="#Basic-representations" data-toc-modified-id="Basic-representations-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Basic representations</a></span></li><li><span><a href="#Edit-distance-calculation-&amp;-sandbox" data-toc-modified-id="Edit-distance-calculation-&amp;-sandbox-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Edit distance calculation &amp; sandbox</a></span></li><li><span><a href="#Actual-computation" data-toc-modified-id="Actual-computation-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Actual computation</a></span></li><li><span><a href="#Convert-distances-to-a-numpy-matrix" data-toc-modified-id="Convert-distances-to-a-numpy-matrix-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Convert distances to a numpy matrix</a></span></li><li><span><a href="#Convert-distances-to-a-dictionary" data-toc-modified-id="Convert-distances-to-a-dictionary-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Convert distances to a dictionary</a></span></li><li><span><a href="#Calculate-neighborhood-sizes-and-densities" data-toc-modified-id="Calculate-neighborhood-sizes-and-densities-8"><span class="toc-item-num">8&nbsp;&nbsp;</span>Calculate neighborhood sizes and densities</a></span><ul class="toc-item"><li><span><a href="#Identify-neighbors" data-toc-modified-id="Identify-neighbors-8.1"><span class="toc-item-num">8.1&nbsp;&nbsp;</span>Identify neighbors</a></span></li><li><span><a href="#Calculate-neighborhood-size" data-toc-modified-id="Calculate-neighborhood-size-8.2"><span class="toc-item-num">8.2&nbsp;&nbsp;</span>Calculate neighborhood size</a></span></li><li><span><a href="#Weight-by-probability" data-toc-modified-id="Weight-by-probability-8.3"><span class="toc-item-num">8.3&nbsp;&nbsp;</span>Weight by probability</a></span></li></ul></li><li><span><a href="#Export" data-toc-modified-id="Export-9"><span class="toc-item-num">9&nbsp;&nbsp;</span>Export</a></span><ul class="toc-item"><li><span><a href="#Levenshtein-distances" data-toc-modified-id="Levenshtein-distances-9.1"><span class="toc-item-num">9.1&nbsp;&nbsp;</span>Levenshtein distances</a></span></li><li><span><a href="#Neighbors" data-toc-modified-id="Neighbors-9.2"><span class="toc-item-num">9.2&nbsp;&nbsp;</span>Neighbors</a></span></li><li><span><a href="#Neighborhood-size" data-toc-modified-id="Neighborhood-size-9.3"><span class="toc-item-num">9.3&nbsp;&nbsp;</span>Neighborhood size</a></span></li><li><span><a href="#Neighborhood-density" data-toc-modified-id="Neighborhood-density-9.4"><span class="toc-item-num">9.4&nbsp;&nbsp;</span>Neighborhood density</a></span></li></ul></li></ul></div>

# Motivation

This notebook calculates 
 - the set of all pairwise Levenshtein distances between pairs of segmental wordforms.
 - unigram probability weighted neighborhood distance.

# Import libraries and data

In [2]:
from os import getcwd, chdir, listdir, path, mkdir, makedirs

In [3]:
from boilerplate import *

In [4]:
from probdist import *
from string_utils import *

In [5]:
from funcy import *

In [6]:
from tqdm import tqdm

from joblib import Parallel, delayed

J = -1
BACKEND = 'multiprocessing'
# BACKEND = 'loky'
V = 10
PREFER = 'processes'
# PREFER = 'threads'

def identity(x):
    return x

def par(gen_expr):
    return Parallel(n_jobs=J, backend=BACKEND, verbose=V, prefer=PREFER)(gen_expr)

In [7]:
# import toolz

In [8]:
# import dask.multiprocessing
# dask.config.set(scheduler='processes')

In [9]:
# # import dask
# from dask.distributed import Client
# client = Client('172.21.47.67:8786')

In [10]:
import editdistance as lev

In [11]:
import numpy as np

In [12]:
# import sparse

In [14]:
# Parameters

p = ''
p = 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered.pW_V.json'
# p = 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json'
# p = 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim.pW_V.json'
# p = 'LTR_newdic_destressed_aligned_w_GD_AmE_destressed/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered.pW_V.json'
# p = 'LTR_newdic_destressed_aligned_w_GD_AmE_destressed/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered_trim.pW_V.json'
# p = 'LTR_CMU_destressed_aligned_w_GD_AmE_destressed/LTR_CMU_destressed_aligned_CM_filtered_LM_filtered.pW_V.json'
# p = 'LTR_CMU_destressed_aligned_w_GD_AmE_destressed/LTR_CMU_destressed_aligned_CM_filtered_LM_filtered_trim.pW_V.json'
# p = 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim.pW_V.json'

# p = 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered.pW_V.json' 
# p = 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json'
# p = 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2.json'

u = ''
u = 'LD_Buckeye_vocab_in_(empty)_(NA)_contexts_1gram_model/LD_Buckeye_vocab_in_(empty)_(NA)_contexts_1gram_model.pV.json'
# u = 'LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model/LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model.pV.json'

o = ''
o = 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered'
# o = 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered'
# o = 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim'
# o = 'LTR_newdic_destressed_aligned_w_GD_AmE_destressed/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered'
# o = 'LTR_newdic_destressed_aligned_w_GD_AmE_destressed/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered_trim'
# o = 'LTR_CMU_destressed_aligned_w_GD_AmE_destressed/LTR_CMU_destressed_aligned_CM_filtered_LM_filtered'
# o = 'LTR_CMU_destressed_aligned_w_GD_AmE_destressed/LTR_CMU_destressed_aligned_CM_filtered_LM_filtered_trim'

# o = 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered'
# o = 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered'
# o = 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim'
# o = 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_X0X1X2'

# g = ''
# # g = 'False'

In [15]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           188G        9.8G        142G        1.7M         36G        177G
Swap:          2.0G        110M        1.9G


In [16]:
if 'pW_V' in p:
    pW_V = condDistsAsProbDists(importProbDist(p))
elif 'pX0X1X2' in p:
    pW = ProbDist(importProbDist(p))
else:
    raise Exception(f"Unknown type of 'p' parameter = {p}")

In [17]:
if 'pV' in u:
    pV = ProbDist(importProbDist(u))
elif 'pX0X1X2' in u:
    pV = pW
else:
    raise Exception(f"Unknown type of 'u' parameter = {u}")

In [18]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           188G        9.8G        142G        1.7M         36G        177G
Swap:          2.0G        110M        1.9G


In [19]:
# testing = False
# benchmark = False

In [20]:
my_dtype = np.int64

# Basic representations

In [21]:
if 'pW_V' in p:
    # Vs = set(pW_V.keys())
    Ws = union(mapValues(lambda dist: set(conditions(dist)), 
                         pW_V).values())
elif 'pX0X1X2' in p:
    Ws = set(conditions(pW))
else:
    raise Exception(f"Unknown type of 'p' parameter = {p}")

# len(Vs)
len(Ws)

6823

In [22]:
Ws_t = tuple(sorted(list(Ws)))

In [23]:
# #≈200s on CMU on solomonoff
# Ps = union(list(par(delayed(getPrefixes)(w) for w in Ws)))
# # Ps = union(par(delayed(getPrefixes)(w) for w in Ws))
# # Ps = union([getPrefixes(w) for w in Ws])
# Ps_t = tuple(sorted(list(Ps)))
# len(Ps_t)

In [24]:
Ws_tt = lmap(ds2t, Ws_t)
# Ps_tt = lmap(ds2t, Ps_t)

Dask data structures:

In [25]:
# import dask.bag as db

In [26]:
# Ws_ttb = db.from_sequence(Ws_tt)

In [27]:
# Ps_ttb = db.from_sequence(Ps_tt)

In [28]:
if 'pW_V' in p:
#     Vs = set(pV.keys())
    my_Vs = set(pW_V.keys())
    
    missing_from_prior = {v for v in my_Vs if v not in pV}
    len(missing_from_prior)
    assert len(missing_from_prior) == 0
    
    missing_from_conditions = {v for v in pV if v not in pW_V}
    len(missing_from_conditions)
    
    pV_trim = ProbDist({v:pV[v] for v in my_Vs})
    assert all(v in pW_V for v in pV_trim)
    
    pW = MarginalProbDist(pW_V, pV_trim)
#     pW = ProbDist({w:sum(pV_trim[v] * pW_V[v][w]
#                      for v in pV_trim)
#                    for w in Ws_t})

0

37562

In [29]:
pW_np = distToNP(pW)
pW_np.shape

(6823,)

# Edit distance calculation & sandbox

In [30]:
def lev_helper_wrapper(uv_idx_pair, prefixes=True, distributed=False):
    return lev_helper(uv_idx_pair[0], uv_idx_pair[1], prefixes=prefixes, distributed=distributed)

def lev_helper(u_idx, v_idx, prefixes=True, distributed=False):
    if prefixes:
        u = Ps_tt[u_idx]
        v = Ps_tt[v_idx]
    else:
        u = Ws_tt[u_idx]
        v = Ws_tt[v_idx]
    return (u_idx, v_idx, lev.eval(u,v))

def lev_dist(uv_t_pair, final_func = None):
    if final_func is None:
        return (t2ds(uv_t_pair[0]), t2ds(uv_t_pair[1]), lev.eval(uv_t_pair[0], uv_t_pair[1]))
    else:
        result = (t2ds(uv_t_pair[0]), t2ds(uv_t_pair[1]), lev.eval(uv_t_pair[0], uv_t_pair[1]))
        return final_func(result)
#         return (final_func(result[0]), final_func(result[1]), result[2])

In [31]:
# lev_helper(3, 20, True)
# Ps_t[3]
# Ps_t[20]
# lev_dist((Ps_tt[3], Ps_tt[20]))

lev_helper(3, 20, False)
Ws_t[3]
Ws_t[20]

lev_dist((Ws_tt[3], Ws_tt[20]))

(3, 20, 3)

'⋊.aɪ.d.i.ʌ.z.⋉.⋉'

'⋊.aɪ.v.i.⋉.⋉'

('⋊.aɪ.d.i.ʌ.z.⋉.⋉', '⋊.aɪ.v.i.⋉.⋉', 3)

In [32]:
# L_d_np_P_updates = par(delayed(lev_helper)(u_idx, v_idx) for u_idx, v_idx in list(product(np.arange(len(Ps_t)), np.arange(len(Ps_t)))))

In [33]:
# my_prefixes = Ps_tt[:1000]
# my_prefix_pairs = list(product(my_prefixes, my_prefixes))
# "{:,}".format(len(my_prefix_pairs))

In [34]:
# Client?

In [35]:
# client.submit?

In [36]:
# foo = client.submit(lev_dist, my_prefix_pairs[23])

In [37]:
# foo.result()

In [38]:
# test_results_futures = client.map(lev_dist, my_prefix_pairs)

In [39]:
# test_results_futures[-1]

In [40]:
# test_results_gathered = client.gather(test_results_futures)

In [41]:
# joblib_only = par(delayed(lev_dist)(pair) for pair in my_prefix_pairs)

In [42]:
# my_prefixes_b = db.from_sequence(my_prefixes)
# my_prefix_pairs = db.from_sequence(my_prefix_pairs)

In [43]:
# dask_bag_mp = my_prefix_pairs.map(lev_dist)

In [44]:
# dask_bag_mpl = list(dask_bag_mp)

In [45]:
# dask_bag_mpl[0]

In [46]:
# test_results_gathered

In [47]:
# my_prefix_range = np.arange(100)

In [48]:
# my_prefix_range_pairs = list(product(my_prefix_range, my_prefix_range))

In [49]:
# test_results = client.map(lev_helper_wrapper, my_prefix_range_pairs)

In [50]:
# client.gather(test_results)

# Actual computation

In [51]:
from itertools import combinations

In [52]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           188G        9.8G        142G        1.7M         36G        177G
Swap:          2.0G        110M        1.9G


In [53]:
# prefix_pairs = list(combinations(Ps_tt, 2)) #slower
# prefix_pairs = combinations(Ps_tt, 2) #faster

In [54]:
wordform_pairs = combinations(Ws_tt, 2)

In [55]:
# prefix_pairs = product(Ps_tt, Ps_tt)
# word_pairs = product(Ws_tt, Ws_tt)

In [56]:
from scipy.special import binom

In [57]:
len(Ws_t)

6823

In [58]:
"{:.2E}".format(binom(len(Ws_t), 2))
"{:,}".format(binom(len(Ws_t), 2))
# "{:.2E}".format(binom(len(Ps_t), 2))
# "{:,}".format(binom(len(Ps_t), 2))

'2.33E+07'

'23,273,253.0'

In [59]:
"{:.2E}".format(len(Ws_t) ** 2)
"{:,}".format(len(Ws_t) ** 2)
# "{:.2E}".format(len(Ps_t) ** 2)
# "{:,}".format(len(Ps_t) ** 2)

'4.66E+07'

'46,553,329'

In [60]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           188G        9.8G        142G        1.7M         36G        177G
Swap:          2.0G        110M        1.9G


In [61]:
#5-6m wittgenstein + buckeye
#70m wittgenstein + nxt_swbd; about the same on sidious + nxt_swbd
distinct_wordform_distances = par(delayed(lev_dist)(pair) for pair in wordform_pairs)

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0100s.) Setting batch_size=38.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0494s.) Setting batch_size=306.
[Parallel(n_jobs=-1)]: Done 710 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 1356 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1137s.) Setting batch_size=1076.
[Parallel(n_jobs=-1)]: Done 2346 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 5212 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 11370 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 1856

[Parallel(n_jobs=-1)]: Done 8967120 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 9037598 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 9109152 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 9180706 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 9253336 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 9325966 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 9399672 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 9473378 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 9548160 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 9622942 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 9698800 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 9774658 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 9851592 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 9928526 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 10006536 tasks      | elapsed:  2.6min
[Parallel

[Parallel(n_jobs=-1)]: Done 21255830 tasks      | elapsed:  8.3min
[Parallel(n_jobs=-1)]: Done 21392876 tasks      | elapsed:  8.3min
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0273s.) Setting batch_size=273.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1898s.) Setting batch_size=574.
[Parallel(n_jobs=-1)]: Done 21497435 tasks      | elapsed:  8.4min
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0038s.) Setting batch_size=287.
[Parallel(n_jobs=-1)]: Done 21629196 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1935s.) Setting batch_size=592.
[Parallel(n_jobs=-1)]: Done 21712446 tasks      | elapsed:  8.6min
[Parallel(n_jobs=-1)]: Done 21863406 tasks      | elapsed:  8.6min
[Parallel(n_jobs=-1)]: Done 22015550 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done 22167694 tasks      | elapsed:  8.8min
[Parallel(n_jobs=-1)]: Done 22321022 tasks      | elapsed:  8.9min
[Parallel(n_jobs=-1)]: Done 22474350 tasks      | elapsed

In [62]:
identity = [(w,w,0) for w in Ws_t]

In [63]:
mirror = [(v,u,d) for u,v,d in tqdm(distinct_wordform_distances)]

100%|██████████| 23273253/23273253 [00:13<00:00, 1781451.66it/s]


In [64]:
wordform_distances = distinct_wordform_distances + identity + mirror

In [65]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           188G         20G        132G        1.7M         36G        167G
Swap:          2.0G        110M        1.9G


In [65]:
# prefix_distances = results

In [66]:
# def is_wordform_distance(prefix_distance_tuple):
#     p0 = prefix_distance_tuple[0]
#     p1 = prefix_distance_tuple[1]
#     return p0 in Ws_t and p1 in Ws_t

In [67]:
# wordform_distances = {d for d in tqdm(prefix_distances) if is_wordform_distance(d)}
# wordform_distances = lfilter(is_wordform_distance, 
#                              prefix_distances)

In [68]:
# !free -h

In [69]:
# from toolz.sandbox.parallel import fold as parfold

In [70]:
# def parfilter(pred, seq, combine):
    
#     def keep_only_matches(acc, nxt):
#         if pred(nxt):
#             return combine(acc, nxt)
#         return acc
    
#     return parfold(keep_only_matches, seq)

# Convert distances to a numpy matrix

In [66]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           188G         20G        132G        1.7M         36G        167G
Swap:          2.0G        110M        1.9G


In [67]:
def to_idx_rep(distance_triple, add_mirror=False):
    t = distance_triple
    if not add_mirror:
        return (Ws_t.index(t[0]), Ws_t.index(t[1]), t[2])
    else:
        return {(Ws_t.index(t[0]), Ws_t.index(t[1]), t[2]),
                (Ws_t.index(t[1]), Ws_t.index(t[0]), t[2])}

In [68]:
#8.5m wittgenstein + buckeye
#70.9m sidious + wittgenstein
wordform_distance_idx_rep = join(par(delayed(to_idx_rep)(d, True) for d in distinct_wordform_distances + identity))

# wordform_distance_idx_rep = par(delayed(to_idx_rep)(d) for d in distinct_wordform_distances + identity)

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0071s.) Setting batch_size=56.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0555s.) Setting batch_size=402.
[Parallel(n_jobs=-1)]: Done 1016 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 1968 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 3032 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 6864 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0511s.) Setting batch_size=201.
[Parallel(n_jobs=-1)]: Done 15306 tasks      | elapsed:    9.2s
[Parallel(n_jobs=-1)]: Done 2374

[Parallel(n_jobs=-1)]: Done 1152650 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 1187670 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0619s.) Setting batch_size=206.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1866s.) Setting batch_size=440.
[Parallel(n_jobs=-1)]: Done 1222278 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 1253072 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 1292232 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 1331392 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Batch computation too slow (2.1060s.) Setting batch_size=220.
[Parallel(n_jobs=-1)]: Done 1371432 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1740s.) Setting batch_size=504.
[Parallel(n_jobs=-1)]: Done 1403972 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 1450844 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 1497716 tasks      | elapsed:  1.9min


[Parallel(n_jobs=-1)]: Done 5843915 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Batch computation too slow (2.6477s.) Setting batch_size=420.
[Parallel(n_jobs=-1)]: Batch computation too slow (6.2014s.) Setting batch_size=210.
[Parallel(n_jobs=-1)]: Done 5965505 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1928s.) Setting batch_size=434.
[Parallel(n_jobs=-1)]: Done 6017053 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 6091267 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 6166349 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done 6241431 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done 6317381 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done 6393331 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Batch computation too slow (2.8735s.) Setting batch_size=217.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1766s.) Setting batch_size=490.
[Parallel(n_jobs=-1)]: Done 6463954 tasks      

[Parallel(n_jobs=-1)]: Done 14005443 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done 14061991 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done 14118539 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done 14174454 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done 14230791 tasks      | elapsed:  8.8min
[Parallel(n_jobs=-1)]: Done 14287128 tasks      | elapsed:  8.8min
[Parallel(n_jobs=-1)]: Done 14343887 tasks      | elapsed:  8.8min
[Parallel(n_jobs=-1)]: Batch computation too slow (2.3116s.) Setting batch_size=105.
[Parallel(n_jobs=-1)]: Done 14400646 tasks      | elapsed:  9.0min
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1283s.) Setting batch_size=326.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1995s.) Setting batch_size=652.
[Parallel(n_jobs=-1)]: Done 14513231 tasks      | elapsed:  9.0min
[Parallel(n_jobs=-1)]: Done 14689923 tasks      | elapsed:  9.1min
[Parallel(n_jobs=-1)]: Done 14867919 tasks      | elapsed:  9.1min
[Paralle

[Parallel(n_jobs=-1)]: Done 23176686 tasks      | elapsed: 13.6min
[Parallel(n_jobs=-1)]: Done 23256508 tasks      | elapsed: 13.6min
[Parallel(n_jobs=-1)]: Done 23280076 out of 23280076 | elapsed: 13.6min finished


In [69]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           188G         35G        117G        1.7M         36G        152G
Swap:          2.0G        110M        1.9G


In [75]:
# wordform_distance_idx_rep = sorted(wordform_distance_idx_rep, key=lambda triple: triple[0])

In [70]:
#4m sidious + nxt_swbd
wordform_distance_idx_rep_grouped = group_by(lambda triple: triple[0], wordform_distance_idx_rep)

In [71]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           188G         35G        116G        1.7M         36G        151G
Swap:          2.0G        110M        1.9G


In [72]:
def group_to_update_vecs(grouped_idx_rep_triples):
    row_idx = grouped_idx_rep_triples[0][0]
    
    sorted_by_col_idx = sorted(grouped_idx_rep_triples, key=second)
    
    col_idxs = np.array(lmap(second, sorted_by_col_idx))
    vals = np.array(lmap(lambda triple: triple[2], sorted_by_col_idx))
    return (row_idx, col_idxs, vals)

In [73]:
#<1m wittgenstein + buckeye
#5m sidious + nxt_swbd
group_update_vecs = par(delayed(group_to_update_vecs)(group) for key, group in wordform_distance_idx_rep_grouped.items())

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0634s.) Setting batch_size=6.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Batch computation too slow (2.3310s.) Setting batch_size=3.
[Parallel(n_jobs=-1)]: Done 166 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done 268 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done 382 tasks      | elapsed:    6.5s
[Parallel(n_jobs=-1)]: Batch computation too slow (4.4812s.) Setting batch_size=1.
[Parallel(n_jobs=-1)]: Done 472 tasks      | elapsed:    7.1s
[Parallel(n_jobs=-1)]: Done 535 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done 598 tasks      

In [74]:
group_update_vecs = sorted(group_update_vecs, key=lambda triple: triple[0])

In [75]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           188G         36G        116G        1.7M         36G        150G
Swap:          2.0G        110M        1.9G


In [76]:
updated_row_idxs = set(lmap(first, 
                            group_update_vecs))

In [77]:
missing_row_idxs = [i for i in range(len(Ws_t)) if i not in updated_row_idxs]
len(missing_row_idxs)
missing_row_idxs

0

[]

In [78]:
Ws_t[6403]

'⋊.ʃ.i.d.⋉.⋉'

In [79]:
col_idx_mat = np.array(lmap(second, group_update_vecs))

In [80]:
val_mat = np.array(lmap(lambda triple: triple[2],
                        group_update_vecs))

In [81]:
col_idx_mat = np.array(col_idx_mat)
val_mat = np.array(val_mat)

In [82]:
col_idx_mat.shape
val_mat.shape

(6823, 6823)

(6823, 6823)

In [83]:
col_idx_mat[:10]
set(lmap(lambda a: a.shape, col_idx_mat))

array([[   0,    1,    2, ..., 6820, 6821, 6822],
       [   0,    1,    2, ..., 6820, 6821, 6822],
       [   0,    1,    2, ..., 6820, 6821, 6822],
       ...,
       [   0,    1,    2, ..., 6820, 6821, 6822],
       [   0,    1,    2, ..., 6820, 6821, 6822],
       [   0,    1,    2, ..., 6820, 6821, 6822]])

{(6823,)}

In [84]:
np.vstack(col_idx_mat)#[:10]

array([[   0,    1,    2, ..., 6820, 6821, 6822],
       [   0,    1,    2, ..., 6820, 6821, 6822],
       [   0,    1,    2, ..., 6820, 6821, 6822],
       ...,
       [   0,    1,    2, ..., 6820, 6821, 6822],
       [   0,    1,    2, ..., 6820, 6821, 6822],
       [   0,    1,    2, ..., 6820, 6821, 6822]])

In [85]:
val_mat[:10]

array([[ 0,  3,  6, ...,  7,  7,  8],
       [ 3,  0,  5, ...,  5,  5,  6],
       [ 6,  5,  0, ...,  9,  9, 10],
       ...,
       [ 6,  4,  8, ...,  4,  4,  2],
       [ 6,  5,  8, ...,  5,  4,  6],
       [ 5,  4,  8, ...,  4,  4,  5]])

In [86]:
val_mat.shape

(6823, 6823)

In [87]:
L_d_np = val_mat

In [88]:
# random_rows = choices(np.arange(len(Ws_t)),k=1000)
# random_cols = choices(np.arange(len(Ws_t)),k=1000)
# random_pairs = lzip(random_rows, random_cols)

In [89]:
# for i,j in random_pairs:
#     if val_mat[i,j] != lev_dist((Ws_tt[i], Ws_tt[j]))[2]:
#         print(f'({i},{j},{val_mat[i,j]}) vs. {lev_dist((Ws_tt[i], Ws_tt[j]))}')

In [90]:
# for i,j in tqdm(product(np.arange(len(Ws_t)), np.arange(len(Ws_t))),
#                 total=(len(Ws_t) * len(Ws_t))):
#     if val_mat[i,j] != lev_dist((Ws_tt[i], Ws_tt[j]))[2]:
#         print(f'({i},{j},{val_mat[i,j]}) vs. {lev_dist((Ws_tt[i], Ws_tt[j]))}')

# Convert distances to a dictionary

In [91]:
def row_to_dict_items(row_idx):
    w_left = Ws_t[row_idx]
    row = L_d_np[row_idx]
    return {((w_left, Ws_t[col_idx]), row[col_idx])
            for col_idx in range(len(Ws_t))}

In [92]:
#time to beat is 17m on wittgenstein + buckeye

#4.5m wittgenstein + buckeye
# wordform_distance_dict = dict(join(par(delayed(row_to_dict_items)(row_idx)
#                                        for row_idx in range(len(Ws_t)))))

In [93]:
#'curried' dictionary (= straightforwardly serializable, unlike version just above)

def row_to_curried_dict_items(row_idx):
    w_left = Ws_t[row_idx]
    row = L_d_np[row_idx]
    return {w_left:{Ws_t[col_idx]:float(row[col_idx])
                    for col_idx in range(len(Ws_t))}}
#     return {((w_left, Ws_t[col_idx]), row[col_idx])
#             for col_idx in range(len(Ws_t))}

In [94]:
#2m wittgenstein + buckeye
#3.75m sidious + nxt_swbd
wordform_distance_dict = dict(join(par(delayed(row_to_curried_dict_items)(row_idx)
                                       for row_idx in range(len(Ws_t)))))

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0462s.) Setting batch_size=8.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 200 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 336 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 488 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 808 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 976 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done 1160 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 1344 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]

In [95]:
random_w0 = choice(Ws_t); random_w0
random_w1 = choice(Ws_t); random_w1
wordform_distance_dict[random_w0][random_w1]

'⋊.l.aʊ.n.dʒ.⋉.⋉'

'⋊.k.ʌ.n.ɛ.k.t.ʌ.d.⋉.⋉'

7.0

# Calculate neighborhood sizes and densities

## Identify neighbors

In [96]:
N_np = L_d_np == 1

In [97]:
N_np #neighbor relation

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [98]:
np.sum(N_np, axis=1)
np.mean(np.sum(N_np, axis=1))

array([ 0,  0,  0, ...,  0,  2, 12])

3.1730910156822514

In [99]:
Ws_t[235]
N_np[235].sum()
N_np[235].nonzero()[0]
lmap(lambda w_idx: Ws_t[w_idx], 
    N_np[235].nonzero()[0])

'⋊.b.ɑ.t.ʌ.m.z.⋉.⋉'

1

array([236])

['⋊.b.ɑ.t.ʌ.m.⋉.⋉']

In [100]:
neighbor_dict = {Ws_t[w_idx]:lmap(lambda w_prime_idx: Ws_t[w_prime_idx],
                                  N_np[w_idx].nonzero()[0])
                 for w_idx in tqdm(np.arange(len(Ws_t)), total=len(Ws_t))}

100%|██████████| 6823/6823 [00:00<00:00, 91120.13it/s]


In [101]:
neighbor_dict[Ws_t[235]]

['⋊.b.ɑ.t.ʌ.m.⋉.⋉']

## Calculate neighborhood size

In [102]:
neighborhood_size = np.sum(N_np, axis=1)
neighborhood_size.shape

(6823,)

In [103]:
neighborhood_size[235]

1

In [104]:
neighborhood_size_dict = walk_values(len, neighbor_dict)

In [105]:
neighborhood_size_dict[Ws_t[235]]

1

## Weight by probability

In [106]:
neighborhood_density_dict = walk_values(lambda neighbors: sum(pW[w] for w in neighbors),
                                        neighbor_dict)

In [107]:
neighborhood_density_dict[Ws_t[235]]

3.56256354551401e-05

In [108]:
pW_np.shape

(6823,)

In [109]:
N_np.shape

(6823, 6823)

In [110]:
Nd_np = N_np * pW_np
Nd_np.shape

(6823, 6823)

In [111]:
words_with_nonempty_neighborhoods = lfilter(lambda w: neighborhood_density_dict[w] > 0,
                                            set(neighborhood_density_dict.keys()))

In [112]:
random_word_with_non_empty_neighborhood = choice(words_with_nonempty_neighborhoods)
random_word_with_non_empty_neighborhood
random_word_with_non_empty_neighborhood_idx = Ws_t.index(random_word_with_non_empty_neighborhood)
random_word_with_non_empty_neighborhood_idx

'⋊.ɛ.s.eɪ.⋉.⋉'

5773

In [113]:
neighborhood_density_dict[random_word_with_non_empty_neighborhood]
np.sum(Nd_np[random_word_with_non_empty_neighborhood_idx])
# Nd_np[random_word_with_non_empty_neighborhood_idx].nonzero()

0.0018430687617013541

0.0018430687617013541

In [114]:
Nd_np[3].nonzero()[0]

array([4])

In [115]:
Nd_np[3]
Nd_np[3][4]

neighborhood_density_dict[Ws_t[3]]

array([0., 0., 0., ..., 0., 0., 0.])

0.00026169571984171336

0.00026169571984171336

In [116]:
Ws_t[7]
Nd_np[7].nonzero()[0]
Nd_np[7].sum()

neighborhood_density_dict[Ws_t[7]]

'⋊.aɪ.d.⋉.⋉'

array([  10,   12,   18,   21,   23,   28,  451,  977, 1348, 1526, 3813,
       4589, 5039, 5322, 5469, 5576, 6005])

0.06058063834409145

0.060580638344091456

# Export

In [117]:
%pwd

'/mnt/cube/home/AD/emeinhar/wr'

In [118]:
o

'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered'

## Levenshtein distances

In [119]:
o + '_Levenshtein_distances'

'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_Levenshtein_distances'

In [120]:
# exportDict(o + '_Levenshtein_distances' + '.json', wordform_distance_dict)

In [121]:
np.save(o + '_Levenshtein_distances' + '.npy', L_d_np, allow_pickle=False)

In [122]:
# exportMatrixMetadata?

In [123]:
L_d_md = {'W':{'from fp':p,
               'changes':'sorted',
               'size':len(Ws_t)}
         }
exportMatrixMetadata(o + '_Levenshtein_distances' + '.npy' + '_metadata.json',
                     o + '_Levenshtein_distances' + '.npy',
                     L_d_np,
                     L_d_md,
                     'Step 5d',
                     'Calculate Levenshtein distances and neighborhood density.ipynb',
                     {})

Wrote metadata for 
	LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_Levenshtein_distances.npy
 to 
	LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_Levenshtein_distances.npy_metadata.json


## Neighbors

In [124]:
o + '_neighbors'

'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_neighbors'

In [125]:
exportDict(o + '_neighbors' + '.json', neighbor_dict)

In [126]:
np.save(o + '_neighbors' + '.npy', N_np, allow_pickle=False)

In [127]:
N_np_md = {'W':{'from fp':p,
                'changes':'sorted',
                'size':len(Ws_t)}
         }
exportMatrixMetadata(o + '_neighbors' + '.npy' + '_metadata.json',
                     o + '_neighbors' + '.npy',
                     N_np,
                     N_np_md,
                     'Step 5d',
                     'Calculate Levenshtein distances and neighborhood density.ipynb',
                     {})

Wrote metadata for 
	LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_neighbors.npy
 to 
	LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_neighbors.npy_metadata.json


## Neighborhood size

In [128]:
o + '_neighborhood_size'

'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_neighborhood_size'

In [129]:
exportDict(o + '_neighborhood_size' + '.json', neighborhood_size_dict)

In [130]:
np.save(o + '_neighborhood_size' + '.npy', neighborhood_size, allow_pickle=False)

In [131]:
NS_np_md = {'W':{'from fp':p,
                 'changes':'sorted',
                 'size':len(Ws_t)}
           }
exportMatrixMetadata(o + '_neighbors' + '.npy' + '_metadata.json',
                     o + '_neighbors' + '.npy',
                     neighborhood_size,
                     NS_np_md,
                     'Step 5d',
                     'Calculate Levenshtein distances and neighborhood density.ipynb',
                     {})

Wrote metadata for 
	LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_neighbors.npy
 to 
	LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_neighbors.npy_metadata.json


## Neighborhood density

In [132]:
o + '_neighborhood_density'

'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_neighborhood_density'

In [133]:
exportDict(o + '_neighborhood_density' + '.json', neighborhood_density_dict)

In [134]:
np.save(o + '_neighborhood_density' + '.npy', Nd_np, allow_pickle=False)

In [135]:
Nd_np_md = {'W':{'from fp':p,
                 'changes':'sorted',
                 'size':len(Ws_t)}
           }
exportMatrixMetadata(o + '_neighborhood_density' + '.npy' + '_metadata.json',
                     o + '_neighborhood_density' + '.npy',
                     Nd_np,
                     Nd_np_md,
                     'Step 5d',
                     'Calculate Levenshtein distances and neighborhood density.ipynb',
                     {})

Wrote metadata for 
	LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_neighborhood_density.npy
 to 
	LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_neighborhood_density.npy_metadata.json


In [136]:
# L_d_P_md = {'W':{'from fp':p,
#                      'changes':'sorted',
#                      'size':len(Ws_t)},
#                      'P':{'from_fp':p,
#                           'changes':'extracted from W, sorted',
#                           'size':len(Ps_t)}}
# exportMatrixMetadata(o + '_L_d_P' + '.npy' + '_metadata.json',
#                      o + '_L_d_P' + '.npy' + '_metadata.json',
#                      L_d_np_P,
#                      L_d_P_md,
#                      FIXME #'Step 4b',
#                      'Calculate Levenshtein distances and neighborhood density.ipynb',
#                     {'Storage':'file is MEMORY MAPPED.'})