In [1]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

**Eric Meinhardt / emeinhardt@ucsd.edu**

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Motivation" data-toc-modified-id="Motivation-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Motivation</a></span></li><li><span><a href="#Import-libraries-and-data" data-toc-modified-id="Import-libraries-and-data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Import libraries and data</a></span></li><li><span><a href="#Basic-representations" data-toc-modified-id="Basic-representations-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Basic representations</a></span></li><li><span><a href="#Edit-distance-calculation-&amp;-sandbox" data-toc-modified-id="Edit-distance-calculation-&amp;-sandbox-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Edit distance calculation &amp; sandbox</a></span></li><li><span><a href="#Actual-computation" data-toc-modified-id="Actual-computation-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Actual computation</a></span></li><li><span><a href="#Convert-distances-to-a-numpy-matrix" data-toc-modified-id="Convert-distances-to-a-numpy-matrix-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Convert distances to a numpy matrix</a></span></li><li><span><a href="#Convert-distances-to-a-dictionary" data-toc-modified-id="Convert-distances-to-a-dictionary-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Convert distances to a dictionary</a></span></li><li><span><a href="#Calculate-neighborhood-sizes-and-densities" data-toc-modified-id="Calculate-neighborhood-sizes-and-densities-8"><span class="toc-item-num">8&nbsp;&nbsp;</span>Calculate neighborhood sizes and densities</a></span><ul class="toc-item"><li><span><a href="#Identify-neighbors" data-toc-modified-id="Identify-neighbors-8.1"><span class="toc-item-num">8.1&nbsp;&nbsp;</span>Identify neighbors</a></span></li><li><span><a href="#Calculate-neighborhood-size" data-toc-modified-id="Calculate-neighborhood-size-8.2"><span class="toc-item-num">8.2&nbsp;&nbsp;</span>Calculate neighborhood size</a></span></li><li><span><a href="#Weight-by-probability" data-toc-modified-id="Weight-by-probability-8.3"><span class="toc-item-num">8.3&nbsp;&nbsp;</span>Weight by probability</a></span></li></ul></li><li><span><a href="#Export" data-toc-modified-id="Export-9"><span class="toc-item-num">9&nbsp;&nbsp;</span>Export</a></span><ul class="toc-item"><li><span><a href="#Levenshtein-distances" data-toc-modified-id="Levenshtein-distances-9.1"><span class="toc-item-num">9.1&nbsp;&nbsp;</span>Levenshtein distances</a></span></li><li><span><a href="#Neighbors" data-toc-modified-id="Neighbors-9.2"><span class="toc-item-num">9.2&nbsp;&nbsp;</span>Neighbors</a></span></li><li><span><a href="#Neighborhood-size" data-toc-modified-id="Neighborhood-size-9.3"><span class="toc-item-num">9.3&nbsp;&nbsp;</span>Neighborhood size</a></span></li><li><span><a href="#Neighborhood-density" data-toc-modified-id="Neighborhood-density-9.4"><span class="toc-item-num">9.4&nbsp;&nbsp;</span>Neighborhood density</a></span></li></ul></li></ul></div>

# Motivation

This notebook calculates 
 - the set of all pairwise Levenshtein distances between pairs of segmental wordforms.
 - unigram probability weighted neighborhood distance.

# Import libraries and data

In [2]:
from os import getcwd, chdir, listdir, path, mkdir, makedirs

In [3]:
from boilerplate import *

In [4]:
from probdist import *
from string_utils import *

In [5]:
from funcy import *

In [6]:
from tqdm import tqdm

from joblib import Parallel, delayed

J = -1
BACKEND = 'multiprocessing'
# BACKEND = 'loky'
V = 10
PREFER = 'processes'
# PREFER = 'threads'

def identity(x):
    return x

def par(gen_expr):
    return Parallel(n_jobs=J, backend=BACKEND, verbose=V, prefer=PREFER)(gen_expr)

In [7]:
# import toolz

In [8]:
# import dask.multiprocessing
# dask.config.set(scheduler='processes')

In [9]:
# # import dask
# from dask.distributed import Client
# client = Client('172.21.47.67:8786')

In [10]:
import editdistance as lev

In [11]:
import numpy as np

In [12]:
# import sparse

In [13]:
# Parameters

p = ''
p = 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json'
# p = 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim.pW_V.json'
# p = 'LTR_newdic_destressed_aligned_w_GD_AmE_destressed/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered.pW_V.json'
# p = 'LTR_newdic_destressed_aligned_w_GD_AmE_destressed/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered_trim.pW_V.json'
# p = 'LTR_CMU_destressed_aligned_w_GD_AmE_destressed/LTR_CMU_destressed_aligned_CM_filtered_LM_filtered.pW_V.json'
# p = 'LTR_CMU_destressed_aligned_w_GD_AmE_destressed/LTR_CMU_destressed_aligned_CM_filtered_LM_filtered_trim.pW_V.json'
# p = 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim.pW_V.json'
# p = 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json'
# p = 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2.json'

u = ''
u = 'LD_Buckeye_vocab_in_(empty)_(NA)_contexts_1gram_model/LD_Buckeye_vocab_in_(empty)_(NA)_contexts_1gram_model.pV.json'
# u = 'LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model/LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model.pV.json'

o = ''
o = 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered'
# o = 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim'
# o = 'LTR_newdic_destressed_aligned_w_GD_AmE_destressed/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered'
# o = 'LTR_newdic_destressed_aligned_w_GD_AmE_destressed/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered_trim'
# o = 'LTR_CMU_destressed_aligned_w_GD_AmE_destressed/LTR_CMU_destressed_aligned_CM_filtered_LM_filtered'
# o = 'LTR_CMU_destressed_aligned_w_GD_AmE_destressed/LTR_CMU_destressed_aligned_CM_filtered_LM_filtered_trim'
# o = 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered'
# o = 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim'
# o = 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_X0X1X2'

# g = ''
# # g = 'False'

In [14]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G         29G         53G        2.2M         43G         95G
Swap:          2.0G        1.0M        2.0G


In [15]:
if 'pW_V' in p:
    pW_V = condDistsAsProbDists(importProbDist(p))
elif 'pX0X1X2' in p:
    pW = ProbDist(importProbDist(p))
else:
    raise Exception(f"Unknown type of 'p' parameter = {p}")

In [16]:
if 'pV' in u:
    pV = ProbDist(importProbDist(u))
elif 'pX0X1X2' in u:
    pV = pW
else:
    raise Exception(f"Unknown type of 'u' parameter = {u}")

In [17]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G         29G         53G        2.2M         43G         95G
Swap:          2.0G        1.0M        2.0G


In [18]:
testing = True
benchmark = True

In [19]:
my_dtype = np.int8

# Basic representations

In [20]:
if 'pW_V' in p:
    # Vs = set(pW_V.keys())
    Ws = union(mapValues(lambda dist: set(conditions(dist)), 
                         pW_V).values())
elif 'pX0X1X2' in p:
    Ws = set(conditions(pW))
else:
    raise Exception(f"Unknown type of 'p' parameter = {p}")

# len(Vs)
len(Ws)

6404

In [21]:
Ws_t = tuple(sorted(list(Ws)))

In [22]:
# #≈200s on CMU on solomonoff
# Ps = union(list(par(delayed(getPrefixes)(w) for w in Ws)))
# # Ps = union(par(delayed(getPrefixes)(w) for w in Ws))
# # Ps = union([getPrefixes(w) for w in Ws])
# Ps_t = tuple(sorted(list(Ps)))
# len(Ps_t)

In [23]:
Ws_tt = lmap(ds2t, Ws_t)
# Ps_tt = lmap(ds2t, Ps_t)

Dask data structures:

In [24]:
# import dask.bag as db

In [25]:
# Ws_ttb = db.from_sequence(Ws_tt)

In [26]:
# Ps_ttb = db.from_sequence(Ps_tt)

In [27]:
if 'pW_V' in p:
#     Vs = set(pV.keys())
    my_Vs = set(pW_V.keys())
    
    missing_from_prior = {v for v in my_Vs if v not in pV}
    len(missing_from_prior)
    assert len(missing_from_prior) == 0
    
    missing_from_conditions = {v for v in pV if v not in pW_V}
    len(missing_from_conditions)
    
    pV_trim = ProbDist({v:pV[v] for v in my_Vs})
    assert all(v in pW_V for v in pV_trim)
    
    pW = MarginalProbDist(pW_V, pV_trim)
#     pW = ProbDist({w:sum(pV_trim[v] * pW_V[v][w]
#                      for v in pV_trim)
#                    for w in Ws_t})

0

37997

In [28]:
pW_np = distToNP(pW)
pW_np.shape

(6404,)

# Edit distance calculation & sandbox

In [29]:
def lev_helper_wrapper(uv_idx_pair, prefixes=True, distributed=False):
    return lev_helper(uv_idx_pair[0], uv_idx_pair[1], prefixes=prefixes, distributed=distributed)

def lev_helper(u_idx, v_idx, prefixes=True, distributed=False):
    if prefixes:
        u = Ps_tt[u_idx]
        v = Ps_tt[v_idx]
    else:
        u = Ws_tt[u_idx]
        v = Ws_tt[v_idx]
    return (u_idx, v_idx, lev.eval(u,v))

def lev_dist(uv_t_pair, final_func = None):
    if final_func is None:
        return (t2ds(uv_t_pair[0]), t2ds(uv_t_pair[1]), lev.eval(uv_t_pair[0], uv_t_pair[1]))
    else:
        result = (t2ds(uv_t_pair[0]), t2ds(uv_t_pair[1]), lev.eval(uv_t_pair[0], uv_t_pair[1]))
        return final_func(result)
#         return (final_func(result[0]), final_func(result[1]), result[2])

In [30]:
# lev_helper(3, 20, True)
# Ps_t[3]
# Ps_t[20]
# lev_dist((Ps_tt[3], Ps_tt[20]))

lev_helper(3, 20, False)
Ws_t[3]
Ws_t[20]

lev_dist((Ws_tt[3], Ws_tt[20]))

(3, 20, 4)

'⋊.aɪ.d.i.ʌ.z.⋉.⋉'

'⋊.aɪ.v.⋉.⋉'

('⋊.aɪ.d.i.ʌ.z.⋉.⋉', '⋊.aɪ.v.⋉.⋉', 4)

In [31]:
# L_d_np_P_updates = par(delayed(lev_helper)(u_idx, v_idx) for u_idx, v_idx in list(product(np.arange(len(Ps_t)), np.arange(len(Ps_t)))))

In [32]:
# my_prefixes = Ps_tt[:1000]
# my_prefix_pairs = list(product(my_prefixes, my_prefixes))
# "{:,}".format(len(my_prefix_pairs))

In [33]:
# Client?

In [34]:
# client.submit?

In [35]:
# foo = client.submit(lev_dist, my_prefix_pairs[23])

In [36]:
# foo.result()

In [37]:
# test_results_futures = client.map(lev_dist, my_prefix_pairs)

In [38]:
# test_results_futures[-1]

In [39]:
# test_results_gathered = client.gather(test_results_futures)

In [40]:
# joblib_only = par(delayed(lev_dist)(pair) for pair in my_prefix_pairs)

In [41]:
# my_prefixes_b = db.from_sequence(my_prefixes)
# my_prefix_pairs = db.from_sequence(my_prefix_pairs)

In [42]:
# dask_bag_mp = my_prefix_pairs.map(lev_dist)

In [43]:
# dask_bag_mpl = list(dask_bag_mp)

In [44]:
# dask_bag_mpl[0]

In [45]:
# test_results_gathered

In [46]:
# my_prefix_range = np.arange(100)

In [47]:
# my_prefix_range_pairs = list(product(my_prefix_range, my_prefix_range))

In [48]:
# test_results = client.map(lev_helper_wrapper, my_prefix_range_pairs)

In [49]:
# client.gather(test_results)

# Actual computation

In [50]:
from itertools import combinations

In [51]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G         29G         53G        2.2M         43G         95G
Swap:          2.0G        1.0M        2.0G


In [52]:
# prefix_pairs = list(combinations(Ps_tt, 2)) #slower
# prefix_pairs = combinations(Ps_tt, 2) #faster

In [53]:
wordform_pairs = combinations(Ws_tt, 2)

In [54]:
# prefix_pairs = product(Ps_tt, Ps_tt)
# word_pairs = product(Ws_tt, Ws_tt)

In [55]:
from scipy.special import binom

In [56]:
len(Ws_t)

6404

In [57]:
"{:.2E}".format(binom(len(Ws_t), 2))
"{:,}".format(binom(len(Ws_t), 2))
# "{:.2E}".format(binom(len(Ps_t), 2))
# "{:,}".format(binom(len(Ps_t), 2))

'2.05E+07'

'20,502,406.0'

In [58]:
"{:.2E}".format(len(Ws_t) ** 2)
"{:,}".format(len(Ws_t) ** 2)
# "{:.2E}".format(len(Ps_t) ** 2)
# "{:,}".format(len(Ps_t) ** 2)

'4.10E+07'

'41,011,216'

In [59]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G         29G         53G        2.2M         43G         95G
Swap:          2.0G        1.0M        2.0G


In [60]:
#5-6m wittgenstein + buckeye
distinct_wordform_distances = par(delayed(lev_dist)(pair) for pair in wordform_pairs)

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0033s.) Setting batch_size=120.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0348s.) Setting batch_size=1380.
[Parallel(n_jobs=-1)]: Done 2104 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 4144 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 6424 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 18784 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 47764 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 76744 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 108484 tasks      | el

[Parallel(n_jobs=-1)]: Done 11948884 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 12132424 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 12318724 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 12505024 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 12694084 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 12883144 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 13074964 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 13266784 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 13461364 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 13655944 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 13853284 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 14050624 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 14250724 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 14450824 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 14653684 tasks      | elapsed:  3.

In [61]:
identity = [(w,w,0) for w in Ws_t]

In [62]:
mirror = [(v,u,d) for u,v,d in tqdm(distinct_wordform_distances)]

100%|██████████| 20502406/20502406 [00:05<00:00, 3564193.14it/s]


In [63]:
wordform_distances = distinct_wordform_distances + identity + mirror

In [64]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G         38G         44G        2.2M         43G         86G
Swap:          2.0G        1.0M        2.0G


In [65]:
# prefix_distances = results

In [66]:
# def is_wordform_distance(prefix_distance_tuple):
#     p0 = prefix_distance_tuple[0]
#     p1 = prefix_distance_tuple[1]
#     return p0 in Ws_t and p1 in Ws_t

In [67]:
# wordform_distances = {d for d in tqdm(prefix_distances) if is_wordform_distance(d)}
# wordform_distances = lfilter(is_wordform_distance, 
#                              prefix_distances)

In [68]:
# !free -h

In [69]:
# from toolz.sandbox.parallel import fold as parfold

In [70]:
# def parfilter(pred, seq, combine):
    
#     def keep_only_matches(acc, nxt):
#         if pred(nxt):
#             return combine(acc, nxt)
#         return acc
    
#     return parfold(keep_only_matches, seq)

# Convert distances to a numpy matrix

In [71]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G         38G         44G        2.2M         43G         86G
Swap:          2.0G        1.0M        2.0G


In [72]:
def to_idx_rep(distance_triple, add_mirror=False):
    t = distance_triple
    if not add_mirror:
        return (Ws_t.index(t[0]), Ws_t.index(t[1]), t[2])
    else:
        return {(Ws_t.index(t[0]), Ws_t.index(t[1]), t[2]),
                (Ws_t.index(t[1]), Ws_t.index(t[0]), t[2])}

In [73]:
#8.5m wittgenstein + buckeye
wordform_distance_idx_rep = join(par(delayed(to_idx_rep)(d, True) for d in distinct_wordform_distances + identity))

# wordform_distance_idx_rep = par(delayed(to_idx_rep)(d) for d in distinct_wordform_distances + identity)

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0045s.) Setting batch_size=88.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0392s.) Setting batch_size=898.
[Parallel(n_jobs=-1)]: Done 1560 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 3056 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done 4728 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Batch computation too slow (2.9103s.) Setting batch_size=449.
[Parallel(n_jobs=-1)]: Done 12880 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done 31738 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done 505

[Parallel(n_jobs=-1)]: Batch computation too slow (4.2218s.) Setting batch_size=337.
[Parallel(n_jobs=-1)]: Done 1884497 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1870s.) Setting batch_size=720.
[Parallel(n_jobs=-1)]: Done 1929655 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 1976577 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 2034897 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 2093217 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Batch computation too slow (2.2798s.) Setting batch_size=360.
[Parallel(n_jobs=-1)]: Done 2152977 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1459s.) Setting batch_size=986.
[Parallel(n_jobs=-1)]: Done 2200871 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 2284681 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 2368491 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Batch computation too slow (2.1718s.) Setti

[Parallel(n_jobs=-1)]: Batch computation too slow (2.1729s.) Setting batch_size=387.
[Parallel(n_jobs=-1)]: Done 7925543 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Batch computation too slow (5.2206s.) Setting batch_size=193.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1769s.) Setting batch_size=436.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1326s.) Setting batch_size=1314.
[Parallel(n_jobs=-1)]: Done 7991359 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Batch computation too slow (2.1472s.) Setting batch_size=657.
[Parallel(n_jobs=-1)]: Batch computation too slow (5.9861s.) Setting batch_size=328.
[Parallel(n_jobs=-1)]: Done 8141483 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1992s.) Setting batch_size=658.
[Parallel(n_jobs=-1)]: Done 8224892 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 8326882 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 8430188 tasks      | elapsed:  4.3min
[Paralle

[Parallel(n_jobs=-1)]: Done 18046840 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 18119018 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 18191196 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 18263978 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 18336760 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 18410146 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 18483532 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 18557522 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 18631512 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 18706106 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 18780700 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 18855898 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done 18931096 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done 19006898 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done 19082700 tasks      | elapsed:  7.

In [74]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G         52G         30G        2.2M         43G         72G
Swap:          2.0G        1.0M        2.0G


In [75]:
# wordform_distance_idx_rep = sorted(wordform_distance_idx_rep, key=lambda triple: triple[0])

In [76]:
wordform_distance_idx_rep_grouped = group_by(lambda triple: triple[0], wordform_distance_idx_rep)

In [77]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G         52G         29G        2.2M         43G         72G
Swap:          2.0G        1.0M        2.0G


In [78]:
def group_to_update_vecs(grouped_idx_rep_triples):
    row_idx = grouped_idx_rep_triples[0][0]
    
    sorted_by_col_idx = sorted(grouped_idx_rep_triples, key=second)
    
    col_idxs = np.array(lmap(second, sorted_by_col_idx))
    vals = np.array(lmap(lambda triple: triple[2], sorted_by_col_idx))
    return (row_idx, col_idxs, vals)

In [79]:
#<1m wittgenstein + buckeye
group_update_vecs = par(delayed(group_to_update_vecs)(group) for key, group in wordform_distance_idx_rep_grouped.items())

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0594s.) Setting batch_size=6.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 166 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 268 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 382 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0112s.) Setting batch_size=3.
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done 622 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 748 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done 832 tasks      | elapsed:    4.2s
[P

In [80]:
group_update_vecs = sorted(group_update_vecs, key=lambda triple: triple[0])

In [81]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G         53G         29G        2.2M         43G         71G
Swap:          2.0G        1.0M        2.0G


In [83]:
updated_row_idxs = set(lmap(first, 
                            group_update_vecs))

In [84]:
missing_row_idxs = [i for i in range(len(Ws_t)) if i not in updated_row_idxs]
len(missing_row_idxs)
missing_row_idxs

0

[]

In [85]:
Ws_t[6403]

'⋊.θ.⋉.⋉'

In [86]:
col_idx_mat = np.array(lmap(second, group_update_vecs))

In [87]:
val_mat = np.array(lmap(lambda triple: triple[2],
                        group_update_vecs))

In [88]:
col_idx_mat = np.array(col_idx_mat)
val_mat = np.array(val_mat)

In [89]:
col_idx_mat.shape
val_mat.shape

(6404, 6404)

(6404, 6404)

In [90]:
col_idx_mat[:10]
set(lmap(lambda a: a.shape, col_idx_mat))

array([[   0,    1,    2, ..., 6401, 6402, 6403],
       [   0,    1,    2, ..., 6401, 6402, 6403],
       [   0,    1,    2, ..., 6401, 6402, 6403],
       ...,
       [   0,    1,    2, ..., 6401, 6402, 6403],
       [   0,    1,    2, ..., 6401, 6402, 6403],
       [   0,    1,    2, ..., 6401, 6402, 6403]])

{(6404,)}

In [91]:
np.vstack(col_idx_mat)#[:10]

array([[   0,    1,    2, ..., 6401, 6402, 6403],
       [   0,    1,    2, ..., 6401, 6402, 6403],
       [   0,    1,    2, ..., 6401, 6402, 6403],
       ...,
       [   0,    1,    2, ..., 6401, 6402, 6403],
       [   0,    1,    2, ..., 6401, 6402, 6403],
       [   0,    1,    2, ..., 6401, 6402, 6403]])

In [92]:
val_mat[:10]

array([[ 0,  3,  6, ...,  7,  7,  8],
       [ 3,  0,  5, ...,  5,  5,  6],
       [ 6,  5,  0, ...,  9,  9, 10],
       ...,
       [ 6,  4,  8, ...,  4,  4,  2],
       [ 6,  5,  8, ...,  5,  4,  6],
       [ 5,  4,  8, ...,  4,  4,  5]])

In [93]:
val_mat.shape

(6404, 6404)

In [94]:
L_d_np = val_mat

In [95]:
# random_rows = choices(np.arange(len(Ws_t)),k=1000)
# random_cols = choices(np.arange(len(Ws_t)),k=1000)
# random_pairs = lzip(random_rows, random_cols)

In [96]:
# for i,j in random_pairs:
#     if val_mat[i,j] != lev_dist((Ws_tt[i], Ws_tt[j]))[2]:
#         print(f'({i},{j},{val_mat[i,j]}) vs. {lev_dist((Ws_tt[i], Ws_tt[j]))}')

In [97]:
# for i,j in tqdm(product(np.arange(len(Ws_t)), np.arange(len(Ws_t))),
#                 total=(len(Ws_t) * len(Ws_t))):
#     if val_mat[i,j] != lev_dist((Ws_tt[i], Ws_tt[j]))[2]:
#         print(f'({i},{j},{val_mat[i,j]}) vs. {lev_dist((Ws_tt[i], Ws_tt[j]))}')

# Convert distances to a dictionary

In [98]:
def row_to_dict_items(row_idx):
    w_left = Ws_t[row_idx]
    row = L_d_np[row_idx]
    return {((w_left, Ws_t[col_idx]), row[col_idx])
            for col_idx in range(len(Ws_t))}

In [99]:
#time to beat is 17m on wittgenstein + buckeye

#4.5m wittgenstein + buckeye
# wordform_distance_dict = dict(join(par(delayed(row_to_dict_items)(row_idx)
#                                        for row_idx in range(len(Ws_t)))))

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0520s.) Setting batch_size=6.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0071s.) Setting batch_size=3.
[Parallel(n_jobs=-1)]: Done 166 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 268 tasks      | elapsed:    5.8s
[Parallel(n_jobs=-1)]: Done 382 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed:   10.6s
[Parallel(n_jobs=-1)]: Batch computation too slow (8.0661s.) Setting batch_size=1.
[Parallel(n_jobs=-1)]: Done 559 tasks      | elapsed:   11.9s
[Parallel(n_jobs=-1)]: Done 622 tasks      

In [171]:
#'curried' dictionary (= straightforwardly serializable, unlike version just above)

def row_to_curried_dict_items(row_idx):
    w_left = Ws_t[row_idx]
    row = L_d_np[row_idx]
    return {w_left:{Ws_t[col_idx]:float(row[col_idx])
                    for col_idx in range(len(Ws_t))}}
#     return {((w_left, Ws_t[col_idx]), row[col_idx])
#             for col_idx in range(len(Ws_t))}

In [172]:
#2m wittgenstein + buckeye
wordform_distance_dict = dict(join(par(delayed(row_to_curried_dict_items)(row_idx)
                                       for row_idx in range(len(Ws_t)))))

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0159s.) Setting batch_size=24.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 110 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 472 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 880 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 1336 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 2296 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 2800 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done 3352 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done 3904 tasks      | elapsed:    3.4s
[Parallel(n_jobs

In [180]:
random_w0 = choice(Ws_t); random_w0
random_w1 = choice(Ws_t); random_w1
wordform_distance_dict[random_w0][random_w1]

'⋊.k.ɛ.p.t.⋉.⋉'

'⋊.s.ʌ.b.ɚ.b.ʌ.n.⋉.⋉'

7.0

# Calculate neighborhood sizes and densities

## Identify neighbors

In [109]:
N_np = L_d_np == 1

In [110]:
N_np #neighbor relation

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [114]:
np.sum(N_np, axis=1)
np.mean(np.sum(N_np, axis=1))

array([ 0,  0,  0, ...,  0,  2, 11])

3.2816989381636477

In [122]:
Ws_t[235]
N_np[235].sum()
N_np[235].nonzero()[0]
lmap(lambda w_idx: Ws_t[w_idx], 
    N_np[235].nonzero()[0])

'⋊.b.ɔɪ.z.⋉.⋉'

7

array([  56,   97,  233,  234,  236, 2982, 4422])

['⋊.b.aɪ.z.⋉.⋉',
 '⋊.b.i.z.⋉.⋉',
 '⋊.b.ɔɪ.l.z.⋉.⋉',
 '⋊.b.ɔɪ.l.⋉.⋉',
 '⋊.b.ɔɪ.⋉.⋉',
 '⋊.n.ɔɪ.z.⋉.⋉',
 '⋊.t.ɔɪ.z.⋉.⋉']

In [124]:
neighbor_dict = {Ws_t[w_idx]:lmap(lambda w_prime_idx: Ws_t[w_prime_idx],
                                  N_np[w_idx].nonzero()[0])
                 for w_idx in tqdm(np.arange(len(Ws_t)), total=len(Ws_t))}

100%|██████████| 6404/6404 [00:46<00:00, 137.12it/s] 


In [129]:
neighbor_dict[Ws_t[235]]

['⋊.b.aɪ.z.⋉.⋉',
 '⋊.b.i.z.⋉.⋉',
 '⋊.b.ɔɪ.l.z.⋉.⋉',
 '⋊.b.ɔɪ.l.⋉.⋉',
 '⋊.b.ɔɪ.⋉.⋉',
 '⋊.n.ɔɪ.z.⋉.⋉',
 '⋊.t.ɔɪ.z.⋉.⋉']

## Calculate neighborhood size

In [131]:
neighborhood_size = np.sum(N_np, axis=1)
neighborhood_size.shape

(6404,)

In [132]:
neighborhood_size[235]

7

In [126]:
neighborhood_size_dict = walk_values(len, neighbor_dict)

In [130]:
neighborhood_size_dict[Ws_t[235]]

7

## Weight by probability

In [135]:
neighborhood_density_dict = walk_values(lambda neighbors: sum(pW[w] for w in neighbors),
                                        neighbor_dict)

In [136]:
neighborhood_density_dict[Ws_t[235]]

0.00019411727826629133

In [137]:
pW_np.shape

(6404,)

In [138]:
N_np.shape

(6404, 6404)

In [155]:
Nd_np = N_np * pW_np
Nd_np.shape

(6404, 6404)

In [156]:
Nd_np[3].nonzero()[0]

array([4])

In [158]:
Nd_np[3]
Nd_np[3][4]

neighborhood_density_dict[Ws_t[3]]

array([0., 0., 0., ..., 0., 0., 0.])

0.00026170730760796863

0.00026170730760796863

In [159]:
Ws_t[7]
Nd_np[7].nonzero()[0]
Nd_np[7].sum()

neighborhood_density_dict[Ws_t[7]]

'⋊.aɪ.d.⋉.⋉'

array([  10,   11,   17,   20,   21,   26,  420,  902, 1253, 1422, 3565,
       4300, 4719, 4983, 5120, 5222, 5629])

0.060583320826927425

0.06058332082692743

# Export

In [164]:
%pwd

'/mnt/cube/home/AD/emeinhar/wr'

In [165]:
o

'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered'

## Levenshtein distances

In [163]:
o + '_Levenshtein_distances'

'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered'

In [173]:
exportDict(o + '_Levenshtein_distances' + '.json', wordform_distance_dict)

In [183]:
np.save(o + '_Levenshtein_distances' + '.npy', L_d_np, allow_pickle=False)

In [185]:
exportMatrixMetadata?

In [189]:
L_d_md = {'W':{'from fp':p,
               'changes':'sorted',
               'size':len(Ws_t)}
         }
exportMatrixMetadata(o + '_Levenshtein_distances' + '.npy' + '_metadata.json',
                     o + '_Levenshtein_distances' + '.npy',
                     L_d_np,
                     L_d_md,
                     FIXME #'Step 4b',
                     'Calculate Levenshtein distances and neighborhood density.ipynb',
                     {})

SyntaxError: invalid syntax (<ipython-input-189-11b3098007e1>, line 10)

## Neighbors

In [175]:
o + '_neighbors'

'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_neighbors'

In [174]:
exportDict(o + '_neighbors' + '.json', neighbor_dict)

In [190]:
np.save(o + '_neighbors' + '.npy', N_np, allow_pickle=False)

In [None]:
N_np_md = {'W':{'from fp':p,
                'changes':'sorted',
                'size':len(Ws_t)}
         }
exportMatrixMetadata(o + '_neighbors' + '.npy' + '_metadata.json',
                     o + '_neighbors' + '.npy',
                     N_np,
                     N_np_md,
                     FIXME #'Step 4b',
                     'Calculate Levenshtein distances and neighborhood density.ipynb',
                     {})

## Neighborhood size

In [176]:
o + '_neighborhood_size'

'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_neighborhood_size'

In [177]:
exportDict(o + '_neighborhood_size' + '.json', neighborhood_size_dict)

In [191]:
np.save(o + '_neighborhood_size' + '.npy', neighborhood_size, allow_pickle=False)

In [None]:
NS_np_md = {'W':{'from fp':p,
                 'changes':'sorted',
                 'size':len(Ws_t)}
           }
exportMatrixMetadata(o + '_neighbors' + '.npy' + '_metadata.json',
                     o + '_neighbors' + '.npy',
                     NS_np,
                     NS_np_md,
                     FIXME #'Step 4b',
                     'Calculate Levenshtein distances and neighborhood density.ipynb',
                     {})

## Neighborhood density

In [178]:
o + '_neighborhood_density'

'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_neighborhood_density'

In [179]:
exportDict(o + '_neighborhood_density' + '.json', neighborhood_density_dict)

In [194]:
np.save(o + '_neighborhood_density' + '.json', Nd_np, allow_pickle=False)

In [None]:
Nd_np_md = {'W':{'from fp':p,
                 'changes':'sorted',
                 'size':len(Ws_t)}
           }
exportMatrixMetadata(o + '_neighborhood_density' + '.npy' + '_metadata.json',
                     o + '_neighborhood_density' + '.npy',
                     Nd_np,
                     Nd_np_md,
                     FIXME #'Step 4b',
                     'Calculate Levenshtein distances and neighborhood density.ipynb',
                     {})

In [None]:
# L_d_P_md = {'W':{'from fp':p,
#                      'changes':'sorted',
#                      'size':len(Ws_t)},
#                      'P':{'from_fp':p,
#                           'changes':'extracted from W, sorted',
#                           'size':len(Ps_t)}}
# exportMatrixMetadata(o + '_L_d_P' + '.npy' + '_metadata.json',
#                      o + '_L_d_P' + '.npy' + '_metadata.json',
#                      L_d_np_P,
#                      L_d_P_md,
#                      FIXME #'Step 4b',
#                      'Calculate Levenshtein distances and neighborhood density.ipynb',
#                     {'Storage':'file is MEMORY MAPPED.'})