In [1]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

**Eric Meinhardt / emeinhardt@ucsd.edu**

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Motivation" data-toc-modified-id="Motivation-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Motivation</a></span></li><li><span><a href="#Import-libraries-and-data" data-toc-modified-id="Import-libraries-and-data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Import libraries and data</a></span></li><li><span><a href="#Basic-representations" data-toc-modified-id="Basic-representations-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Basic representations</a></span></li><li><span><a href="#Edit-distance-calculation-&amp;-sandbox" data-toc-modified-id="Edit-distance-calculation-&amp;-sandbox-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Edit distance calculation &amp; sandbox</a></span></li><li><span><a href="#Actual-computation" data-toc-modified-id="Actual-computation-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Actual computation</a></span></li><li><span><a href="#Convert-distances-to-a-dictionary" data-toc-modified-id="Convert-distances-to-a-dictionary-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Convert distances to a dictionary</a></span></li><li><span><a href="#Convert-distances-to-a-numpy-matrix..." data-toc-modified-id="Convert-distances-to-a-numpy-matrix...-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Convert distances to a numpy matrix...</a></span></li><li><span><a href="#Calculate-weighted-distances" data-toc-modified-id="Calculate-weighted-distances-8"><span class="toc-item-num">8&nbsp;&nbsp;</span>Calculate weighted distances</a></span></li><li><span><a href="#Export" data-toc-modified-id="Export-9"><span class="toc-item-num">9&nbsp;&nbsp;</span>Export</a></span></li></ul></div>

# Motivation

This notebook calculates 
 - the set of all pairwise Levenshtein distances between pairs of segmental wordforms.
 - unigram probability weighted neighborhood distance.

# Import libraries and data

In [2]:
from os import getcwd, chdir, listdir, path, mkdir, makedirs

In [3]:
from boilerplate import *

In [4]:
from probdist import *
from string_utils import *

In [5]:
from funcy import *

In [6]:
from tqdm import tqdm

from joblib import Parallel, delayed

J = -1
BACKEND = 'multiprocessing'
# BACKEND = 'loky'
V = 10
PREFER = 'processes'
# PREFER = 'threads'

def identity(x):
    return x

def par(gen_expr):
    return Parallel(n_jobs=J, backend=BACKEND, verbose=V, prefer=PREFER)(gen_expr)

In [7]:
import toolz

In [8]:
# import dask.multiprocessing
# dask.config.set(scheduler='processes')

In [9]:
# # import dask
# from dask.distributed import Client
# client = Client('172.21.47.67:8786')

In [10]:
import editdistance as lev

In [11]:
import numpy as np

In [12]:
import sparse

In [13]:
# Parameters

p = ''
p = 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json'
# p = 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim.pW_V.json'
# p = 'LTR_newdic_destressed_aligned_w_GD_AmE_destressed/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered.pW_V.json'
# p = 'LTR_newdic_destressed_aligned_w_GD_AmE_destressed/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered_trim.pW_V.json'
# p = 'LTR_CMU_destressed_aligned_w_GD_AmE_destressed/LTR_CMU_destressed_aligned_CM_filtered_LM_filtered.pW_V.json'
# p = 'LTR_CMU_destressed_aligned_w_GD_AmE_destressed/LTR_CMU_destressed_aligned_CM_filtered_LM_filtered_trim.pW_V.json'
# p = 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim.pW_V.json'
# p = 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json'
# p = 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2.json'

u = ''
u = 'LD_Buckeye_vocab_in_(empty)_(NA)_contexts_1gram_model/LD_Buckeye_vocab_in_(empty)_(NA)_contexts_1gram_model.pV.json'
# u = 'LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model/LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model.pV.json'

o = ''
o = 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered'
# o = 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim'
# o = 'LTR_newdic_destressed_aligned_w_GD_AmE_destressed/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered'
# o = 'LTR_newdic_destressed_aligned_w_GD_AmE_destressed/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered_trim'
# o = 'LTR_CMU_destressed_aligned_w_GD_AmE_destressed/LTR_CMU_destressed_aligned_CM_filtered_LM_filtered'
# o = 'LTR_CMU_destressed_aligned_w_GD_AmE_destressed/LTR_CMU_destressed_aligned_CM_filtered_LM_filtered_trim'
# o = 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered'
# o = 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim'
# o = 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_X0X1X2'

# g = ''
# # g = 'False'

In [14]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G         28G         50G        3.0M         46G         96G
Swap:          2.0G          0B        2.0G


In [15]:
if 'pW_V' in p:
    pW_V = condDistsAsProbDists(importProbDist(p))
elif 'pX0X1X2' in p:
    pW = ProbDist(importProbDist(p))
else:
    raise Exception(f"Unknown type of 'p' parameter = {p}")

In [16]:
if 'pV' in u:
    pV = ProbDist(importProbDist(u))
elif 'pX0X1X2' in u:
    pV = pW
else:
    raise Exception(f"Unknown type of 'u' parameter = {u}")

In [17]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G         28G         50G        3.0M         46G         96G
Swap:          2.0G          0B        2.0G


In [18]:
testing = True
benchmark = True

In [19]:
my_dtype = np.int8

# Basic representations

In [20]:
if 'pW_V' in p:
    # Vs = set(pW_V.keys())
    Ws = union(mapValues(lambda dist: set(conditions(dist)), 
                         pW_V).values())
elif 'pX0X1X2' in p:
    Ws = set(conditions(pW))
else:
    raise Exception(f"Unknown type of 'p' parameter = {p}")

# len(Vs)
len(Ws)

6404

In [21]:
Ws_t = tuple(sorted(list(Ws)))

In [22]:
# #≈200s on CMU on solomonoff
# Ps = union(list(par(delayed(getPrefixes)(w) for w in Ws)))
# # Ps = union(par(delayed(getPrefixes)(w) for w in Ws))
# # Ps = union([getPrefixes(w) for w in Ws])
# Ps_t = tuple(sorted(list(Ps)))
# len(Ps_t)

In [23]:
Ws_tt = lmap(ds2t, Ws_t)
# Ps_tt = lmap(ds2t, Ps_t)

Dask data structures:

In [24]:
# import dask.bag as db

In [25]:
# Ws_ttb = db.from_sequence(Ws_tt)

In [26]:
# Ps_ttb = db.from_sequence(Ps_tt)

In [31]:
if 'pW_V' in p:
#     Vs = set(pV.keys())
    my_Vs = set(pW_V.keys())
    
    missing_from_prior = {v for v in my_Vs if v not in pV}
    len(missing_from_prior)
    assert len(missing_from_prior) == 0
    
    missing_from_conditions = {v for v in pV if v not in pW_V}
    len(missing_from_conditions)
    
    pV_trim = ProbDist({v:pV[v] for v in my_Vs})
    assert all(v in pW_V for v in pV_trim)
    
    pW = MarginalProbDist(pW_V, pV_trim)
#     pW = ProbDist({w:sum(pV_trim[v] * pW_V[v][w]
#                      for v in pV_trim)
#                    for w in Ws_t})

0

37997

In [32]:
pW_np = distToNP(pW)
pW_np.shape

(6404,)

# Edit distance calculation & sandbox

In [33]:
def lev_helper_wrapper(uv_idx_pair, prefixes=True, distributed=False):
    return lev_helper(uv_idx_pair[0], uv_idx_pair[1], prefixes=prefixes, distributed=distributed)

def lev_helper(u_idx, v_idx, prefixes=True, distributed=False):
    if prefixes:
        u = Ps_tt[u_idx]
        v = Ps_tt[v_idx]
    else:
        u = Ws_tt[u_idx]
        v = Ws_tt[v_idx]
    return (u_idx, v_idx, lev.eval(u,v))

def lev_dist(uv_t_pair, final_func = None):
    if final_func is None:
        return (t2ds(uv_t_pair[0]), t2ds(uv_t_pair[1]), lev.eval(uv_t_pair[0], uv_t_pair[1]))
    else:
        result = (t2ds(uv_t_pair[0]), t2ds(uv_t_pair[1]), lev.eval(uv_t_pair[0], uv_t_pair[1]))
        return final_func(result)
#         return (final_func(result[0]), final_func(result[1]), result[2])

In [34]:
# lev_helper(3, 20, True)
# Ps_t[3]
# Ps_t[20]
# lev_dist((Ps_tt[3], Ps_tt[20]))

lev_helper(3, 20, False)
Ws_t[3]
Ws_t[20]

lev_dist((Ws_tt[3], Ws_tt[20]))

(3, 20, 4)

'⋊.aɪ.d.i.ʌ.z.⋉.⋉'

'⋊.aɪ.v.⋉.⋉'

('⋊.aɪ.d.i.ʌ.z.⋉.⋉', '⋊.aɪ.v.⋉.⋉', 4)

In [35]:
# L_d_np_P_updates = par(delayed(lev_helper)(u_idx, v_idx) for u_idx, v_idx in list(product(np.arange(len(Ps_t)), np.arange(len(Ps_t)))))

In [36]:
# my_prefixes = Ps_tt[:1000]
# my_prefix_pairs = list(product(my_prefixes, my_prefixes))
# "{:,}".format(len(my_prefix_pairs))

In [37]:
# Client?

In [38]:
# client.submit?

In [39]:
# foo = client.submit(lev_dist, my_prefix_pairs[23])

In [40]:
# foo.result()

In [41]:
# test_results_futures = client.map(lev_dist, my_prefix_pairs)

In [42]:
# test_results_futures[-1]

In [43]:
# test_results_gathered = client.gather(test_results_futures)

In [44]:
# joblib_only = par(delayed(lev_dist)(pair) for pair in my_prefix_pairs)

In [45]:
# my_prefixes_b = db.from_sequence(my_prefixes)
# my_prefix_pairs = db.from_sequence(my_prefix_pairs)

In [46]:
# dask_bag_mp = my_prefix_pairs.map(lev_dist)

In [47]:
# dask_bag_mpl = list(dask_bag_mp)

In [48]:
# dask_bag_mpl[0]

In [49]:
# test_results_gathered

In [50]:
# my_prefix_range = np.arange(100)

In [51]:
# my_prefix_range_pairs = list(product(my_prefix_range, my_prefix_range))

In [52]:
# test_results = client.map(lev_helper_wrapper, my_prefix_range_pairs)

In [53]:
# client.gather(test_results)

# Actual computation

In [54]:
from itertools import combinations

In [55]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G         28G         50G        3.0M         46G         96G
Swap:          2.0G          0B        2.0G


In [56]:
# prefix_pairs = list(combinations(Ps_tt, 2)) #slower
# prefix_pairs = combinations(Ps_tt, 2) #faster

In [57]:
wordform_pairs = combinations(Ws_tt, 2)

In [58]:
# prefix_pairs = product(Ps_tt, Ps_tt)
# word_pairs = product(Ws_tt, Ws_tt)

In [59]:
from scipy.special import binom

In [60]:
len(Ws_t)

6404

In [61]:
"{:.2E}".format(binom(len(Ws_t), 2))
"{:,}".format(binom(len(Ws_t), 2))
# "{:.2E}".format(binom(len(Ps_t), 2))
# "{:,}".format(binom(len(Ps_t), 2))

'2.05E+07'

'20,502,406.0'

In [62]:
"{:.2E}".format(len(Ws_t) ** 2)
"{:,}".format(len(Ws_t) ** 2)
# "{:.2E}".format(len(Ps_t) ** 2)
# "{:,}".format(len(Ps_t) ** 2)

'4.10E+07'

'41,011,216'

In [63]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G         28G         50G        3.0M         46G         96G
Swap:          2.0G          0B        2.0G


In [64]:
#6m wittgenstein + buckeye
distinct_wordform_distances = par(delayed(lev_dist)(pair) for pair in wordform_pairs)

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0035s.) Setting batch_size=114.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0357s.) Setting batch_size=1276.
[Parallel(n_jobs=-1)]: Done 2002 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 3940 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 6106 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 17568 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 44364 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 71160 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 100508 tasks      | el

[Parallel(n_jobs=-1)]: Done 11048588 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 11218296 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 11390556 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 11562816 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 11737628 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 11912440 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 12089804 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 12267168 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 12447084 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 12627000 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 12809468 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 12991936 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 13176956 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 13361976 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 13549548 tasks      | elapsed:  2.

In [65]:
identity = [(w,w,0) for w in Ws_t]

In [66]:
mirror = [(v,u,d) for u,v,d in tqdm(distinct_wordform_distances)]

100%|██████████| 20502406/20502406 [00:05<00:00, 3590178.06it/s]


In [67]:
wordform_distances = distinct_wordform_distances + identity + mirror

In [68]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G         37G         41G        3.0M         46G         87G
Swap:          2.0G          0B        2.0G


In [69]:
# prefix_distances = results

In [70]:
# def is_wordform_distance(prefix_distance_tuple):
#     p0 = prefix_distance_tuple[0]
#     p1 = prefix_distance_tuple[1]
#     return p0 in Ws_t and p1 in Ws_t

In [71]:
# wordform_distances = {d for d in tqdm(prefix_distances) if is_wordform_distance(d)}
# wordform_distances = lfilter(is_wordform_distance, 
#                              prefix_distances)

In [72]:
# !free -h

In [73]:
from toolz.sandbox.parallel import fold as parfold

In [74]:
def parfilter(pred, seq, combine):
    
    def keep_only_matches(acc, nxt):
        if pred(nxt):
            return combine(acc, nxt)
        return acc
    
    return parfold(keep_only_matches, seq)

In [75]:
# type(prefix_distances)

In [76]:
type(wordform_distances)

list

In [77]:
# prefix_distances_s = set(prefix_distances)

In [78]:
# wordform_distancez = parfilter(is_wordform_distance, prefix_distances_s, combine=func_partial(do, set.add))

In [79]:
# def keep_only_wordform_distances(acc, nxt, combine):
#     if is_wordform_distance(nxt):
#         return combine(acc, nxt)
#     return acc

In [80]:
# wordform_distancez2 = parfold(is_wordform_distance, prefix_distances_s, func_partial(do, set.add))

# Convert distances to a dictionary

In [81]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G         37G         41G        3.0M         46G         87G
Swap:          2.0G          0B        2.0G


In [82]:
def to_dict_rep(distance_triple):
    return ((distance_triple[0], distance_triple[1]), distance_triple[2])

In [83]:
# prefix_distance_dict = dict(par(delayed(to_dict_rep)(d) for d in prefix_distances))

In [84]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G         37G         41G        3.0M         46G         87G
Swap:          2.0G          0B        2.0G


In [66]:
wordform_distance_dict = dict(par(delayed(to_dict_rep)(d) for d in wordform_distances))

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0075s.) Setting batch_size=52.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0584s.) Setting batch_size=356.
[Parallel(n_jobs=-1)]: Done 948 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 1832 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 2820 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 6240 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 13716 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 21192 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 29380 tasks      | elapsed

[Parallel(n_jobs=-1)]: Done 3485586 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 3533333 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 3581080 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 3629545 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 3678010 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 3727193 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 3776376 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 3826277 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 3876178 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 3926797 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 3977416 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 4028753 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 4080090 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 4132145 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 4184200 tasks      | elapsed:  2.7min
[Parallel(

[Parallel(n_jobs=-1)]: Done 11427745 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done 11511077 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done 11594409 tasks      | elapsed:  8.6min
[Parallel(n_jobs=-1)]: Done 11678405 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done 11762401 tasks      | elapsed:  8.8min
[Parallel(n_jobs=-1)]: Done 11847061 tasks      | elapsed:  8.9min
[Parallel(n_jobs=-1)]: Done 11931721 tasks      | elapsed:  9.0min
[Parallel(n_jobs=-1)]: Done 12017045 tasks      | elapsed:  9.0min
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0839s.) Setting batch_size=166.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1628s.) Setting batch_size=406.
[Parallel(n_jobs=-1)]: Done 12093903 tasks      | elapsed:  9.1min
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0847s.) Setting batch_size=203.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1859s.) Setting batch_size=436.
[Parallel(n_jobs=-1)]: Done 12183845 tasks      | elapsed

[Parallel(n_jobs=-1)]: Done 18498850 tasks      | elapsed: 15.1min
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1975s.) Setting batch_size=470.
[Parallel(n_jobs=-1)]: Batch computation too slow (2.6511s.) Setting batch_size=235.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1993s.) Setting batch_size=470.
[Parallel(n_jobs=-1)]: Done 18617377 tasks      | elapsed: 15.3min
[Parallel(n_jobs=-1)]: Batch computation too slow (2.1098s.) Setting batch_size=235.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1612s.) Setting batch_size=582.
[Parallel(n_jobs=-1)]: Batch computation too slow (2.1179s.) Setting batch_size=291.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1985s.) Setting batch_size=586.
[Parallel(n_jobs=-1)]: Done 18742409 tasks      | elapsed: 15.4min
[Parallel(n_jobs=-1)]: Batch computation too slow (2.6985s.) Setting batch_size=293.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1980s.) Setting batch_size=590.
[Parallel(n_jobs=-1)]: Batch compu

In [None]:
#fixme update to reflect symmetry...
# or to economize on space, only interface with the dictionary through a function that will try both orders

In [67]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           188G         18G        168G        3.5M        1.9G        169G
Swap:          2.0G          0B        2.0G


In [90]:
# about the same speed or slower
# wordform_distance_dict = dict(par(delayed(lev_dist)(pair, to_dict_rep) for pair in combinations(Ws_tt, 2)))

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0085s.) Setting batch_size=46.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0675s.) Setting batch_size=272.
[Parallel(n_jobs=-1)]: Done 846 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 1628 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 2502 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1995s.) Setting batch_size=544.
[Parallel(n_jobs=-1)]: Done 5184 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 10896 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 16608

[Parallel(n_jobs=-1)]: Done 2465618 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 2500543 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 2536018 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 2571493 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 2607518 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 2643543 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 2680118 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 2716693 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1996s.) Setting batch_size=550.
[Parallel(n_jobs=-1)]: Done 2753818 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0121s.) Setting batch_size=275.
[Parallel(n_jobs=-1)]: Done 2827243 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 2881968 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 2919643 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Batch computati

[Parallel(n_jobs=-1)]: Done 8298325 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 8366240 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done 8434155 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done 8502648 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done 8571141 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-1)]: Done 8640212 tasks      | elapsed:  7.7min
[Parallel(n_jobs=-1)]: Done 8709283 tasks      | elapsed:  7.8min
[Parallel(n_jobs=-1)]: Done 8778932 tasks      | elapsed:  7.9min
[Parallel(n_jobs=-1)]: Done 8848581 tasks      | elapsed:  7.9min
[Parallel(n_jobs=-1)]: Done 8918808 tasks      | elapsed:  8.0min
[Parallel(n_jobs=-1)]: Done 8989035 tasks      | elapsed:  8.1min
[Parallel(n_jobs=-1)]: Done 9059840 tasks      | elapsed:  8.1min
[Parallel(n_jobs=-1)]: Done 9130645 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1984s.) Setting batch_size=582.
[Parallel(n_jobs=-1)]: Batch computation too slow (2.4431

[Parallel(n_jobs=-1)]: Batch computation too slow (2.8743s.) Setting batch_size=236.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1772s.) Setting batch_size=532.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1743s.) Setting batch_size=1220.
[Parallel(n_jobs=-1)]: Batch computation too slow (4.3766s.) Setting batch_size=610.
[Parallel(n_jobs=-1)]: Done 13792384 tasks      | elapsed: 13.6min
[Parallel(n_jobs=-1)]: Batch computation too slow (2.9861s.) Setting batch_size=305.
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0968s.) Setting batch_size=152.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1438s.) Setting batch_size=422.
[Parallel(n_jobs=-1)]: Batch computation too slow (2.1172s.) Setting batch_size=211.
[Parallel(n_jobs=-1)]: Done 13910526 tasks      | elapsed: 13.8min
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1568s.) Setting batch_size=538.
[Parallel(n_jobs=-1)]: Batch computation too slow (2.9537s.) Setting batch_size=269.
[Parallel(n_job

[Parallel(n_jobs=-1)]: Batch computation too fast (0.1971s.) Setting batch_size=452.
[Parallel(n_jobs=-1)]: Batch computation too slow (3.1184s.) Setting batch_size=226.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1985s.) Setting batch_size=454.
[Parallel(n_jobs=-1)]: Done 16914411 tasks      | elapsed: 17.5min
[Parallel(n_jobs=-1)]: Batch computation too slow (2.7969s.) Setting batch_size=227.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1739s.) Setting batch_size=522.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1397s.) Setting batch_size=1494.
[Parallel(n_jobs=-1)]: Batch computation too slow (5.7429s.) Setting batch_size=747.
[Parallel(n_jobs=-1)]: Batch computation too slow (3.1672s.) Setting batch_size=373.
[Parallel(n_jobs=-1)]: Done 17134331 tasks      | elapsed: 17.7min
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0694s.) Setting batch_size=186.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1780s.) Setting batch_size=416.
[Parallel(n_job

In [91]:
# !free -h

              total        used        free      shared  buff/cache   available
Mem:           188G         25G        161G        3.5M        1.9G        162G
Swap:          2.0G          0B        2.0G


# Convert distances to a numpy matrix...

In [85]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G         37G         41G        3.0M         46G         87G
Swap:          2.0G          0B        2.0G


In [86]:
def to_dok_rep(distance_triple):
    return ((Ws_t.index(distance_triple[0]), Ws_t.index(distance_triple[1])), distance_triple[2])

In [87]:
#20m sidious + buckeye
wordform_distance_dok_rep = dict(par(delayed(to_dok_rep)(d) for d in distinct_wordform_distances + identity))

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0029s.) Setting batch_size=138.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0366s.) Setting batch_size=1506.
[Parallel(n_jobs=-1)]: Done 2410 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done 4756 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 7378 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Batch computation too slow (6.4330s.) Setting batch_size=753.
[Parallel(n_jobs=-1)]: Done 20944 tasks      | elapsed:   10.3s
[Parallel(n_jobs=-1)]: Done 52570 tasks      | elapsed:   10.3s
[Parallel(n_jobs=-1)]: Done 8

[Parallel(n_jobs=-1)]: Done 1625513 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Batch computation too slow (6.1391s.) Setting batch_size=513.
[Parallel(n_jobs=-1)]: Done 1684055 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Batch computation too slow (3.0428s.) Setting batch_size=256.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0848s.) Setting batch_size=1206.
[Parallel(n_jobs=-1)]: Done 1727398 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Batch computation too slow (8.6018s.) Setting batch_size=603.
[Parallel(n_jobs=-1)]: Done 1779745 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Batch computation too slow (2.1691s.) Setting batch_size=301.
[Parallel(n_jobs=-1)]: Done 1846075 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1599s.) Setting batch_size=752.
[Parallel(n_jobs=-1)]: Batch computation too slow (5.7917s.) Setting batch_size=376.
[Parallel(n_jobs=-1)]: Done 1875890 tasks      | elapsed:  3.0min
[Paralle

[Parallel(n_jobs=-1)]: Batch computation too slow (2.2185s.) Setting batch_size=277.
[Parallel(n_jobs=-1)]: Done 3941975 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1565s.) Setting batch_size=706.
[Parallel(n_jobs=-1)]: Batch computation too slow (5.8487s.) Setting batch_size=353.
[Parallel(n_jobs=-1)]: Done 3994875 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1959s.) Setting batch_size=720.
[Parallel(n_jobs=-1)]: Batch computation too slow (3.2033s.) Setting batch_size=360.
[Parallel(n_jobs=-1)]: Done 4058695 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Batch computation too slow (3.2379s.) Setting batch_size=180.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0991s.) Setting batch_size=726.
[Parallel(n_jobs=-1)]: Done 4111975 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Batch computation too slow (7.2336s.) Setting batch_size=363.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.198

[Parallel(n_jobs=-1)]: Done 6159020 tasks      | elapsed: 10.1min
[Parallel(n_jobs=-1)]: Batch computation too slow (6.2818s.) Setting batch_size=491.
[Parallel(n_jobs=-1)]: Batch computation too slow (3.4986s.) Setting batch_size=245.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1045s.) Setting batch_size=936.
[Parallel(n_jobs=-1)]: Done 6258068 tasks      | elapsed: 10.3min
[Parallel(n_jobs=-1)]: Batch computation too slow (6.0814s.) Setting batch_size=468.
[Parallel(n_jobs=-1)]: Batch computation too slow (3.3422s.) Setting batch_size=234.
[Parallel(n_jobs=-1)]: Done 6348901 tasks      | elapsed: 10.4min
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1069s.) Setting batch_size=874.
[Parallel(n_jobs=-1)]: Batch computation too slow (6.4422s.) Setting batch_size=437.
[Parallel(n_jobs=-1)]: Batch computation too slow (3.1464s.) Setting batch_size=218.
[Parallel(n_jobs=-1)]: Done 6430831 tasks      | elapsed: 10.5min
[Parallel(n_jobs=-1)]: Batch computation too fast (0.105

[Parallel(n_jobs=-1)]: Batch computation too slow (3.3209s.) Setting batch_size=370.
[Parallel(n_jobs=-1)]: Batch computation too slow (3.1571s.) Setting batch_size=185.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0836s.) Setting batch_size=884.
[Parallel(n_jobs=-1)]: Done 8459346 tasks      | elapsed: 14.1min
[Parallel(n_jobs=-1)]: Batch computation too slow (6.9540s.) Setting batch_size=442.
[Parallel(n_jobs=-1)]: Batch computation too slow (3.3588s.) Setting batch_size=221.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1648s.) Setting batch_size=536.
[Parallel(n_jobs=-1)]: Done 8550793 tasks      | elapsed: 14.3min
[Parallel(n_jobs=-1)]: Batch computation too slow (3.0972s.) Setting batch_size=268.
[Parallel(n_jobs=-1)]: Batch computation too slow (3.3570s.) Setting batch_size=134.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0729s.) Setting batch_size=734.
[Parallel(n_jobs=-1)]: Done 8609168 tasks      | elapsed: 14.4min
[Parallel(n_jobs=-1)]: Batch computat

[Parallel(n_jobs=-1)]: Batch computation too slow (6.3002s.) Setting batch_size=407.
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0797s.) Setting batch_size=203.
[Parallel(n_jobs=-1)]: Done 10785061 tasks      | elapsed: 18.3min
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1499s.) Setting batch_size=540.
[Parallel(n_jobs=-1)]: Batch computation too slow (3.1655s.) Setting batch_size=270.
[Parallel(n_jobs=-1)]: Batch computation too slow (3.4666s.) Setting batch_size=135.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0764s.) Setting batch_size=706.
[Parallel(n_jobs=-1)]: Done 10857933 tasks      | elapsed: 18.4min
[Parallel(n_jobs=-1)]: Batch computation too slow (3.2853s.) Setting batch_size=353.
[Parallel(n_jobs=-1)]: Batch computation too slow (3.9788s.) Setting batch_size=176.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0852s.) Setting batch_size=824.
[Parallel(n_jobs=-1)]: Done 10935731 tasks      | elapsed: 18.6min
[Parallel(n_jobs=-1)]: Batch compu

[Parallel(n_jobs=-1)]: Batch computation too slow (3.2854s.) Setting batch_size=324.
[Parallel(n_jobs=-1)]: Batch computation too slow (4.3812s.) Setting batch_size=162.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0872s.) Setting batch_size=742.
[Parallel(n_jobs=-1)]: Done 12803307 tasks      | elapsed: 22.1min
[Parallel(n_jobs=-1)]: Batch computation too slow (6.4029s.) Setting batch_size=371.
[Parallel(n_jobs=-1)]: Batch computation too slow (2.1517s.) Setting batch_size=185.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0879s.) Setting batch_size=842.
[Parallel(n_jobs=-1)]: Done 12894698 tasks      | elapsed: 22.3min
[Parallel(n_jobs=-1)]: Batch computation too slow (7.5342s.) Setting batch_size=421.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1733s.) Setting batch_size=970.
[Parallel(n_jobs=-1)]: Batch computation too slow (7.5313s.) Setting batch_size=485.
[Parallel(n_jobs=-1)]: Batch computation too slow (3.3941s.) Setting batch_size=242.
[Parallel(n_jobs

[Parallel(n_jobs=-1)]: Batch computation too slow (3.2932s.) Setting batch_size=269.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1755s.) Setting batch_size=612.
[Parallel(n_jobs=-1)]: Batch computation too slow (6.8111s.) Setting batch_size=306.
[Parallel(n_jobs=-1)]: Done 15128762 tasks      | elapsed: 26.6min
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1960s.) Setting batch_size=624.
[Parallel(n_jobs=-1)]: Batch computation too slow (3.3713s.) Setting batch_size=312.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1938s.) Setting batch_size=642.
[Parallel(n_jobs=-1)]: Batch computation too slow (3.3920s.) Setting batch_size=321.
[Parallel(n_jobs=-1)]: Done 15240608 tasks      | elapsed: 26.8min
[Parallel(n_jobs=-1)]: Batch computation too slow (3.4467s.) Setting batch_size=160.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0790s.) Setting batch_size=810.
[Parallel(n_jobs=-1)]: Batch computation too slow (7.0824s.) Setting batch_size=405.
[Parallel(n_jobs

[Parallel(n_jobs=-1)]: Batch computation too slow (3.5349s.) Setting batch_size=131.
[Parallel(n_jobs=-1)]: Done 16904062 tasks      | elapsed: 30.2min
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0757s.) Setting batch_size=692.
[Parallel(n_jobs=-1)]: Batch computation too slow (3.4613s.) Setting batch_size=346.
[Parallel(n_jobs=-1)]: Batch computation too slow (3.7832s.) Setting batch_size=173.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0950s.) Setting batch_size=728.
[Parallel(n_jobs=-1)]: Batch computation too slow (7.5329s.) Setting batch_size=364.
[Parallel(n_jobs=-1)]: Done 17013557 tasks      | elapsed: 30.5min
[Parallel(n_jobs=-1)]: Batch computation too slow (2.1207s.) Setting batch_size=182.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1317s.) Setting batch_size=552.
[Parallel(n_jobs=-1)]: Batch computation too slow (3.4026s.) Setting batch_size=276.
[Parallel(n_jobs=-1)]: Batch computation too slow (3.8023s.) Setting batch_size=138.
[Parallel(n_jobs

[Parallel(n_jobs=-1)]: Batch computation too slow (3.8115s.) Setting batch_size=267.
[Parallel(n_jobs=-1)]: Done 18790440 tasks      | elapsed: 34.3min
[Parallel(n_jobs=-1)]: Batch computation too slow (3.4859s.) Setting batch_size=133.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1044s.) Setting batch_size=508.
[Parallel(n_jobs=-1)]: Batch computation too slow (3.5880s.) Setting batch_size=254.
[Parallel(n_jobs=-1)]: Batch computation too slow (3.5335s.) Setting batch_size=127.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0958s.) Setting batch_size=530.
[Parallel(n_jobs=-1)]: Done 18878573 tasks      | elapsed: 34.5min
[Parallel(n_jobs=-1)]: Batch computation too slow (4.1746s.) Setting batch_size=265.
[Parallel(n_jobs=-1)]: Batch computation too slow (3.5740s.) Setting batch_size=132.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1058s.) Setting batch_size=498.
[Parallel(n_jobs=-1)]: Batch computation too slow (3.4198s.) Setting batch_size=249.
[Parallel(n_jobs

[Parallel(n_jobs=-1)]: Batch computation too fast (0.1989s.) Setting batch_size=502.
[Parallel(n_jobs=-1)]: Batch computation too slow (7.1106s.) Setting batch_size=251.
[Parallel(n_jobs=-1)]: Done 20508810 out of 20508810 | elapsed: 38.1min finished


In [88]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G         43G         35G        3.0M         46G         80G
Swap:          2.0G          0B        2.0G


In [97]:
#22m sidious + buckeye
# wordform_distance_dok_rep2 = dict(par(delayed(lev_dist)(pair, to_dok_rep) for pair in combinations(Ws_tt, 2)))

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0085s.) Setting batch_size=46.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0654s.) Setting batch_size=280.
[Parallel(n_jobs=-1)]: Done 846 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 1628 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 2502 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1832s.) Setting batch_size=610.
[Parallel(n_jobs=-1)]: Done 5248 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 11128 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 17008

[Parallel(n_jobs=-1)]: Done 2623188 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 2663753 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 2704318 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 2745493 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 2786668 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 2828453 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 2870238 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 2912633 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 2955028 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 2998033 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 3041038 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 3084653 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 3128268 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 3172493 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 3216718 tasks      | elapsed:  2.4min
[Parallel(

[Parallel(n_jobs=-1)]: Batch computation too slow (2.0256s.) Setting batch_size=225.
[Parallel(n_jobs=-1)]: Done 8239553 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1960s.) Setting batch_size=458.
[Parallel(n_jobs=-1)]: Done 8304930 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0890s.) Setting batch_size=229.
[Parallel(n_jobs=-1)]: Done 8379355 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 8433170 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done 8487443 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1945s.) Setting batch_size=470.
[Parallel(n_jobs=-1)]: Done 8548223 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0335s.) Setting batch_size=235.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1797s.) Setting batch_size=522.
[Parallel(n_jobs=-1)]: Batch computation too slow (2.1154s.) Setting batch_size=261.
[Parallel

[Parallel(n_jobs=-1)]: Batch computation too slow (2.1662s.) Setting batch_size=233.
[Parallel(n_jobs=-1)]: Done 14433994 tasks      | elapsed: 14.1min
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1915s.) Setting batch_size=486.
[Parallel(n_jobs=-1)]: Batch computation too slow (2.8930s.) Setting batch_size=243.
[Parallel(n_jobs=-1)]: Done 14525497 tasks      | elapsed: 14.3min
[Parallel(n_jobs=-1)]: Done 14602528 tasks      | elapsed: 14.3min
[Parallel(n_jobs=-1)]: Done 14680045 tasks      | elapsed: 14.4min
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1938s.) Setting batch_size=500.
[Parallel(n_jobs=-1)]: Batch computation too slow (2.9065s.) Setting batch_size=250.
[Parallel(n_jobs=-1)]: Done 14774906 tasks      | elapsed: 14.5min
[Parallel(n_jobs=-1)]: Done 14855156 tasks      | elapsed: 14.6min
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1987s.) Setting batch_size=502.
[Parallel(n_jobs=-1)]: Done 14935406 tasks      | elapsed: 14.7min
[Parallel(n_jobs=-1)]

[Parallel(n_jobs=-1)]: Done 17503242 tasks      | elapsed: 18.1min
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0011s.) Setting batch_size=163.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1959s.) Setting batch_size=332.
[Parallel(n_jobs=-1)]: Batch computation too slow (2.1584s.) Setting batch_size=166.
[Parallel(n_jobs=-1)]: Done 17588608 tasks      | elapsed: 18.2min
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1893s.) Setting batch_size=350.
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0212s.) Setting batch_size=175.
[Parallel(n_jobs=-1)]: Done 17675206 tasks      | elapsed: 18.3min
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1980s.) Setting batch_size=352.
[Parallel(n_jobs=-1)]: Batch computation too slow (2.1213s.) Setting batch_size=176.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1694s.) Setting batch_size=414.
[Parallel(n_jobs=-1)]: Batch computation too slow (2.1664s.) Setting batch_size=207.
[Parallel(n_jobs=-1)]: Batch compu

[Parallel(n_jobs=-1)]: Batch computation too fast (0.1997s.) Setting batch_size=282.
[Parallel(n_jobs=-1)]: Done 20066089 tasks      | elapsed: 21.6min
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0015s.) Setting batch_size=141.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1991s.) Setting batch_size=282.
[Parallel(n_jobs=-1)]: Batch computation too slow (2.1016s.) Setting batch_size=141.
[Parallel(n_jobs=-1)]: Done 20157316 tasks      | elapsed: 21.7min
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0346s.) Setting batch_size=70.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0825s.) Setting batch_size=338.
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0079s.) Setting batch_size=169.
[Parallel(n_jobs=-1)]: Done 20253484 tasks      | elapsed: 21.8min
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1953s.) Setting batch_size=346.
[Parallel(n_jobs=-1)]: Batch computation too slow (2.3093s.) Setting batch_size=173.
[Parallel(n_jobs=-1)]: Done 2033300

In [98]:
# !free -h

              total        used        free      shared  buff/cache   available
Mem:           188G         26G        160G        3.4M        2.0G        161G
Swap:          2.0G          0B        2.0G


In [89]:
#19m sidious + buckeye
wordform_distance_DOK = sparse.DOK((len(Ws_t), len(Ws_t)), wordform_distance_dok_rep)

In [103]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           188G         26G        160G        3.4M        2.0G        160G
Swap:          2.0G          0B        2.0G


In [104]:
# wordform_distance_DOK2 = sparse.DOK((len(Ws_t), len(Ws_t)), wordform_distance_dok_rep2)

In [105]:
# !free -h

              total        used        free      shared  buff/cache   available
Mem:           188G         29G        157G        3.4M        2.0G        158G
Swap:          2.0G          0B        2.0G


In [90]:
wordform_distance_np = wordform_distance_DOK.todense()
wordform_distance_np.nbytes / 1e9

0.328089728

In [91]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G         45G         33G        3.0M         46G         78G
Swap:          2.0G          0B        2.0G


In [None]:
#fixme update to reflect symmetry...

In [None]:
# L_d_np = np.zeros((len(Ws_t), len(Ws_t)), dtype=np.int64)

In [92]:
def to_idx_rep(distance_triple, add_mirror=False):
    t = distance_triple
    if not add_mirror:
        return (Ws_t.index(t[0]), Ws_t.index(t[1]), t[2])
    else:
        return {(Ws_t.index(t[0]), Ws_t.index(t[1]), t[2]),
                (Ws_t.index(t[1]), Ws_t.index(t[0]), t[2])}

In [93]:
#?m wittgenstein + buckeye
# wordform_distance_idx_rep = par(delayed(to_idx_rep)(d) for d in distinct_wordform_distances + identity)
wordform_distance_idx_rep = join(par(delayed(to_idx_rep)(d, True) for d in distinct_wordform_distances + identity))

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0034s.) Setting batch_size=116.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0393s.) Setting batch_size=1180.
[Parallel(n_jobs=-1)]: Done 2036 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 4008 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 6212 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Batch computation too slow (6.2023s.) Setting batch_size=590.
[Parallel(n_jobs=-1)]: Done 16928 tasks      | elapsed:    6.3s
[Parallel(n_jobs=-1)]: Done 41708 tasks      | elapsed:   10.0s
[Parallel(n_jobs=-1)]: Done 6

[Parallel(n_jobs=-1)]: Done 2223825 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 2274495 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1972s.) Setting batch_size=1142.
[Parallel(n_jobs=-1)]: Done 2321224 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 2412536 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0466s.) Setting batch_size=571.
[Parallel(n_jobs=-1)]: Done 2503325 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 2558712 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1959s.) Setting batch_size=1164.
[Parallel(n_jobs=-1)]: Done 2613133 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 2714401 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Batch computation too slow (2.2725s.) Setting batch_size=582.
[Parallel(n_jobs=-1)]: Done 2817997 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Batch computation too slow (6.2729s.) Set

[Parallel(n_jobs=-1)]: Done 7485369 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done 7600209 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 7716633 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 7833057 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0732s.) Setting batch_size=396.
[Parallel(n_jobs=-1)]: Batch computation too slow (3.0116s.) Setting batch_size=198.
[Parallel(n_jobs=-1)]: Done 7948293 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0992s.) Setting batch_size=798.
[Parallel(n_jobs=-1)]: Done 8025273 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Batch computation too slow (2.3087s.) Setting batch_size=399.
[Parallel(n_jobs=-1)]: Done 8144973 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1962s.) Setting batch_size=812.
[Parallel(n_jobs=-1)]: Done 8216562 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done 8340798 tasks      

[Parallel(n_jobs=-1)]: Done 16881893 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done 16960664 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done 17039435 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done 17118888 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done 17198341 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done 17278476 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done 17358611 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done 17439428 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done 17520245 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-1)]: Done 17601744 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-1)]: Done 17683243 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-1)]: Done 17765424 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-1)]: Done 17847605 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-1)]: Done 17930468 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-1)]: Done 18013331 tasks      | elapsed:  7.

In [94]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G         58G         20G        3.0M         46G         66G
Swap:          2.0G          0B        2.0G


In [95]:
# wordform_distance_idx_rep = sorted(wordform_distance_idx_rep, key=lambda triple: triple[0])

In [96]:
wordform_distance_idx_rep_grouped = group_by(lambda triple: triple[0], wordform_distance_idx_rep)

In [97]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G         59G         20G        3.0M         46G         65G
Swap:          2.0G          0B        2.0G


In [98]:
def group_to_update_vecs(grouped_idx_rep_triples):
    row_idx = grouped_idx_rep_triples[0][0]
    
    sorted_by_col_idx = sorted(grouped_idx_rep_triples, key=second)
    
    col_idxs = np.array(lmap(second, sorted_by_col_idx))
    vals = np.array(lmap(lambda triple: triple[2], sorted_by_col_idx))
    return (row_idx, col_idxs, vals)

In [99]:
group_update_vecs = par(delayed(group_to_update_vecs)(group) for key, group in wordform_distance_idx_rep_grouped.items())

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0483s.) Setting batch_size=8.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 200 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 336 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0118s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done 488 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 808 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done 948 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done 1040 tasks      | elapsed:    5.0s
[

In [100]:
group_update_vecs = sorted(group_update_vecs, key=lambda triple: triple[0])

In [101]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G         59G         19G        3.0M         46G         65G
Swap:          2.0G          0B        2.0G


In [102]:
L_d_np = -99 * np.ones((len(Ws_t), len(Ws_t)), dtype=np.int64)

In [103]:
updated_row_idxs = set(lmap(first, 
                            group_update_vecs))

In [104]:
missing_row_idxs = [i for i in range(len(Ws_t)) if i not in updated_row_idxs]
len(missing_row_idxs)
missing_row_idxs

0

[]

In [105]:
Ws_t[6403]

'⋊.θ.⋉.⋉'

In [106]:
col_idx_mat = np.array(lmap(second, group_update_vecs))

In [107]:
val_mat = np.array(lmap(lambda triple: triple[2],
                        group_update_vecs))

In [108]:
col_idx_mat = np.array(col_idx_mat)
val_mat = np.array(val_mat)

In [109]:
col_idx_mat.shape
val_mat.shape

(6404, 6404)

(6404, 6404)

In [110]:
col_idx_mat[:10]
set(lmap(lambda a: a.shape, col_idx_mat))

array([[   0,    1,    2, ..., 6401, 6402, 6403],
       [   0,    1,    2, ..., 6401, 6402, 6403],
       [   0,    1,    2, ..., 6401, 6402, 6403],
       ...,
       [   0,    1,    2, ..., 6401, 6402, 6403],
       [   0,    1,    2, ..., 6401, 6402, 6403],
       [   0,    1,    2, ..., 6401, 6402, 6403]])

{(6404,)}

In [111]:
np.vstack(col_idx_mat)#[:10]

array([[   0,    1,    2, ..., 6401, 6402, 6403],
       [   0,    1,    2, ..., 6401, 6402, 6403],
       [   0,    1,    2, ..., 6401, 6402, 6403],
       ...,
       [   0,    1,    2, ..., 6401, 6402, 6403],
       [   0,    1,    2, ..., 6401, 6402, 6403],
       [   0,    1,    2, ..., 6401, 6402, 6403]])

In [112]:
val_mat[:10]

array([[ 0,  3,  6, ...,  7,  7,  8],
       [ 3,  0,  5, ...,  5,  5,  6],
       [ 6,  5,  0, ...,  9,  9, 10],
       ...,
       [ 6,  4,  8, ...,  4,  4,  2],
       [ 6,  5,  8, ...,  5,  4,  6],
       [ 5,  4,  8, ...,  4,  4,  5]])

In [113]:
val_mat.shape

(6404, 6404)

In [119]:
random_rows = choices(np.arange(len(Ws_t)),k=1000)
random_cols = choices(np.arange(len(Ws_t)),k=1000)
random_pairs = lzip(random_rows, random_cols)

In [122]:
np.array_equal(val_mat, wordform_distance_np)

False

In [128]:
diff_mask = val_mat != wordform_distance_np

In [132]:
diff_mask.nonzero()[0].shape

(20502406,)

In [130]:
diff_mask.nonzero()

(array([   1,    2,    2, ..., 6403, 6403, 6403]),
 array([   0,    0,    1, ..., 6400, 6401, 6402]))

In [136]:
my_i, my_j = 2,1
val_mat[my_i, my_j]
wordform_distance_np[my_i, my_j]
lev_dist((Ws_tt[my_i], Ws_tt[my_j]))

5

0

('⋊.aɪ.d.i.ʌ.l.ɪ.s.t.ɪ.k.⋉.⋉', '⋊.aɪ.d.i.ʌ.l.i.⋉.⋉', 5)

In [120]:
wordform_distance_np.shape

(6404, 6404)

In [124]:
%psource lev_dist

In [125]:
for i,j in random_pairs:
    if val_mat[i,j] != lev_dist((Ws_tt[i], Ws_tt[j]))[2]:
        print(f'({i},{j},{val_mat[i,j]}) vs. {lev_dist((Ws_tt[i], Ws_tt[j]))}')

In [127]:
for i,j in tqdm(product(np.arange(len(Ws_t)), np.arange(len(Ws_t))),
                total=(len(Ws_t) * len(Ws_t))):
    if val_mat[i,j] != lev_dist((Ws_tt[i], Ws_tt[j]))[2]:
        print(f'({i},{j},{val_mat[i,j]}) vs. {lev_dist((Ws_tt[i], Ws_tt[j]))}')

41011216it [01:53, 361922.35it/s]


In [124]:
np.put_along_axis(L_d_np, col_idx_mat, val_mat, axis=1)

IndexError: `indices` must be an integer array

In [None]:
# for update in tqdm(group_update_vecs):
#     row
#     np.put_along_axis(L_d_np, )

# Calculate weighted distances

# Export

In [None]:
# exportDict

In [None]:
# L_d_P_md = {'W':{'from fp':p,
#                      'changes':'sorted',
#                      'size':len(Ws_t)},
#                      'P':{'from_fp':p,
#                           'changes':'extracted from W, sorted',
#                           'size':len(Ps_t)}}
# exportMatrixMetadata(o + '_L_d_P' + '.npy' + '_metadata.json',
#                      o + '_L_d_P' + '.npy' + '_metadata.json',
#                      L_d_np_P,
#                      L_d_P_md,
#                      FIXME #'Step 4b',
#                      'Calculate Levenshtein distances and neighborhood density.ipynb',
#                     {'Storage':'file is MEMORY MAPPED.'})