In [1]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

**Eric Meinhardt / emeinhardt@ucsd.edu**

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Motivation" data-toc-modified-id="Motivation-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Motivation</a></span></li><li><span><a href="#Import-libraries-and-data" data-toc-modified-id="Import-libraries-and-data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Import libraries and data</a></span></li><li><span><a href="#Basic-representations" data-toc-modified-id="Basic-representations-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Basic representations</a></span></li><li><span><a href="#Edit-distance-calculation-&amp;-sandbox" data-toc-modified-id="Edit-distance-calculation-&amp;-sandbox-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Edit distance calculation &amp; sandbox</a></span></li><li><span><a href="#Actual-computation" data-toc-modified-id="Actual-computation-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Actual computation</a></span></li><li><span><a href="#Convert-distances-to-a-dictionary" data-toc-modified-id="Convert-distances-to-a-dictionary-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Convert distances to a dictionary</a></span></li><li><span><a href="#Convert-distances-to-a-numpy-matrix..." data-toc-modified-id="Convert-distances-to-a-numpy-matrix...-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Convert distances to a numpy matrix...</a></span></li><li><span><a href="#Calculate-weighted-distances" data-toc-modified-id="Calculate-weighted-distances-8"><span class="toc-item-num">8&nbsp;&nbsp;</span>Calculate weighted distances</a></span></li><li><span><a href="#Export" data-toc-modified-id="Export-9"><span class="toc-item-num">9&nbsp;&nbsp;</span>Export</a></span></li></ul></div>

# Motivation

This notebook calculates 
 - the set of all pairwise Levenshtein distances between pairs of segmental wordforms.
 - unigram probability weighted neighborhood distance.

# Import libraries and data

In [2]:
from os import getcwd, chdir, listdir, path, mkdir, makedirs

In [3]:
from boilerplate import *

In [4]:
from probdist import *
from string_utils import *

In [5]:
from funcy import *

In [6]:
from tqdm import tqdm

from joblib import Parallel, delayed

J = -1
BACKEND = 'multiprocessing'
# BACKEND = 'loky'
V = 10
PREFER = 'processes'
# PREFER = 'threads'

def identity(x):
    return x

def par(gen_expr):
    return Parallel(n_jobs=J, backend=BACKEND, verbose=V, prefer=PREFER)(gen_expr)

In [7]:
import toolz

In [8]:
# import dask.multiprocessing
# dask.config.set(scheduler='processes')

In [9]:
# # import dask
# from dask.distributed import Client
# client = Client('172.21.47.67:8786')

In [10]:
import editdistance as lev

In [101]:
import numpy as np

In [None]:
import sparse

In [11]:
# Parameters

p = ''
p = 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json'
# p = 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim.pW_V.json'
# p = 'LTR_newdic_destressed_aligned_w_GD_AmE_destressed/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered.pW_V.json'
# p = 'LTR_newdic_destressed_aligned_w_GD_AmE_destressed/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered_trim.pW_V.json'
# p = 'LTR_CMU_destressed_aligned_w_GD_AmE_destressed/LTR_CMU_destressed_aligned_CM_filtered_LM_filtered.pW_V.json'
# p = 'LTR_CMU_destressed_aligned_w_GD_AmE_destressed/LTR_CMU_destressed_aligned_CM_filtered_LM_filtered_trim.pW_V.json'
# p = 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim.pW_V.json'
# p = 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json'
# p = 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2.json'

u = ''
u = 'LD_Buckeye_vocab_in_(empty)_(NA)_contexts_1gram_model/LD_Buckeye_vocab_in_(empty)_(NA)_contexts_1gram_model.pV.json'
# u = 'LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model/LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model.pV.json'

o = ''
o = 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered'
# o = 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim'
# o = 'LTR_newdic_destressed_aligned_w_GD_AmE_destressed/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered'
# o = 'LTR_newdic_destressed_aligned_w_GD_AmE_destressed/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered_trim'
# o = 'LTR_CMU_destressed_aligned_w_GD_AmE_destressed/LTR_CMU_destressed_aligned_CM_filtered_LM_filtered'
# o = 'LTR_CMU_destressed_aligned_w_GD_AmE_destressed/LTR_CMU_destressed_aligned_CM_filtered_LM_filtered_trim'
# o = 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered'
# o = 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim'
# o = 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_X0X1X2'

# g = ''
# # g = 'False'

In [12]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           188G        1.1G        185G        3.5M        1.9G        186G
Swap:          2.0G          0B        2.0G


In [13]:
if 'pW_V' in p:
    pW_V = condDistsAsProbDists(importProbDist(p))
elif 'pX0X1X2' in p:
    pW = ProbDist(importProbDist(p))
else:
    raise Exception(f"Unknown type of 'p' parameter = {p}")

In [14]:
if 'pV' in u:
    pV = ProbDist(importProbDist(u))
elif 'pX0X1X2' in u:
    pV = pW
else:
    raise Exception(f"Unknown type of 'u' parameter = {u}")

In [15]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           188G        1.1G        185G        3.5M        1.9G        186G
Swap:          2.0G          0B        2.0G


In [16]:
testing = True
benchmark = True

In [17]:
my_dtype = np.int8

# Basic representations

In [18]:
if 'pW_V' in p:
    # Vs = set(pW_V.keys())
    Ws = union(mapValues(lambda dist: set(conditions(dist)), 
                         pW_V).values())
elif 'pX0X1X2' in p:
    Ws = set(conditions(pW))
else:
    raise Exception(f"Unknown type of 'p' parameter = {p}")

# len(Vs)
len(Ws)

6404

In [19]:
Ws_t = tuple(sorted(list(Ws)))

In [20]:
# #≈200s on CMU on solomonoff
# Ps = union(list(par(delayed(getPrefixes)(w) for w in Ws)))
# # Ps = union(par(delayed(getPrefixes)(w) for w in Ws))
# # Ps = union([getPrefixes(w) for w in Ws])
# Ps_t = tuple(sorted(list(Ps)))
# len(Ps_t)

In [21]:
Ws_tt = lmap(ds2t, Ws_t)
# Ps_tt = lmap(ds2t, Ps_t)

Dask data structures:

In [22]:
# import dask.bag as db

In [23]:
# Ws_ttb = db.from_sequence(Ws_tt)

In [24]:
# Ps_ttb = db.from_sequence(Ps_tt)

In [None]:
if 'pW_V' in p:
#     Vs = set(pV.keys())
    pW = MarginalProbDist(pW_V, pV)

In [None]:
pW_np = distToNP(pW)
pW_np.shape

# Edit distance calculation & sandbox

In [88]:
def lev_helper_wrapper(uv_idx_pair, prefixes=True, distributed=False):
    return lev_helper(uv_idx_pair[0], uv_idx_pair[1], prefixes=prefixes, distributed=distributed)

def lev_helper(u_idx, v_idx, prefixes=True, distributed=False):
    if prefixes:
        u = Ps_tt[u_idx]
        v = Ps_tt[v_idx]
    else:
        u = Ws_tt[u_idx]
        v = Ws_tt[v_idx]
    return (u_idx, v_idx, lev.eval(u,v))

def lev_dist(uv_t_pair, final_func = None):
    if final_func is None:
        return (t2ds(uv_t_pair[0]), t2ds(uv_t_pair[1]), lev.eval(uv_t_pair[0], uv_t_pair[1]))
    else:
        result = (t2ds(uv_t_pair[0]), t2ds(uv_t_pair[1]), lev.eval(uv_t_pair[0], uv_t_pair[1]))
        return final_func(result)
#         return (final_func(result[0]), final_func(result[1]), result[2])

In [89]:
# lev_helper(3, 20, True)
# Ps_t[3]
# Ps_t[20]
# lev_dist((Ps_tt[3], Ps_tt[20]))

lev_helper(3, 20, False)
Ws_t[3]
Ws_t[20]

lev_dist((Ws_tt[3], Ws_tt[20]))

(3, 20, 4)

'⋊.aɪ.d.i.ʌ.z.⋉.⋉'

'⋊.aɪ.v.⋉.⋉'

('⋊.aɪ.d.i.ʌ.z.⋉.⋉', '⋊.aɪ.v.⋉.⋉', 4)

In [27]:
# L_d_np_P_updates = par(delayed(lev_helper)(u_idx, v_idx) for u_idx, v_idx in list(product(np.arange(len(Ps_t)), np.arange(len(Ps_t)))))

In [28]:
# my_prefixes = Ps_tt[:1000]
# my_prefix_pairs = list(product(my_prefixes, my_prefixes))
# "{:,}".format(len(my_prefix_pairs))

In [29]:
# Client?

In [30]:
# client.submit?

In [31]:
# foo = client.submit(lev_dist, my_prefix_pairs[23])

In [32]:
# foo.result()

In [33]:
# test_results_futures = client.map(lev_dist, my_prefix_pairs)

In [34]:
# test_results_futures[-1]

In [35]:
# test_results_gathered = client.gather(test_results_futures)

In [36]:
# joblib_only = par(delayed(lev_dist)(pair) for pair in my_prefix_pairs)

In [37]:
# my_prefixes_b = db.from_sequence(my_prefixes)
# my_prefix_pairs = db.from_sequence(my_prefix_pairs)

In [38]:
# dask_bag_mp = my_prefix_pairs.map(lev_dist)

In [39]:
# dask_bag_mpl = list(dask_bag_mp)

In [40]:
# dask_bag_mpl[0]

In [41]:
# test_results_gathered

In [42]:
# my_prefix_range = np.arange(100)

In [43]:
# my_prefix_range_pairs = list(product(my_prefix_range, my_prefix_range))

In [44]:
# test_results = client.map(lev_helper_wrapper, my_prefix_range_pairs)

In [45]:
# client.gather(test_results)

# Actual computation

In [46]:
from itertools import combinations

In [47]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           188G        1.1G        185G        3.5M        1.9G        186G
Swap:          2.0G          0B        2.0G


In [48]:
# prefix_pairs = list(combinations(Ps_tt, 2)) #slower
# prefix_pairs = combinations(Ps_tt, 2) #faster

In [49]:
wordform_pairs = combinations(Ws_tt, 2)

In [50]:
# prefix_pairs = product(Ps_tt, Ps_tt)
# word_pairs = product(Ws_tt, Ws_tt)

In [51]:
from scipy.special import binom

In [52]:
len(Ws_t)

6404

In [53]:
"{:.2E}".format(binom(len(Ws_t), 2))
"{:,}".format(binom(len(Ws_t), 2))
# "{:.2E}".format(binom(len(Ps_t), 2))
# "{:,}".format(binom(len(Ps_t), 2))

'2.05E+07'

'20,502,406.0'

In [54]:
"{:.2E}".format(len(Ws_t) ** 2)
"{:,}".format(len(Ws_t) ** 2)
# "{:.2E}".format(len(Ps_t) ** 2)
# "{:,}".format(len(Ps_t) ** 2)

'4.10E+07'

'41,011,216'

In [55]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           188G        1.1G        185G        3.5M        1.9G        186G
Swap:          2.0G          0B        2.0G


In [56]:
wordform_distances = par(delayed(lev_dist)(pair) for pair in wordform_pairs)

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0073s.) Setting batch_size=54.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0549s.) Setting batch_size=392.
[Parallel(n_jobs=-1)]: Done 982 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 1900 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 2926 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 6656 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 14888 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 23120 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 32136 tasks      | elapsed

[Parallel(n_jobs=-1)]: Done 3447632 tasks      | elapsed:   46.5s
[Parallel(n_jobs=-1)]: Done 3500552 tasks      | elapsed:   47.3s
[Parallel(n_jobs=-1)]: Done 3553472 tasks      | elapsed:   48.1s
[Parallel(n_jobs=-1)]: Done 3607176 tasks      | elapsed:   48.9s
[Parallel(n_jobs=-1)]: Done 3660880 tasks      | elapsed:   49.6s
[Parallel(n_jobs=-1)]: Done 3715368 tasks      | elapsed:   50.3s
[Parallel(n_jobs=-1)]: Done 3769856 tasks      | elapsed:   51.1s
[Parallel(n_jobs=-1)]: Done 3825128 tasks      | elapsed:   51.9s
[Parallel(n_jobs=-1)]: Done 3880400 tasks      | elapsed:   52.7s
[Parallel(n_jobs=-1)]: Done 3936456 tasks      | elapsed:   53.6s
[Parallel(n_jobs=-1)]: Done 3992512 tasks      | elapsed:   54.4s
[Parallel(n_jobs=-1)]: Done 4049352 tasks      | elapsed:   55.3s
[Parallel(n_jobs=-1)]: Done 4106192 tasks      | elapsed:   56.2s
[Parallel(n_jobs=-1)]: Done 4163816 tasks      | elapsed:   57.1s
[Parallel(n_jobs=-1)]: Done 4221440 tasks      | elapsed:   58.0s
[Parallel(

[Parallel(n_jobs=-1)]: Done 12974800 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 13076328 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 13177856 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 13280168 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 13382480 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 13485576 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done 13588672 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done 13692552 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 13796432 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 13901096 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 14005760 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 14111208 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done 14216656 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done 14322888 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 14429120 tasks      | elapsed:  4.

In [57]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           188G        8.1G        178G        3.5M        1.9G        179G
Swap:          2.0G          0B        2.0G


In [None]:
# prefix_distances = results

In [None]:
# def is_wordform_distance(prefix_distance_tuple):
#     p0 = prefix_distance_tuple[0]
#     p1 = prefix_distance_tuple[1]
#     return p0 in Ws_t and p1 in Ws_t

In [None]:
# wordform_distances = {d for d in tqdm(prefix_distances) if is_wordform_distance(d)}
# wordform_distances = lfilter(is_wordform_distance, 
#                              prefix_distances)

In [None]:
# !free -h

In [59]:
from toolz.sandbox.parallel import fold as parfold

In [60]:
def parfilter(pred, seq, combine):
    
    def keep_only_matches(acc, nxt):
        if pred(nxt):
            return combine(acc, nxt)
        return acc
    
    return parfold(keep_only_matches, seq)

In [None]:
# type(prefix_distances)

In [61]:
type(wordform_distances)

list

In [None]:
# prefix_distances_s = set(prefix_distances)

In [None]:
# wordform_distancez = parfilter(is_wordform_distance, prefix_distances_s, combine=func_partial(do, set.add))

In [None]:
# def keep_only_wordform_distances(acc, nxt, combine):
#     if is_wordform_distance(nxt):
#         return combine(acc, nxt)
#     return acc

In [None]:
# wordform_distancez2 = parfold(is_wordform_distance, prefix_distances_s, func_partial(do, set.add))

# Convert distances to a dictionary

In [62]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           188G        8.1G        178G        3.5M        1.9G        179G
Swap:          2.0G          0B        2.0G


In [63]:
def to_dict_rep(distance_triple):
    return ((distance_triple[0], distance_triple[1]), distance_triple[2])

In [64]:
# prefix_distance_dict = dict(par(delayed(to_dict_rep)(d) for d in prefix_distances))

In [65]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           188G        8.1G        178G        3.5M        1.9G        179G
Swap:          2.0G          0B        2.0G


In [66]:
wordform_distance_dict = dict(par(delayed(to_dict_rep)(d) for d in wordform_distances))

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0075s.) Setting batch_size=52.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0584s.) Setting batch_size=356.
[Parallel(n_jobs=-1)]: Done 948 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 1832 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 2820 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 6240 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 13716 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 21192 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 29380 tasks      | elapsed

[Parallel(n_jobs=-1)]: Done 3485586 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 3533333 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 3581080 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 3629545 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 3678010 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 3727193 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 3776376 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 3826277 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 3876178 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 3926797 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 3977416 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 4028753 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 4080090 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 4132145 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 4184200 tasks      | elapsed:  2.7min
[Parallel(

[Parallel(n_jobs=-1)]: Done 11427745 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done 11511077 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done 11594409 tasks      | elapsed:  8.6min
[Parallel(n_jobs=-1)]: Done 11678405 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done 11762401 tasks      | elapsed:  8.8min
[Parallel(n_jobs=-1)]: Done 11847061 tasks      | elapsed:  8.9min
[Parallel(n_jobs=-1)]: Done 11931721 tasks      | elapsed:  9.0min
[Parallel(n_jobs=-1)]: Done 12017045 tasks      | elapsed:  9.0min
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0839s.) Setting batch_size=166.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1628s.) Setting batch_size=406.
[Parallel(n_jobs=-1)]: Done 12093903 tasks      | elapsed:  9.1min
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0847s.) Setting batch_size=203.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1859s.) Setting batch_size=436.
[Parallel(n_jobs=-1)]: Done 12183845 tasks      | elapsed

[Parallel(n_jobs=-1)]: Done 18498850 tasks      | elapsed: 15.1min
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1975s.) Setting batch_size=470.
[Parallel(n_jobs=-1)]: Batch computation too slow (2.6511s.) Setting batch_size=235.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1993s.) Setting batch_size=470.
[Parallel(n_jobs=-1)]: Done 18617377 tasks      | elapsed: 15.3min
[Parallel(n_jobs=-1)]: Batch computation too slow (2.1098s.) Setting batch_size=235.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1612s.) Setting batch_size=582.
[Parallel(n_jobs=-1)]: Batch computation too slow (2.1179s.) Setting batch_size=291.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1985s.) Setting batch_size=586.
[Parallel(n_jobs=-1)]: Done 18742409 tasks      | elapsed: 15.4min
[Parallel(n_jobs=-1)]: Batch computation too slow (2.6985s.) Setting batch_size=293.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1980s.) Setting batch_size=590.
[Parallel(n_jobs=-1)]: Batch compu

In [None]:
#fixme update to reflect symmetry...

In [67]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           188G         18G        168G        3.5M        1.9G        169G
Swap:          2.0G          0B        2.0G


In [90]:
# wordform_distance_dict = dict(par(delayed(lev_dist)(pair, to_dict_rep) for pair in combinations(Ws_tt, 2)))

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0085s.) Setting batch_size=46.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0675s.) Setting batch_size=272.
[Parallel(n_jobs=-1)]: Done 846 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 1628 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 2502 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1995s.) Setting batch_size=544.
[Parallel(n_jobs=-1)]: Done 5184 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 10896 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 16608

[Parallel(n_jobs=-1)]: Done 2465618 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 2500543 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 2536018 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 2571493 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 2607518 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 2643543 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 2680118 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 2716693 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1996s.) Setting batch_size=550.
[Parallel(n_jobs=-1)]: Done 2753818 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0121s.) Setting batch_size=275.
[Parallel(n_jobs=-1)]: Done 2827243 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 2881968 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 2919643 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Batch computati

[Parallel(n_jobs=-1)]: Done 8298325 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 8366240 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done 8434155 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done 8502648 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done 8571141 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-1)]: Done 8640212 tasks      | elapsed:  7.7min
[Parallel(n_jobs=-1)]: Done 8709283 tasks      | elapsed:  7.8min
[Parallel(n_jobs=-1)]: Done 8778932 tasks      | elapsed:  7.9min
[Parallel(n_jobs=-1)]: Done 8848581 tasks      | elapsed:  7.9min
[Parallel(n_jobs=-1)]: Done 8918808 tasks      | elapsed:  8.0min
[Parallel(n_jobs=-1)]: Done 8989035 tasks      | elapsed:  8.1min
[Parallel(n_jobs=-1)]: Done 9059840 tasks      | elapsed:  8.1min
[Parallel(n_jobs=-1)]: Done 9130645 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1984s.) Setting batch_size=582.
[Parallel(n_jobs=-1)]: Batch computation too slow (2.4431

[Parallel(n_jobs=-1)]: Batch computation too slow (2.8743s.) Setting batch_size=236.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1772s.) Setting batch_size=532.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1743s.) Setting batch_size=1220.
[Parallel(n_jobs=-1)]: Batch computation too slow (4.3766s.) Setting batch_size=610.
[Parallel(n_jobs=-1)]: Done 13792384 tasks      | elapsed: 13.6min
[Parallel(n_jobs=-1)]: Batch computation too slow (2.9861s.) Setting batch_size=305.
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0968s.) Setting batch_size=152.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1438s.) Setting batch_size=422.
[Parallel(n_jobs=-1)]: Batch computation too slow (2.1172s.) Setting batch_size=211.
[Parallel(n_jobs=-1)]: Done 13910526 tasks      | elapsed: 13.8min
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1568s.) Setting batch_size=538.
[Parallel(n_jobs=-1)]: Batch computation too slow (2.9537s.) Setting batch_size=269.
[Parallel(n_job

[Parallel(n_jobs=-1)]: Batch computation too fast (0.1971s.) Setting batch_size=452.
[Parallel(n_jobs=-1)]: Batch computation too slow (3.1184s.) Setting batch_size=226.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1985s.) Setting batch_size=454.
[Parallel(n_jobs=-1)]: Done 16914411 tasks      | elapsed: 17.5min
[Parallel(n_jobs=-1)]: Batch computation too slow (2.7969s.) Setting batch_size=227.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1739s.) Setting batch_size=522.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1397s.) Setting batch_size=1494.
[Parallel(n_jobs=-1)]: Batch computation too slow (5.7429s.) Setting batch_size=747.
[Parallel(n_jobs=-1)]: Batch computation too slow (3.1672s.) Setting batch_size=373.
[Parallel(n_jobs=-1)]: Done 17134331 tasks      | elapsed: 17.7min
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0694s.) Setting batch_size=186.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1780s.) Setting batch_size=416.
[Parallel(n_job

In [91]:
# !free -h

              total        used        free      shared  buff/cache   available
Mem:           188G         25G        161G        3.5M        1.9G        162G
Swap:          2.0G          0B        2.0G


# Convert distances to a numpy matrix...

In [92]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           188G         25G        161G        3.5M        1.9G        162G
Swap:          2.0G          0B        2.0G


In [94]:
def to_dok_rep(distance_triple):
    return ((Ws_t.index(distance_triple[0]), Ws_t.index(distance_triple[1])), distance_triple[2])

In [95]:
wordform_distance_dok_rep = dict(par(delayed(to_dok_rep)(d) for d in wordform_distances))

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0107s.) Setting batch_size=36.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0577s.) Setting batch_size=248.
[Parallel(n_jobs=-1)]: Done 676 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 1288 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 1972 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1300s.) Setting batch_size=762.
[Parallel(n_jobs=-1)]: Done 4352 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 9560 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 14768 

[Parallel(n_jobs=-1)]: Done 3271599 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 3322272 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 3372945 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 3424380 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 3475815 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 3528012 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 3580209 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 3633168 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 3686127 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 3739848 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 3793569 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 3848052 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 3902535 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 3957780 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 4013025 tasks      | elapsed:  2.6min
[Parallel(

[Parallel(n_jobs=-1)]: Done 12551937 tasks      | elapsed: 10.0min
[Parallel(n_jobs=-1)]: Done 12636489 tasks      | elapsed: 10.1min
[Parallel(n_jobs=-1)]: Done 12721041 tasks      | elapsed: 10.1min
[Parallel(n_jobs=-1)]: Done 12806217 tasks      | elapsed: 10.2min
[Parallel(n_jobs=-1)]: Done 12891393 tasks      | elapsed: 10.3min
[Parallel(n_jobs=-1)]: Done 12977193 tasks      | elapsed: 10.4min
[Parallel(n_jobs=-1)]: Done 13062993 tasks      | elapsed: 10.5min
[Parallel(n_jobs=-1)]: Done 13149417 tasks      | elapsed: 10.6min
[Parallel(n_jobs=-1)]: Done 13235841 tasks      | elapsed: 10.6min
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0078s.) Setting batch_size=156.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1961s.) Setting batch_size=318.
[Parallel(n_jobs=-1)]: Done 13310019 tasks      | elapsed: 10.7min
[Parallel(n_jobs=-1)]: Done 13398741 tasks      | elapsed: 10.8min
[Parallel(n_jobs=-1)]: Done 13488099 tasks      | elapsed: 10.9min
[Parallel(n_jobs=-1)]: Don

[Parallel(n_jobs=-1)]: Batch computation too fast (0.1235s.) Setting batch_size=416.
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0061s.) Setting batch_size=208.
[Parallel(n_jobs=-1)]: Done 18900576 tasks      | elapsed: 16.9min
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1962s.) Setting batch_size=422.
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0017s.) Setting batch_size=211.
[Parallel(n_jobs=-1)]: Done 18999574 tasks      | elapsed: 17.1min
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1971s.) Setting batch_size=428.
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0081s.) Setting batch_size=214.
[Parallel(n_jobs=-1)]: Done 19085612 tasks      | elapsed: 17.2min
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1984s.) Setting batch_size=430.
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0904s.) Setting batch_size=215.
[Parallel(n_jobs=-1)]: Batch computation too slow (2.2051s.) Setting batch_size=107.
[Parallel(n_jobs=-1)]: Batch compu

In [96]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           188G         25G        161G        3.5M        2.0G        162G
Swap:          2.0G          0B        2.0G


In [97]:
# wordform_distance_dok_rep2 = dict(par(delayed(lev_dist)(pair, to_dok_rep) for pair in combinations(Ws_tt, 2)))

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0085s.) Setting batch_size=46.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0654s.) Setting batch_size=280.
[Parallel(n_jobs=-1)]: Done 846 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 1628 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 2502 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1832s.) Setting batch_size=610.
[Parallel(n_jobs=-1)]: Done 5248 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 11128 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 17008

[Parallel(n_jobs=-1)]: Done 2623188 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 2663753 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 2704318 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 2745493 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 2786668 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 2828453 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 2870238 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 2912633 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 2955028 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 2998033 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 3041038 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 3084653 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 3128268 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 3172493 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 3216718 tasks      | elapsed:  2.4min
[Parallel(

[Parallel(n_jobs=-1)]: Batch computation too slow (2.0256s.) Setting batch_size=225.
[Parallel(n_jobs=-1)]: Done 8239553 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1960s.) Setting batch_size=458.
[Parallel(n_jobs=-1)]: Done 8304930 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0890s.) Setting batch_size=229.
[Parallel(n_jobs=-1)]: Done 8379355 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 8433170 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done 8487443 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1945s.) Setting batch_size=470.
[Parallel(n_jobs=-1)]: Done 8548223 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0335s.) Setting batch_size=235.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1797s.) Setting batch_size=522.
[Parallel(n_jobs=-1)]: Batch computation too slow (2.1154s.) Setting batch_size=261.
[Parallel

[Parallel(n_jobs=-1)]: Batch computation too slow (2.1662s.) Setting batch_size=233.
[Parallel(n_jobs=-1)]: Done 14433994 tasks      | elapsed: 14.1min
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1915s.) Setting batch_size=486.
[Parallel(n_jobs=-1)]: Batch computation too slow (2.8930s.) Setting batch_size=243.
[Parallel(n_jobs=-1)]: Done 14525497 tasks      | elapsed: 14.3min
[Parallel(n_jobs=-1)]: Done 14602528 tasks      | elapsed: 14.3min
[Parallel(n_jobs=-1)]: Done 14680045 tasks      | elapsed: 14.4min
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1938s.) Setting batch_size=500.
[Parallel(n_jobs=-1)]: Batch computation too slow (2.9065s.) Setting batch_size=250.
[Parallel(n_jobs=-1)]: Done 14774906 tasks      | elapsed: 14.5min
[Parallel(n_jobs=-1)]: Done 14855156 tasks      | elapsed: 14.6min
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1987s.) Setting batch_size=502.
[Parallel(n_jobs=-1)]: Done 14935406 tasks      | elapsed: 14.7min
[Parallel(n_jobs=-1)]

[Parallel(n_jobs=-1)]: Done 17503242 tasks      | elapsed: 18.1min
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0011s.) Setting batch_size=163.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1959s.) Setting batch_size=332.
[Parallel(n_jobs=-1)]: Batch computation too slow (2.1584s.) Setting batch_size=166.
[Parallel(n_jobs=-1)]: Done 17588608 tasks      | elapsed: 18.2min
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1893s.) Setting batch_size=350.
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0212s.) Setting batch_size=175.
[Parallel(n_jobs=-1)]: Done 17675206 tasks      | elapsed: 18.3min
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1980s.) Setting batch_size=352.
[Parallel(n_jobs=-1)]: Batch computation too slow (2.1213s.) Setting batch_size=176.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1694s.) Setting batch_size=414.
[Parallel(n_jobs=-1)]: Batch computation too slow (2.1664s.) Setting batch_size=207.
[Parallel(n_jobs=-1)]: Batch compu

[Parallel(n_jobs=-1)]: Batch computation too fast (0.1997s.) Setting batch_size=282.
[Parallel(n_jobs=-1)]: Done 20066089 tasks      | elapsed: 21.6min
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0015s.) Setting batch_size=141.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1991s.) Setting batch_size=282.
[Parallel(n_jobs=-1)]: Batch computation too slow (2.1016s.) Setting batch_size=141.
[Parallel(n_jobs=-1)]: Done 20157316 tasks      | elapsed: 21.7min
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0346s.) Setting batch_size=70.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0825s.) Setting batch_size=338.
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0079s.) Setting batch_size=169.
[Parallel(n_jobs=-1)]: Done 20253484 tasks      | elapsed: 21.8min
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1953s.) Setting batch_size=346.
[Parallel(n_jobs=-1)]: Batch computation too slow (2.3093s.) Setting batch_size=173.
[Parallel(n_jobs=-1)]: Done 2033300

In [98]:
# !free -h

              total        used        free      shared  buff/cache   available
Mem:           188G         26G        160G        3.4M        2.0G        161G
Swap:          2.0G          0B        2.0G


In [102]:
wordform_distance_DOK = sparse.DOK((len(Ws_t), len(Ws_t)), wordform_distance_dok_rep)

In [103]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           188G         26G        160G        3.4M        2.0G        160G
Swap:          2.0G          0B        2.0G


In [104]:
# wordform_distance_DOK2 = sparse.DOK((len(Ws_t), len(Ws_t)), wordform_distance_dok_rep2)

In [105]:
# !free -h

              total        used        free      shared  buff/cache   available
Mem:           188G         29G        157G        3.4M        2.0G        158G
Swap:          2.0G          0B        2.0G


In [106]:
wordform_distance_np = wordform_distance_DOK.todense()
wordform_distance_np.nbytes / 1e9

0.328089728

In [107]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           188G         29G        157G        3.4M        2.0G        158G
Swap:          2.0G          0B        2.0G


In [None]:
#fixme update to reflect symmetry...

In [None]:
# L_d_np = np.zeros((len(Ws_t), len(Ws_t)), dtype=np.int64)

In [108]:
def to_idx_rep(distance_triple):
    t = distance_triple
    return (Ws_t.index(t[0]), Ws_t.index(t[1]), t[2])

In [109]:
wordform_distance_idx_rep = par(delayed(to_idx_rep)(d) for d in wordform_distances)

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0085s.) Setting batch_size=46.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0451s.) Setting batch_size=408.
[Parallel(n_jobs=-1)]: Done 846 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 1628 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 2502 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1628s.) Setting batch_size=1002.
[Parallel(n_jobs=-1)]: Done 6272 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 14840 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 2340

[Parallel(n_jobs=-1)]: Done 4236518 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 4302149 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 4368782 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 4435415 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 4503050 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 4570685 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 4639322 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 4707959 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 4777598 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 4847237 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 4917878 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 4988519 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 5060162 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 5131805 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 5204450 tasks      | elapsed:  2.6min
[Parallel(

[Parallel(n_jobs=-1)]: Done 15497180 tasks      | elapsed:  8.8min
[Parallel(n_jobs=-1)]: Done 15629206 tasks      | elapsed:  8.8min
[Parallel(n_jobs=-1)]: Done 15762284 tasks      | elapsed:  8.9min
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0251s.) Setting batch_size=263.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1976s.) Setting batch_size=532.
[Parallel(n_jobs=-1)]: Done 15878004 tasks      | elapsed:  9.0min
[Parallel(n_jobs=-1)]: Done 16002366 tasks      | elapsed:  9.1min
[Parallel(n_jobs=-1)]: Done 16138026 tasks      | elapsed:  9.1min
[Parallel(n_jobs=-1)]: Done 16274750 tasks      | elapsed:  9.3min
[Parallel(n_jobs=-1)]: Done 16411474 tasks      | elapsed:  9.3min
[Parallel(n_jobs=-1)]: Done 16549262 tasks      | elapsed:  9.4min
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0043s.) Setting batch_size=266.
[Parallel(n_jobs=-1)]: Done 16646618 tasks      | elapsed:  9.5min
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1955s.) Setting batch_

In [110]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           188G         32G        154G        3.5M        2.0G        155G
Swap:          2.0G          0B        2.0G


In [133]:
def mirror(triple):
    t = triple
    return (t[1], t[0], t[2])

In [None]:
wordform_distance_idx_rep_mirror = par(delayed())

In [None]:
# wordform_distance_idx_rep = sorted(wordform_distance_idx_rep, key=lambda triple: triple[0])

In [111]:
wordform_distance_idx_rep_grouped = group_by(lambda triple: triple[0], wordform_distance_idx_rep)

In [112]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           188G         32G        154G        3.5M        2.0G        155G
Swap:          2.0G          0B        2.0G


In [113]:
def group_to_update_vecs(groupped_idx_rep_triples):
    row_idx = groupped_idx_rep_triples[0][0]
    col_idxs = np.array(lmap(second, groupped_idx_rep_triples))
    vals = np.array(lmap(lambda triple: triple[2], groupped_idx_rep_triples))
    return (row_idx, col_idxs, vals)

In [114]:
group_update_vecs = par(delayed(group_to_update_vecs)(group) for key, group in wordform_distance_idx_rep_grouped.items())

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0499s.) Setting batch_size=8.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 200 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 336 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 488 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done 808 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done 976 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 1160 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done 1344 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]

In [117]:
group_update_vecs = sorted(group_update_vecs, key=lambda triple: triple[0])

In [115]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           188G         32G        154G        3.5M        2.0G        154G
Swap:          2.0G          0B        2.0G


In [116]:
L_d_np = -99 * np.ones((len(Ws_t), len(Ws_t)), dtype=np.int64)

In [118]:
updated_row_idxs = set(lmap(lambda triple: first, 
                            group_update_vecs))

In [119]:
missing_row_idxs = [i for i in range(len(Ws_t)) if i not in updated_row_idxs]
len(missing_row_idxs)
missing_row_idxs

1

[6403]

In [125]:
Ws_t[6403]

'⋊.θ.⋉.⋉'

In [120]:
col_idx_mat = np.array(lmap(second, group_update_vecs))

In [121]:
val_mat = np.array(lmap(lambda triple: triple[2],
                        group_update_vecs))

In [122]:
col_idx_mat = np.array(col_idx_mat)
val_mat = np.array(val_mat)

In [123]:
col_idx_mat.shape
val_mat.shape

(6403,)

(6403,)

In [131]:
col_idx_mat[:10]
set(lmap(lambda a: a.shape, col_idx_mat))

array([array([   1,    2,    3, ..., 6401, 6402, 6403]),
       array([   2,    3,    4, ..., 6401, 6402, 6403]),
       array([   3,    4,    5, ..., 6401, 6402, 6403]),
       array([   4,    5,    6, ..., 6401, 6402, 6403]),
       array([   5,    6,    7, ..., 6401, 6402, 6403]),
       array([   6,    7,    8, ..., 6401, 6402, 6403]),
       array([   7,    8,    9, ..., 6401, 6402, 6403]),
       array([   8,    9,   10, ..., 6401, 6402, 6403]),
       array([   9,   10,   11, ..., 6401, 6402, 6403]),
       array([  10,   11,   12, ..., 6401, 6402, 6403])], dtype=object)

{(4975,),
 (2554,),
 (1828,),
 (779,),
 (4082,),
 (5454,),
 (3033,),
 (4405,),
 (6403,),
 (1514,),
 (465,),
 (5933,),
 (3512,),
 (4884,),
 (2479,),
 (1993,),
 (944,),
 (4007,),
 (5619,),
 (2958,),
 (4570,),
 (1439,),
 (390,),
 (6098,),
 (2669,),
 (5049,),
 (1150,),
 (101,),
 (3148,),
 (5528,),
 (2099,),
 (4495,),
 (1629,),
 (580,),
 (3627,),
 (6023,),
 (2578,),
 (4206,),
 (6204,),
 (1059,),
 (10,),
 (5734,),
 (3313,),
 (4685,),
 (2264,),
 (1538,),
 (745,),
 (3792,),
 (5164,),
 (2759,),
 (4115,),
 (6369,),
 (1224,),
 (191,),
 (5643,),
 (3238,),
 (4850,),
 (2189,),
 (1719,),
 (670,),
 (3717,),
 (5329,),
 (2924,),
 (4280,),
 (6294,),
 (1405,),
 (356,),
 (5808,),
 (3403,),
 (4775,),
 (2354,),
 (1884,),
 (835,),
 (3882,),
 (5254,),
 (2833,),
 (4461,),
 (1314,),
 (265,),
 (5989,),
 (3568,),
 (4940,),
 (2535,),
 (1793,),
 (1000,),
 (4063,),
 (5419,),
 (3014,),
 (4370,),
 (1495,),
 (446,),
 (5898,),
 (3493,),
 (5105,),
 (2444,),
 (1974,),
 (925,),
 (3972,),
 (5584,),
 (2155,),
 (4551,),
 (636,

In [130]:
# np.vstack(col_idx_mat)#[:10]

ValueError: all the input array dimensions except for the concatenation axis must match exactly

In [124]:
np.put_along_axis(L_d_np, col_idx_mat, val_mat, axis=1)

IndexError: `indices` must be an integer array

In [None]:
# for update in tqdm(group_update_vecs):
#     row
#     np.put_along_axis(L_d_np, )

# Calculate weighted distances

# Export

In [None]:
# exportDict

In [None]:
# L_d_P_md = {'W':{'from fp':p,
#                      'changes':'sorted',
#                      'size':len(Ws_t)},
#                      'P':{'from_fp':p,
#                           'changes':'extracted from W, sorted',
#                           'size':len(Ps_t)}}
# exportMatrixMetadata(o + '_L_d_P' + '.npy' + '_metadata.json',
#                      o + '_L_d_P' + '.npy' + '_metadata.json',
#                      L_d_np_P,
#                      L_d_P_md,
#                      FIXME #'Step 4b',
#                      'Calculate Levenshtein distances and neighborhood density.ipynb',
#                     {'Storage':'file is MEMORY MAPPED.'})