In [1]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

**Eric Meinhardt / emeinhardt@ucsd.edu**

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Overview" data-toc-modified-id="Overview-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Overview</a></span><ul class="toc-item"><li><span><a href="#Requirements" data-toc-modified-id="Requirements-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Requirements</a></span></li><li><span><a href="#Usage" data-toc-modified-id="Usage-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Usage</a></span></li></ul></li><li><span><a href="#Parameters" data-toc-modified-id="Parameters-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Parameters</a></span></li><li><span><a href="#Imports-/-load-data" data-toc-modified-id="Imports-/-load-data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Imports / load data</a></span></li><li><span><a href="#Define-basic-structures" data-toc-modified-id="Define-basic-structures-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Define basic structures</a></span></li><li><span><a href="#Calculate-prefix-relation-of-$W$" data-toc-modified-id="Calculate-prefix-relation-of-$W$-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Calculate prefix relation of $W$</a></span><ul class="toc-item"><li><span><a href="#Export" data-toc-modified-id="Export-5.1"><span class="toc-item-num">5.1&nbsp;&nbsp;</span>Export</a></span></li></ul></li><li><span><a href="#Calculate-$k$-cousin-relation-of-$W$" data-toc-modified-id="Calculate-$k$-cousin-relation-of-$W$-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Calculate $k$-cousin relation of $W$</a></span></li><li><span><a href="#Calculate-$k$-spheres-of-$W$" data-toc-modified-id="Calculate-$k$-spheres-of-$W$-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Calculate $k$-spheres of $W$</a></span></li></ul></div>

# Overview

Given 
 - a filepath $p$ to a conditional distribution on segmental wordforms given an orthographic wordform $p(W|V)$
 - an output directory $o$
 
this notebook calculates and writes to file 
 - what the prefix relation of $W$ is
 - what the set of (unique, alphabetically sorted) prefixes of $W$ is
 - what the prefixes of each segmental wordform in $W$ are
 - what the complete wordforms associated with each prefix $p$ are
 - what the $k$-cousins of each prefix are
 - what the $k$-spheres of each wordform are.

## Requirements

 - `tqdm`
 - `joblib`

## Usage

#FIXME

# Parameters

In [2]:
from os import getcwd, chdir, listdir, path, mkdir, makedirs

In [3]:
from boilerplate import *

In [4]:
# Parameters

p = ''
p = 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json'

o = ''
o = 'LTR_Buckeye_aligned_w_GD_AmE_destressed'

In [5]:
ensure_dir_exists(o)

# Imports / load data

In [6]:
from probdist import *
from string_utils import *

In [7]:
from tqdm import tqdm

In [8]:
from joblib import Parallel, delayed

J = -1
BACKEND = 'multiprocessing'
# BACKEND = 'loky'
V = 10
PREFER = 'processes'
# PREFER = 'threads'

def identity(x):
    return x

def par(gen_expr):
    return Parallel(n_jobs=J, backend=BACKEND, verbose=V, prefer=PREFER)(gen_expr)

In [9]:
import sparse

In [10]:
pW_V = condDistsAsProbDists(importProbDist(p))

# Define basic structures

In [11]:
Vs = set(pW_V.keys())
Ws = union(mapValues(lambda dist: set(conditions(dist)), 
                     pW_V).values())
len(Vs)
len(Ws)

6574

6403

In [12]:
v_to_Ws = mapValues(lambda dist: set(conditions(dist)),
                    pW_V)
V_W_relation = {(v,w) 
                for v in v_to_Ws 
                for w in v_to_Ws[v]}
w_to_Vs = {w:{v for v in Vs if (v,w) in V_W_relation}
           for w in Ws}

# Calculate prefix relation of $W$

In [13]:
prefix_relation = set(union({(w,p) for p in getPrefixes(w)} for w in tqdm(Ws)))
len(prefix_relation)

100%|██████████| 6403/6403 [00:03<00:00, 1909.85it/s] 


49429

In [14]:
list(prefix_relation)[:5]

[('⋊.b.ɑ.n.f.aɪ.ɚ.z.⋉', '⋊'),
 ('⋊.h.oʊ.l.d.⋉', '⋊.h'),
 ('⋊.s.ɛ.n.s.ɪ.t.ɪ.v.⋉', '⋊.s'),
 ('⋊.ɹ.ʊ.ɪ.n.ɪ.ŋ.⋉', '⋊.ɹ'),
 ('⋊.f.oʊ.ɹ.m.æ.t.⋉', '⋊')]

In [15]:
Ps = set(map(lambda pair: pair[1],
             prefix_relation))

In [16]:
# for export as a TSV
def pref_rel_pair_to_dict(pair):
    return {'Segmental_Wordform':pair[0],
            'Prefix':pair[1]}

In [17]:
def extract_prefix_function(Ws):
    return {w:getPrefixes(w) for w in Ws}

In [18]:
def completeWordformsWithPrefix(p, Ws):
    return set(filter(lambda w: hasAsPrefix(w, p),
                      Ws))

In [19]:
#slow af
def extract_w_to_P(pref_rel):
    Ws = set(map(lambda pair: pair[0],
                 pref_rel))
    return {w:{p for w_prime, p in pref_rel if w_prime == w}
            for w in Ws}

def extract_p_to_W(pref_rel, fast=True):
    Ps = set(map(lambda pair: pair[1],
                 pref_rel))
#     if not fast:
    return {p:{w for w, p_prime in pref_rel if p_prime == p}
            for p in Ps}
#     else:
#         return dict((p, completeWordformsWithPrefix(p, Ws))
#                     for p in Ps)
#         return dict(par((p, delayed(completeWordformsWithPrefix)(p, Ws))
#                         for p in Ps))

In [20]:
#49.4s on wittgenstein (w/ heavy load of other stuff)
# w_to_P = extract_w_to_P(prefix_relation)

In [21]:
prefixesOf = extract_prefix_function(Ws)

In [22]:
#2.75m on wittgenstein (w/ heavy load of other stuff)
# completionsOf = extract_p_to_W(prefix_relation)

In [23]:
# p_to_W = extract_p_to_W(prefix_relation)

In [24]:
def foo(p):
    return (p, completeWordformsWithPrefix(p, Ws))

#21s on wittgenstein (w/ heavy load of other stuff)
completionsOf = dict(par(delayed(foo)(p)
                         for p in Ps))

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0185s.) Setting batch_size=20.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 404 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 744 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 1124 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 1504 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 1924 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 2344 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 2804 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 3264 tasks      | elapsed:    1.8s
[Parallel(n_jobs

## Export

In [25]:
Ws_t = tuple(sorted(list(Ws)))
Ps_t = tuple(sorted(list(Ps)))
num_wordforms = len(Ws_t)
num_prefixes = len(Ps_t)

num_wordforms
num_prefixes

6403

21475

In [26]:
# prefixes as .txt
LTR_basename = path.basename(p).split('.pW_V.json')[0]

ps_fp = path.join(o, f'prefixes of {LTR_basename}.txt')

exportSeqs(ps_fp, Ps_t)

In [27]:
# prefix relation as tsv
pr_dl = list(map(pref_rel_pair_to_dict,
                 prefix_relation))

prefix_relation_fp = path.join(o, 'prefix_relation.tsv')

saveDictList_as_TSV(prefix_relation_fp, pr_dl, ('Segmental_Wordform', 'Prefix'))

In [28]:
# prefix relation as sparse array = .npz
def w_p_pair_to_indices(w,p):
    return Ws_t.index(w), Ps_t.index(p)

list(prefix_relation)[2322]
w_p_pair_to_indices(*list(prefix_relation)[2322])

prefix_relation_np = np.zeros(shape=(num_wordforms, num_prefixes), dtype="uint8")

for w,p in prefix_relation:
    prefix_relation_np[w_p_pair_to_indices(w,p)] = 1
    
prefix_relation_np.nbytes / 1e9

prefix_relation_sparse = sparse.COO.from_numpy(prefix_relation_np)
prefix_relation_sparse.density
prefix_relation_sparse.nbytes / 1e9

pr_npz_fp = path.join(o, 'prefix_relation')

sparse.save_npz(pr_npz_fp,prefix_relation_sparse)

('⋊.k.ɑ.n.t.æ.k.t.s.⋉', '⋊.k')

(2071, 5683)

0.137504425

0.0003594720678989058

0.000840293

In [29]:
# prefixesOf

prefixesOf_fp = path.join(o, 'prefixesOf.json')

exportDict(prefixesOf_fp, castSetValuesToTuples(prefixesOf))

In [30]:
# completionsOf

completionsOf_fp = path.join(o, 'completionsOf.json')

exportDict(completionsOf_fp, castSetValuesToTuples(completionsOf))

# Calculate $k$-cousin relation of $W$

Let $s$ be a finite-length string over $\Sigma$ and let $L$ be a finite set of strings over $\Sigma$.

**k-sphere**: $s'$ is in the *exact* $k$-sphere of $s$ w.r.t. $L$ iff $s' \in L \land $ the Hamming distance of $s'$ from $s$ is *exactly* $k$.

**k-cousin**: string $p$ is an *exact* $k$-cousin of segmental wordform $w$ wr.t. $L$ iff
 - $w \in L$
 - $p \in \text{prefixes}(L)$
 - $\exists p' \in k\text{-sphere}(p) \cap \text{prefixes}(w)$

**NB** In **this section *only*** $k$-spheres are defined w.r.t. $L = $ the set of *all prefixes* of $W$.

In [31]:
# Ws_t = tuple(sorted(list(Ws)))
# Ps_t = tuple(sorted(list(Ps)))

def idx(s, seq_tuple):
    return seq_tuple.index(s)

In [32]:
from itertools import chain

In [33]:
def concat(lst_a, lst_b):
    return lst_a + lst_b

def concat2(lst_pair_a, lst_pair_b):
    return (concat(lst_pair_a[0], lst_pair_b[0]),
            concat(lst_pair_a[1], lst_pair_b[1]))

def to_coords(p, d):
    cousins_of_p = d[p]
# def to_coords(p, cousins_of_p):
    return ([idx(p, Ps_t)] * len(cousins_of_p),
            [idx(w, Ws_t) for w in cousins_of_p])

def kCousinsDict_to_sparse_array(d):
    num_rows = len(Ps_t)
    num_cols = len(Ws_t)
    my_shape = (num_rows, num_cols)
#     to_coords = lambda p: ([idx(p, Ps_t)] * len(d[p]),
#                            [idx(w, Ws_t) for w in d[p]])
#     coords = reduce(concat2, (to_coords(p) for p in Ps_t))
#     coords = reduce(concat2, (to_coords(p,d) for p in Ps_t))
#     coords = reduce(concat2, (to_coords(p,d[p]) for p in Ps_t))
#     coords = reduce(concat2, par(delayed(to_coords)(p) for p in Ps_t))
#     coords = reduce(concat2, par(delayed(to_coords)(p, d) for p in Ps_t))
#     coords = (reduce(concat, [[idx(p, Ps_t)] * len(d[p]) 
#                               for p in Ps_t]),
#               reduce(concat, [[idx(w, Ws_t) for w in d[p]] 
#                               for p in Ps_t]))
    coords = (list(chain.from_iterable([[idx(p, Ps_t)] * len(d[p]) for p in Ps_t])),
              list(chain.from_iterable([[idx(w, Ws_t) for w in d[p]] for p in Ps_t])))
    data = np.ones((len(coords[1]),), dtype='uint8')
    return sparse.COO(coords, data, my_shape)

In [34]:
def kCousins_calc(p, k):
    return (p, get_k_cousins(p, k, Ws, Ps, exactlyK = True))

In [101]:
def kCousins_calc_idx(p, k):
    kCousins = [idx(w, Ws_t) for w in get_k_cousins(p, k, Ws, Ps, exactlyK = True)]
    return ([[idx(p, Ps_t)] * len(kCousins),
            kCousins])
#     return kCousins

sparse_mat_coords_shape = (len(Ps_t), len(Ws_t))

def combine_coords(coords):
#     num_rows = len(Ps_t)
#     num_cols = len(Ws_t)
#     my_shape = (num_rows, num_cols)
    row_coords, col_coords = tuple(zip(*coords))
    flat_row_coords = tuple(chain.from_iterable(row_coords))
    flat_col_coords = tuple(chain.from_iterable(col_coords))
    coords = (flat_row_coords, flat_col_coords)
    data = np.ones((len(flat_col_coords),), dtype='uint8')
    return sparse.COO(coords, data, sparse_mat_coords_shape)

In [36]:
cousin_fps = [path.join(o, f'{k}cousinsOf.json') for k in range(5)]

In [37]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G        1.3G         70G        1.7M         53G        123G
Swap:          2.0G        266M        1.7G


In [39]:
zeroCousinsOf = dict( par(delayed(kCousins_calc)(p, 0) for p in Ps) )

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1281s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  98 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 132 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 170 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 250 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 292 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 338 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 384 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: 

[Parallel(n_jobs=-1)]: Done 19408 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 19690 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 19972 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 20258 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 20544 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 20834 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 21124 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 21412 out of 21475 | elapsed:  1.4min remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 21475 out of 21475 | elapsed:  1.4min finished


In [86]:
zeroCousinsOfIdx = par(delayed(kCousins_calc_idx)(p, 0) for p in Ps)

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0937s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 132 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 200 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 276 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 352 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 436 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 520 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 612 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 704 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: 

In [40]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G        1.3G         70G        1.7M         53G        123G
Swap:          2.0G        266M        1.7G


In [41]:
numZeroCousinsOf = {p:len(zeroCousinsOf[p]) for p in zeroCousinsOf}

In [42]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G        1.3G         70G        1.7M         53G        123G
Swap:          2.0G        266M        1.7G


In [43]:
exportDict(cousin_fps[0], castSetValuesToTuples(zeroCousinsOf))

In [None]:
zeroCousins_sparse = combine_coords(zeroCousinsOfIdx)
zeroCousins_sparse.shape
zeroCousins_sparse.dtype
zeroCousins_sparse.nbytes / 1e9
zeroCousins_sparse.density
sparse.save_npz(cousin_fps[0].split('.json')[0], zeroCousins_sparse)

In [44]:
# zeroCousins_sparse = kCousinsDict_to_sparse_array(zeroCousinsOf)
# zeroCousins_sparse.shape
# zeroCousins_sparse.dtype
# zeroCousins_sparse.nbytes / 1e9
# zeroCousins_sparse.density
# sparse.save_npz(cousin_fps[0].split('.json')[0], zeroCousins_sparse)

(21475, 6403)

dtype('uint8')

0.000840293

0.0003594720678989058

In [110]:
# baz = np.equal(zeroCousins_sparse2, zeroCousins_sparse)

In [None]:
# for i,row in tqdm(enumerate(zeroCousins_sparse.todense())):
#     for j,cell in enumerate(row):
#         if zeroCousins_sparse[i,j] != zeroCousins_sparse2[i,j]:
#             print(f"{i},{j}")

1358it [09:14,  2.52it/s]

In [122]:
# quux = baz.todense() == False
# for i,row in tqdm(enumerate(quux)):
#     for j,cell in enumerate(row):
#         if cell:
#             print(f"{i},{j}")

21475it [00:08, 2491.85it/s]


In [114]:
# np.array_equal(zeroCousins_sparse2, zeroCousins_sparse)

False

In [45]:
del zeroCousinsOf

In [46]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G        1.3G         70G        1.7M         53G        123G
Swap:          2.0G        266M        1.7G


In [48]:
oneCousinsOf = dict( par(delayed(kCousins_calc)(p, 1) for p in Ps) )

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1258s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  98 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 132 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 170 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 250 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 292 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 338 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 384 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: 

[Parallel(n_jobs=-1)]: Done 19408 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 19690 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 19972 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 20258 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 20544 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 20834 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 21124 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 21412 out of 21475 | elapsed:  1.4min remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 21475 out of 21475 | elapsed:  1.4min finished


In [47]:
oneCousinsOfIdx = par(delayed(kCousins_calc_idx)(p, 1) for p in Ps)

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1286s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  98 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 132 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 170 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 250 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 292 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 338 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 384 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: 

[Parallel(n_jobs=-1)]: Done 19408 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 19690 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 19972 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 20258 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 20544 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 20834 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 21124 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 21412 out of 21475 | elapsed:  1.4min remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 21475 out of 21475 | elapsed:  1.4min finished


In [49]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G        1.5G         70G        1.7M         53G        123G
Swap:          2.0G        266M        1.7G


In [50]:
numOneCousinsOf = {p:len(oneCousinsOf[p]) for p in oneCousinsOf}

In [51]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G        1.5G         70G        1.7M         53G        123G
Swap:          2.0G        266M        1.7G


In [52]:
exportDict(cousin_fps[1], castSetValuesToTuples(oneCousinsOf))

In [None]:
oneCousins_sparse = combine_coords(oneCousinsOfIdx)
oneCousins_sparse.shape
oneCousins_sparse.dtype
oneCousins_sparse.nbytes / 1e9
oneCousins_sparse.density
sparse.save_npz(cousin_fps[1].split('.json')[0], oneCousins_sparse)

In [53]:
# oneCousins_sparse = kCousinsDict_to_sparse_array(oneCousinsOf)
# oneCousins_sparse.shape
# oneCousins_sparse.dtype
# oneCousins_sparse.nbytes / 1e9
# oneCousins_sparse.density
# sparse.save_npz(cousin_fps[1].split('.json')[0], oneCousins_sparse)

(21475, 6403)

dtype('uint8')

0.010039078

0.004294654517481892

In [54]:
del oneCousinsOf

In [55]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G        1.5G         70G        1.7M         53G        123G
Swap:          2.0G        266M        1.7G


In [57]:
twoCousinsOf = dict( par(delayed(kCousins_calc)(p, 2) for p in Ps) )

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1269s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  98 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 132 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 170 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 250 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 292 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 338 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 384 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: 

[Parallel(n_jobs=-1)]: Done 19408 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 19690 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 19972 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 20258 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 20544 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 20834 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 21124 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 21412 out of 21475 | elapsed:  1.4min remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 21475 out of 21475 | elapsed:  1.4min finished


In [56]:
twoCousinsOfIdx = par(delayed(kCousins_calc_idx)(p, 2) for p in Ps)

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1273s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  65 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done  98 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 132 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 170 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 250 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 292 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 338 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 384 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: 

[Parallel(n_jobs=-1)]: Done 19408 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 19690 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 19972 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 20258 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 20544 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 20834 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 21124 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 21412 out of 21475 | elapsed:  1.6min remaining:    0.3s
[Parallel(n_jobs=-1)]: Done 21475 out of 21475 | elapsed:  1.6min finished


In [58]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G        2.4G         69G        1.7M         53G        122G
Swap:          2.0G        266M        1.7G


In [59]:
numTwoCousinsOf = {p:len(twoCousinsOf[p]) for p in twoCousinsOf}

In [60]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G        2.4G         69G        1.7M         53G        122G
Swap:          2.0G        266M        1.7G


In [61]:
exportDict(cousin_fps[2], castSetValuesToTuples(twoCousinsOf))

In [None]:
twoCousins_sparse = combine_coords(twoCousinsOfIdx)
twoCousins_sparse.shape
twoCousins_sparse.dtype
twoCousins_sparse.nbytes / 1e9
twoCousins_sparse.density
sparse.save_npz(cousin_fps[2].split('.json')[0], twoCousins_sparse)

In [62]:
# twoCousins_sparse = kCousinsDict_to_sparse_array(twoCousinsOf)
# twoCousins_sparse.shape
# twoCousins_sparse.dtype
# twoCousins_sparse.nbytes / 1e9
# twoCousins_sparse.density
# sparse.save_npz(cousin_fps[2].split('.json')[0], twoCousins_sparse)

(21475, 6403)

dtype('uint8')

0.082934738

0.035478960040740505

In [63]:
del twoCousinsOf

In [64]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G        2.1G         69G        1.7M         53G        122G
Swap:          2.0G        266M        1.7G


In [66]:
threeCousinsOf = dict( par(delayed(kCousins_calc)(p, 3) for p in Ps) )

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1015s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  98 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 132 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 170 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 250 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 292 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 338 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 384 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: 

[Parallel(n_jobs=-1)]: Done 19408 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 19690 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 19972 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 20258 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 20544 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 20834 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 21124 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 21412 out of 21475 | elapsed:  1.4min remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 21475 out of 21475 | elapsed:  1.4min finished


In [65]:
threeCousinsOfIdx = par(delayed(kCousins_calc_idx)(p, 3) for p in Ps)

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1170s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done  99 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 132 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 170 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 250 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 292 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 338 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 384 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: 

[Parallel(n_jobs=-1)]: Done 19408 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 19690 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 19972 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 20258 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 20544 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 20834 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 21124 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 21412 out of 21475 | elapsed:  2.0min remaining:    0.4s
[Parallel(n_jobs=-1)]: Done 21475 out of 21475 | elapsed:  2.0min finished


In [67]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G        5.2G         66G        1.7M         53G        119G
Swap:          2.0G        266M        1.7G


In [68]:
numThreeCousinsOf = {p:len(threeCousinsOf[p]) for p in threeCousinsOf}

In [69]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G        5.2G         66G        1.7M         53G        119G
Swap:          2.0G        266M        1.7G


In [70]:
exportDict(cousin_fps[3], castSetValuesToTuples(threeCousinsOf))

In [None]:
threeCousins_sparse = combine_coords(threeCousinsOfIdx)
threeCousins_sparse.shape
threeCousins_sparse.dtype
threeCousins_sparse.nbytes / 1e9
threeCousins_sparse.density
sparse.save_npz(cousin_fps[3].split('.json')[0], threeCousins_sparse)

In [71]:
# threeCousins_sparse = kCousinsDict_to_sparse_array(threeCousinsOf)
# threeCousins_sparse.shape
# threeCousins_sparse.dtype
# threeCousins_sparse.nbytes / 1e9
# threeCousins_sparse.density
# sparse.save_npz(cousin_fps[3].split('.json')[0], threeCousins_sparse)

(21475, 6403)

dtype('uint8')

0.296305478

0.12675762252742048

In [72]:
del threeCousinsOf

In [73]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G        4.4G         67G        1.7M         53G        120G
Swap:          2.0G        266M        1.7G


In [75]:
fourCousinsOf = dict( par(delayed(kCousins_calc)(p, 4) for p in Ps) )

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1116s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  98 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 132 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 170 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 250 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 292 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 338 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 384 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: 

[Parallel(n_jobs=-1)]: Done 19408 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 19690 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 19972 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 20258 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 20544 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 20834 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 21124 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 21412 out of 21475 | elapsed:  1.4min remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 21475 out of 21475 | elapsed:  1.4min finished


In [74]:
fourCousinsOfIdx = par(delayed(kCousins_calc_idx)(p, 4) for p in Ps)

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1269s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done  98 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 132 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 170 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 250 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 292 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 338 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 384 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: 

[Parallel(n_jobs=-1)]: Done 19408 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 19690 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 19972 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 20258 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 20544 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 20834 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 21124 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 21412 out of 21475 | elapsed:  2.3min remaining:    0.4s
[Parallel(n_jobs=-1)]: Done 21475 out of 21475 | elapsed:  2.3min finished


In [76]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G        7.4G         64G        1.7M         53G        117G
Swap:          2.0G        266M        1.7G


In [77]:
numFourCousinsOf = {p:len(fourCousinsOf[p]) for p in fourCousinsOf}

In [78]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G        7.4G         64G        1.7M         53G        117G
Swap:          2.0G        266M        1.7G


In [79]:
exportDict(cousin_fps[4], castSetValuesToTuples(fourCousinsOf))

In [None]:
fourCousins_sparse = combine_coords(fourCousinsOfIdx)
fourCousins_sparse.shape
fourCousins_sparse.dtype
fourCousins_sparse.nbytes / 1e9
fourCousins_sparse.density
sparse.save_npz(cousin_fps[4].split('.json')[0], fourCousins_sparse)

In [135]:
# fourCousins_sparse = kCousinsDict_to_sparse_array(fourCousinsOf)
# fourCousins_sparse.shape
# fourCousins_sparse.dtype
# fourCousins_sparse.nbytes / 1e9
# fourCousins_sparse.density
# sparse.save_npz(cousin_fps[4].split('.json')[0], fourCousins_sparse)

(21475, 6403)

dtype('uint8')

0.443369367

0.18967063059970615

In [136]:
del fourCousinsOf

In [137]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G        6.6G         75G        1.7M         43G        118G
Swap:          2.0G        266M        1.7G


In [138]:
numZeroCousinsOf = Counter(numZeroCousinsOf)
numOneCousinsOf = Counter(numOneCousinsOf)
numTwoCousinsOf = Counter(numTwoCousinsOf)
numThreeCousinsOf = Counter(numThreeCousinsOf)
numFourCousinsOf = Counter(numFourCousinsOf)

In [139]:
numFormatter = lambda n: "{:,.2f}".format(n)

def report(numKOrLessCousinsOf, seconds_per_calc):
    d = numKOrLessCousinsOf
    calcs = sum(d.values())
    time_s = calcs*seconds_per_calc
    time_d = time_s/60/60/24
    rep = (calcs, time_s, time_d)
    return tuple(map(numFormatter,
                     rep))

In [140]:
rate50 = 30.6/1000 #30.6ms

report(numZeroCousinsOf, rate50)

numOneOrLessCousinsOf = numZeroCousinsOf + numOneCousinsOf
report(numOneOrLessCousinsOf, rate50)

numTwoOrLessCousinsOf = numZeroCousinsOf + numOneCousinsOf + numTwoCousinsOf
report(numTwoOrLessCousinsOf, rate50)

numThreeOrLessCousinsOf = numZeroCousinsOf + numOneCousinsOf + numTwoCousinsOf + numThreeCousinsOf
report(numThreeOrLessCousinsOf, rate50)

numFourOrLessCousinsOf = numZeroCousinsOf + numOneCousinsOf + numTwoCousinsOf + numThreeCousinsOf + numFourCousinsOf
report(numFourOrLessCousinsOf, rate50)

('49,429.00', '1,512.53', '0.02')

('639,963.00', '19,582.87', '0.23')

('5,518,477.00', '168,865.40', '1.95')

('22,948,211.00', '702,215.26', '8.13')

('49,028,762.00', '1,500,280.12', '17.36')

In [141]:
rate200 = 104/1000 #104ms

report(numZeroCousinsOf, rate200)

numOneOrLessCousinsOf = numZeroCousinsOf + numOneCousinsOf
report(numOneOrLessCousinsOf, rate200)

numTwoOrLessCousinsOf = numZeroCousinsOf + numOneCousinsOf + numTwoCousinsOf
report(numTwoOrLessCousinsOf, rate200)

numThreeOrLessCousinsOf = numZeroCousinsOf + numOneCousinsOf + numTwoCousinsOf + numThreeCousinsOf
report(numThreeOrLessCousinsOf, rate200)

numFourOrLessCousinsOf = numZeroCousinsOf + numOneCousinsOf + numTwoCousinsOf + numThreeCousinsOf + numFourCousinsOf
report(numFourOrLessCousinsOf, rate200)

('49,429.00', '5,140.62', '0.06')

('639,963.00', '66,556.15', '0.77')

('5,518,477.00', '573,921.61', '6.64')

('22,948,211.00', '2,386,613.94', '27.62')

('49,028,762.00', '5,098,991.25', '59.02')

In [142]:
Counter(sorted(numZeroCousinsOf.values()))

Counter({1: 16966,
         2: 2371,
         3: 816,
         4: 441,
         5: 223,
         6: 131,
         7: 84,
         8: 62,
         9: 49,
         10: 25,
         11: 32,
         12: 23,
         13: 20,
         14: 19,
         15: 13,
         16: 11,
         17: 13,
         18: 10,
         19: 15,
         20: 7,
         21: 6,
         22: 2,
         23: 12,
         24: 6,
         25: 5,
         26: 8,
         27: 6,
         28: 6,
         29: 6,
         30: 3,
         31: 1,
         32: 2,
         33: 2,
         34: 4,
         35: 3,
         37: 2,
         38: 2,
         40: 4,
         41: 1,
         43: 2,
         44: 3,
         45: 2,
         46: 2,
         49: 1,
         51: 2,
         53: 4,
         54: 1,
         55: 1,
         57: 1,
         60: 3,
         61: 1,
         62: 1,
         66: 2,
         67: 1,
         70: 1,
         72: 1,
         74: 1,
         75: 2,
         76: 1,
         81: 2,
         92: 1,
    

In [143]:
Counter(sorted(numOneOrLessCousinsOf.values()))

Counter({1: 7501,
         2: 3444,
         3: 1772,
         4: 1148,
         5: 856,
         6: 624,
         7: 488,
         8: 428,
         9: 365,
         10: 283,
         11: 253,
         12: 232,
         13: 205,
         14: 187,
         15: 175,
         16: 145,
         17: 143,
         18: 132,
         19: 106,
         20: 107,
         21: 108,
         22: 72,
         23: 64,
         24: 79,
         25: 68,
         26: 62,
         27: 60,
         28: 55,
         29: 51,
         30: 51,
         31: 41,
         32: 39,
         33: 48,
         34: 47,
         35: 46,
         36: 44,
         37: 35,
         38: 34,
         39: 35,
         40: 28,
         41: 44,
         42: 38,
         43: 26,
         44: 22,
         45: 28,
         46: 33,
         47: 25,
         48: 30,
         49: 32,
         50: 26,
         51: 18,
         52: 14,
         53: 20,
         54: 25,
         55: 21,
         56: 24,
         57: 19,
         58: 18

In [144]:
Counter(sorted(numOneCousinsOf.values()))

Counter({0: 8606,
         1: 2913,
         2: 1605,
         3: 1063,
         4: 805,
         5: 552,
         6: 473,
         7: 400,
         8: 344,
         9: 285,
         10: 258,
         11: 195,
         12: 208,
         13: 169,
         14: 170,
         15: 133,
         16: 143,
         17: 125,
         18: 119,
         19: 107,
         20: 94,
         21: 71,
         22: 68,
         23: 73,
         24: 73,
         25: 60,
         26: 58,
         27: 64,
         28: 42,
         29: 45,
         30: 43,
         31: 46,
         32: 45,
         33: 43,
         34: 39,
         35: 37,
         36: 31,
         37: 36,
         38: 34,
         39: 33,
         40: 39,
         41: 30,
         42: 36,
         43: 24,
         44: 32,
         45: 20,
         46: 26,
         47: 34,
         48: 24,
         49: 31,
         50: 22,
         51: 12,
         52: 17,
         53: 23,
         54: 22,
         55: 27,
         56: 22,
         57: 24,


In [145]:
Counter(sorted(numTwoOrLessCousinsOf.values()))

Counter({1: 3573,
         2: 2215,
         3: 1295,
         4: 894,
         5: 627,
         6: 499,
         7: 427,
         8: 351,
         9: 332,
         10: 252,
         11: 260,
         12: 233,
         13: 201,
         14: 174,
         15: 180,
         16: 151,
         17: 129,
         18: 137,
         19: 121,
         20: 103,
         21: 121,
         22: 83,
         23: 89,
         24: 93,
         25: 102,
         26: 82,
         27: 89,
         28: 78,
         29: 78,
         30: 81,
         31: 83,
         32: 77,
         33: 80,
         34: 75,
         35: 92,
         36: 65,
         37: 88,
         38: 80,
         39: 68,
         40: 71,
         41: 86,
         42: 72,
         43: 59,
         44: 59,
         45: 64,
         46: 68,
         47: 67,
         48: 52,
         49: 51,
         50: 50,
         51: 53,
         52: 51,
         53: 47,
         54: 49,
         55: 51,
         56: 60,
         57: 52,
         58: 52

In [146]:
Counter(sorted(numThreeOrLessCousinsOf.values()))

Counter({1: 1667,
         2: 1141,
         3: 809,
         4: 586,
         5: 488,
         6: 365,
         7: 337,
         8: 258,
         9: 218,
         10: 225,
         11: 214,
         12: 198,
         13: 168,
         14: 141,
         15: 140,
         16: 134,
         17: 119,
         18: 118,
         19: 122,
         20: 78,
         21: 106,
         22: 80,
         23: 75,
         24: 74,
         25: 80,
         26: 63,
         27: 66,
         28: 78,
         29: 71,
         30: 84,
         31: 58,
         32: 64,
         33: 58,
         34: 54,
         35: 61,
         36: 56,
         37: 60,
         38: 56,
         39: 51,
         40: 37,
         41: 59,
         42: 47,
         43: 67,
         44: 48,
         45: 51,
         46: 54,
         47: 47,
         48: 54,
         49: 42,
         50: 46,
         51: 51,
         52: 45,
         53: 48,
         54: 38,
         55: 41,
         56: 55,
         57: 41,
         58: 28,
 

In [147]:
Counter(sorted(numFourOrLessCousinsOf.values()))

Counter({1: 776,
         2: 563,
         3: 435,
         4: 333,
         5: 259,
         6: 257,
         7: 227,
         8: 171,
         9: 157,
         10: 122,
         11: 156,
         12: 107,
         13: 106,
         14: 127,
         15: 111,
         16: 97,
         17: 91,
         18: 95,
         19: 103,
         20: 75,
         21: 81,
         22: 65,
         23: 64,
         24: 50,
         25: 67,
         26: 60,
         27: 78,
         28: 72,
         29: 52,
         30: 54,
         31: 44,
         32: 35,
         33: 38,
         34: 59,
         35: 67,
         36: 43,
         37: 46,
         38: 53,
         39: 42,
         40: 39,
         41: 40,
         42: 36,
         43: 45,
         44: 42,
         45: 28,
         46: 31,
         47: 33,
         48: 30,
         49: 35,
         50: 37,
         51: 30,
         52: 29,
         53: 24,
         54: 29,
         55: 27,
         56: 39,
         57: 31,
         58: 30,
       

In [148]:
# from functools import reduce

# def mergeDictsOfSets(d_a, d_b):
#     keys = set.union(set(d_a.keys()), set(d_b.keys()))
#     return {k:set.union(d_a[k], d_b[k]) for k in keys}

# oneOrLessCousinsOf = reduce(mergeDictsOfSets, (zeroCousinsOf, oneCousinsOf))
# twoOrLessCousinsOf = reduce(mergeDictsOfSets, (zeroCousinsOf, oneCousinsOf, twoCousinsOf))
# threeOrLessCousinsOf = reduce(mergeDictsOfSets, (zeroCousinsOf, oneCousinsOf, twoCousinsOf, threeCousinsOf))
# fourOrLessCousinsOf = reduce(mergeDictsOfSets, (zeroCousinsOf, oneCousinsOf, twoCousinsOf, threeCousinsOf, fourCousinsOf))

In [149]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G        6.6G         75G        1.7M         43G        118G
Swap:          2.0G        266M        1.7G


In [150]:
# del oneOrLessCousinsOf
# del twoOrLessCousinsOf
# del threeOrLessCousinsOf
# del fourOrLessCousinsOf

# Calculate $k$-spheres of $W$

In this subsection, we calculate, for each full wordform $w$, the set of *full wordforms \{w'\}* that are within $k$ edits of $w$.

In [151]:
def kSpheresDict_to_sparse_array(d):
    num_rows = len(Ws_t)
    num_cols = len(Ws_t)
    my_shape = (num_rows, num_cols)
    to_coords = lambda w: ([idx(w, Ws_t)] * len(d[w]),
                           [idx(w_prime, Ws_t) for w_prime in d[w]])
    coords = reduce(concat2, [to_coords(w) for w in Ws_t])
    data = np.ones((len(coords[1]),), dtype='uint8')
#     print('coords: {0}'.format(coords))
#     print('data: {0}'.format(data))
    return sparse.COO(coords, data, my_shape)

In [152]:
def kSphere_calc(w, k):
    return (w, h_sphere(k, w, Ws))

In [153]:
sphere_fps = [path.join(o, f'{k}spheresOf.json') for k in range(5)]

In [154]:
!free -h 

              total        used        free      shared  buff/cache   available
Mem:           125G        6.6G         75G        1.7M         43G        118G
Swap:          2.0G        266M        1.7G


In [155]:
zeroSpheresOf = dict( par(delayed(kSphere_calc)(w, 0) for w in Ws) )
numZeroSpheresOf = {w:len(zeroSpheresOf[w]) for w in zeroSpheresOf}

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0187s.) Setting batch_size=20.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 404 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 744 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 1124 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 1504 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 1924 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 2344 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 2804 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 3264 tasks      | elapsed:    1.6s
[Parallel(n_jobs

In [156]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G        6.4G         75G        1.7M         43G        118G
Swap:          2.0G        266M        1.7G


In [157]:
exportDict(sphere_fps[0], castSetValuesToTuples(zeroSpheresOf))

In [158]:
zeroSpheres_sparse = kSpheresDict_to_sparse_array(zeroSpheresOf)
zeroSpheres_sparse.shape
zeroSpheres_sparse.dtype
zeroSpheres_sparse.nbytes / 1e9
zeroSpheres_sparse.density
sparse.save_npz(sphere_fps[0].split('.json')[0], zeroSpheres_sparse)

(6403, 6403)

dtype('uint8')

0.000108851

0.00015617679212868969

In [159]:
del zeroSpheresOf

In [160]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G        6.4G         75G        1.7M         43G        118G
Swap:          2.0G        266M        1.7G


In [161]:
oneSpheresOf = dict( par(delayed(kSphere_calc)(w, 1) for w in Ws) )
numOneSpheresOf = {w:len(oneSpheresOf[w]) for w in oneSpheresOf}

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0142s.) Setting batch_size=28.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 540 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 1016 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 1548 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 2080 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 2668 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 3256 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 3900 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 4544 tasks      | elapsed:    2.3s
[Parallel(n_job

In [162]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G        6.4G         75G        1.7M         43G        118G
Swap:          2.0G        266M        1.7G


In [163]:
exportDict(sphere_fps[1], castSetValuesToTuples(oneSpheresOf))

In [164]:
oneSpheres_sparse = kSpheresDict_to_sparse_array(oneSpheresOf)
oneSpheres_sparse.shape
oneSpheres_sparse.dtype
oneSpheres_sparse.nbytes / 1e9
oneSpheres_sparse.density
sparse.save_npz(sphere_fps[1].split('.json')[0], oneSpheres_sparse)

(6403, 6403)

dtype('uint8')

0.00025347

0.0003636726488581545

In [165]:
del oneSpheresOf

In [166]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G        6.4G         75G        1.7M         43G        118G
Swap:          2.0G        266M        1.7G


In [167]:
twoSpheresOf = dict( par(delayed(kSphere_calc)(w, 2) for w in Ws) )
numTwoSpheresOf = {w:len(twoSpheresOf[w]) for w in twoSpheresOf}

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0136s.) Setting batch_size=28.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 540 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 1016 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 1548 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 2080 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 2668 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 3256 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 3900 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 4544 tasks      | elapsed:    2.3s
[Parallel(n_job

In [168]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G        6.4G         75G        1.7M         43G        118G
Swap:          2.0G        266M        1.7G


In [169]:
exportDict(sphere_fps[2], castSetValuesToTuples(twoSpheresOf))

In [170]:
twoSpheres_sparse = kSpheresDict_to_sparse_array(twoSpheresOf)
twoSpheres_sparse.shape
twoSpheres_sparse.dtype
twoSpheres_sparse.nbytes / 1e9
twoSpheres_sparse.density
sparse.save_npz(sphere_fps[2].split('.json')[0], twoSpheres_sparse)

(6403, 6403)

dtype('uint8')

0.00294321

0.004222846793884124

In [171]:
del twoSpheresOf

In [172]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G        6.4G         75G        1.7M         43G        118G
Swap:          2.0G        266M        1.7G


In [173]:
threeSpheresOf = dict( par(delayed(kSphere_calc)(w, 3) for w in Ws) )
numThreeSpheresOf = {w:len(threeSpheresOf[w]) for w in threeSpheresOf}

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0191s.) Setting batch_size=20.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 404 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 744 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 1124 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 1504 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 1924 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 2344 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 2804 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 3264 tasks      | elapsed:    1.7s
[Parallel(n_jobs

In [174]:
!free -h 

              total        used        free      shared  buff/cache   available
Mem:           125G        6.4G         75G        1.7M         43G        118G
Swap:          2.0G        266M        1.7G


In [175]:
exportDict(sphere_fps[3], castSetValuesToTuples(threeSpheresOf))

In [176]:
threeSpheres_sparse = kSpheresDict_to_sparse_array(threeSpheresOf)
threeSpheres_sparse.shape
threeSpheres_sparse.dtype
threeSpheres_sparse.nbytes / 1e9
threeSpheres_sparse.density
sparse.save_npz(sphere_fps[3].split('.json')[0], threeSpheres_sparse)

(6403, 6403)

dtype('uint8')

0.015549662

0.022310280381855792

In [177]:
del threeSpheresOf

In [178]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G        6.4G         75G        1.7M         43G        118G
Swap:          2.0G        266M        1.7G


In [179]:
fourSpheresOf = dict( par(delayed(kSphere_calc)(w, 4) for w in Ws) )
numFourSpheresOf = {w:len(fourSpheresOf[w]) for w in fourSpheresOf}

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0266s.) Setting batch_size=14.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 302 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 540 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 806 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 1072 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 1366 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 1660 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 1982 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 2304 tasks      | elapsed:    1.2s
[Parallel(n_jobs=

In [180]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G        6.4G         75G        1.7M         43G        118G
Swap:          2.0G        266M        1.7G


In [181]:
exportDict(sphere_fps[4], castSetValuesToTuples(fourSpheresOf))

In [182]:
fourSpheres_sparse = kSpheresDict_to_sparse_array(fourSpheresOf)
fourSpheres_sparse.shape
fourSpheres_sparse.dtype
fourSpheres_sparse.nbytes / 1e9
fourSpheres_sparse.density
sparse.save_npz(sphere_fps[4].split('.json')[0], fourSpheres_sparse)

(6403, 6403)

dtype('uint8')

0.025094482

0.03600495814361967

In [183]:
del fourSpheresOf

In [184]:
!free - h

              total        used        free      shared  buff/cache   available
Mem:      131961284     6677240    79498192        1752    45785852   124137828
Swap:       2097148      273008     1824140
