In [1]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

**Eric Meinhardt / emeinhardt@ucsd.edu**

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Overview" data-toc-modified-id="Overview-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Overview</a></span><ul class="toc-item"><li><span><a href="#Requirements" data-toc-modified-id="Requirements-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Requirements</a></span></li><li><span><a href="#Usage" data-toc-modified-id="Usage-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Usage</a></span></li></ul></li><li><span><a href="#Parameters" data-toc-modified-id="Parameters-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Parameters</a></span></li><li><span><a href="#Imports-/-load-data" data-toc-modified-id="Imports-/-load-data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Imports / load data</a></span></li><li><span><a href="#Define-basic-structures" data-toc-modified-id="Define-basic-structures-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Define basic structures</a></span></li><li><span><a href="#Calculate-prefix-relation-of-$W$" data-toc-modified-id="Calculate-prefix-relation-of-$W$-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Calculate prefix relation of $W$</a></span><ul class="toc-item"><li><span><a href="#Export" data-toc-modified-id="Export-5.1"><span class="toc-item-num">5.1&nbsp;&nbsp;</span>Export</a></span></li></ul></li><li><span><a href="#Calculate-$k$-cousin-relation-of-$W$" data-toc-modified-id="Calculate-$k$-cousin-relation-of-$W$-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Calculate $k$-cousin relation of $W$</a></span></li><li><span><a href="#Calculate-$k$-spheres-of-$W$" data-toc-modified-id="Calculate-$k$-spheres-of-$W$-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Calculate $k$-spheres of $W$</a></span></li></ul></div>

# Overview

Given 
 - a filepath $p$ to a conditional distribution on segmental wordforms given an orthographic wordform $p(W|V)$
 - an output directory $o$
 
this notebook calculates and writes to file 
 - what the prefix relation of $W$ is
 - what the set of (unique, alphabetically sorted) prefixes of $W$ is
 - what the prefixes of each segmental wordform in $W$ are
 - what the complete wordforms associated with each prefix $p$ are
 - what the $k$-cousins of each prefix are
 - what the $k$-spheres of each wordform are.

## Requirements

 - `tqdm`
 - `joblib`

## Usage

#FIXME

# Parameters

In [2]:
from os import getcwd, chdir, listdir, path, mkdir, makedirs

In [5]:
from boilerplate import *

In [3]:
# Parameters

p = ''
# p = 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json'

o = ''
# o = 'LTR_Buckeye_aligned_w_GD_AmE_destressed'

In [6]:
ensure_dir_exists(o)

# Imports / load data

In [7]:
from probdist import *
from string_utils import *

In [8]:
from tqdm import tqdm

In [9]:
from joblib import Parallel, delayed

J = -1
BACKEND = 'multiprocessing'
# BACKEND = 'loky'
V = 10
PREFER = 'processes'
# PREFER = 'threads'

def identity(x):
    return x

def par(gen_expr):
    return Parallel(n_jobs=J, backend=BACKEND, verbose=V, prefer=PREFER)(gen_expr)

In [57]:
import sparse

In [10]:
pW_V = condDistsAsProbDists(importProbDist(p))

# Define basic structures

In [11]:
Vs = set(pW_V.keys())
Ws = union(mapValues(lambda dist: set(conditions(dist)), 
                     pW_V).values())
len(Vs)
len(Ws)

6574

6403

In [12]:
v_to_Ws = mapValues(lambda dist: set(conditions(dist)),
                    pW_V)
V_W_relation = {(v,w) 
                for v in v_to_Ws 
                for w in v_to_Ws[v]}
w_to_Vs = {w:{v for v in Vs if (v,w) in V_W_relation}
           for w in Ws}

# Calculate prefix relation of $W$

In [13]:
prefix_relation = set(union({(w,p) for p in getPrefixes(w)} for w in tqdm(Ws)))
len(prefix_relation)

100%|██████████| 6403/6403 [00:03<00:00, 1913.63it/s] 


49429

In [14]:
list(prefix_relation)[:5]

[('⋊.k.ɪ.s.⋉', '⋊.k.ɪ.s.⋉'),
 ('⋊.d.u.m.⋉', '⋊.d.u.m.⋉'),
 ('⋊.ʌ.k.s.ɛ.p.t.⋉', '⋊.ʌ.k.s.ɛ.p.t.⋉'),
 ('⋊.ɛ.k.s.aɪ.t.ʌ.d.⋉', '⋊.ɛ.k.s.aɪ.t.ʌ.d'),
 ('⋊.l.u.ɪ.n.s.k.i.⋉', '⋊.l.u.ɪ.n.s.k.i.⋉')]

In [15]:
Ps = set(map(lambda pair: pair[1],
             prefix_relation))

In [16]:
# for export as a TSV
def pref_rel_pair_to_dict(pair):
    return {'Segmental_Wordform':pair[0],
            'Prefix':pair[1]}

In [17]:
def extract_prefix_function(Ws):
    return {w:getPrefixes(w) for w in Ws}

In [18]:
def completeWordformsWithPrefix(p, Ws):
    return set(filter(lambda w: hasAsPrefix(w, p),
                      Ws))

In [19]:
#slow af
def extract_w_to_P(pref_rel):
    Ws = set(map(lambda pair: pair[0],
                 pref_rel))
    return {w:{p for w_prime, p in pref_rel if w_prime == w}
            for w in Ws}

def extract_p_to_W(pref_rel, fast=True):
    Ps = set(map(lambda pair: pair[1],
                 pref_rel))
#     if not fast:
    return {p:{w for w, p_prime in pref_rel if p_prime == p}
            for p in Ps}
#     else:
#         return dict((p, completeWordformsWithPrefix(p, Ws))
#                     for p in Ps)
#         return dict(par((p, delayed(completeWordformsWithPrefix)(p, Ws))
#                         for p in Ps))

In [16]:
#49.4s on wittgenstein (w/ heavy load of other stuff)
# w_to_P = extract_w_to_P(prefix_relation)

In [20]:
prefixesOf = extract_prefix_function(Ws)

In [36]:
#2.75m on wittgenstein (w/ heavy load of other stuff)
# completionsOf = extract_p_to_W(prefix_relation)

In [17]:
# p_to_W = extract_p_to_W(prefix_relation)

In [21]:
def foo(p):
    return (p, completeWordformsWithPrefix(p, Ws))

#21s on wittgenstein (w/ heavy load of other stuff)
completionsOf = dict(par(delayed(foo)(p)
                         for p in Ps))

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0206s.) Setting batch_size=18.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 370 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 676 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 1018 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 1360 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 1738 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 2116 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 2530 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 2944 tasks      | elapsed:    1.5s
[Parallel(n_jobs

## Export

In [41]:
Ws_t = tuple(sorted(list(Ws)))
Ps_t = tuple(sorted(list(Ps)))
num_wordforms = len(Ws_t)
num_prefixes = len(Ps_t)

num_wordforms
num_prefixes

6403

21475

In [31]:
# prefixes as .txt
LTR_basename = path.basename(p).split('.pW_V.json')[0]

ps_fp = path.join(o, f'prefixes of {LTR_basename}.txt')

exportSeqs(ps_fp, Ps_t)

In [26]:
# prefix relation as tsv
pr_dl = list(map(pref_rel_pair_to_dict,
                 prefix_relation))

prefix_relation_fp = path.join(o, 'prefix_relation.tsv')

saveDictList_as_TSV(prefix_relation_fp, pr_dl, ('Segmental_Wordform', 'Prefix'))

In [61]:
# prefix relation as sparse array = .npz
def w_p_pair_to_indices(w,p):
    return Ws_t.index(w), Ps_t.index(p)

list(prefix_relation)[2322]
w_p_pair_to_indices(*list(prefix_relation)[2322])

prefix_relation_np = np.zeros(shape=(num_wordforms, num_prefixes), dtype="uint8")

for w,p in prefix_relation:
    prefix_relation_np[w_p_pair_to_indices(w,p)] = 1
    
prefix_relation_np.nbytes / 1e9

prefix_relation_sparse = sparse.COO.from_numpy(prefix_relation_np)
prefix_relation_sparse.density
prefix_relation_sparse.nbytes / 1e9

pr_npz_fp = path.join(o, 'prefix_relation')

sparse.save_npz(pr_npz_fp,prefix_relation_sparse)

('⋊.tʃ.æ.t.⋉', '⋊.tʃ')

(4617, 14864)

0.137504425

0.0003594720678989058

0.000840293

In [27]:
# prefixesOf

prefixesOf_fp = path.join(o, 'prefixesOf.json')

exportDict(prefixesOf_fp, castSetValuesToTuples(prefixesOf))

In [28]:
# completionsOf

completionsOf_fp = path.join(o, 'completionsOf.json')

exportDict(completionsOf_fp, castSetValuesToTuples(completionsOf))

# Calculate $k$-cousin relation of $W$

Let $s$ be a finite-length string over $\Sigma$ and let $L$ be a finite set of strings over $\Sigma$.

**k-sphere**: $s'$ is in the *exact* $k$-sphere of $s$ w.r.t. $L$ iff $s' \in L \land $ the Hamming distance of $s'$ from $s$ is *exactly* $k$.

**k-cousin**: string $p$ is an *exact* $k$-cousin of segmental wordform $w$ wr.t. $L$ iff
 - $w \in L$
 - $p \in \text{prefixes}(L)$
 - $\exists p' \in k\text{-sphere}(p) \cap \text{prefixes}(w)$

**NB** In **this section *only*** $k$-spheres are defined w.r.t. $L = $ the set of *all prefixes* of $W$.

In [79]:
# Ws_t = tuple(sorted(list(Ws)))
# Ps_t = tuple(sorted(list(Ps)))

def idx(s, seq_tuple):
    return seq_tuple.index(s)

In [99]:
def concat(lst_a, lst_b):
    return lst_a + lst_b

def concat2(lst_pair_a, lst_pair_b):
    return (concat(lst_pair_a[0], lst_pair_b[0]),
            concat(lst_pair_a[1], lst_pair_b[1]))

def kCousinsDict_to_sparse_array(d):
    num_rows = len(Ps_t)
    num_cols = len(Ws_t)
    my_shape = (num_rows, num_cols)
    to_coords = lambda p: ([idx(p, Ps_t)] * len(d[p]),
                           [idx(w, Ws_t) for w in d[p]])
    coords = reduce(concat2, [to_coords(p) for p in Ps_t])
    data = np.ones((len(coords[1]),), dtype='uint8')
#     print('coords: {0}'.format(coords))
#     print('data: {0}'.format(data))
    return sparse.COO(coords, data, my_shape)

In [70]:
def kCousins_calc(p, k):
    return (p, get_k_cousins(p, k, Ws, Ps, exactlyK = True))

In [71]:
cousin_fps = [path.join(o, f'{k}cousinsOf.json') for k in range(5)]

In [72]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G        1.1G         82G        1.7M         41G        123G
Swap:          2.0G        266M        1.7G


In [73]:
zeroCousinsOf = dict( par(delayed(kCousins_calc)(p, 0) for p in Ps) )

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1239s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  98 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 132 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 170 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 250 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 292 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 338 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 384 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: 

[Parallel(n_jobs=-1)]: Done 19408 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 19690 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 19972 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 20258 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 20544 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 20834 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 21124 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 21412 out of 21475 | elapsed:  1.4min remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 21475 out of 21475 | elapsed:  1.4min finished


In [74]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G        1.2G         82G        1.7M         41G        123G
Swap:          2.0G        266M        1.7G


In [75]:
numZeroCousinsOf = {p:len(zeroCousinsOf[p]) for p in zeroCousinsOf}

In [76]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G        1.1G         82G        1.7M         41G        123G
Swap:          2.0G        266M        1.7G


In [77]:
exportDict(cousin_fps[0], castSetValuesToTuples(zeroCousinsOf))

In [103]:
zeroCousins_sparse = kCousinsDict_to_sparse_array(zeroCousinsOf)
zeroCousins_sparse.shape
zeroCousins_sparse.dtype
zeroCousins_sparse.nbytes / 1e9
zeroCousins_sparse.density
sparse.save_npz(cousin_fps[0].split('.json')[0], zeroCousins_sparse)

(21475, 6403)

dtype('uint8')

0.000840293

0.0003594720678989058

In [104]:
del zeroCousinsOf

In [105]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G        1.2G         82G        1.7M         41G        123G
Swap:          2.0G        266M        1.7G


In [106]:
oneCousinsOf = dict( par(delayed(kCousins_calc)(p, 1) for p in Ps) )

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1251s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  98 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 132 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 170 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 250 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 292 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 338 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 384 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: 

[Parallel(n_jobs=-1)]: Done 19408 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 19690 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 19972 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 20258 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 20544 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 20834 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 21124 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 21412 out of 21475 | elapsed:  1.4min remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 21475 out of 21475 | elapsed:  1.4min finished


In [107]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G        1.3G         82G        1.7M         41G        123G
Swap:          2.0G        266M        1.7G


In [108]:
numOneCousinsOf = {p:len(oneCousinsOf[p]) for p in oneCousinsOf}

In [109]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G        1.3G         82G        1.7M         41G        123G
Swap:          2.0G        266M        1.7G


In [110]:
exportDict(cousin_fps[1], castSetValuesToTuples(oneCousinsOf))

In [111]:
oneCousins_sparse = kCousinsDict_to_sparse_array(oneCousinsOf)
oneCousins_sparse.shape
oneCousins_sparse.dtype
oneCousins_sparse.nbytes / 1e9
oneCousins_sparse.density
sparse.save_npz(cousin_fps[1].split('.json')[0], oneCousins_sparse)

(21475, 6403)

dtype('uint8')

0.010039078

0.004294654517481892

In [112]:
del oneCousinsOf

In [113]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G        1.3G         82G        1.7M         41G        123G
Swap:          2.0G        266M        1.7G


In [114]:
twoCousinsOf = dict( par(delayed(kCousins_calc)(p, 2) for p in Ps) )

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1119s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  98 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 132 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 170 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 250 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 292 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 338 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 384 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: 

[Parallel(n_jobs=-1)]: Done 19408 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 19690 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 19972 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 20258 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 20544 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 20834 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 21124 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 21412 out of 21475 | elapsed:  1.4min remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 21475 out of 21475 | elapsed:  1.4min finished


In [115]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G        2.1G         81G        1.7M         41G        122G
Swap:          2.0G        266M        1.7G


In [None]:
numTwoCousinsOf = {p:len(twoCousinsOf[p]) for p in twoCousinsOf}

In [None]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G        2.1G         81G        1.7M         41G        122G
Swap:          2.0G        266M        1.7G


In [None]:
exportDict(cousin_fps[2], castSetValuesToTuples(twoCousinsOf))

In [None]:
twoCousins_sparse = kCousinsDict_to_sparse_array(twoCousinsOf)
twoCousins_sparse.shape
twoCousins_sparse.dtype
twoCousins_sparse.nbytes / 1e9
twoCousins_sparse.density
sparse.save_npz(cousin_fps[2].split('.json')[0], twoCousins_sparse)

(21475, 6403)

dtype('uint8')

0.082934738

0.035478960040740505

In [None]:
del twoCousinsOf

In [None]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G        1.9G         81G        1.7M         42G        122G
Swap:          2.0G        266M        1.7G


In [None]:
threeCousinsOf = dict( par(delayed(kCousins_calc)(p, 3) for p in Ps) )

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0947s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 132 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 200 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 276 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 352 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 436 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 520 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 612 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 704 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: 

In [None]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G        4.1G         79G        1.7M         42G        120G
Swap:          2.0G        266M        1.7G


In [None]:
numThreeCousinsOf = {p:len(threeCousinsOf[p]) for p in threeCousinsOf}

In [None]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G        4.1G         79G        1.7M         42G        120G
Swap:          2.0G        266M        1.7G


In [None]:
exportDict(cousin_fps[3], castSetValuesToTuples(threeCousinsOf))

In [None]:
threeCousins_sparse = kCousinsDict_to_sparse_array(threeCousinsOf)
threeCousins_sparse.shape
threeCousins_sparse.dtype
threeCousins_sparse.nbytes / 1e9
threeCousins_sparse.density
sparse.save_npz(cousin_fps[3].split('.json')[0], threeCousins_sparse)

(21475, 6403)

dtype('uint8')

0.296305478

0.12675762252742048

In [None]:
del threeCousinsOf

In [None]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G        4.8G         78G        1.7M         42G        119G
Swap:          2.0G        266M        1.7G


In [None]:
fourCousinsOf = dict( par(delayed(kCousins_calc)(p, 4) for p in Ps) )

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1050s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  98 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 132 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 170 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 250 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 292 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 338 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 384 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: 

[Parallel(n_jobs=-1)]: Done 19408 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 19690 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 19972 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 20258 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 20544 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 20834 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 21124 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 21412 out of 21475 | elapsed:  1.4min remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 21475 out of 21475 | elapsed:  1.4min finished


In [None]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G        7.2G         75G        1.7M         42G        117G
Swap:          2.0G        266M        1.7G


In [None]:
numFourCousinsOf = {p:len(fourCousinsOf[p]) for p in fourCousinsOf}

In [None]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G        7.2G         75G        1.7M         42G        117G
Swap:          2.0G        266M        1.7G


In [None]:
exportDict(cousin_fps[4], castSetValuesToTuples(fourCousinsOf))

In [None]:
fourCousins_sparse = kCousinsDict_to_sparse_array(fourCousinsOf)
fourCousins_sparse.shape
fourCousins_sparse.dtype
fourCousins_sparse.nbytes / 1e9
fourCousins_sparse.density
sparse.save_npz(cousin_fps[4].split('.json')[0], fourCousins_sparse)

In [None]:
del fourCousinsOf

In [None]:
!free -h

In [None]:
numZeroCousinsOf = Counter(numZeroCousinsOf)
numOneCousinsOf = Counter(numOneCousinsOf)
numTwoCousinsOf = Counter(numTwoCousinsOf)
numThreeCousinsOf = Counter(numThreeCousinsOf)
numFourCousinsOf = Counter(numFourCousinsOf)

In [None]:
numFormatter = lambda n: "{:,.2f}".format(n)

def report(numKOrLessCousinsOf, seconds_per_calc):
    d = numKOrLessCousinsOf
    calcs = sum(d.values())
    time_s = calcs*seconds_per_calc
    time_d = time_s/60/60/24
    rep = (calcs, time_s, time_d)
    return tuple(map(numFormatter,
                     rep))

In [None]:
rate50 = 30.6/1000 #30.6ms

report(numZeroCousinsOf, rate50)

numOneOrLessCousinsOf = numZeroCousinsOf + numOneCousinsOf
report(numOneOrLessCousinsOf, rate50)

numTwoOrLessCousinsOf = numZeroCousinsOf + numOneCousinsOf + numTwoCousinsOf
report(numTwoOrLessCousinsOf, rate50)

numThreeOrLessCousinsOf = numZeroCousinsOf + numOneCousinsOf + numTwoCousinsOf + numThreeCousinsOf
report(numThreeOrLessCousinsOf, rate50)

numFourOrLessCousinsOf = numZeroCousinsOf + numOneCousinsOf + numTwoCousinsOf + numThreeCousinsOf + numFourCousinsOf
report(numFourOrLessCousinsOf, rate50)

In [None]:
rate200 = 104/1000 #104ms

report(numZeroCousinsOf, rate200)

numOneOrLessCousinsOf = numZeroCousinsOf + numOneCousinsOf
report(numOneOrLessCousinsOf, rate200)

numTwoOrLessCousinsOf = numZeroCousinsOf + numOneCousinsOf + numTwoCousinsOf
report(numTwoOrLessCousinsOf, rate200)

numThreeOrLessCousinsOf = numZeroCousinsOf + numOneCousinsOf + numTwoCousinsOf + numThreeCousinsOf
report(numThreeOrLessCousinsOf, rate200)

numFourOrLessCousinsOf = numZeroCousinsOf + numOneCousinsOf + numTwoCousinsOf + numThreeCousinsOf + numFourCousinsOf
report(numFourOrLessCousinsOf, rate200)

In [None]:
Counter(sorted(numZeroCousinsOf.values()))

In [None]:
Counter(sorted(numOneOrLessCousinsOf.values()))

In [None]:
Counter(sorted(numOneCousinsOf.values()))

In [None]:
Counter(sorted(numTwoOrLessCousinsOf.values()))

In [None]:
Counter(sorted(numThreeOrLessCousinsOf.values()))

In [None]:
Counter(sorted(numFourOrLessCousinsOf.values()))

In [None]:
# from functools import reduce

# def mergeDictsOfSets(d_a, d_b):
#     keys = set.union(set(d_a.keys()), set(d_b.keys()))
#     return {k:set.union(d_a[k], d_b[k]) for k in keys}

# oneOrLessCousinsOf = reduce(mergeDictsOfSets, (zeroCousinsOf, oneCousinsOf))
# twoOrLessCousinsOf = reduce(mergeDictsOfSets, (zeroCousinsOf, oneCousinsOf, twoCousinsOf))
# threeOrLessCousinsOf = reduce(mergeDictsOfSets, (zeroCousinsOf, oneCousinsOf, twoCousinsOf, threeCousinsOf))
# fourOrLessCousinsOf = reduce(mergeDictsOfSets, (zeroCousinsOf, oneCousinsOf, twoCousinsOf, threeCousinsOf, fourCousinsOf))

In [None]:
!free -h

In [None]:
# del oneOrLessCousinsOf
# del twoOrLessCousinsOf
# del threeOrLessCousinsOf
# del fourOrLessCousinsOf

# Calculate $k$-spheres of $W$

In this subsection, we calculate, for each full wordform $w$, the set of *full wordforms \{w'\}* that are within $k$ edits of $w$.

In [None]:
def kSpheresDict_to_sparse_array(d):
    num_rows = len(Ws_t)
    num_cols = len(Ws_t)
    my_shape = (num_rows, num_cols)
    to_coords = lambda w: ([idx(w, Ws_t)] * len(d[w]),
                           [idx(w_prime, Ws_t) for w_prime in d[w]])
    coords = reduce(concat2, [to_coords(w) for w in Ws_t])
    data = np.ones((len(coords[1]),), dtype='uint8')
#     print('coords: {0}'.format(coords))
#     print('data: {0}'.format(data))
    return sparse.COO(coords, data, my_shape)

In [None]:
def kSphere_calc(w, k):
    return (w, h_sphere(k, w, Ws))

In [None]:
sphere_fps = [path.join(o, f'{k}spheresOf.json') for k in range(5)]

In [None]:
!free -h 

In [None]:
zeroSpheresOf = dict( par(delayed(kSphere_calc)(w, 0) for w in Ws) )
numZeroSpheresOf = {w:len(zeroSpheresOf[w]) for w in zeroSpheresOf}

In [None]:
!free -h

In [None]:
exportDict(sphere_fps[0], castSetValuesToTuples(zeroSpheresOf))

In [None]:
zeroSpheres_sparse = kSpheresDict_to_sparse_array(zeroSpheresOf)
zeroSpheres_sparse.shape
zeroSpheres_sparse.dtype
zeroSpheres_sparse.nbytes / 1e9
zeroSpheres_sparse.density
sparse.save_npz(sphere_fps[0].split('.json')[0], zeroSpheres_sparse)

In [None]:
del zeroSpheresOf

In [None]:
!free -h

In [None]:
oneSpheresOf = dict( par(delayed(kSphere_calc)(w, 1) for w in Ws) )
numOneSpheresOf = {w:len(oneSpheresOf[w]) for w in oneSpheresOf}

In [None]:
!free -h

In [None]:
exportDict(sphere_fps[1], castSetValuesToTuples(oneSpheresOf))

In [None]:
oneSpheres_sparse = kSpheresDict_to_sparse_array(oneSpheresOf)
oneSpheres_sparse.shape
oneSpheres_sparse.dtype
oneSpheres_sparse.nbytes / 1e9
oneSpheres_sparse.density
sparse.save_npz(sphere_fps[1].split('.json')[0], oneSpheres_sparse)

In [None]:
del oneSpheresOf

In [None]:
!free -h

In [None]:
twoSpheresOf = dict( par(delayed(kSphere_calc)(w, 2) for w in Ws) )
numTwoSpheresOf = {w:len(twoSpheresOf[w]) for w in twoSpheresOf}

In [None]:
!free -h

In [None]:
exportDict(sphere_fps[2], castSetValuesToTuples(twoSpheresOf))

In [None]:
twoSpheres_sparse = kSpheresDict_to_sparse_array(twoSpheresOf)
twoSpheres_sparse.shape
twoSpheres_sparse.dtype
twoSpheres_sparse.nbytes / 1e9
twoSpheres_sparse.density
sparse.save_npz(sphere_fps[2].split('.json')[0], twoSpheres_sparse)

In [None]:
del twoSpheresOf

In [None]:
!free -h

In [None]:
threeSpheresOf = dict( par(delayed(kSphere_calc)(w, 3) for w in Ws) )
numThreeSpheresOf = {w:len(threeSpheresOf[w]) for w in threeSpheresOf}

In [None]:
!free -h 

In [None]:
exportDict(sphere_fps[3], castSetValuesToTuples(threeSpheresOf))

In [None]:
threeSpheres_sparse = kSpheresDict_to_sparse_array(threeSpheresOf)
threeSpheres_sparse.shape
threeSpheres_sparse.dtype
threeSpheres_sparse.nbytes / 1e9
threeSpheres_sparse.density
sparse.save_npz(sphere_fps[3].split('.json')[0], threeSpheres_sparse)

In [None]:
del threeSpheresOf

In [None]:
!free -h

In [None]:
fourSpheresOf = dict( par(delayed(kSphere_calc)(w, 4) for w in Ws) )
numFourSpheresOf = {w:len(fourSpheresOf[w]) for w in fourSpheresOf}

In [None]:
!free -h

In [None]:
exportDict(sphere_fps[4], castSetValuesToTuples(fourSpheresOf))

In [None]:
fourSpheres_sparse = kSpheresDict_to_sparse_array(fourSpheresOf)
fourSpheres_sparse.shape
fourSpheres_sparse.dtype
fourSpheres_sparse.nbytes / 1e9
fourSpheres_sparse.density
sparse.save_npz(sphere_fps[4].split('.json')[0], fourSpheres_sparse)

In [None]:
del fourSpheresOf

In [None]:
!free - h