In [1]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

**Eric Meinhardt / emeinhardt@ucsd.edu**

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Overview" data-toc-modified-id="Overview-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Overview</a></span><ul class="toc-item"><li><span><a href="#Requirements" data-toc-modified-id="Requirements-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Requirements</a></span></li><li><span><a href="#Usage" data-toc-modified-id="Usage-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Usage</a></span></li></ul></li><li><span><a href="#Parameters" data-toc-modified-id="Parameters-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Parameters</a></span></li><li><span><a href="#Imports-/-load-data" data-toc-modified-id="Imports-/-load-data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Imports / load data</a></span></li><li><span><a href="#Define-basic-structures" data-toc-modified-id="Define-basic-structures-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Define basic structures</a></span></li><li><span><a href="#Calculate-prefix-relation-of-$W$" data-toc-modified-id="Calculate-prefix-relation-of-$W$-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Calculate prefix relation of $W$</a></span></li><li><span><a href="#Calculate-$k$-cousin-relation-of-$W$" data-toc-modified-id="Calculate-$k$-cousin-relation-of-$W$-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Calculate $k$-cousin relation of $W$</a></span></li><li><span><a href="#Calculate-$k$-spheres-of-$W$" data-toc-modified-id="Calculate-$k$-spheres-of-$W$-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Calculate $k$-spheres of $W$</a></span></li><li><span><a href="#Export" data-toc-modified-id="Export-8"><span class="toc-item-num">8&nbsp;&nbsp;</span>Export</a></span></li></ul></div>

# Overview

#FIXME

## Requirements

#FIXME

## Usage

#FIXME

# Parameters

In [2]:
from os import getcwd, chdir, listdir, path, mkdir, makedirs

In [68]:
# Parameters

p = ''
# p = 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json'

o = ''
# o = 'LTR_Buckeye_aligned_w_GD_AmE_destressed'

In [69]:
if not path.exists(o):
    print(f"Making output directory '{o}'")
    makedirs(o)

# Imports / load data

In [4]:
from boilerplate import *
from probdist import *
from string_utils import *

In [5]:
from tqdm import tqdm

In [6]:
from joblib import Parallel, delayed

J = -1
BACKEND = 'multiprocessing'
# BACKEND = 'loky'
V = 10
PREFER = 'processes'
# PREFER = 'threads'

def identity(x):
    return x

def par(gen_expr):
    return Parallel(n_jobs=J, backend=BACKEND, verbose=V, prefer=PREFER)(gen_expr)

In [7]:
pW_V = condDistsAsProbDists(importProbDist(p))

# Define basic structures

In [8]:
Vs = set(pW_V.keys())
Ws = union(mapValues(lambda dist: set(conditions(dist)), 
                     pW_V).values())
len(Vs)
len(Ws)

6574

6403

In [9]:
v_to_Ws = mapValues(lambda dist: set(conditions(dist)),
                    pW_V)
V_W_relation = {(v,w) 
                for v in v_to_Ws 
                for w in v_to_Ws[v]}
w_to_Vs = {w:{v for v in Vs if (v,w) in V_W_relation}
           for w in Ws}

# Calculate prefix relation of $W$

In [10]:
prefix_relation = set(union({(w,p) for p in getPrefixes(w)} for w in tqdm(Ws)))
len(prefix_relation)

100%|██████████| 6403/6403 [00:03<00:00, 1675.59it/s] 


49429

In [11]:
list(prefix_relation)[:5]

[('⋊.k.l.æ.s.ʌ.z.⋉', '⋊.k.l.æ.s'),
 ('⋊.ɪ.n.t.ɛ.g.ɹ.ʌ.t.i.⋉', '⋊.ɪ.n.t.ɛ.g.ɹ.ʌ'),
 ('⋊.k.oʊ.oʊ.ɹ.d.n.eɪ.t.ɚ.⋉', '⋊.k.oʊ.oʊ.ɹ.d.n.eɪ.t'),
 ('⋊.p.eɪ.ɹ.ʌ.n.t.s.⋉', '⋊.p'),
 ('⋊.ɛ.k.s.k.l.u.s.ɪ.v.l.i.⋉', '⋊.ɛ.k.s.k.l.u.s.ɪ')]

In [12]:
# for export as a TSV
def pref_rel_pair_to_dict(pair):
    return {'Segmental_Wordform':pair[0],
            'Prefix':pair[1]}

In [13]:
def extract_prefix_function(Ws):
    return {w:getPrefixes(w) for w in Ws}

In [15]:
#slow af
def extract_w_to_P(pref_rel):
    Ws = set(map(lambda pair: pair[0],
                 pref_rel))
    return {w:{p for w_prime, p in pref_rel if w_prime == w}
            for w in Ws}

def extract_p_to_W(pref_rel):
    Ps = set(map(lambda pair: pair[1],
                 pref_rel))
    return {p:{w for w, p_prime in pref_rel if p_prime == p}
            for p in Ps}

In [16]:
#49.4s on wittgenstein (w/ heavy load of other stuff)
# w_to_P = extract_w_to_P(prefix_relation)

In [14]:
prefixesOf = extract_prefix_function(Ws)

In [55]:
#2.75m on wittgenstein (w/ heavy load of other stuff)
completionsOf = extract_p_to_W(prefix_relation)

In [17]:
# p_to_W = extract_p_to_W(prefix_relation)

In [16]:
Ps = set(map(lambda pair: pair[1],
             prefix_relation))

# Calculate $k$-cousin relation of $W$

In [17]:
def kCousins_calc(p, k):
    return (p, get_k_cousins(p, k, Ws, Ps, exactlyK = True))

In [18]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G         16G         44G         21M         65G        108G
Swap:          2.0G        101M        1.9G


In [19]:
zeroCousinsOf = dict( par(delayed(kCousins_calc)(p, 0) for p in Ps) )

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1149s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done  98 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 132 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 170 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 250 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 292 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 338 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 384 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: 

[Parallel(n_jobs=-1)]: Done 19408 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 19690 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 19972 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 20258 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 20544 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 20834 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 21124 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 21412 out of 21475 | elapsed:  2.3min remaining:    0.4s
[Parallel(n_jobs=-1)]: Done 21475 out of 21475 | elapsed:  2.3min finished


In [20]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G         16G         44G         21M         65G        108G
Swap:          2.0G        101M        1.9G


In [21]:
numZeroCousinsOf = {p:len(zeroCousinsOf[p]) for p in zeroCousinsOf}

In [22]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G         16G         44G         21M         65G        108G
Swap:          2.0G        101M        1.9G


In [23]:
oneCousinsOf = dict( par(delayed(kCousins_calc)(p, 1) for p in Ps) )

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1080s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done  98 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 132 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 170 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 250 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 292 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 338 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 384 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: 

[Parallel(n_jobs=-1)]: Done 19408 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 19690 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 19972 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 20258 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 20544 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 20834 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 21124 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 21412 out of 21475 | elapsed:  2.3min remaining:    0.4s
[Parallel(n_jobs=-1)]: Done 21475 out of 21475 | elapsed:  2.3min finished


In [24]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G         16G         43G         21M         65G        108G
Swap:          2.0G        101M        1.9G


In [25]:
numOneCousinsOf = {p:len(oneCousinsOf[p]) for p in oneCousinsOf}

In [26]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G         16G         43G         21M         65G        108G
Swap:          2.0G        101M        1.9G


In [27]:
twoCousinsOf = dict( par(delayed(kCousins_calc)(p, 2) for p in Ps) )

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1132s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done  67 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done  98 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 132 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 170 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 250 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 292 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 338 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done 384 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: 

[Parallel(n_jobs=-1)]: Done 19408 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 19690 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 19972 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 20258 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 20544 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 20834 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 21124 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 21412 out of 21475 | elapsed:  2.4min remaining:    0.4s
[Parallel(n_jobs=-1)]: Done 21475 out of 21475 | elapsed:  2.4min finished


In [28]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G         17G         43G         21M         65G        107G
Swap:          2.0G        101M        1.9G


In [29]:
numTwoCousinsOf = {p:len(twoCousinsOf[p]) for p in twoCousinsOf}

In [30]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G         17G         43G         21M         65G        107G
Swap:          2.0G        101M        1.9G


In [31]:
threeCousinsOf = dict( par(delayed(kCousins_calc)(p, 3) for p in Ps) )

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1689s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done  98 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 132 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 170 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 250 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 292 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 338 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done 384 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: 

[Parallel(n_jobs=-1)]: Done 19408 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 19690 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 19972 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 20258 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 20544 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 20834 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 21124 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 21412 out of 21475 | elapsed:  2.3min remaining:    0.4s
[Parallel(n_jobs=-1)]: Done 21475 out of 21475 | elapsed:  2.3min finished


In [32]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G         20G         40G         21M         65G        104G
Swap:          2.0G        101M        1.9G


In [33]:
numThreeCousinsOf = {p:len(threeCousinsOf[p]) for p in threeCousinsOf}

In [34]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G         20G         40G         21M         65G        104G
Swap:          2.0G        101M        1.9G


In [35]:
fourCousinsOf = dict( par(delayed(kCousins_calc)(p, 4) for p in Ps) )

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1669s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done  65 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done  98 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 132 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 170 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 250 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done 292 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 338 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done 384 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: 

[Parallel(n_jobs=-1)]: Done 19408 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 19690 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 19972 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 20258 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 20544 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 20834 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 21124 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 21412 out of 21475 | elapsed:  2.4min remaining:    0.4s
[Parallel(n_jobs=-1)]: Done 21475 out of 21475 | elapsed:  2.4min finished


In [36]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G         23G         36G         21M         65G        100G
Swap:          2.0G        101M        1.9G


In [38]:
numFourCousinsOf = {p:len(fourCousinsOf[p]) for p in fourCousinsOf}

In [39]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G         23G         36G         21M         65G        100G
Swap:          2.0G        101M        1.9G


In [40]:
numZeroCousinsOf = Counter(numZeroCousinsOf)
numOneCousinsOf = Counter(numOneCousinsOf)
numTwoCousinsOf = Counter(numTwoCousinsOf)
numThreeCousinsOf = Counter(numThreeCousinsOf)
numFourCousinsOf = Counter(numFourCousinsOf)

In [41]:
numFormatter = lambda n: "{:,.2f}".format(n)

def report(numKOrLessCousinsOf, seconds_per_calc):
    d = numKOrLessCousinsOf
    calcs = sum(d.values())
    time_s = calcs*seconds_per_calc
    time_d = time_s/60/60/24
    rep = (calcs, time_s, time_d)
    return tuple(map(numFormatter,
                     rep))

In [42]:
rate50 = 30.6/1000 #30.6ms

report(numZeroCousinsOf, rate50)

numOneOrLessCousinsOf = numZeroCousinsOf + numOneCousinsOf
report(numOneOrLessCousinsOf, rate50)

numTwoOrLessCousinsOf = numZeroCousinsOf + numOneCousinsOf + numTwoCousinsOf
report(numTwoOrLessCousinsOf, rate50)

numThreeOrLessCousinsOf = numZeroCousinsOf + numOneCousinsOf + numTwoCousinsOf + numThreeCousinsOf
report(numThreeOrLessCousinsOf, rate50)

numFourOrLessCousinsOf = numZeroCousinsOf + numOneCousinsOf + numTwoCousinsOf + numThreeCousinsOf + numFourCousinsOf
report(numFourOrLessCousinsOf, rate50)

('49,429.00', '1,512.53', '0.02')

('639,963.00', '19,582.87', '0.23')

('5,518,477.00', '168,865.40', '1.95')

('22,948,211.00', '702,215.26', '8.13')

('49,028,762.00', '1,500,280.12', '17.36')

In [43]:
rate200 = 104/1000 #104ms

report(numZeroCousinsOf, rate200)

numOneOrLessCousinsOf = numZeroCousinsOf + numOneCousinsOf
report(numOneOrLessCousinsOf, rate200)

numTwoOrLessCousinsOf = numZeroCousinsOf + numOneCousinsOf + numTwoCousinsOf
report(numTwoOrLessCousinsOf, rate200)

numThreeOrLessCousinsOf = numZeroCousinsOf + numOneCousinsOf + numTwoCousinsOf + numThreeCousinsOf
report(numThreeOrLessCousinsOf, rate200)

numFourOrLessCousinsOf = numZeroCousinsOf + numOneCousinsOf + numTwoCousinsOf + numThreeCousinsOf + numFourCousinsOf
report(numFourOrLessCousinsOf, rate200)

('49,429.00', '5,140.62', '0.06')

('639,963.00', '66,556.15', '0.77')

('5,518,477.00', '573,921.61', '6.64')

('22,948,211.00', '2,386,613.94', '27.62')

('49,028,762.00', '5,098,991.25', '59.02')

In [44]:
Counter(sorted(numZeroCousinsOf.values()))

Counter({1: 16966,
         2: 2371,
         3: 816,
         4: 441,
         5: 223,
         6: 131,
         7: 84,
         8: 62,
         9: 49,
         10: 25,
         11: 32,
         12: 23,
         13: 20,
         14: 19,
         15: 13,
         16: 11,
         17: 13,
         18: 10,
         19: 15,
         20: 7,
         21: 6,
         22: 2,
         23: 12,
         24: 6,
         25: 5,
         26: 8,
         27: 6,
         28: 6,
         29: 6,
         30: 3,
         31: 1,
         32: 2,
         33: 2,
         34: 4,
         35: 3,
         37: 2,
         38: 2,
         40: 4,
         41: 1,
         43: 2,
         44: 3,
         45: 2,
         46: 2,
         49: 1,
         51: 2,
         53: 4,
         54: 1,
         55: 1,
         57: 1,
         60: 3,
         61: 1,
         62: 1,
         66: 2,
         67: 1,
         70: 1,
         72: 1,
         74: 1,
         75: 2,
         76: 1,
         81: 2,
         92: 1,
    

In [45]:
Counter(sorted(numOneOrLessCousinsOf.values()))

Counter({1: 7501,
         2: 3444,
         3: 1772,
         4: 1148,
         5: 856,
         6: 624,
         7: 488,
         8: 428,
         9: 365,
         10: 283,
         11: 253,
         12: 232,
         13: 205,
         14: 187,
         15: 175,
         16: 145,
         17: 143,
         18: 132,
         19: 106,
         20: 107,
         21: 108,
         22: 72,
         23: 64,
         24: 79,
         25: 68,
         26: 62,
         27: 60,
         28: 55,
         29: 51,
         30: 51,
         31: 41,
         32: 39,
         33: 48,
         34: 47,
         35: 46,
         36: 44,
         37: 35,
         38: 34,
         39: 35,
         40: 28,
         41: 44,
         42: 38,
         43: 26,
         44: 22,
         45: 28,
         46: 33,
         47: 25,
         48: 30,
         49: 32,
         50: 26,
         51: 18,
         52: 14,
         53: 20,
         54: 25,
         55: 21,
         56: 24,
         57: 19,
         58: 18

In [46]:
Counter(sorted(numOneCousinsOf.values()))

Counter({0: 8606,
         1: 2913,
         2: 1605,
         3: 1063,
         4: 805,
         5: 552,
         6: 473,
         7: 400,
         8: 344,
         9: 285,
         10: 258,
         11: 195,
         12: 208,
         13: 169,
         14: 170,
         15: 133,
         16: 143,
         17: 125,
         18: 119,
         19: 107,
         20: 94,
         21: 71,
         22: 68,
         23: 73,
         24: 73,
         25: 60,
         26: 58,
         27: 64,
         28: 42,
         29: 45,
         30: 43,
         31: 46,
         32: 45,
         33: 43,
         34: 39,
         35: 37,
         36: 31,
         37: 36,
         38: 34,
         39: 33,
         40: 39,
         41: 30,
         42: 36,
         43: 24,
         44: 32,
         45: 20,
         46: 26,
         47: 34,
         48: 24,
         49: 31,
         50: 22,
         51: 12,
         52: 17,
         53: 23,
         54: 22,
         55: 27,
         56: 22,
         57: 24,


In [47]:
Counter(sorted(numTwoOrLessCousinsOf.values()))

Counter({1: 3573,
         2: 2215,
         3: 1295,
         4: 894,
         5: 627,
         6: 499,
         7: 427,
         8: 351,
         9: 332,
         10: 252,
         11: 260,
         12: 233,
         13: 201,
         14: 174,
         15: 180,
         16: 151,
         17: 129,
         18: 137,
         19: 121,
         20: 103,
         21: 121,
         22: 83,
         23: 89,
         24: 93,
         25: 102,
         26: 82,
         27: 89,
         28: 78,
         29: 78,
         30: 81,
         31: 83,
         32: 77,
         33: 80,
         34: 75,
         35: 92,
         36: 65,
         37: 88,
         38: 80,
         39: 68,
         40: 71,
         41: 86,
         42: 72,
         43: 59,
         44: 59,
         45: 64,
         46: 68,
         47: 67,
         48: 52,
         49: 51,
         50: 50,
         51: 53,
         52: 51,
         53: 47,
         54: 49,
         55: 51,
         56: 60,
         57: 52,
         58: 52

In [48]:
Counter(sorted(numThreeOrLessCousinsOf.values()))

Counter({1: 1667,
         2: 1141,
         3: 809,
         4: 586,
         5: 488,
         6: 365,
         7: 337,
         8: 258,
         9: 218,
         10: 225,
         11: 214,
         12: 198,
         13: 168,
         14: 141,
         15: 140,
         16: 134,
         17: 119,
         18: 118,
         19: 122,
         20: 78,
         21: 106,
         22: 80,
         23: 75,
         24: 74,
         25: 80,
         26: 63,
         27: 66,
         28: 78,
         29: 71,
         30: 84,
         31: 58,
         32: 64,
         33: 58,
         34: 54,
         35: 61,
         36: 56,
         37: 60,
         38: 56,
         39: 51,
         40: 37,
         41: 59,
         42: 47,
         43: 67,
         44: 48,
         45: 51,
         46: 54,
         47: 47,
         48: 54,
         49: 42,
         50: 46,
         51: 51,
         52: 45,
         53: 48,
         54: 38,
         55: 41,
         56: 55,
         57: 41,
         58: 28,
 

In [49]:
Counter(sorted(numFourOrLessCousinsOf.values()))

Counter({1: 776,
         2: 563,
         3: 435,
         4: 333,
         5: 259,
         6: 257,
         7: 227,
         8: 171,
         9: 157,
         10: 122,
         11: 156,
         12: 107,
         13: 106,
         14: 127,
         15: 111,
         16: 97,
         17: 91,
         18: 95,
         19: 103,
         20: 75,
         21: 81,
         22: 65,
         23: 64,
         24: 50,
         25: 67,
         26: 60,
         27: 78,
         28: 72,
         29: 52,
         30: 54,
         31: 44,
         32: 35,
         33: 38,
         34: 59,
         35: 67,
         36: 43,
         37: 46,
         38: 53,
         39: 42,
         40: 39,
         41: 40,
         42: 36,
         43: 45,
         44: 42,
         45: 28,
         46: 31,
         47: 33,
         48: 30,
         49: 35,
         50: 37,
         51: 30,
         52: 29,
         53: 24,
         54: 29,
         55: 27,
         56: 39,
         57: 31,
         58: 30,
       

In [51]:
# from functools import reduce

# def mergeDictsOfSets(d_a, d_b):
#     keys = set.union(set(d_a.keys()), set(d_b.keys()))
#     return {k:set.union(d_a[k], d_b[k]) for k in keys}

# oneOrLessCousinsOf = reduce(mergeDictsOfSets, (zeroCousinsOf, oneCousinsOf))
# twoOrLessCousinsOf = reduce(mergeDictsOfSets, (zeroCousinsOf, oneCousinsOf, twoCousinsOf))
# threeOrLessCousinsOf = reduce(mergeDictsOfSets, (zeroCousinsOf, oneCousinsOf, twoCousinsOf, threeCousinsOf))
# fourOrLessCousinsOf = reduce(mergeDictsOfSets, (zeroCousinsOf, oneCousinsOf, twoCousinsOf, threeCousinsOf, fourCousinsOf))

In [52]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G         27G         32G         21M         65G         96G
Swap:          2.0G        101M        1.9G


In [53]:
# del oneOrLessCousinsOf
# del twoOrLessCousinsOf
# del threeOrLessCousinsOf
# del fourOrLessCousinsOf

# Calculate $k$-spheres of $W$

In [56]:
def kSphere_calc(w, k):
    return (w, h_sphere(k, w, Ws))

In [57]:
!free -h 

              total        used        free      shared  buff/cache   available
Mem:           125G         25G         35G         21M         65G         99G
Swap:          2.0G        101M        1.9G


In [58]:
zeroSpheresOf = dict( par(delayed(kSphere_calc)(w, 0) for w in Ws) )
numZeroSpheresOf = {w:len(zeroSpheresOf[w]) for w in zeroSpheresOf}

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0127s.) Setting batch_size=30.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 574 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 1084 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 1654 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 2224 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 2854 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 3484 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 4174 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done 6403 out of 6403 | elapsed:    5.6s finished


In [59]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G         25G         35G         21M         65G         99G
Swap:          2.0G        101M        1.9G


In [60]:
oneSpheresOf = dict( par(delayed(kSphere_calc)(w, 1) for w in Ws) )
numOneSpheresOf = {w:len(oneSpheresOf[w]) for w in oneSpheresOf}

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0161s.) Setting batch_size=24.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 472 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 880 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 1336 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 2296 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 2800 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done 3352 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done 3904 tasks      | elapsed:    3.5s
[Parallel(n_jobs

In [61]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G         25G         35G         21M         65G         99G
Swap:          2.0G        101M        1.9G


In [62]:
twoSpheresOf = dict( par(delayed(kSphere_calc)(w, 2) for w in Ws) )
numTwoSpheresOf = {w:len(twoSpheresOf[w]) for w in twoSpheresOf}

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0177s.) Setting batch_size=22.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 438 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 812 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 1230 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 1648 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 2110 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 2572 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done 3078 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 3584 tasks      | elapsed:    3.2s
[Parallel(n_jobs

In [63]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G         25G         35G         21M         65G         99G
Swap:          2.0G        101M        1.9G


In [64]:
threeSpheresOf = dict( par(delayed(kSphere_calc)(w, 3) for w in Ws) )
numThreeSpheresOf = {w:len(threeSpheresOf[w]) for w in threeSpheresOf}

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0132s.) Setting batch_size=30.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 574 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 1084 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 1654 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 2224 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 2854 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done 3484 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0496s.) Setting batch_size=15.
[Parallel(n_jobs=-1)]: Done 4174 tasks      | elapsed:   

In [65]:
!free -h 

              total        used        free      shared  buff/cache   available
Mem:           125G         25G         34G         21M         65G         99G
Swap:          2.0G        101M        1.9G


In [66]:
fourSpheresOf = dict( par(delayed(kSphere_calc)(w, 4) for w in Ws) )
numFourSpheresOf = {w:len(fourSpheresOf[w]) for w in fourSpheresOf}

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0123s.) Setting batch_size=32.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 608 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 1152 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 1760 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 2368 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 3040 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done 3712 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 5699 out of 6403 | elapsed:    5.0s remaining:    0.6s
[Parallel(n_jobs=-1)]: Done 6403 out of 6403 | elapsed:   

In [67]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G         25G         34G         21M         65G         99G
Swap:          2.0G        101M        1.9G


# Export

In [71]:
## prefix relation as tsv
pr_dl = pref_rel_pair_to_dict(list(prefix_relation))

prefix_relation_fp = path.join(o, 'prefix_relation.tsv')

saveDictList_as_TSV(prefix_relation_fp, pr_dl, ('Segmental_Wordform', 'Prefix'))

In [None]:
# prefixesOf

prefixesOf_fp = path.join(o, 'prefixesOf.json')

exportDict(prefixesOf_fp, prefixesOf)

In [None]:
# completionsOf

completionsOf_fp = path.join(o, 'completionsOf.json')

exportDict(completionsOf_fp, completionsOf)

In [None]:
# k-cousins

cousin_fps = [path.join(o, f'{k}_cousinsOf.json') for k in range(5)]

exportDict(cousin_fps[0], castSetValuesToTuples(zeroCousinsOf))
exportDict(cousin_fps[1], castSetValuesToTuples(oneCousinsOf))
exportDict(cousin_fps[2], castSetValuesToTuples(twoCousinsOf))
exportDict(cousin_fps[3], castSetValuesToTuples(threeCousinsOf))
exportDict(cousin_fps[4], castSetValuesToTuples(fourCousinsOf))

In [None]:
# k-spheres

sphere_fps = [path.join(o, f'{k}_spheresOf.json') for k in range(5)]

exportDict(sphere_fps[0], castSetValuesToTuples(zeroSpheresOf))
exportDict(sphere_fps[1], castSetValuesToTuples(oneSpheresOf))
exportDict(sphere_fps[2], castSetValuesToTuples(twoSpheresOf))
exportDict(sphere_fps[3], castSetValuesToTuples(threeSpheresOf))
exportDict(sphere_fps[4], castSetValuesToTuples(fourSpheresOf))