In [1]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

**Eric Meinhardt / emeinhardt@ucsd.edu**

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Overview" data-toc-modified-id="Overview-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Overview</a></span><ul class="toc-item"><li><span><a href="#Requirements" data-toc-modified-id="Requirements-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Requirements</a></span></li><li><span><a href="#Usage" data-toc-modified-id="Usage-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Usage</a></span></li></ul></li><li><span><a href="#Parameters" data-toc-modified-id="Parameters-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Parameters</a></span></li><li><span><a href="#Imports-/-load-data" data-toc-modified-id="Imports-/-load-data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Imports / load data</a></span></li><li><span><a href="#Main-calculation" data-toc-modified-id="Main-calculation-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Main calculation</a></span></li></ul></div>

# Overview

Given
  - a filepath $d$ to an `.npy` file defining $p(V|C)$ a distribution on orthographic wordforms $V$ given (orthographic) $n$-gram context $c \in C$
  - a filepath $w$ to an `.npz` file defining $p(W|V)$ a distribution on full segmental wordforms $W$ given an orthographic wordform $v \in V$
  - a filepath $m$ to an `.npy` file defining $p(W|C)$ a distribution on full segmental wordforms $W$ given an (orthographic) $n$-gram context $c \in C$
  - an output filepath prefix $o$

this notebook calculates $p(V|W,C)$, i.e.

$$p(\hat{V} = v^*|\hat{X}_0^f = x_0^{'f}, c) = \frac{p(x_0^{'f}|v^*)p(v^*|c)}{p(x_0^{'f}|c)}$$

and exports it to $o$ as a `tiledb` sparse array.

## Requirements

 - `numpy`
 - the `pydata` `sparse` package
 - the `tiledb` package for high-dimensional, out-of-core, sparse matrix storage

## Usage

In [2]:
#FIXME

# Parameters

In [3]:
from os import getcwd, chdir, listdir, path, mkdir, makedirs

In [4]:
from boilerplate import *

In [5]:
# Parameters

d = ''
# d = 'LD_Fisher_vocab_in_Buckeye_contexts/LD_fisher_vocab_in_buckeye_contexts_projected_LTR_Buckeye.pV_C.npy'

w = ''
# w = 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.npz'

m = ''
# m = 'LD_Fisher_vocab_in_Buckeye_contexts/LTR_Buckeye_aligned_CM_filtered_LM_filtered_in_buckeye_contexts.pW_C.npy'

o = ''
# o = 'LD_Fisher_vocab_in_Buckeye_contexts/LTR_Buckeye_aligned_CM_filtered_LM_filtered_in_buckeye_contexts.pV_WC'

In [6]:
# Parameters
# d = "LD_Fisher_vocab_in_swbd2003_contexts/LD_fisher_vocab_in_swbd2003_contexts_projected_LTR_newdic_destressed.pV_C.npy"
# w = "LTR_newdic_destressed_aligned_w_GD_AmE_destressed/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered.pW_V.npz"
# m = "LD_Fisher_vocab_in_swbd2003_contexts/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered_in_swbd2003_contexts.pW_C.npy"
# o = "LD_Fisher_vocab_in_swbd2003_contexts/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered_in_swbd2003_contexts.pV_WC"


In [7]:
output_dir = path.dirname(o)
ensure_dir_exists(output_dir)

# Imports / load data

In [8]:
from itertools import starmap, chain

In [9]:
import numpy as np

In [12]:
import sparse

In [13]:
from tqdm import tqdm, tqdm_notebook, tqdm_gui

In [14]:
#from joblib import Parallel, delayed
import joblib as jl

J = -1
BACKEND = 'multiprocessing'
# BACKEND = 'loky'
V = 10
PREFER = 'processes'
# PREFER = 'threads'

def identity(x):
    return x

def par(gen_expr):
    return jl.Parallel(n_jobs=J, backend=BACKEND, verbose=V, prefer=PREFER)(gen_expr)

In [15]:
pV_C = np.load(d)
pV_C.shape
pV_C.dtype
pV_C.nbytes / 1e9

(9411, 106295)

dtype('float64')

8.00273796

In [16]:
assert np.allclose( np.sum(pV_C, axis=0), np.ones(shape = np.sum(pV_C, axis=0).shape)  )

In [17]:
pW_V = sparse.load_npz(w)
pW_V.shape
pW_V.dtype
pW_V.nbytes / 1e9
pW_V.density

(9172, 9411)

dtype('float64')

0.000225864

0.00010902747492368077

In [18]:
assert np.allclose( np.sum(pW_V, axis=0).todense(), np.ones(shape = np.sum(pW_V, axis=0).todense().shape)  )

In [19]:
#takes <=1 min
pW_C = np.load(m)
pW_C.shape
pW_C.dtype
pW_C.nbytes / 1e9

(9172, 106295)

dtype('float64')

7.79950192

In [20]:
assert np.allclose( np.sum(pW_C, axis=0), np.ones(shape = np.sum(pW_C, axis=0).shape)  )

# Main calculation

$$p(\hat{V} = v^*|\hat{X}_0^f = x_0^{'f}, c) = \frac{p(x_0^{'f}|v^*)p(v^*|c)}{p(x_0^{'f}|c)}$$

Let
 - $d = p(V|C)$
 - $w = p(W|V)$
 - $m = p(W|C)$
 - $o = p(V|W,C)$

$o_{i,j,k} = \frac{w_{j,i} d_{i,k}}{m_{j,k}}$

In [21]:
num_orthWords, num_contexts = pV_C.shape
num_segWords, num_orthWords2 = pW_V.shape
assert num_orthWords == num_orthWords2
num_segWords2, num_contexts2 = pW_C.shape
assert num_segWords == num_segWords2
assert num_contexts == num_contexts

In [22]:
pV_WC_shape = (num_orthWords, num_segWords, num_contexts)
pV_WC_shape
num_cells = np.prod(pV_WC_shape); f"{num_cells:,}"
n_GB = (num_cells * 8) / 1e9; f"{n_GB:,} GB"

(9411, 9172, 106295)

'9,175,139,071,140'

'73,401.11256912 GB'

In [23]:
pW_V.data.shape

(9411,)

Given how large this number is, we *can't* calculate this naively, and given that $p(W|V)$ is *incredibly* sparse, we definitely don't *need* to.

In [24]:
# def pV_WC_calc(v, w, c):
#     i = Vs_t
#     return pV_WC_calc_np(i,j,k)

def pV_WC_calc_np(i,j,k):
    numerator = pW_V[j,i]* pV_C[i,k]
    denominator = pW_C[j,k]
    return numerator / denominator

In [25]:
assert not np.any(pW_C == 0) # if there are no 0s in pW_C, then we can take the element-wise inverse without anything blowing up
inv_pW_C = 1.0 / pW_C

In [26]:
inv_pW_C.shape
inv_pW_C.nbytes / 1e9

(9172, 106295)

7.79950192

In [27]:
del pW_C

In [28]:
# technically correct
# pV_WC = np.einsum('ji,ik,jk->ijk', [pW_V, pV_C, inv_pW_C])

Here's why we don't actually need to represent the full 3D matrix:

In [29]:
pW_V.coords
pW_V.data
pW_V.data.shape
pW_V.shape

array([[   0,    1,    2, ..., 9169, 9170, 9171],
       [3995, 3996, 3994, ..., 8499, 8498, 8500]])

array([ 1.,  1.,  1., ...,  1.,  1.,  1.])

(9411,)

(9172, 9411)

In [30]:
# H(W|V) = 0
assert pW_V.data.shape[0] == pW_V.shape[1]
assert np.array_equal(pW_V.data, np.ones(pW_V.data.shape))

**Recall:**

$$p(\hat{V} = v^*|\hat{X}_0^f = x_0^{'f}, c) = \frac{p(x_0^{'f}|v^*)p(v^*|c)}{p(x_0^{'f}|c)}$$

Let
 - $d = p(V|C)$
 - $w = p(W|V)$
 - $m = p(W|C)$
 - $o = p(V|W,C)$

$o_{i,j,k} = \frac{w_{j,i} d_{i,k}}{m_{j,k}}$

$w_{j,i}$ is 1 only at a certain set of coordinates and zero everywhere else, so therefore $o$ is non-zero only at corresponding coordinates: for every $(j,i)$ s.t. $w_{j,i} = 1$, there is a column of $d$ we want to divide by a column of $m$
$$o_{i,j,:} = \frac{d_{i,:}}{m_{j,:}}$$

In [31]:
pW_V.shape
pV_C.shape
# numerator = pW_V @ pV_C
# numerator.shape

(9172, 9411)

(9411, 106295)

In [32]:
pV_WC_shape

(9411, 9172, 106295)

In [33]:
pV_WC_shape[0]
pV_WC_shape[1]
pV_WC_shape[2]

9411

9172

106295

The cell below calculates one 'column' of $k$s at a time (as a `numpy array`), given $i$ and $j$.

In [34]:
pV_WC_coords_j = pW_V.coords[0]
pV_WC_coords_i = pW_V.coords[1]

ij_pairs = tuple(zip(*(pV_WC_coords_i, pV_WC_coords_j)))
# ij_pairs = zip(*(pV_WC_coords_i, pV_WC_coords_j))
del pV_WC_coords_j
del pV_WC_coords_i

def calc_layer(i,j):
    return pV_C[i] * inv_pW_C[j]
# def calc_layer(ij_pair):
#     i, j = ij_pair
#     return pV_C[i] * inv_pW_C[j]

# lazy_calc_layer = delayed(calc_layer)

In [35]:
len(ij_pairs)

9411

The cell below calculates - as a `sparse COO` array - one 'block' of all relevant $j$s and $k$s given a single $i$.

In [47]:
# there is one block of cells for every orthographic word v_i
#  spanning (num_segwords x num_contexts)

# block_shape = (1, num_segWords, num_contexts)
block_shape = (num_segWords, num_contexts)
block_shape

# each block is *sparse*

# @delayed
def block_constructor(i):
    my_js = (ij_pair[1] for ij_pair in ij_pairs if ij_pair[0] == i)
#     my_nonempty_layers = (((i,j), calc_layer(i,j)) for j in my_js)
    
    my_DOK = sparse.DOK(shape = block_shape,
                        dtype = 'float64',
                        fill_value = 0.0)
    for j in my_js:
        my_DOK[j,:] = calc_layer(i,j)
#         my_DOK[0,j,:] = calc_layer(i,j)
#         my_DOK[i,j,:] = calc_layer(i,j)
#         my_DOK[0,j,:] = lazy_calc_layer(i,j)
    my_COO = sparse.COO(my_DOK)
#     return (i, my_COO)
    return my_COO

#     #this is ≥2x slower - unlike the method above, 
#     #it doesn't use slice assignments.
#     my_DOK_data = {(i,j,k):val
#                    for k,val in enumerate(calc_layer(i,j))}
#     my_DOK = sparse.DOK(shape = (1, num_segWords, num_contexts),
#                         data = my_DOK_data,
#                         dtype = 'float64',
#                         fill_value = 0.0)
#     return sparse.COO(my_DOK)

(9172, 106295)

In [73]:
import tiledb

In [74]:
pV_WC_shape
block_shape

(9411, 9172, 106295)

(9172, 106295)

In [None]:
# #Two block sizes that make sense for reading:
# # |V| x 1 x 1 = each distribution/conditioning event is a block
# # |V| x |W| x 1 = each context is a block
# dom_read = tiledb.Domain(tiledb.Dim(name="orthWord", domain=(0, 9410), tile=9411, dtype=np.uint32),
#                          tiledb.Dim(name="segWord", domain=(0, 9171), tile=9172, dtype=np.uint32),
#                          tiledb.Dim(name="context", domain=(0, 106294), tile=1, dtype=np.uint32))

# schema_read = tiledb.ArraySchema(domain=dom_read, sparse=True,
#                                  attrs=[tiledb.Attr(name="O", dtype=np.float64)])

In [75]:
#Two block sizes that make sense for reading:
# |V| x 1 x 1 = each distribution/conditioning event is a block
# |V| x |W| x 1 = each context is a block
dom_read = tiledb.Domain(tiledb.Dim(name="orthWord", domain=(0, pV_WC_shape[0]-1), tile=pV_WC_shape[0], dtype=np.uint32),
                         tiledb.Dim(name="segWord", domain=(0, pV_WC_shape[1]-1), tile=pV_WC_shape[1], dtype=np.uint32),
                         tiledb.Dim(name="context", domain=(0, pV_WC_shape[2]-1), tile=1, dtype=np.uint32))

schema_read = tiledb.ArraySchema(domain=dom_read, sparse=True,
                                 attrs=[tiledb.Attr(name="O", dtype=np.float64)])

In [4]:
# %rm -r LD_Fisher_vocab_in_swbd2003_contexts/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered_in_swbd2003_contexts.pV_WC_read

rm: cannot remove 'LD_Fisher_vocab_in_swbd2003_contexts/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered_in_swbd2003_contexts.pV_WC_read': No such file or directory


In [76]:
o
# array_name_read = o + "_read"
array_name_read = o
tiledb.SparseArray.create(array_name_read, schema_read)

'LD_Fisher_vocab_in_swbd2003_contexts/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered_in_swbd2003_contexts.pV_WC'

In [77]:
def block_transformer(i, constructed_block_set_jk_tile):
    I = np.array([i] * constructed_block_set_jk_tile.coords.shape[1])
    J = constructed_block_set_jk_tile.coords[0]
    K = constructed_block_set_jk_tile.coords[1]
    data = constructed_block_set_jk_tile.data
    return (I, J, K, data)

# def block_writer(block_tuple, arr_name):
#     with tiledb.SparseArray(arr_name, mode='w') as A:
def block_writer(block_tuple, A):
# #     with tiledb.SparseArray(arr_name, mode='w') as A:
        I, J, K, data = block_tuple
        A[I, J, K] = data

In [78]:
config = tiledb.Config()
# config["sm.consolidation.steps"] = 10
config["sm.consolidation.steps"] = 600
config["sm.consolidation.step_min_frags"] = 2
config["sm.consolidation.step_max_frags"] = 20
# tiledb.consolidate(array_name, config)

In [48]:
# # Each i index corresponds to a block; each "block set" is a sequence of ≈ pV_WC_shape[0]/4 indices
# block_sets = np.array_split(np.arange(pV_WC_shape[0]), 4)


# with tiledb.SparseArray(array_name_read, mode='w') as A:
#     for block_set in tqdm(block_sets):
#         constructed_blocks = par(jl.delayed(block_constructor)(i) for i in block_set)

#         block_tuples = jl.Parallel(n_jobs=J, backend='threading', verbose=V, prefer=PREFER)(jl.delayed(block_transformer)(i, constructed_block_tile)
#                                                                                             for i, constructed_block_tile in zip(block_set, constructed_blocks))

#         jl.Parallel(n_jobs=J, backend='threading', verbose=V, prefer=PREFER)(jl.delayed(block_writer)(block_tuple, A) 
#                                                                              for block_tuple in block_tuples);
# #         tiledb.consolidate(config, uri=array_name_read)

  0%|          | 0/4 [00:00<?, ?it/s][Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 160 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done  72 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done 101 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:    8.0s
[Parallel(n_jobs=-1)]: Done 161 tasks      | elapsed:   10.5s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   12.5s
[Parallel(n_jobs=-1)]: Done 225 tasks      | elapsed:   13.2s
[Parallel(n_jobs=-1)]: Done 258 tasks      | elapsed:   14.0s
[Parallel(n_jobs=-1)]: Done 293 tasks      | elapsed:   14.7s
[Parallel(n_jobs=-1)]: Done 328 tasks      | elapsed:   18.8s
[Parallel(n_jobs=-1)]: Done 365 tasks      | elapsed:   19.6s
[Parallel(n_jobs=-1)]: Done 402 tasks      | elapsed:   20.4s
[Parallel(n_jobs=-1)]: Done 441 tasks      | elapsed:   21.1s
[Paral

[Parallel(n_jobs=-1)]: Done 1541 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1602 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1665 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1728 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1793 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 1858 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 1925 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1992 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 2270 out of 2353 | elapsed:  1.7min remaining:    3.8s
[Parallel(n_jobs=-1)]: Done 2353 out of 2353 | elapsed:  1.8min finished


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

 25%|██▌       | 1/4 [04:46<14:19, 286.57s/it][Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 160 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done  72 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done 101 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-1)]: Done 161 tasks      | elapsed:   11.3s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   13.0s
[Parallel(n_jobs=-1)]: Done 225 tasks      | elapsed:   13.6s
[Parallel(n_jobs=-1)]: Done 258 tasks      | elapsed:   14.2s
[Parallel(n_jobs=-1)]: Done 293 tasks      | elapsed:   14.8s
[Parallel(n_jobs=-1)]: Done 328 tasks      | elapsed:   17.8s
[Parallel(n_jobs=-1)]: Done 365 tasks      | elapsed:   19.2s
[Parallel(n_jobs=-1)]: Done 402 tasks      | elapsed:   19.9s
[Parallel(n_jobs=-1)]: Done 441 tasks      | elapsed:   20.

[Parallel(n_jobs=-1)]: Done 1665 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1728 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1793 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 1858 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 1925 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1992 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 2270 out of 2353 | elapsed:  1.7min remaining:    3.8s
[Parallel(n_jobs=-1)]: Done 2353 out of 2353 | elapsed:  1.8min finished


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

 50%|█████     | 2/4 [10:25<10:04, 302.24s/it][Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 160 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    6.5s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:    7.1s
[Parallel(n_jobs=-1)]: Done  72 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done 101 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-1)]: Done 161 tasks      | elapsed:   11.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   12.0s
[Parallel(n_jobs=-1)]: Done 225 tasks      | elapsed:   12.8s
[Parallel(n_jobs=-1)]: Done 258 tasks      | elapsed:   13.5s
[Parallel(n_jobs=-1)]: Done 293 tasks      | elapsed:   14.1s
[Parallel(n_jobs=-1)]: Done 328 tasks      | elapsed:   17.0s
[Parallel(n_jobs=-1)]: Done 365 tasks      | elapsed:   18.1s
[Parallel(n_jobs=-1)]: Done 402 tasks      | elapsed:   18.8s
[Parallel(n_jobs=-1)]: Done 441 tasks      | elapsed:   19.

[Parallel(n_jobs=-1)]: Done 1602 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1665 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1728 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1793 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 1858 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 1925 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1992 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 2270 out of 2353 | elapsed:  1.7min remaining:    3.8s
[Parallel(n_jobs=-1)]: Done 2353 out of 2353 | elapsed:  1.8min finished


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

 75%|███████▌  | 3/4 [16:45<05:25, 325.46s/it][Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 160 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done  72 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done 101 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:    8.4s
[Parallel(n_jobs=-1)]: Done 161 tasks      | elapsed:   11.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   13.4s
[Parallel(n_jobs=-1)]: Done 225 tasks      | elapsed:   14.0s
[Parallel(n_jobs=-1)]: Done 258 tasks      | elapsed:   14.6s
[Parallel(n_jobs=-1)]: Done 293 tasks      | elapsed:   15.1s
[Parallel(n_jobs=-1)]: Done 328 tasks      | elapsed:   18.6s
[Parallel(n_jobs=-1)]: Done 365 tasks      | elapsed:   19.7s
[Parallel(n_jobs=-1)]: Done 402 tasks      | elapsed:   20.4s
[Parallel(n_jobs=-1)]: Done 441 tasks      | elapsed:   21.

[Parallel(n_jobs=-1)]: Done 1602 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1665 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1728 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1793 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 1858 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 1925 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1992 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 2269 out of 2352 | elapsed:  1.7min remaining:    3.8s
[Parallel(n_jobs=-1)]: Done 2352 out of 2352 | elapsed:  1.8min finished


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

100%|██████████| 4/4 [23:51<00:00, 355.75s/it]


In [79]:
# Each i index corresponds to a block; each "block set" is a sequence of ≈ pV_WC_shape[0]/4 indices
block_sets = np.array_split(np.arange(pV_WC_shape[0]), 4)

for block_set in tqdm(block_sets):
    constructed_blocks = par(jl.delayed(block_constructor)(i) for i in block_set)

    block_tuples = jl.Parallel(n_jobs=J, backend='threading', verbose=V, prefer=PREFER)(jl.delayed(block_transformer)(i, constructed_block_tile)
                                                                                        for i, constructed_block_tile in zip(block_set, constructed_blocks))
    with tiledb.SparseArray(array_name_read, mode='w') as A:
        jl.Parallel(n_jobs=J, backend='threading', verbose=V, prefer=PREFER)(jl.delayed(block_writer)(block_tuple, A) 
                                                                             for block_tuple in block_tuples);
    tiledb.consolidate(config, uri=array_name_read)

  0%|          | 0/4 [00:00<?, ?it/s][Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 160 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:    8.6s
[Parallel(n_jobs=-1)]: Done  72 tasks      | elapsed:    8.9s
[Parallel(n_jobs=-1)]: Done 101 tasks      | elapsed:    9.3s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:    9.6s
[Parallel(n_jobs=-1)]: Done 161 tasks      | elapsed:   11.9s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   13.7s
[Parallel(n_jobs=-1)]: Done 225 tasks      | elapsed:   14.7s
[Parallel(n_jobs=-1)]: Done 258 tasks      | elapsed:   15.3s
[Parallel(n_jobs=-1)]: Done 293 tasks      | elapsed:   16.0s
[Parallel(n_jobs=-1)]: Done 328 tasks      | elapsed:   19.1s
[Parallel(n_jobs=-1)]: Done 365 tasks      | elapsed:   20.8s
[Parallel(n_jobs=-1)]: Done 402 tasks      | elapsed:   21.8s
[Parallel(n_jobs=-1)]: Done 441 tasks      | elapsed:   23.0s
[Paral

[Parallel(n_jobs=-1)]: Done 1602 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1665 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1728 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 1793 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 1858 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1925 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1992 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 2270 out of 2353 | elapsed:  1.8min remaining:    3.9s
[Parallel(n_jobs=-1)]: Done 2353 out of 2353 | elapsed:  1.8min finished


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

 25%|██▌       | 1/4 [05:06<15:18, 306.15s/it][Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 160 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done  72 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-1)]: Done 101 tasks      | elapsed:    8.4s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:    8.7s
[Parallel(n_jobs=-1)]: Done 161 tasks      | elapsed:   11.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   13.2s
[Parallel(n_jobs=-1)]: Done 225 tasks      | elapsed:   13.9s
[Parallel(n_jobs=-1)]: Done 258 tasks      | elapsed:   14.6s
[Parallel(n_jobs=-1)]: Done 293 tasks      | elapsed:   15.1s
[Parallel(n_jobs=-1)]: Done 328 tasks      | elapsed:   18.1s
[Parallel(n_jobs=-1)]: Done 365 tasks      | elapsed:   19.8s
[Parallel(n_jobs=-1)]: Done 402 tasks      | elapsed:   20.5s
[Parallel(n_jobs=-1)]: Done 441 tasks      | elapsed:   21.

[Parallel(n_jobs=-1)]: Done 1541 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1602 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1665 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1728 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1793 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 1858 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 1925 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1992 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 2270 out of 2353 | elapsed:  1.7min remaining:    3.8s
[Parallel(n_jobs=-1)]: Done 2353 out of 2353 | elapsed:  1.8min finished


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

 50%|█████     | 2/4 [11:15<10:50, 325.07s/it][Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 160 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done  72 tasks      | elapsed:    7.1s
[Parallel(n_jobs=-1)]: Done 101 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done 161 tasks      | elapsed:    9.8s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   11.3s
[Parallel(n_jobs=-1)]: Done 225 tasks      | elapsed:   12.4s
[Parallel(n_jobs=-1)]: Done 258 tasks      | elapsed:   12.9s
[Parallel(n_jobs=-1)]: Done 293 tasks      | elapsed:   13.4s
[Parallel(n_jobs=-1)]: Done 328 tasks      | elapsed:   15.9s
[Parallel(n_jobs=-1)]: Done 365 tasks      | elapsed:   17.4s
[Parallel(n_jobs=-1)]: Done 402 tasks      | elapsed:   18.2s
[Parallel(n_jobs=-1)]: Done 441 tasks      | elapsed:   19.

[Parallel(n_jobs=-1)]: Done 1602 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1665 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1728 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1793 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 1858 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 1925 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1992 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 2270 out of 2353 | elapsed:  1.7min remaining:    3.8s
[Parallel(n_jobs=-1)]: Done 2353 out of 2353 | elapsed:  1.8min finished


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

 75%|███████▌  | 3/4 [18:16<05:53, 353.90s/it][Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 160 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done  72 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done 101 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done 161 tasks      | elapsed:    8.7s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   11.4s
[Parallel(n_jobs=-1)]: Done 225 tasks      | elapsed:   12.1s
[Parallel(n_jobs=-1)]: Done 258 tasks      | elapsed:   12.6s
[Parallel(n_jobs=-1)]: Done 293 tasks      | elapsed:   13.1s
[Parallel(n_jobs=-1)]: Done 328 tasks      | elapsed:   15.3s
[Parallel(n_jobs=-1)]: Done 365 tasks      | elapsed:   17.2s
[Parallel(n_jobs=-1)]: Done 402 tasks      | elapsed:   18.0s
[Parallel(n_jobs=-1)]: Done 441 tasks      | elapsed:   18.

[Parallel(n_jobs=-1)]: Done 1602 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1665 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1728 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1793 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 1858 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 1925 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1992 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 2269 out of 2352 | elapsed:  1.7min remaining:    3.7s
[Parallel(n_jobs=-1)]: Done 2352 out of 2352 | elapsed:  1.8min finished


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

100%|██████████| 4/4 [25:54<00:00, 385.10s/it]


In [None]:
# this last consolidation step should clean up any remaining fragments
tiledb.consolidate(config, uri=array_name_read)

In [79]:
# from random import choice

# def randomCoords(arr_shape):
#     return tuple(np.random.randint(0,arr_shape[dim]) for dim in range(len(arr_shape)))

# numRandomCoords = 10000

# test_coords = [randomCoords(pV_WC_shape) for each in range(numRandomCoords)]

In [84]:
# %%timeit

# with tiledb.SparseArray(array_name_read, mode='r') as A:
#     rand_coords = choice(test_coords)
# #     rand_coords
#     A[rand_coords]

35.6 s ± 262 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [80]:
# config

Parameter                        | Value      
-------------------------------- | -----------
sm.array_schema_cache_size       | '10000000' 
sm.check_coord_dups              | 'true'     
sm.check_coord_oob               | 'true'     
sm.check_global_order            | 'true'     
sm.consolidation.amplification   | '1'        
sm.consolidation.buffer_size     | '50000000' 
sm.consolidation.step_max_frags  | '20'       
sm.consolidation.step_min_frags  | '2'        
sm.consolidation.step_size_ratio | '0'        
sm.consolidation.steps           | '10'       
sm.dedup_coords                  | 'false'    
sm.enable_signal_handlers        | 'true'     
sm.fragment_metadata_cache_size  | '10000000' 
sm.num_async_threads             | '1'        
sm.num_reader_threads            | '1'        
sm.num_tbb_threads               | '-1'       
sm.num_writer_threads            | '1'        
sm.tile_cache_size               | '10000000' 
vfs.file.max_parallel_ops        | '160'      
vfs.hdfs.kerb

In [87]:
# config["sm.consolidation.steps"] = 200

In [89]:
#10 steps, min_frags=2, max_frags=20 => ~1m20s per cell run, w/ <=200 fewer fragments per cell run
# => estimate of 16.66 min per 2k fragments
#80 steps, min_frags=2, max_frags=20 => ~4m7s per cell run
#100 steps, min_frags=2, max_frags=20 => ~4m53s per cell run, w/ 1900 fewer fragments per cell run
#200 steps, min_frags=2, max_frags=20 => ~8m20s per cell run, w/ 3800 fewer fragments per cell run
# tiledb.consolidate(config, uri=array_name_read)

'LD_Fisher_vocab_in_swbd2003_contexts/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered_in_swbd2003_contexts.pV_WC_read'

In [None]:
# %ls -l LTR_newdic_destressed_aligned_CM_filtered_LM_filtered_in_swbd2003_contexts.pV_WC_read/

In [91]:
# %%timeit

# with tiledb.SparseArray(array_name_read, mode='r') as A:
#     rand_coords = choice(test_coords)
# #     rand_coords
#     A[rand_coords]

36.9 ms ± 732 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


Consolidation of fragments leads to a nearly 1000x speed-up.

In [92]:
num_segWords

9172

In [None]:
pV_WC_md = {
    'V':{'from fp':f"implicitly associated with two files:\n\t{d}\n\t{w}",
         'changes':'None',
         'size':num_orthWords},
    'W':{'from fp':f"implicitly associated with two files:\n\t{w}\n\t{m}",
         'changes':'None',
         'size':num_segWords},
    'C':{'from fp':f"implicitly associated with two files:\n\t{d}\n\t{m}",
         'changes':'None',
         'size':num_contexts}
}

my_fp = o
exportMatrixMetadata(my_fp + '_metadata.json',
                     my_fp,
                     None,
                     pV_WC_md,
                     'Step 5a',
                     'Calculate orthographic posterior given segmental wordform + context',
                     {})

In [85]:
listdir(output_dir)

['LD_fisher_vocab_in_buckeye_contexts_projected_LTR_Buckeye.pV_C.npy',
 'LD_fisher_vocab_in_buckeye_contexts.pV_C',
 'buckeye_contexts.txt',
 'LM_filtered_buckeye_contexts.txt',
 '.ipynb_checkpoints',
 'LD_fisher_vocab_in_buckeye_contexts_projected_LTR_Buckeye.pV_C',
 'Producing Fisher vocab in Buckeye contexts contextual distributions.ipynb',
 'Filter LD_fisher_vocab_in_buckeye_contexts against LTR_Buckeye_aligned_CM_filtered_LM_filtered.ipynb',
 'Calculate segmental wordform distribution for LTR_Buckeye_aligned_CM_filtered_LM_filtered in buckeye contexts.ipynb',
 'LD_fisher_vocab_in_buckeye_contexts.hV_C',
 'fisher_vocabulary_main.txt',
 'LTR_Buckeye_aligned_CM_filtered_LM_filtered.pV_WC.npz',
 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_in_buckeye_contexts.pW_C.npy']

In [None]:
# # pV_WC_temp = None
# temp_coords = None
# temp_data = None
# with tiledb.SparseArray(array_name_read, mode='r') as A:
# #     rand_coords = choice(test_coords)
# # #     rand_coords
# #     A[rand_coords]
# #     pV_WC_temp = A[:,:,:]
#     A[:,:,:]
#     temp_coords = A[:,:,:]['coords']
#     temp_data = A[:,:,:]['data']