In [1]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

**Eric Meinhardt / emeinhardt@ucsd.edu**

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Overview" data-toc-modified-id="Overview-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Overview</a></span><ul class="toc-item"><li><span><a href="#Requirements" data-toc-modified-id="Requirements-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Requirements</a></span></li><li><span><a href="#Usage" data-toc-modified-id="Usage-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Usage</a></span></li></ul></li><li><span><a href="#Parameters" data-toc-modified-id="Parameters-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Parameters</a></span></li><li><span><a href="#Imports-/-load-data" data-toc-modified-id="Imports-/-load-data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Imports / load data</a></span></li><li><span><a href="#Main-calculation" data-toc-modified-id="Main-calculation-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Main calculation</a></span></li><li><span><a href="#Export" data-toc-modified-id="Export-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Export</a></span></li></ul></div>

# Overview

Given
  - a filepath $d$ to an `.npy` file defining $p(V|C)$ a distribution on orthographic wordforms $V$ given (orthographic) $n$-gram context $c \in C$
  - a filepath $w$ to an `.npz` file defining $p(W|V)$ a distribution on full segmental wordforms $W$ given an orthographic wordform $v \in V$
  - a filepath $m$ to an `.npy` file defining $p(W|C)$ a distribution on full segmental wordforms $W$ given an (orthographic) $n$-gram context $c \in C$
  - an output filepath prefix $o$

this notebook calculates $p(V|W,C)$, i.e.

$$p(\hat{V} = v^*|\hat{X}_0^f = x_0^{'f}, c) = \frac{p(x_0^{'f}|v^*)p(v^*|c)}{p(x_0^{'f}|c)}$$

and exports it to $o$`.npz`.

## Requirements

 - `numpy`
 - the `pydata` `sparse` package

## Usage

In [2]:
#FIXME

# Parameters

In [3]:
from os import getcwd, chdir, listdir, path, mkdir, makedirs

In [4]:
from boilerplate import *

In [5]:
# Parameters

d = ''
# d = 'LD_Fisher_vocab_in_Buckeye_contexts/LD_fisher_vocab_in_buckeye_contexts_projected_LTR_Buckeye.pV_C.npy'

w = ''
# w = 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.npz'

m = ''
# m = 'LD_Fisher_vocab_in_Buckeye_contexts/LTR_Buckeye_aligned_CM_filtered_LM_filtered_in_buckeye_contexts.pW_C.npy'

o = ''
# o = 'LD_Fisher_vocab_in_Buckeye_contexts/LTR_Buckeye_aligned_CM_filtered_LM_filtered_in_buckeye_contexts.pV_WC'

In [6]:
# Parameters
d = "LD_Fisher_vocab_in_swbd2003_contexts/LD_fisher_vocab_in_swbd2003_contexts_projected_LTR_newdic_destressed.pV_C.npy"
w = "LTR_newdic_destressed_aligned_w_GD_AmE_destressed/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered.pW_V.npz"
m = "LD_Fisher_vocab_in_swbd2003_contexts/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered_in_swbd2003_contexts.pW_C.npy"
o = "LD_Fisher_vocab_in_swbd2003_contexts/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered_in_swbd2003_contexts.pV_WC"


In [7]:
output_dir = path.dirname(o)
ensure_dir_exists(output_dir)

# Imports / load data

In [8]:
from itertools import starmap, chain

In [9]:
import sparse

In [10]:
from tqdm import tqdm, tqdm_notebook, tqdm_gui

In [11]:
pV_C = np.load(d)
pV_C.shape
pV_C.dtype
pV_C.nbytes / 1e9

(9411, 106295)

dtype('float64')

8.00273796

In [12]:
pW_V = sparse.load_npz(w)
pW_V.shape
pW_V.dtype
pW_V.nbytes / 1e9
pW_V.density

(9172, 9411)

dtype('float64')

0.000225864

0.00010902747492368077

In [13]:
#takes <=1 min
pW_C = np.load(m)
pW_C.shape
pW_C.dtype
pW_C.nbytes / 1e9

(9172, 106295)

dtype('float64')

7.79950192

# Main calculation

$$p(\hat{V} = v^*|\hat{X}_0^f = x_0^{'f}, c) = \frac{p(x_0^{'f}|v^*)p(v^*|c)}{p(x_0^{'f}|c)}$$

Let
 - $d = p(V|C)$
 - $w = p(W|V)$
 - $m = p(W|C)$
 - $o = p(V|W,C)$

$o_{i,j,k} = \frac{w_{j,i} d_{i,k}}{m_{j,k}}$

In [14]:
num_orthWords, num_contexts = pV_C.shape
num_segWords, num_orthWords2 = pW_V.shape
assert num_orthWords == num_orthWords2
num_segWords2, num_contexts2 = pW_C.shape
assert num_segWords == num_segWords2
assert num_contexts == num_contexts

In [15]:
pV_WC_shape = (num_orthWords, num_segWords, num_contexts)
pV_WC_shape
num_cells = np.prod(pV_WC_shape); f"{num_cells:,}"
n_GB = (num_cells * 8) / 1e9; f"{n_GB:,} GB"

(9411, 9172, 106295)

'9,175,139,071,140'

'73,401.11256912 GB'

In [16]:
pW_V.data.shape

(9411,)

Given how large this number is, we *can't* calculate this naively, and given that $p(W|V)$ is *incredibly* sparse, we definitely don't *need* to.

In [17]:
# def pV_WC_calc(v, w, c):
#     i = Vs_t
#     return pV_WC_calc_np(i,j,k)

def pV_WC_calc_np(i,j,k):
    numerator = pW_V[j,i]* pV_C[i,k]
    denominator = pW_C[j,k]
    return numerator / denominator

In [18]:
assert not np.any(pW_C == 0) # if there are no 0s in pW_C, then we can take the element-wise inverse without anything blowing up
inv_pW_C = 1.0 / pW_C

In [19]:
inv_pW_C.shape
inv_pW_C.nbytes / 1e9

(9172, 106295)

7.79950192

In [20]:
del pW_C

In [21]:
# technically correct
# pV_WC = np.einsum('ji,ik,jk->ijk', [pW_V, pV_C, inv_pW_C])

Here's why we don't actually need to represent the full 3D matrix:

In [22]:
pW_V.coords
pW_V.data
pW_V.data.shape
pW_V.shape

array([[   0,    1,    2, ..., 9169, 9170, 9171],
       [   0,    1,    2, ..., 9408, 9409, 9410]])

array([1., 1., 1., ..., 1., 1., 1.])

(9411,)

(9172, 9411)

In [23]:
# H(W|V) = 0
assert pW_V.data.shape[0] == pW_V.shape[1]
assert np.array_equal(pW_V.data, np.ones(pW_V.data.shape))

**Recall:**

$$p(\hat{V} = v^*|\hat{X}_0^f = x_0^{'f}, c) = \frac{p(x_0^{'f}|v^*)p(v^*|c)}{p(x_0^{'f}|c)}$$

Let
 - $d = p(V|C)$
 - $w = p(W|V)$
 - $m = p(W|C)$
 - $o = p(V|W,C)$

$o_{i,j,k} = \frac{w_{j,i} d_{i,k}}{m_{j,k}}$

$w_{j,i}$ is 1 only at a certain set of coordinates and zero everywhere else, so therefore $o$ is non-zero only at corresponding coordinates: for every $(j,i)$ s.t. $w_{j,i} = 1$, there is a column of $d$ we want to divide by a column of $m$
$$o_{i,j,:} = \frac{d_{i,:}}{m_{j,:}}$$

In [24]:
pW_V.shape
pV_C.shape
# numerator = pW_V @ pV_C
# numerator.shape

(9172, 9411)

(9411, 106295)

In [25]:
pV_WC_shape

pV_WC = sparse.DOK(pV_WC_shape, dtype='float64')

pV_WC_coords_j = pW_V.coords[0]
pV_WC_coords_i = pW_V.coords[1]

ij_pairs = tuple(zip(*(pV_WC_coords_i, pV_WC_coords_j)))
# ij_pairs = zip(*(pV_WC_coords_i, pV_WC_coords_j))
del pV_WC_coords_j
del pV_WC_coords_i

def calc_layer(i,j):
    return pV_C[i] * inv_pW_C[j]

#3h 7min for newdic, results in 122GB of mem usage
for i,j in tqdm_notebook(ij_pairs, mininterval=6, total=len(ij_pairs)):
    pV_WC[i,j,:] = calc_layer(i,j)

# pV_WC_coords_k = np.arange(num_contexts)

(9411, 9172, 106295)

HBox(children=(IntProgress(value=0, max=9411), HTML(value='')))

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [33]:
pV_WC.dtype
pV_WC.shape
pV_WC.density

dtype('float64')

(9411, 9172, 106295)

7.801820511381734e-05

In [37]:
o

'LD_Fisher_vocab_in_swbd2003_contexts/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered_in_swbd2003_contexts.pV_WC'

In [40]:
# from pickle import dump

In [41]:
# dump(pV_WC, open(o + '.pickle', 'wb'))

MemoryError: 

In [38]:
# sparse.save_npz(o, pV_WC, compressed=False)

AttributeError: 'DOK' object has no attribute 'coords'

In [None]:
#takes 15s

# ji_pairs = tuple(zip(*pW_V.coords))
pV_WC_coords_j = pW_V.coords[0]
pV_WC_coords_i = pW_V.coords[1]

ij_pairs = tuple(zip(*(pV_WC_coords_i, pV_WC_coords_j)))
# ij_pairs = zip(*(pV_WC_coords_i, pV_WC_coords_j))
del pV_WC_coords_j
del pV_WC_coords_i

# ks = np.arange(num_contexts)
ks = range(num_contexts)

# pV_WC_shape
#very slow
# pV_WC = sparse.DOK(pV_WC_shape, dtype='float64')
# for i,j in tqdm(ij_pairs):
#     pV_WC[i,j,:] = calc_layer(i,j)

# pV_WC_coords_k = np.arange(num_contexts)


def calc_layer(i,j):
    return pV_C[i] * inv_pW_C[j]

#each line is pretty fast
Is = ([i] * len(ks) for i,j in ij_pairs)
Js = ([j] * len(ks) for i,j in ij_pairs)
Ks = (ks for i,j in ij_pairs)
# del ks
# layers = tuple(starmap(calc_layer, ij_pairs))
layers = starmap(calc_layer, ij_pairs)
# del ij_pairs

# len(layers)
# len(ij_pairs)
# len(ks)
# len(layers[0])
# ' '
# len(Is[0])
# len(Js[0])
# len(Ks[0])
# len(layers[0])

In [None]:
#takes 1.5m
# row_coords = tuple(chain.from_iterable(Is))
row_coords = chain.from_iterable(Is)
# del Is
# col_coords = tuple(chain.from_iterable(Js))
col_coords = chain.from_iterable(Js)
# del Js
# layer_coords = tuple(chain.from_iterable(Ks))
layer_coords = chain.from_iterable(Ks)
# del Ks

# coords = tuple(zip(row_coords, col_coords, layer_coords))
coords = zip(row_coords, col_coords, layer_coords)

data = tuple(chain.from_iterable(layers))
# data = chain.from_iterable(layers)
# del layers
#113GB

In [None]:
#takes 11hours
pV_WC = sparse.DOK(pV_WC_shape, dtype='float64')

for coord, datum in tqdm_notebook(zip(coords, data), total=len(data), mininterval=1):
    i,j,k = coord
    pV_WC[i,j,k] = datum

HBox(children=(IntProgress(value=0, max=1000342245), HTML(value='')))

In [32]:
del data

In [31]:
# del ij_pairs
del Is
del Js
del Ks
del layers

In [None]:
coords = tuple(zip(row_coords, col_coords, layer_coords))
del row_coords
del col_coords
del layer_coords

In [30]:
# del pV_WC_coords_j
# del pV_WC_coords_i
# del ij_pairs
# del ks

In [29]:
# del pV_WC_coords_j
# del pV_WC_coords_i
# del ij_pairs
# del ks
del Is
del Js
del Ks
del layers
#91GB

In [None]:
del pW_V
del pV_C
#?GB

In [28]:
#takes 3.5m to run out of mem
coords = (tuple(row_coords), tuple(col_coords), tuple(layer_coords))
# coords = (row_coords, col_coords, layer_coords)
pV_WC = sparse.COO(coords, data, pV_WC_shape)
pV_WC.shape
pV_WC.dtype
pV_WC.nbytes / 1e9
pV_WC.density

MemoryError: 

In [30]:
# pW_V_t = torch.sparse.FloatTensor(torch.from_numpy(pW_V.coords), torch.from_numpy(pW_V.data), pW_V.shape)
# pV_C_t = torch.from_numpy(pV_C)
# pW_C_t = torch.from_numpy(pW_C)
# assert not np.any(pW_C == 0) # if there are no 0s in pW_C, then we can take the element-wise inverse without anything blowing up
# inv_pW_C_t = 1.0 / pW_C_t

# Export

In [84]:
sparse.save_npz(o, pV_WC)

In [85]:
listdir(output_dir)

['LD_fisher_vocab_in_buckeye_contexts_projected_LTR_Buckeye.pV_C.npy',
 'LD_fisher_vocab_in_buckeye_contexts.pV_C',
 'buckeye_contexts.txt',
 'LM_filtered_buckeye_contexts.txt',
 '.ipynb_checkpoints',
 'LD_fisher_vocab_in_buckeye_contexts_projected_LTR_Buckeye.pV_C',
 'Producing Fisher vocab in Buckeye contexts contextual distributions.ipynb',
 'Filter LD_fisher_vocab_in_buckeye_contexts against LTR_Buckeye_aligned_CM_filtered_LM_filtered.ipynb',
 'Calculate segmental wordform distribution for LTR_Buckeye_aligned_CM_filtered_LM_filtered in buckeye contexts.ipynb',
 'LD_fisher_vocab_in_buckeye_contexts.hV_C',
 'fisher_vocabulary_main.txt',
 'LTR_Buckeye_aligned_CM_filtered_LM_filtered.pV_WC.npz',
 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_in_buckeye_contexts.pW_C.npy']