In [1]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

**Eric Meinhardt / emeinhardt@ucsd.edu**

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Overview" data-toc-modified-id="Overview-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Overview</a></span><ul class="toc-item"><li><span><a href="#Dependencies" data-toc-modified-id="Dependencies-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Dependencies</a></span></li><li><span><a href="#Usage" data-toc-modified-id="Usage-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Usage</a></span></li></ul></li><li><span><a href="#Parameters" data-toc-modified-id="Parameters-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Parameters</a></span></li><li><span><a href="#Imports-/-loading-data" data-toc-modified-id="Imports-/-loading-data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Imports / loading data</a></span><ul class="toc-item"><li><span><a href="#Language-model" data-toc-modified-id="Language-model-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Language model</a></span></li><li><span><a href="#Contexts" data-toc-modified-id="Contexts-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Contexts</a></span></li><li><span><a href="#Vocabulary" data-toc-modified-id="Vocabulary-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Vocabulary</a></span></li></ul></li><li><span><a href="#Main-calculation" data-toc-modified-id="Main-calculation-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Main calculation</a></span></li><li><span><a href="#Calculate-the-number-of-computations-+-estimate-required-space" data-toc-modified-id="Calculate-the-number-of-computations-+-estimate-required-space-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Calculate the number of computations + estimate required space</a></span></li><li><span><a href="#Ensure-matrix-metadata-is-standardized" data-toc-modified-id="Ensure-matrix-metadata-is-standardized-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Ensure matrix metadata is standardized</a></span></li><li><span><a href="#Pick-out-relevant-functions-for-mapping-between-context/word-and-index" data-toc-modified-id="Pick-out-relevant-functions-for-mapping-between-context/word-and-index-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Pick out relevant functions for mapping between context/word and index</a></span></li><li><span><a href="#Construct-and-write-distributions" data-toc-modified-id="Construct-and-write-distributions-8"><span class="toc-item-num">8&nbsp;&nbsp;</span>Construct and write distributions</a></span><ul class="toc-item"><li><span><a href="#Doing-calculations-in-place-/-via-memory-mapped-arrays" data-toc-modified-id="Doing-calculations-in-place-/-via-memory-mapped-arrays-8.1"><span class="toc-item-num">8.1&nbsp;&nbsp;</span>Doing calculations in-place / via memory mapped arrays</a></span></li><li><span><a href="#Doing-calculations-in-memory-and-writing-to-disk" data-toc-modified-id="Doing-calculations-in-memory-and-writing-to-disk-8.2"><span class="toc-item-num">8.2&nbsp;&nbsp;</span>Doing calculations in memory and writing to disk</a></span></li><li><span><a href="#Creating-a-version-that-contains-probabilities-(and-is-normalized)" data-toc-modified-id="Creating-a-version-that-contains-probabilities-(and-is-normalized)-8.3"><span class="toc-item-num">8.3&nbsp;&nbsp;</span>Creating a version that contains probabilities (and is normalized)</a></span><ul class="toc-item"><li><span><a href="#Sketch-of-normalization-process" data-toc-modified-id="Sketch-of-normalization-process-8.3.1"><span class="toc-item-num">8.3.1&nbsp;&nbsp;</span>Sketch of normalization process</a></span></li></ul></li><li><span><a href="#Normalization" data-toc-modified-id="Normalization-8.4"><span class="toc-item-num">8.4&nbsp;&nbsp;</span>Normalization</a></span></li></ul></li></ul></div>

# Overview

Given 
 - a file path $m$ to a `.arpa` file (or, more realistically/practically, a `kenlm` memory mapped version of one) for a language model
 - a file path $c$ to a set of $n$-gram contexts $C$ (a `.txt` file with one context per line, where a context is sequence of space-separated wordforms)
 - a file path $v$ to a vocabulary $W$ (a `.txt` file with one wordform per line)
 - a filepath $o$ for the main output of the notebook
 
this notebook will calculate the distribution $p(W|C)$ as a memory mapped `numpy` array (written to $o$).

## Dependencies

 - `kenlm`
 - `numpy`
 - `joblib`

## Usage

# Parameters

In [2]:
from os import getcwd, chdir, listdir, path, mkdir, makedirs

In [3]:
from shutil import copyfile

In [4]:
# parameters

m = ''
# m = '/home/AD/emeinhar/fisher-lm' + '/' + 'fisher_utterances_main_4gram.mmap'

c = ''
# c = '/home/AD/emeinhar/buckeye-lm' + '/' + 'buckeye_contexts.txt'

v = ''
# v = '/home/AD/emeinhar/fisher-lm' + '/' + 'fisher_vocabulary_main.txt'

o = ''
# o = '/home/AD/emeinhar/wr' + '/' + 'LD_Fisher_vocab_in_Buckeye_contexts' + '/' + 'LD_fisher_vocab_in_buckeye_contexts'

In [15]:
output_dir = path.dirname(o)
if not path.exists(output_dir):
    print('Making ' + output_dir)
    makedirs(output_dir)

In [6]:
copyfile(c, path.join(output_dir, path.basename(c)))
copyfile(v, path.join(output_dir, path.basename(v)))

'/home/AD/emeinhar/wr/LD_Fisher_vocab_in_Buckeye_contexts/buckeye_contexts.txt'

'/home/AD/emeinhar/wr/LD_Fisher_vocab_in_Buckeye_contexts/fisher_vocabulary_main.txt'

In [7]:
getcwd()

'/mnt/cube/home/AD/emeinhar/wr'

In [8]:
listdir()

['probdist.py',
 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0',
 'Calculate orthographic posterior given segmental wordform + context (sparse + dask + tiledb).ipynb',
 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.05',
 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0',
 'Generate triphone lexicon distribution from channel model.ipynb',
 'swbd2003_contexts.txt',
 'Calculate orthographic posterior given segmental wordform + context (sparse tensor calculations + memory issues).ipynb',
 'boilerplate.py',
 'buckeye_contexts_filtered_against_fisher_vocabulary_main.txt',
 'GD_AmE-diphones - LTR_Buckeye alignment application to LTR_Buckeye.ipynb',
 'LTR_Buckeye',
 '.gitignore',
 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.1',
 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0',
 'LTR_Buckeye_aligned_w_GD_AmE_destressed',
 'Run n-phone analysis of gating data.ipynb',
 'swbd2003_contexts_filtered_against_fisher_vocabulary_

# Imports / loading data

In [11]:
import kenlm
import csv
import numpy as np

In [12]:
from boilerplate import stamp, stampedNote, exportMatrixMetadata

In [13]:
from tqdm import tqdm

from joblib import Parallel, delayed

J = -1
BACKEND = 'multiprocessing'
# BACKEND = 'loky'
V = 10
PREFER = 'processes'
# PREFER = 'threads'

def identity(x):
    return x

def par(gen_expr):
    return Parallel(n_jobs=J, backend=BACKEND, verbose=V, prefer=PREFER)(gen_expr)

## Language model

In [14]:
model = kenlm.LanguageModel(m)

## Contexts

In [16]:
contexts = []
with open(c) as file:
    for line in file:
        contexts.append(line.rstrip())
contexts = tuple(contexts)
len(contexts)
contexts[:10]

17415

('a',
 "aaron's",
 'ability',
 'about',
 'above',
 'absent',
 'absentee',
 'absolutely',
 'accept',
 'accommodate')

In [17]:
assert len(set(contexts)) == len(contexts), "Contexts must consist of unique strings."

## Vocabulary

In [18]:
vocabulary = []
with open(v) as file:
    for line in file:
        vocabulary.append(line.rstrip())
vocabulary = tuple(vocabulary)
len(vocabulary)
vocabulary[:10]

44064

("'and",
 "'berserkly'",
 "'bout",
 "'burb",
 "'burban",
 "'burbs",
 "'cau",
 "'cause",
 "'cept",
 "'cide")

In [19]:
assert len(set(vocabulary)) == len(vocabulary), "Vocabulary must consist of unique wordforms."

# Main calculation

In [20]:
from random import choice

In [21]:
ctxt = choice(contexts)
ctxt

wrd = choice(vocabulary)
wrd

'there <rem>'

'sustaining'

In [22]:
model.score("this is a sentence", eos = False)
model.score("this is a sentence")
model.score("this is a sentence", eos = True)
model.score("this is a sentence </s>", eos=False)
model.score("this is a sentence </s>")
tuple(model.full_scores("this is a sentence"))
sum(map(lambda triple: triple[0],
        tuple(model.full_scores("this is a sentence"))))
' '
tuple(model.full_scores("this is a sentence", eos=False, bos=False))
sum(map(lambda triple: triple[0],
        tuple(model.full_scores("this is a sentence", eos=False, bos=False))))

-9.022207260131836

-9.479642868041992

-9.479642868041992

-9.479642868041992

-11.116127967834473

((-2.684236526489258, 2, False),
 (-0.21483998000621796, 3, False),
 (-1.1899677515029907, 4, False),
 (-4.933162689208984, 2, False),
 (-0.45743539929389954, 3, False))

-9.47964234650135

' '

((-2.610599994659424, 1, False),
 (-1.2509719133377075, 2, False),
 (-1.429597020149231, 3, False),
 (-4.933162689208984, 2, False))

-10.224331617355347

In [23]:
from math import log10, log2

In [24]:
def score(word, context, base2=True, surprisal=True):
    score_infos = tuple(model.full_scores(context + ' ' + word, eos=False, bos=False))
    key_score_log10 = score_infos[-1][0]
    if base2:
        key_score = key_score_log10 / log10(2)
    if surprisal:
        key_score = -1.0 * key_score
    return key_score

In [25]:
ctxt
wrd
score(wrd, ctxt)

'there <rem>'

'sustaining'

17.7518716448236

# Calculate the number of computations + estimate required space

In [26]:
bits_per_cell = 64
bytes_per_cell = bits_per_cell / 8

len(contexts)
len(vocabulary)
"{:,}".format( len(contexts) * len(vocabulary) )
"{:,} GB".format( len(contexts) * len(vocabulary) * bytes_per_cell / 1e9)

17415

44064

'767,374,560'

'6.13899648 GB'

In [None]:
# from itertools import product

In [None]:
# computations = product(contexts, vocabulary)

In [None]:
# computations = tuple(product(contexts, vocabulary))

In [None]:
# from random import choices

In [None]:
# example_computations = choices(computations, k=10)
# example_computations
# ex = choice(example_computations)
# ex

# Ensure matrix metadata is standardized

In [27]:
type(vocabulary)
type(contexts)

tuple

tuple

In [28]:
list(vocabulary) == sorted(list(vocabulary))
list(contexts) == sorted(list(contexts))

True

False

In [29]:
vocabulary_sorted = tuple(sorted(list(vocabulary)))
contexts_sorted = tuple(sorted(list(contexts)))

In [30]:
assert list(vocabulary_sorted) == sorted(list(vocabulary))
assert list(contexts_sorted) == sorted(list(contexts))

# Pick out relevant functions for mapping between context/word and index

In [31]:
wrd
vocabulary_sorted.index(wrd)
ctxt
contexts_sorted.index(ctxt)

'sustaining'

38367

'there <rem>'

12319

In [None]:
# ex
# contexts.index(ex[0])
# contexts[ contexts.index(ex[0]) ]

In [32]:
def score_np(word_idx, context_idx, base2=True, surprisal=True):
    return score(vocabulary_sorted[word_idx], contexts_sorted[context_idx], base2=base2, surprisal=surprisal)

# Construct and write distributions

In [33]:
num_words = len(vocabulary_sorted)
num_contexts = len(contexts_sorted)
my_shape = (num_words, num_contexts) #columns are distributions
my_shape
num_words * num_contexts

(44064, 17415)

767374560

In [34]:
memory_map = False

## Doing calculations in-place / via memory mapped arrays

This will only begin to make sense if the array is going to be too large to fit in memory or very small; otherwise joblib will use threads to parallelize the computation (because it involves lots of IO).

In [35]:
if memory_map:
    hVC = np.memmap(o + '.hV_C', dtype='float64', mode='w+', shape=my_shape)
    hVC.nbytes / 1e9
    hVC.dtype

In [36]:
score_np_vec = np.vectorize(score_np)

In [37]:
if memory_map:
    def define_score(word_idx, context_idx, base2=True, surprisal=True):
        hVC[word_idx, context_idx] = score_np(word_idx, context_idx, base2=base2, surprisal=surprisal)

In [38]:
if memory_map:
    stampedNote("Started calculations")

In [39]:
if memory_map:
    # est 7h on wittgenstein with J=30 and other computations going on in the background
    # pretty sure that means it's using threads rather than processes
    par(delayed(define_score)(w_idx, ctxt_idx) 
        for ctxt_idx in range(num_contexts) 
        for w_idx in range(num_words))

In [40]:
if memory_map:
    stampedNote("Ended calculations")

In [None]:
# no need for testing ordering because of the way define_score is defined...

In [None]:
if memory_map:
    hVC_dim_md = {'C':{'from fp':c,
                       'changes':'sorted alphabetically',
                       'size':len(contexts_sorted)},
                  'V':{'from fp':v,
                       'changes':'none - already sorted',
                       'size':len(vocabulary_sorted)}}
    # other_md = {'Produced in step':'Step 2b',
    #             'Base notebook name':'Producing contextual distributions'}

    exportMatrixMetadata(o+'.hV_C'+'_metadata.json',
                         o+'.hV_C',
                         hVC,
                         hVC_dim_md,
                         'Step 2b',
                         'Producing contextual distributions',
                         {})

## Doing calculations in memory and writing to disk

In [41]:
if not memory_map:
    #~30m on wittgenstein
#     hVC = np.vstack([score_np_vec(np.array(range(num_words)), c_idx) for c_idx in tqdm(range(num_contexts))]).T
    
    #takes ~2.75m on wittgenstein with J=30 and other things going on in the background
    hVC = np.vstack(par(delayed(score_np_vec)(np.array(range(num_words)), c_idx)
                        for c_idx in range(num_contexts))).T

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done  50 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done  65 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done  80 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done 114 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done 133 tasks      | elapsed:    6.1s
[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-1)]: Done 194 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done 217 tasks      | elapsed:    8.8s
[Parallel(n_jobs=-1)]: Done 240 tasks      |

[Parallel(n_jobs=-1)]: Done 9752 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Done 9893 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Done 10034 tasks      | elapsed:  6.6min
[Parallel(n_jobs=-1)]: Done 10177 tasks      | elapsed:  6.7min
[Parallel(n_jobs=-1)]: Done 10320 tasks      | elapsed:  6.8min
[Parallel(n_jobs=-1)]: Done 10465 tasks      | elapsed:  6.9min
[Parallel(n_jobs=-1)]: Done 10610 tasks      | elapsed:  7.1min
[Parallel(n_jobs=-1)]: Done 10757 tasks      | elapsed:  7.1min
[Parallel(n_jobs=-1)]: Done 10904 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 11053 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 11202 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done 11353 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done 11504 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-1)]: Done 11657 tasks      | elapsed:  7.7min
[Parallel(n_jobs=-1)]: Done 11810 tasks      | elapsed:  7.8min
[Parallel(n_jobs=-1)]: Done 11965 tasks   

Now, for reasons of caution and paranoia, we check that the ordering on contexts and wordforms is preserved by picking a thousand random context-wordform pairs, doing the calculations manually and checking that the calculation matches what's at the corresponding location in `hVC`:

In [44]:
hVC[0,0]
score_np(0,0) #word idx, context idx

hVC[2,3]
score_np(2,3)

hVC[3,2]
score_np(3,2)

21.966872680419925

21.966872680419925

16.888473711424368

16.888473711424368

19.400280759634796

19.400280759634796

In [45]:
from random import choices

N_test_pairs = 10000
random_context_indices = choices(range(num_contexts), k=N_test_pairs)
random_orthWord_indices = choices(range(num_words), k=N_test_pairs)

random_index_pairs = tuple(zip(random_orthWord_indices,
                               random_context_indices))

tests = [hVC[i,j] == score_np(i,j) for i,j in random_index_pairs]
all(tests)
assert all(tests)

True

In [46]:
if not memory_map:
    hVC.nbytes / 1e9
    hVC.dtype
    hVC_on_disk = np.memmap(o + '.hV_C', dtype='float64', mode='w+', shape=my_shape)
    
    #takes ~1.25m on wittgenstein
    hVC_on_disk[:,:] = hVC
    hVC = hVC_on_disk
    del hVC_on_disk

6.13899648

dtype('float64')

In [47]:
o

'/home/AD/emeinhar/wr/LD_Fisher_vocab_in_Buckeye_contexts/LD_fisher_vocab_in_buckeye_contexts'

In [48]:
listdir(output_dir)

['LD_fisher_vocab_in_buckeye_contexts_projected_LTR_Buckeye.pV_C.npy',
 'LD_fisher_vocab_in_buckeye_contexts.pV_C',
 'buckeye_contexts.txt',
 'LM_filtered_buckeye_contexts.txt',
 '.ipynb_checkpoints',
 'LD_fisher_vocab_in_buckeye_contexts_projected_LTR_Buckeye.pV_C',
 'Producing Fisher vocab in Buckeye contexts contextual distributions.ipynb',
 'Filter LD_fisher_vocab_in_buckeye_contexts against LTR_Buckeye_aligned_CM_filtered_LM_filtered.ipynb',
 'Calculate segmental wordform distribution for LTR_Buckeye_aligned_CM_filtered_LM_filtered in buckeye contexts.ipynb',
 'LD_fisher_vocab_in_buckeye_contexts.hV_C',
 'fisher_vocabulary_main.txt',
 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_in_buckeye_contexts.pW_C.npy']

In [49]:
hVC.nbytes / 1e9
hVC.shape
hVC.dtype

6.13899648

(44064, 17415)

dtype('float64')

In [60]:
# def exportMatrixMetadata(md_fp, matrix_fp, matrix, dim_md, step_name, nb_name, other_md):
#     md = {'matrix fp':matrix_fp,
#           'matrix shape':matrix.shape,
#           'Produced in step':step_name,
#           'Produced in notebook':nb_name}
#     md.update(dim_md)
#     md.update(other_md)
#     exportDict(md_fp, md)
#     print(f'Wrote metadata for \n\t{matrix_fp}\n to \n\t{md_fp}')

In [55]:
o

'/home/AD/emeinhar/wr/LD_Fisher_vocab_in_Buckeye_contexts/LD_fisher_vocab_in_buckeye_contexts'

In [61]:
hVC_dim_md = {'C':{'from fp':c,
                   'changes':'sorted alphabetically',
                   'size':len(contexts_sorted)},
              'V':{'from fp':v,
                   'changes':'none - already sorted',
                   'size':len(vocabulary_sorted)}}
# other_md = {'Produced in step':'Step 2b',
#             'Base notebook name':'Producing contextual distributions'}

exportMatrixMetadata(o+'.hV_C'+'_metadata.json',
                     o+'.hV_C',
                     hVC,
                     hVC_dim_md,
                     'Step 2b',
                     'Producing contextual distributions',
                     {})

Wrote metadata for 
	/home/AD/emeinhar/wr/LD_Fisher_vocab_in_Buckeye_contexts/LD_fisher_vocab_in_buckeye_contexts.hV_C
 to 
	/home/AD/emeinhar/wr/LD_Fisher_vocab_in_Buckeye_contexts/LD_fisher_vocab_in_buckeye_contexts.hV_C_metadata.json


In [62]:
!cat /home/AD/emeinhar/wr/LD_Fisher_vocab_in_Buckeye_contexts/LD_fisher_vocab_in_buckeye_contexts.hV_C_metadata.json

{
    "matrix fp": "/home/AD/emeinhar/wr/LD_Fisher_vocab_in_Buckeye_contexts/LD_fisher_vocab_in_buckeye_contexts.hV_C",
    "matrix shape": [
        44064,
        17415
    ],
    "Produced in step": "Step 2b",
    "Produced in notebook": "Producing contextual distributions",
    "C": {
        "from fp": "/home/AD/emeinhar/buckeye-lm/buckeye_contexts.txt",
        "changes": "sorted alphabetically",
        "size": 17415
    },
    "V": {
        "from fp": "/home/AD/emeinhar/fisher-lm/fisher_vocabulary_main.txt",
        "changes": "none - already sorted",
        "size": 44064
    }
}

## Creating a version that contains probabilities (and is normalized)

We can't just naively convert (-) log-probabilities to probabilities if we're interested in distributions:

In [63]:
# NB! assumes hVC is in base 2 surprisals...
dist_norms = np.sum(np.exp2(-1.0 * hVC), axis=0)
dist_norms

array([ 0.98952166,  0.92990437,  0.97399201, ...,  0.96380412,
        0.9685743 ,  0.86971511])

### Sketch of normalization process

In [64]:
random_context = choice(contexts_sorted)
random_context

'i did'

In [65]:
contexts.index(random_context)

2993

In [66]:
hW_rc = np.array(par(delayed(score)(w, random_context) for w in vocabulary_sorted))

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0117s.) Setting batch_size=34.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0252s.) Setting batch_size=538.
[Parallel(n_jobs=-1)]: Done 116 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 626 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 1136 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 2218 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1808s.) Setting batch_size=1190.
[Parallel(n_jobs=-1)]: Done 44064 out of 44064 | elapsed:    0.4s finished


In [67]:
np.array_equal( hVC[:,contexts_sorted.index(random_context)], hW_rc )

True

In [68]:
pW_rc = np.exp2(-1.0 * hW_rc)
pW_rc[:10]
np.sum(pW_rc)

array([  6.08164937e-08,   6.08164937e-08,   2.41221365e-06,
         8.52301967e-08,   6.08164937e-08,   6.08164937e-08,
         6.08164937e-08,   1.48785937e-03,   2.66976272e-07,
         6.08164937e-08])

0.95962677681213815

In [69]:
hVC.shape

(44064, 17415)

In [70]:
# from https://stats.stackexchange.com/a/66621
my_logprobs = -1.0 * hW_rc
my_epsilon = 10 ** (-1.0 * 16)
print('𝛆 = {0}'.format( my_epsilon ))

my_n = my_logprobs.shape[0]
print('n = {0}'.format( my_n ))

my_max = np.max(my_logprobs)
print('max(λᵢ) = λᵦ = {0}'.format( my_max ))

my_threshold = np.log2(my_epsilon) - np.log2(my_n)
print('𝚹 = {0}'.format( my_threshold ))

mask = my_logprobs - my_max >= my_threshold
np.sum(mask)
to_alpha = lambda logprob: np.exp2(logprob - my_max) if (logprob - my_max) >= my_threshold else 0.0
to_alpha_vec = lambda logprobs: np.exp2(logprobs - my_max) * (logprobs - my_max >= my_threshold)
my_alphas = np.array([to_alpha(l) for l in my_logprobs])
assert np.array_equal(my_alphas, to_alpha_vec(my_logprobs))
my_alpha_norm = np.sum(my_alphas)
my_probs = my_alphas / my_alpha_norm
np.sum(my_probs)

𝛆 = 1e-16
n = 44064
max(λᵢ) = λᵦ = -4.113823025793098
𝚹 = -68.57816236233276


44064

0.99999999999999989

## Normalization

In [71]:
# from https://stats.stackexchange.com/a/66621
def normalize_logprobs(logprobs, d=16, axis=0, b=None):
# def normalize_logprobs(logprobs, d=16, b=None):
#     axis = 0
    n = logprobs.shape[axis]
    epsilon = 10**(-1.0 * d)
    maxlogp = np.max( logprobs[axis] )
    if b is None:
        threshold = np.log(epsilon) - np.log(n)
        to_alpha = lambda logprob: np.exp(logprob - my_max) if (logprob - my_max) >= my_threshold else 0.0
        to_alpha_vec = lambda logprobs: np.exp(logprobs - my_max) * (logprobs - my_max >= my_threshold)
    elif b == 2:
        threshold = np.log2(epsilon) - np.log2(n)
        to_alpha = lambda logprob: np.exp2(logprob - my_max) if (logprob - my_max) >= my_threshold else 0.0
        to_alpha_vec = lambda logprobs: np.exp2(logprobs - my_max) * (logprobs - my_max >= my_threshold)
    elif b == 10:
        threshold = np.log10(epsilon) - np.log10(n)
        to_alpha = lambda logprob: np.power(logprob - my_max, 10) if (logprob - my_max) >= my_threshold else 0.0
        to_alpha_vec = lambda logprobs: np.power(logprobs - my_max, 10) * (logprobs - my_max >= my_threshold)
    else:
        threshold = (np.log(epsilon) / np.log(b)) - (np.log(n) / np.log(b))
        to_alpha = lambda logprob: np.power(logprob - my_max, b) if (logprob - my_max) >= my_threshold else 0.0
        to_alpha_vec = lambda logprobs: np.power(logprobs - my_max, 10) * (logprobs - my_max >= my_threshold)
    alpha_is = np.apply_along_axis(to_alpha_vec, axis=axis, arr=logprobs)
#     alpha_is = np.array([to_alpha(l) for l in logprobs])
    alpha_norm = np.sum(alpha_is, axis=axis)
    probs = alpha_is / alpha_norm
#     assert np.isclose(np.sum(probs), 1.0)
    return probs

In [72]:
normalize_logprobs(-1.0 * hW_rc, b=2)

array([  6.33751529e-08,   6.33751529e-08,   2.51369982e-06, ...,
         6.33751529e-08,   8.88159842e-08,   6.33751529e-08])

In [73]:
normalize_logprobs(-1.0 * hVC[:,0], b=2)
normalize_logprobs(-1.0 * hVC[:,0], b=2).sum()

array([  2.46539838e-07,   2.46539838e-07,   1.45738607e-05, ...,
         2.46539838e-07,   3.45509264e-07,   2.46539838e-07])

1.0

In [74]:
if memory_map:
    pVC = np.memmap(o + '.pV_C', dtype='float64', mode='w+', shape=my_shape)

In [75]:
# if memory_map:
#     for j in range(my_shape[1]):
#         pVC[:,j] = normalize_logprobs(-1.0 * hVC[:,j], b=2)

In [76]:
def normColumn(j):
    pVC[:,j] = normalize_logprobs(-1.0 * hVC[:,j], b=2)

if memory_map:
    # takes 3.4m on wittgenstein with J=30 and other stuff going on in the background
    par(delayed(normColumn)(j) for j in range(num_contexts))

In [77]:
if not memory_map:
    #takes ~30s on wittgenstein with J=30 and other stuff going on in the background
    pVC = np.vstack(par(delayed(normalize_logprobs)(-1.0 * hVC[:,j])
                        for j in range(num_contexts))).T

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0191s.) Setting batch_size=20.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 228 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 368 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 548 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done 728 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-1)]: Done 948 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done 1168 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-1)]: Done 1428 tasks      | elapsed:    9.8s
[Parallel(n_jobs=-1)]: Done 1688 tasks      | elapsed:   11.3s
[Parallel(n_jobs=-1)]: Done 1988 tasks      | elapsed:   13.0s
[Parallel(n_jobs=-1)]: Done 2288 tasks      | elapsed:   14.7s
[Parallel(n_jobs=

In [79]:
N_test_indices = 5000
random_context_indices = choices(range(num_contexts), k=N_test_indices)

tests = [np.allclose(pVC[:,j], normalize_logprobs(-1.0 * hVC[:,j])) for j in tqdm(random_context_indices)]

all(tests)
assert all(tests)

100%|██████████| 5000/5000 [00:28<00:00, 177.12it/s]


True

In [80]:
if not memory_map:
    #takes ~1.25m on wittgenstein with other stuff going on in the background
    pVC_on_disk = np.memmap(o + '.pV_C', dtype='float64', mode='w+', shape=my_shape)
    pVC_on_disk[:,:] = pVC

In [81]:
pVC.shape
pVC.dtype
pVC.nbytes / 1e9
np.sum(pVC, axis=0)

(44064, 17415)

dtype('float64')

6.13899648

array([ 1.,  1.,  1., ...,  1.,  1.,  1.])

In [82]:
listdir(output_dir)

['LD_fisher_vocab_in_buckeye_contexts_projected_LTR_Buckeye.pV_C.npy',
 'LD_fisher_vocab_in_buckeye_contexts.pV_C',
 'buckeye_contexts.txt',
 'LM_filtered_buckeye_contexts.txt',
 '.ipynb_checkpoints',
 'LD_fisher_vocab_in_buckeye_contexts_projected_LTR_Buckeye.pV_C',
 'Producing Fisher vocab in Buckeye contexts contextual distributions.ipynb',
 'LD_fisher_vocab_in_buckeye_contexts.hV_C_metadata.json',
 'Filter LD_fisher_vocab_in_buckeye_contexts against LTR_Buckeye_aligned_CM_filtered_LM_filtered.ipynb',
 'Calculate segmental wordform distribution for LTR_Buckeye_aligned_CM_filtered_LM_filtered in buckeye contexts.ipynb',
 'LD_fisher_vocab_in_buckeye_contexts.hV_C',
 'fisher_vocabulary_main.txt',
 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_in_buckeye_contexts.pW_C.npy']

In [84]:
pVC_dim_md = {'C':{'from fp':c,
                   'changes':'sorted alphabetically',
                   'size':len(contexts_sorted)},
              'V':{'from fp':v,
                   'changes':'none - already sorted',
                   'size':len(vocabulary_sorted)}}
# other_md = {'Produced in step':'Step 2b',
#             'Base notebook name':'Producing contextual distributions'}

exportMatrixMetadata(o+'.pV_C'+'_metadata.json',
                     o+'.pV_C',
                     pVC,
                     pVC_dim_md,
                     'Step 2b',
                     'Producing contextual distributions',
                     {'Comment':'Non-trivially normalized version of hVC with nearly the same name'})

Wrote metadata for 
	/home/AD/emeinhar/wr/LD_Fisher_vocab_in_Buckeye_contexts/LD_fisher_vocab_in_buckeye_contexts.pV_C
 to 
	/home/AD/emeinhar/wr/LD_Fisher_vocab_in_Buckeye_contexts/LD_fisher_vocab_in_buckeye_contexts.pV_C_metadata.json


In [85]:
!cat /home/AD/emeinhar/wr/LD_Fisher_vocab_in_Buckeye_contexts/LD_fisher_vocab_in_buckeye_contexts.pV_C_metadata.json

{
    "matrix fp": "/home/AD/emeinhar/wr/LD_Fisher_vocab_in_Buckeye_contexts/LD_fisher_vocab_in_buckeye_contexts.pV_C",
    "matrix shape": [
        44064,
        17415
    ],
    "Produced in step": "Step 2b",
    "Produced in notebook": "Producing contextual distributions",
    "C": {
        "from fp": "/home/AD/emeinhar/buckeye-lm/buckeye_contexts.txt",
        "changes": "sorted alphabetically",
        "size": 17415
    },
    "V": {
        "from fp": "/home/AD/emeinhar/fisher-lm/fisher_vocabulary_main.txt",
        "changes": "none - already sorted",
        "size": 44064
    },
    "Comment": "Non-trivially normalized version of hVC with nearly the same name"
}