In [2]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

**Eric Meinhardt / emeinhardt@ucsd.edu**

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Overview" data-toc-modified-id="Overview-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Overview</a></span><ul class="toc-item"><li><span><a href="#Dependencies" data-toc-modified-id="Dependencies-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Dependencies</a></span></li><li><span><a href="#Usage" data-toc-modified-id="Usage-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Usage</a></span></li></ul></li><li><span><a href="#Parameters" data-toc-modified-id="Parameters-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Parameters</a></span></li><li><span><a href="#Imports-/-loading-data" data-toc-modified-id="Imports-/-loading-data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Imports / loading data</a></span><ul class="toc-item"><li><span><a href="#Language-model" data-toc-modified-id="Language-model-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Language model</a></span></li><li><span><a href="#Contexts" data-toc-modified-id="Contexts-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Contexts</a></span></li><li><span><a href="#Vocabulary" data-toc-modified-id="Vocabulary-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Vocabulary</a></span></li></ul></li><li><span><a href="#Main-calculation" data-toc-modified-id="Main-calculation-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Main calculation</a></span></li><li><span><a href="#Estimate-the-number-of-computations-+-space-required" data-toc-modified-id="Estimate-the-number-of-computations-+-space-required-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Estimate the number of computations + space required</a></span></li><li><span><a href="#Pick-out-relevant-functions-for-mapping-between-context/word-and-index" data-toc-modified-id="Pick-out-relevant-functions-for-mapping-between-context/word-and-index-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Pick out relevant functions for mapping between context/word and index</a></span></li><li><span><a href="#Construct-the-(memory-mapped)-array" data-toc-modified-id="Construct-the-(memory-mapped)-array-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Construct the (memory mapped) array</a></span></li></ul></div>

# Overview

Given 
 - a file path $m$ to a `.arpa` file (or, more realistically/practically, a `kenlm` memory mapped version of one) for a language model
 - a file path $c$ to a set of $n$-gram contexts $C$ (a `.txt` file with one context per line, where a context is sequence of space-separated wordforms)
 - a file path $v$ to a vocabulary $W$ (a `.txt` file with one wordform per line)
 - a filepath $o$ for the main output of the notebook
 
this notebook will calculate the distribution $p(W|C)$ as a memory mapped `numpy` array (written to $o$).

## Dependencies

 - `kenlm`
 - `numpy`
 - `joblib`

## Usage

# Parameters

In [3]:
from os import getcwd, chdir, listdir, path, mkdir, makedirs

In [4]:
# parameters

# m = ''
m = '/home/AD/emeinhar/fisher-lm' + '/' + 'fisher_utterances_main_4gram.mmap'

# c = ''
c = '/home/AD/emeinhar/buckeye-lm' + '/' + 'buckeye_contexts.txt'

#v = ''
v = '/home/AD/emeinhar/fisher-lm' + '/' + 'fisher_vocabulary_main.txt'

# o = ''
o = '/home/AD/emeinhar/wr' + '/' + 'LD_fisher_vocab_in_buckeye_contexts' + '/' + 'fisher_vocab_in_buckeye_contexts_LM.npy'

In [5]:
output_dir = path.dirname(o)
if not path.exists(output_dir):
    print('Making ' + output_dir)
    makedirs(output_dir)

In [6]:
%pwd

'/mnt/cube/home/AD/emeinhar/wr'

In [7]:
%ls

'1 initial directory setup.txt'
'2a alignment_paths_and_cmds.sh'
'Align transcriptions.ipynb'
 boilerplate.py
 [0m[01;34mCM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0[0m/
 [01;34mCM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01[0m/
 [01;34mCM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.05[0m/
 [01;34mCM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1[0m/
 [01;34mCM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0[0m/
 [01;34mCM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.01[0m/
 [01;34mCM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.05[0m/
 [01;34mCM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.1[0m/
 [01;34mCM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0[0m/
 [01;34mCM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.01[0m/
 [01;34mCM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.05[0m/
 [01;34mCM_AmE_destressed_aligned_w_LTR_newdic_destre

# Imports / loading data

In [8]:
import kenlm
import csv
import numpy as np

In [88]:
from boilerplate import stamp, stampedNote

In [9]:
from joblib import Parallel, delayed

J = 30
BACKEND = 'multiprocessing'
# BACKEND = 'loky'
V = 10
PREFER = 'processes'
# PREFER = 'threads'

def identity(x):
    return x

def par(gen_expr):
    return Parallel(n_jobs=J, backend=BACKEND, verbose=V, prefer=PREFER)(gen_expr)

## Language model

In [10]:
model = kenlm.LanguageModel(m)

## Contexts

In [11]:
contexts = []
with open(c) as file:
    for line in file:
        contexts.append(line.rstrip())
contexts = tuple(contexts)
len(contexts)
contexts[:10]

17415

('a',
 "aaron's",
 'ability',
 'about',
 'above',
 'absent',
 'absentee',
 'absolutely',
 'accept',
 'accommodate')

In [12]:
assert len(set(contexts)) == len(contexts), "Contexts must consist of unique strings."

## Vocabulary

In [13]:
vocabulary = []
with open(v) as file:
    for line in file:
        vocabulary.append(line.rstrip())
vocabulary = tuple(vocabulary)
len(vocabulary)
vocabulary[:10]

44064

("'and",
 "'berserkly'",
 "'bout",
 "'burb",
 "'burban",
 "'burbs",
 "'cau",
 "'cause",
 "'cept",
 "'cide")

In [14]:
assert len(set(vocabulary)) == len(vocabulary), "Vocabulary must consist of unique wordforms."

# Main calculation

In [15]:
from random import choice

In [16]:
c = choice(contexts)
c

w = choice(vocabulary)
w

'subleasing'

"doin'"

In [17]:
model.score("this is a sentence", eos = False)
model.score("this is a sentence")
model.score("this is a sentence", eos = True)
model.score("this is a sentence </s>", eos=False)
model.score("this is a sentence </s>")
tuple(model.full_scores("this is a sentence"))
sum(map(lambda triple: triple[0],
        tuple(model.full_scores("this is a sentence"))))
' '
tuple(model.full_scores("this is a sentence", eos=False, bos=False))
sum(map(lambda triple: triple[0],
        tuple(model.full_scores("this is a sentence", eos=False, bos=False))))

-9.022207260131836

-9.479642868041992

-9.479642868041992

-9.479642868041992

-11.116127967834473

((-2.684236526489258, 2, False),
 (-0.21483998000621796, 3, False),
 (-1.1899677515029907, 4, False),
 (-4.933162689208984, 2, False),
 (-0.45743539929389954, 3, False))

-9.47964234650135

' '

((-2.610599994659424, 1, False),
 (-1.2509719133377075, 2, False),
 (-1.429597020149231, 3, False),
 (-4.933162689208984, 2, False))

-10.224331617355347

In [18]:
from math import log10, log2

In [None]:
def score(word, context, base2=True, surprisal=True):
    score_infos = tuple(model.full_scores(context + ' ' + word, eos=False, bos=False))
    key_score_log10 = score_infos[-1][0]
    if base2:
        key_score = key_score_log10 / log10(2)
    if surprisal:
        key_score = -1.0 * key_score
    return key_score

In [None]:
c
w
score(w, c)

'subleasing'

"doin'"

15.189254027088547

# Estimate the number of computations + space required

In [154]:
bits_per_cell = 64
bytes_per_cell = bits_per_cell / 8

len(contexts)
len(vocabulary)
"{:,}".format( len(contexts) * len(vocabulary) )
"{:,} GB".format( len(contexts) * len(vocabulary) * bytes_per_cell / 1e9)

17415

44064

'767,374,560'

'6.13899648 GB'

In [None]:
# from itertools import product

In [None]:
# computations = product(contexts, vocabulary)

In [None]:
# computations = tuple(product(contexts, vocabulary))

In [None]:
# from random import choices

In [None]:
# example_computations = choices(computations, k=10)
# example_computations
# ex = choice(example_computations)
# ex

# Pick out relevant functions for mapping between context/word and index

In [None]:
type(contexts)
type(vocabulary)

tuple

tuple

In [None]:
# ex
# contexts.index(ex[0])
# contexts[ contexts.index(ex[0]) ]

In [None]:
def score_np(word_idx, context_idx, base2=True, surprisal=True):
    return score(vocabulary[word_idx], contexts[context_idx], base2=base2, surprisal=surprisal)

# Construct the (memory mapped) array

In [None]:
num_words = len(vocabulary)
num_contexts = len(contexts)
my_shape = (num_words, num_contexts) #columns are distributions
my_shape
num_words * num_contexts

(44064, 17415)

767374560

In [None]:
hVC = np.memmap(o, dtype='float64', mode='w+', shape=my_shape)
hVC.nbytes / 1e9
hVC.dtype

6.13899648

dtype('float64')

In [None]:
def define_score(word_idx, context_idx, base2=True, surprisal=True):
    hVC[word_idx, context_idx] = score_np(word_idx, context_idx, base2=base2, surprisal=surprisal)    

In [None]:
stampedNote("Started calculations")

In [None]:
# est 4h on wittgenstein with J=30 and other computations going on in the background
par(delayed(define_score)(w_idx, ctxt_idx) 
    for ctxt_idx in range(num_contexts) 
    for w_idx in range(num_words))

[Parallel(n_jobs=30)]: Using backend MultiprocessingBackend with 30 concurrent workers.
[Parallel(n_jobs=30)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=30)]: Batch computation too fast (0.0148s.) Setting batch_size=26.
[Parallel(n_jobs=30)]: Done  12 tasks      | elapsed:    0.0s
[Parallel(n_jobs=30)]: Done  25 tasks      | elapsed:    0.0s
[Parallel(n_jobs=30)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=30)]: Done  53 tasks      | elapsed:    0.1s
[Parallel(n_jobs=30)]: Batch computation too fast (0.0976s.) Setting batch_size=106.
[Parallel(n_jobs=30)]: Done 268 tasks      | elapsed:    0.4s
[Parallel(n_jobs=30)]: Done 710 tasks      | elapsed:    1.0s
[Parallel(n_jobs=30)]: Done 1152 tasks      | elapsed:    1.4s
[Parallel(n_jobs=30)]: Done 1726 tasks      | elapsed:    2.2s
[Parallel(n_jobs=30)]: Batch computation too slow (2.0162s.) Setting batch_size=53.
[Parallel(n_jobs=30)]: Done 3740 tasks      | elapsed:    4.5s
[Parallel(n_jobs=30)]: Done 5966 tas

[Parallel(n_jobs=30)]: Done 8091341 tasks      | elapsed:  1.6min
[Parallel(n_jobs=30)]: Batch computation too slow (2.0202s.) Setting batch_size=675.
[Parallel(n_jobs=30)]: Done 8260216 tasks      | elapsed:  1.7min
[Parallel(n_jobs=30)]: Batch computation too slow (3.9556s.) Setting batch_size=337.
[Parallel(n_jobs=30)]: Batch computation too slow (4.5944s.) Setting batch_size=168.
[Parallel(n_jobs=30)]: Batch computation too slow (2.4378s.) Setting batch_size=84.
[Parallel(n_jobs=30)]: Done 8326675 tasks      | elapsed:  1.8min
[Parallel(n_jobs=30)]: Done 8338519 tasks      | elapsed:  1.9min
[Parallel(n_jobs=30)]: Done 8352646 tasks      | elapsed:  1.9min
[Parallel(n_jobs=30)]: Done 8365000 tasks      | elapsed:  2.0min
[Parallel(n_jobs=30)]: Batch computation too slow (2.9880s.) Setting batch_size=42.
[Parallel(n_jobs=30)]: Batch computation too slow (2.8551s.) Setting batch_size=21.
[Parallel(n_jobs=30)]: Batch computation too fast (0.0087s.) Setting batch_size=966.
[Parallel(n_

[Parallel(n_jobs=30)]: Batch computation too slow (2.7299s.) Setting batch_size=442.
[Parallel(n_jobs=30)]: Batch computation too slow (2.1255s.) Setting batch_size=221.
[Parallel(n_jobs=30)]: Done 21124249 tasks      | elapsed:  5.1min
[Parallel(n_jobs=30)]: Batch computation too slow (3.3994s.) Setting batch_size=110.
[Parallel(n_jobs=30)]: Batch computation too slow (5.5248s.) Setting batch_size=55.
[Parallel(n_jobs=30)]: Batch computation too fast (0.1957s.) Setting batch_size=112.
[Parallel(n_jobs=30)]: Batch computation too fast (0.0156s.) Setting batch_size=2876.
[Parallel(n_jobs=30)]: Done 21163005 tasks      | elapsed:  5.2min
[Parallel(n_jobs=30)]: Done 21653355 tasks      | elapsed:  5.3min
[Parallel(n_jobs=30)]: Done 22179663 tasks      | elapsed:  5.3min
[Parallel(n_jobs=30)]: Done 22711723 tasks      | elapsed:  5.4min
[Parallel(n_jobs=30)]: Done 23243783 tasks      | elapsed:  5.4min
[Parallel(n_jobs=30)]: Done 23781595 tasks      | elapsed:  5.5min
[Parallel(n_jobs=30)]

[Parallel(n_jobs=30)]: Batch computation too slow (3.3932s.) Setting batch_size=301.
[Parallel(n_jobs=30)]: Batch computation too fast (0.0565s.) Setting batch_size=2130.
[Parallel(n_jobs=30)]: Done 38353768 tasks      | elapsed:  9.4min
[Parallel(n_jobs=30)]: Done 38851177 tasks      | elapsed:  9.5min
[Parallel(n_jobs=30)]: Done 39351727 tasks      | elapsed:  9.5min
[Parallel(n_jobs=30)]: Done 39856537 tasks      | elapsed:  9.6min
[Parallel(n_jobs=30)]: Done 40361347 tasks      | elapsed:  9.7min
[Parallel(n_jobs=30)]: Done 40870417 tasks      | elapsed:  9.7min
[Parallel(n_jobs=30)]: Batch computation too slow (2.1506s.) Setting batch_size=1065.
[Parallel(n_jobs=30)]: Batch computation too slow (4.3300s.) Setting batch_size=532.
[Parallel(n_jobs=30)]: Batch computation too slow (2.0016s.) Setting batch_size=266.
[Parallel(n_jobs=30)]: Batch computation too fast (0.0637s.) Setting batch_size=1668.
[Parallel(n_jobs=30)]: Done 41197881 tasks      | elapsed:  9.9min
[Parallel(n_jobs=3

[Parallel(n_jobs=30)]: Done 56442654 tasks      | elapsed: 15.3min
[Parallel(n_jobs=30)]: Done 57084354 tasks      | elapsed: 15.3min
[Parallel(n_jobs=30)]: Done 57730654 tasks      | elapsed: 15.4min
[Parallel(n_jobs=30)]: Batch computation too slow (2.2491s.) Setting batch_size=1150.
[Parallel(n_jobs=30)]: Batch computation too slow (10.3239s.) Setting batch_size=575.
[Parallel(n_jobs=30)]: Batch computation too slow (11.4936s.) Setting batch_size=287.
[Parallel(n_jobs=30)]: Batch computation too slow (4.3354s.) Setting batch_size=143.
[Parallel(n_jobs=30)]: Done 58243833 tasks      | elapsed: 16.0min
[Parallel(n_jobs=30)]: Batch computation too slow (2.4161s.) Setting batch_size=71.
[Parallel(n_jobs=30)]: Batch computation too slow (2.3274s.) Setting batch_size=35.
[Parallel(n_jobs=30)]: Batch computation too slow (14.5890s.) Setting batch_size=17.
[Parallel(n_jobs=30)]: Batch computation too fast (0.0085s.) Setting batch_size=800.
[Parallel(n_jobs=30)]: Done 58338825 tasks      | e

[Parallel(n_jobs=30)]: Done 69782843 tasks      | elapsed: 22.0min
[Parallel(n_jobs=30)]: Batch computation too slow (2.2462s.) Setting batch_size=972.
[Parallel(n_jobs=30)]: Batch computation too slow (7.0462s.) Setting batch_size=486.
[Parallel(n_jobs=30)]: Done 70384817 tasks      | elapsed: 22.3min
[Parallel(n_jobs=30)]: Batch computation too slow (24.2854s.) Setting batch_size=243.
[Parallel(n_jobs=30)]: Batch computation too slow (3.7952s.) Setting batch_size=121.
[Parallel(n_jobs=30)]: Batch computation too slow (2.1941s.) Setting batch_size=60.
[Parallel(n_jobs=30)]: Batch computation too slow (2.6196s.) Setting batch_size=30.
[Parallel(n_jobs=30)]: Batch computation too slow (2.0024s.) Setting batch_size=15.
[Parallel(n_jobs=30)]: Done 70491327 tasks      | elapsed: 23.0min
[Parallel(n_jobs=30)]: Batch computation too slow (4.4487s.) Setting batch_size=7.
[Parallel(n_jobs=30)]: Batch computation too slow (2.2271s.) Setting batch_size=3.
[Parallel(n_jobs=30)]: Batch computation

[Parallel(n_jobs=30)]: Done 79823681 tasks      | elapsed: 29.1min
[Parallel(n_jobs=30)]: Batch computation too fast (0.1984s.) Setting batch_size=100.
[Parallel(n_jobs=30)]: Batch computation too fast (0.1766s.) Setting batch_size=226.
[Parallel(n_jobs=30)]: Batch computation too fast (0.0273s.) Setting batch_size=3314.
[Parallel(n_jobs=30)]: Done 80046361 tasks      | elapsed: 29.2min
[Parallel(n_jobs=30)]: Done 81183063 tasks      | elapsed: 29.4min
[Parallel(n_jobs=30)]: Batch computation too slow (2.3210s.) Setting batch_size=1657.
[Parallel(n_jobs=30)]: Done 82308166 tasks      | elapsed: 29.5min
[Parallel(n_jobs=30)]: Batch computation too slow (2.0640s.) Setting batch_size=828.
[Parallel(n_jobs=30)]: Batch computation too slow (7.6861s.) Setting batch_size=414.
[Parallel(n_jobs=30)]: Batch computation too slow (4.4960s.) Setting batch_size=207.
[Parallel(n_jobs=30)]: Batch computation too fast (0.1680s.) Setting batch_size=492.
[Parallel(n_jobs=30)]: Batch computation too slow 

[Parallel(n_jobs=30)]: Batch computation too slow (2.3669s.) Setting batch_size=463.
[Parallel(n_jobs=30)]: Batch computation too slow (15.2584s.) Setting batch_size=231.
[Parallel(n_jobs=30)]: Batch computation too fast (0.1726s.) Setting batch_size=534.
[Parallel(n_jobs=30)]: Done 105251922 tasks      | elapsed: 35.3min
[Parallel(n_jobs=30)]: Batch computation too fast (0.1963s.) Setting batch_size=1088.
[Parallel(n_jobs=30)]: Batch computation too slow (2.0845s.) Setting batch_size=544.
[Parallel(n_jobs=30)]: Batch computation too slow (9.1902s.) Setting batch_size=272.
[Parallel(n_jobs=30)]: Batch computation too fast (0.0622s.) Setting batch_size=1746.
[Parallel(n_jobs=30)]: Batch computation too slow (3.3986s.) Setting batch_size=873.
[Parallel(n_jobs=30)]: Batch computation too slow (2.1757s.) Setting batch_size=436.
[Parallel(n_jobs=30)]: Batch computation too fast (0.0730s.) Setting batch_size=2388.
[Parallel(n_jobs=30)]: Done 105669883 tasks      | elapsed: 35.7min
[Parallel(

[Parallel(n_jobs=30)]: Batch computation too slow (2.3760s.) Setting batch_size=687.
[Parallel(n_jobs=30)]: Batch computation too slow (10.1131s.) Setting batch_size=343.
[Parallel(n_jobs=30)]: Batch computation too slow (4.8352s.) Setting batch_size=171.
[Parallel(n_jobs=30)]: Batch computation too slow (6.6974s.) Setting batch_size=85.
[Parallel(n_jobs=30)]: Batch computation too fast (0.0231s.) Setting batch_size=1470.
[Parallel(n_jobs=30)]: Batch computation too slow (2.1630s.) Setting batch_size=735.
[Parallel(n_jobs=30)]: Batch computation too slow (27.2831s.) Setting batch_size=367.
[Parallel(n_jobs=30)]: Batch computation too slow (15.8696s.) Setting batch_size=183.
[Parallel(n_jobs=30)]: Batch computation too slow (4.5464s.) Setting batch_size=91.
[Parallel(n_jobs=30)]: Batch computation too slow (7.3119s.) Setting batch_size=45.
[Parallel(n_jobs=30)]: Done 111249118 tasks      | elapsed: 46.3min
[Parallel(n_jobs=30)]: Batch computation too fast (0.0126s.) Setting batch_size=1

[Parallel(n_jobs=30)]: Done 127035446 tasks      | elapsed: 53.6min
[Parallel(n_jobs=30)]: Done 127401807 tasks      | elapsed: 53.7min
[Parallel(n_jobs=30)]: Batch computation too slow (2.0054s.) Setting batch_size=413.
[Parallel(n_jobs=30)]: Batch computation too fast (0.1984s.) Setting batch_size=832.
[Parallel(n_jobs=30)]: Done 127725342 tasks      | elapsed: 53.7min
[Parallel(n_jobs=30)]: Batch computation too slow (2.0001s.) Setting batch_size=416.
[Parallel(n_jobs=30)]: Batch computation too fast (0.1930s.) Setting batch_size=862.
[Parallel(n_jobs=30)]: Done 128062556 tasks      | elapsed: 53.8min
[Parallel(n_jobs=30)]: Batch computation too slow (2.0131s.) Setting batch_size=431.
[Parallel(n_jobs=30)]: Batch computation too fast (0.0945s.) Setting batch_size=1822.
[Parallel(n_jobs=30)]: Batch computation too slow (2.0703s.) Setting batch_size=911.
[Parallel(n_jobs=30)]: Done 128619065 tasks      | elapsed: 54.0min
[Parallel(n_jobs=30)]: Done 129026282 tasks      | elapsed: 54.0

[Parallel(n_jobs=30)]: Done 149668962 tasks      | elapsed: 59.0min
[Parallel(n_jobs=30)]: Batch computation too slow (2.0286s.) Setting batch_size=929.
[Parallel(n_jobs=30)]: Done 150303200 tasks      | elapsed: 59.1min
[Parallel(n_jobs=30)]: Batch computation too slow (2.0162s.) Setting batch_size=464.
[Parallel(n_jobs=30)]: Batch computation too fast (0.1170s.) Setting batch_size=1586.
[Parallel(n_jobs=30)]: Batch computation too slow (2.1173s.) Setting batch_size=793.
[Parallel(n_jobs=30)]: Done 150853328 tasks      | elapsed: 59.3min
[Parallel(n_jobs=30)]: Batch computation too fast (0.1972s.) Setting batch_size=1608.
[Parallel(n_jobs=30)]: Batch computation too slow (2.0261s.) Setting batch_size=804.
[Parallel(n_jobs=30)]: Done 151485010 tasks      | elapsed: 59.4min
[Parallel(n_jobs=30)]: Batch computation too slow (2.0361s.) Setting batch_size=402.
[Parallel(n_jobs=30)]: Batch computation too fast (0.1954s.) Setting batch_size=822.
[Parallel(n_jobs=30)]: Done 151859770 tasks   

[Parallel(n_jobs=30)]: Done 172355058 tasks      | elapsed: 64.7min
[Parallel(n_jobs=30)]: Batch computation too slow (2.0046s.) Setting batch_size=647.
[Parallel(n_jobs=30)]: Batch computation too fast (0.1973s.) Setting batch_size=1310.
[Parallel(n_jobs=30)]: Batch computation too slow (2.0066s.) Setting batch_size=655.
[Parallel(n_jobs=30)]: Done 172836213 tasks      | elapsed: 64.9min
[Parallel(n_jobs=30)]: Batch computation too fast (0.1964s.) Setting batch_size=1332.
[Parallel(n_jobs=30)]: Done 173270041 tasks      | elapsed: 65.0min
[Parallel(n_jobs=30)]: Batch computation too slow (2.0320s.) Setting batch_size=666.
[Parallel(n_jobs=30)]: Batch computation too fast (0.1991s.) Setting batch_size=1336.
[Parallel(n_jobs=30)]: Done 173788917 tasks      | elapsed: 65.2min
[Parallel(n_jobs=30)]: Batch computation too slow (2.0477s.) Setting batch_size=668.
[Parallel(n_jobs=30)]: Batch computation too fast (0.1058s.) Setting batch_size=2526.
[Parallel(n_jobs=30)]: Batch computation too

[Parallel(n_jobs=30)]: Batch computation too slow (2.0751s.) Setting batch_size=803.
[Parallel(n_jobs=30)]: Batch computation too fast (0.1996s.) Setting batch_size=1608.
[Parallel(n_jobs=30)]: Done 187413389 tasks      | elapsed: 71.2min
[Parallel(n_jobs=30)]: Batch computation too slow (2.5587s.) Setting batch_size=804.
[Parallel(n_jobs=30)]: Batch computation too slow (20.6492s.) Setting batch_size=402.
[Parallel(n_jobs=30)]: Batch computation too slow (18.2973s.) Setting batch_size=201.
[Parallel(n_jobs=30)]: Batch computation too slow (9.6873s.) Setting batch_size=100.
[Parallel(n_jobs=30)]: Batch computation too slow (8.3791s.) Setting batch_size=50.
[Parallel(n_jobs=30)]: Batch computation too slow (4.5537s.) Setting batch_size=25.
[Parallel(n_jobs=30)]: Batch computation too fast (0.0077s.) Setting batch_size=1292.
[Parallel(n_jobs=30)]: Batch computation too slow (2.4615s.) Setting batch_size=646.
[Parallel(n_jobs=30)]: Batch computation too slow (14.9831s.) Setting batch_size

In [None]:
stampedNote("Ended calculations")

In [37]:
o

'/home/AD/emeinhar/wr/LD_fisher_in_buckeye/fisher_in_buckeye_LM.npy'

In [89]:
listdir(output_dir)

['fisher_in_buckeye_LM.npy']

In [43]:
hVC.nbytes / 1e9
hVC.shape
hVC.dtype

6.13899648

(44064, 17415)

dtype('float64')

In [75]:
dist_norms = np.sum(np.exp2(-1.0 * hVC), axis=0)
dist_norms

array([0.98952166, 0.98384789, 0.86949554, ..., 0.93416071, 0.96380412,
       0.86971511])

In [58]:
random_context = choice(contexts)
random_context

'oh mine drives'

In [61]:
contexts.index(random_context)

12416

In [68]:
hW_rc = np.array(par(delayed(score)(w, random_context) for w in vocabulary))

[Parallel(n_jobs=30)]: Using backend MultiprocessingBackend with 30 concurrent workers.
[Parallel(n_jobs=30)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=30)]: Batch computation too fast (0.0045s.) Setting batch_size=88.
[Parallel(n_jobs=30)]: Done  12 tasks      | elapsed:    0.0s
[Parallel(n_jobs=30)]: Done  25 tasks      | elapsed:    0.0s
[Parallel(n_jobs=30)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=30)]: Done  53 tasks      | elapsed:    0.0s
[Parallel(n_jobs=30)]: Batch computation too fast (0.0256s.) Setting batch_size=1372.
[Parallel(n_jobs=30)]: Done 764 tasks      | elapsed:    0.1s
[Parallel(n_jobs=30)]: Done 2260 tasks      | elapsed:    0.1s
[Parallel(n_jobs=30)]: Done 44064 out of 44064 | elapsed:    0.2s finished


In [69]:
np.array_equal( hVC[:,contexts.index(random_context)], hW_rc )

True

In [87]:
pW_rc = np.exp2(-1.0 * hW_rc)
pW_rc[:10]
np.sum(pW_rc)

array([6.36551410e-07, 6.36551410e-07, 2.52480522e-05, 8.92084666e-07,
       6.36551410e-07, 6.36551410e-07, 6.36551410e-07, 1.03142636e-03,
       2.79437554e-06, 6.36551410e-07])

0.8959708740033729

In [113]:
my_logprobs = -1.0 * hW_rc
my_epsilon = 10 ** (-1.0 * 16)
print('𝛆 = {0}'.format( my_epsilon ))

my_n = my_logprobs.shape[0]
print('n = {0}'.format( my_n ))

my_max = np.max(my_logprobs)
print('max(λᵢ) = λᵦ = {0}'.format( my_max ))

my_threshold = np.log2(my_epsilon) - np.log2(my_n)
print('𝚹 = {0}'.format( my_threshold ))

mask = my_logprobs - my_max >= my_threshold
np.sum(mask)
to_alpha = lambda logprob: np.exp2(logprob - my_max) if (logprob - my_max) >= my_threshold else 0.0
my_alphas = np.array([to_alpha(l) for l in my_logprobs])
my_alpha_norm = np.sum(my_alphas)
my_probs = my_alphas / my_alpha_norm
np.sum(my_probs)

𝛆 = 1e-16
n = 44064
max(λᵢ) = λᵦ = -2.995823580275348
𝚹 = -68.57816236233276


44064

1.0

In [115]:
np.isclose(1.0, 1.00000001)

True

In [142]:
# from https://stats.stackexchange.com/a/66621
# def normalize_logprobs(logprobs, d=16, axis=0, b=None):
def normalize_logprobs(logprobs, d=16, b=None):
    axis = 0
    n = logprobs.shape[axis]
    epsilon = 10**(-1.0 * d)
    maxlogp = np.max( logprobs[axis] )
    if b is None:
        threshold = np.log(epsilon) - np.log(n)
        to_alpha = lambda logprob: np.exp(logprob - my_max) if (logprob - my_max) >= my_threshold else 0.0
    elif b == 2:
        threshold = np.log2(epsilon) - np.log2(n)
        to_alpha = lambda logprob: np.exp2(logprob - my_max) if (logprob - my_max) >= my_threshold else 0.0
    elif b == 10:
        threshold = np.log10(epsilon) - np.log10(n)
        to_alpha = lambda logprob: np.power(logprob - my_max, 10) if (logprob - my_max) >= my_threshold else 0.0
    else:
        threshold = (np.log(epsilon) / np.log(b)) - (np.log(n) / np.log(b))
        to_alpha = lambda logprob: np.power(logprob - my_max, b) if (logprob - my_max) >= my_threshold else 0.0
#     alpha_is = np.apply_along_axis(to_alpha, axis=axis, arr=logprobs)
    alpha_is = np.array([to_alpha(l) for l in logprobs])
    alpha_norm = np.sum(alpha_is, axis=axis)
    probs = alpha_is / alpha_norm
    assert np.isclose(np.sum(probs), 1.0)
    return probs

In [143]:
normalize_logprobs(-1.0 * hW_rc, b=2)

array([7.10459937e-07, 7.10459937e-07, 2.81795457e-05, ...,
       7.10459937e-07, 9.95662574e-07, 7.10459937e-07])

In [144]:
normalize_logprobs(-1.0 * hVC[:,0], b=2)
normalize_logprobs(-1.0 * hVC[:,0], b=2).sum()

array([2.46539838e-07, 2.46539838e-07, 1.45738607e-05, ...,
       2.46539838e-07, 3.45509264e-07, 2.46539838e-07])

0.9999999999999998

In [148]:
pVC_normed = np.memmap(o + '.normed', dtype='float64', mode='w+', shape=my_shape)

In [147]:
# for j in range(my_shape[1]):
#     pVC_normed[:,j] = normalize_logprobs(-1.0 * hVC[:,j], b=2)

KeyboardInterrupt: 

In [149]:
def normColumn(j):
    pVC_normed[:,j] = normalize_logprobs(-1.0 * pVC[:,j], b=2)
    
# takes 3.4m on wittgenstein with J=30 and other stuff going on in the background
par(delayed(normColumn)(j) for j in range(num_contexts))

[Parallel(n_jobs=30)]: Using backend MultiprocessingBackend with 30 concurrent workers.
[Parallel(n_jobs=30)]: Done   1 tasks      | elapsed:  1.1min
[Parallel(n_jobs=30)]: Done  12 tasks      | elapsed:  1.1min
[Parallel(n_jobs=30)]: Done  25 tasks      | elapsed:  1.1min
[Parallel(n_jobs=30)]: Done  38 tasks      | elapsed:  1.1min
[Parallel(n_jobs=30)]: Done  53 tasks      | elapsed:  1.1min
[Parallel(n_jobs=30)]: Done  68 tasks      | elapsed:  1.1min
[Parallel(n_jobs=30)]: Done  85 tasks      | elapsed:  1.1min
[Parallel(n_jobs=30)]: Done 102 tasks      | elapsed:  1.1min
[Parallel(n_jobs=30)]: Done 121 tasks      | elapsed:  1.1min
[Parallel(n_jobs=30)]: Done 140 tasks      | elapsed:  1.1min
[Parallel(n_jobs=30)]: Done 161 tasks      | elapsed:  1.1min
[Parallel(n_jobs=30)]: Done 182 tasks      | elapsed:  1.1min
[Parallel(n_jobs=30)]: Done 205 tasks      | elapsed:  1.1min
[Parallel(n_jobs=30)]: Done 228 tasks      | elapsed:  1.1min
[Parallel(n_jobs=30)]: Done 253 tasks      |

[Parallel(n_jobs=30)]: Done 9881 tasks      | elapsed:  2.3min
[Parallel(n_jobs=30)]: Done 10022 tasks      | elapsed:  2.3min
[Parallel(n_jobs=30)]: Done 10165 tasks      | elapsed:  2.4min
[Parallel(n_jobs=30)]: Done 10308 tasks      | elapsed:  2.4min
[Parallel(n_jobs=30)]: Done 10453 tasks      | elapsed:  2.4min
[Parallel(n_jobs=30)]: Done 10598 tasks      | elapsed:  2.4min
[Parallel(n_jobs=30)]: Done 10745 tasks      | elapsed:  2.4min
[Parallel(n_jobs=30)]: Done 10892 tasks      | elapsed:  2.4min
[Parallel(n_jobs=30)]: Done 11041 tasks      | elapsed:  2.6min
[Parallel(n_jobs=30)]: Done 11190 tasks      | elapsed:  2.6min
[Parallel(n_jobs=30)]: Done 11341 tasks      | elapsed:  2.6min
[Parallel(n_jobs=30)]: Done 11492 tasks      | elapsed:  2.7min
[Parallel(n_jobs=30)]: Done 11645 tasks      | elapsed:  2.7min
[Parallel(n_jobs=30)]: Done 11798 tasks      | elapsed:  2.7min
[Parallel(n_jobs=30)]: Done 11953 tasks      | elapsed:  2.7min
[Parallel(n_jobs=30)]: Done 12108 tasks  

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

In [146]:
# pVC_normed[:,:] = normalize_logprobs(-1.0 * pVC, axis=0, b=2)
# pVC_normed.shape

In [155]:
pVC_normed.shape
pVC_normed.dtype
pVC_normed.nbytes / 1e9
np.sum(pVC_normed, axis=0)

(44064, 17415)

dtype('float64')

6.13899648

array([1., 1., 1., ..., 1., 1., 1.])

In [152]:
listdir(output_dir)

['fisher_in_buckeye_LM.npy.normed', 'fisher_in_buckeye_LM.npy']