In [1]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

**Notebook author:** emeinhardt@ucsd.edu

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Overview" data-toc-modified-id="Overview-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Overview</a></span><ul class="toc-item"><li><span><a href="#Usage" data-toc-modified-id="Usage-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Usage</a></span></li><li><span><a href="#Requirements" data-toc-modified-id="Requirements-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Requirements</a></span></li></ul></li><li><span><a href="#Parameters" data-toc-modified-id="Parameters-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Parameters</a></span></li><li><span><a href="#Imports-/-load-data" data-toc-modified-id="Imports-/-load-data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Imports / load data</a></span></li><li><span><a href="#Main-calculation" data-toc-modified-id="Main-calculation-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Main calculation</a></span></li><li><span><a href="#Export" data-toc-modified-id="Export-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Export</a></span></li></ul></div>

# Overview

Given 
 - a filepath $d$ to an `.npy` file defining a distribution $p(V|C)$ on orthographic wordforms given $n$-gram contexts $C$
 - a filepath prefix $w$ pointing towards an `.npz` file defining a sparse distribution $p(W|V)$ on segmental wordforms given an orthographic wordform $v$ (where $V$ is the same as in $d$)
 - output filepath prefix $o$
 
this notebook calculates $p(W|C)$ and writes it to the filepath $o$ as an `.npy` file.

## Usage

In [None]:
#FIXME

## Requirements

 - `numpy`
 - the `pydata` `sparse` package

# Parameters

In [2]:
from os import getcwd, chdir, listdir, path, mkdir, makedirs

In [3]:
from boilerplate import *

In [4]:
# Parameters

d = ''
# d = 'LD_Fisher_vocab_in_swbd2003_contexts/LD_fisher_vocab_in_swbd2003_contexts_projected_LTR_newdic_destressed.pV_C.npy'

w = ''
# w = 'LTR_newdic_destressed_aligned_w_GD_AmE_destressed/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered.pW_V.npz'

o = ''
# o = 'LD_Fisher_vocab_in_swbd2003_contexts/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered_in_swbd2003_contexts.pW_C'

In [5]:
ensure_dir_exists(path.dirname(o))

# Imports / load data

In [6]:
from boilerplate import *

In [7]:
from probdist import *

In [8]:
import numpy as np
# import torch
import sparse

In [8]:
# from tqdm import tqdm

In [9]:
# # pVC_mmap = np.memmap(d, dtype='float64', mode='r', shape=new_shape)
# pVC_mmap = np.memmap(d, dtype='float64', mode='r')
# pVC_mmap.shape

In [9]:
# pV_C = np.load(d + '.npy')
pV_C = np.load(d)
pV_C.shape
pV_C.dtype
pV_C.nbytes / 1e9

(9411, 106295)

dtype('float64')

8.00273796

In [10]:
np.sum(pV_C, axis=0) #should all be close to 1
# np.sum(pV_C, axis=1) #nonsense

assert np.allclose(np.sum(pV_C, axis=0), np.ones(shape = np.sum(pV_C, axis=0).shape ))

array([ 1.,  1.,  1., ...,  1.,  1.,  1.])

In [11]:
# pW_V = sparse.load_npz(w + '.pW_V.npz')
pW_V = sparse.load_npz(w)
pW_V.shape
pW_V.dtype
pW_V.nbytes / 1e9
pW_V.density

(9172, 9411)

dtype('float64')

0.000225864

0.00010902747492368077

In [12]:
dist_sums = pW_V.sum(axis=0).todense() #should all be 1
dist_sums
assert np.allclose(dist_sums, np.ones(shape=dist_sums.shape))

array([ 1.,  1.,  1., ...,  1.,  1.,  1.])

# Main calculation

In [13]:
pW_C = sparse.matmul(pW_V, pV_C)
pW_C.shape
pW_C.dtype
pW_C.nbytes / 1e9

(9172, 106295)

dtype('float64')

7.79950192

In [14]:
# np.sum(pV_C, axis=0)
# np.sum(pV_C, axis=1) #nonsense

In [15]:
np.sum(pW_C, axis=0)
# np.sum(pW_C, axis=1) #nonsense
assert np.allclose(np.sum(pW_C, axis=0), np.ones(shape = np.sum(pW_C, axis=0).shape) )

array([ 1.,  1.,  1., ...,  1.,  1.,  1.])

In [None]:
#sanity checking

In [16]:
d
w

'LD_Fisher_vocab_in_swbd2003_contexts/LD_fisher_vocab_in_swbd2003_contexts_projected_LTR_newdic_destressed.pV_C.npy'

'LTR_newdic_destressed_aligned_w_GD_AmE_destressed/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered.pW_V.npz'

In [20]:
d.split('.npy')[0]
importDict(d.split('.npy')[0] + '_metadata.json')

'LD_Fisher_vocab_in_swbd2003_contexts/LD_fisher_vocab_in_swbd2003_contexts_projected_LTR_newdic_destressed.pV_C'

{'matrix fp': 'LD_Fisher_vocab_in_swbd2003_contexts/LD_fisher_vocab_in_swbd2003_contexts_projected_LTR_newdic_destressed.pV_C',
 'matrix shape': [9411, 106295],
 'Produced in step': 'Step 3d',
 'Produced in notebook': 'Filter contextual lexicon distribution by transcription lexicon',
 'C': {'from fp': 'swbd2003_contexts.txt',
  'changes': '1. Filtered by removing contexts with orthWords not in the LM = not in LM_Fisher/fisher_vocabulary_main.txt\n2. sorted',
  'new fp': 'LD_Fisher_vocab_in_swbd2003_contexts/LM_filtered_swbd2003_contexts.txt',
  'size': 106295},
 'V': {'from fp': 'LM_Fisher/fisher_vocabulary_main.txt',
  'changes': '1. Filtered by removing orthWords not in LTR @ LTR_newdic_destressed_aligned_w_GD_AmE_destressed/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered.tsv\n2. sorted',
  'new fp': 'orthographic words in LTR_newdic_destressed_aligned_w_GD_AmE_destressed/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered.tsv',
  'size': 9411}}

In [21]:
importDict(w + '_metadata.json')

{'matrix fp': 'LTR_newdic_destressed_aligned_w_GD_AmE_destressed/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered.pW_V.npz',
 'matrix shape': [9172, 9411],
 'Produced in step': 'Step 3e',
 'Produced in notebook': 'Define a conditional distribution on segmental wordforms given an orthographic one',
 'V': {'from fp': 'LTR_newdic_destressed_aligned_w_GD_AmE_destressed/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered.tsv',
  'changes': 'sorted',
  'size': 9411,
  'new fp': 'LTR_newdic_destressed_aligned_w_GD_AmE_destressed/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered_Orthographic_Wordforms.txt'},
 'W': {'from fp': 'LTR_newdic_destressed_aligned_w_GD_AmE_destressed/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered.tsv',
  'changes': 'sorted',
  'size': 9172,
  'new fp': 'LTR_newdic_destressed_aligned_w_GD_AmE_destressed/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered_Transcriptions.txt'},
 'Comment': 'This contains exactly the same information as the json file L

# Export

In [23]:
np.save(o, pW_C)

In [22]:
o

'LD_Fisher_vocab_in_swbd2003_contexts/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered_in_swbd2003_contexts.pW_C'

In [24]:
listdir(path.dirname(o))

['LD_fisher_vocab_in_swbd2003_contexts_projected_LTR_newdic_destressed.pV_C_metadata.json',
 '.ipynb_checkpoints',
 'Calculate segmental wordform distribution for LTR_CMU_destressed_aligned_CM_filtered_LM_filtered in swbd2003 contexts.ipynb',
 'Calculate orthographic posterior given segmental wordform + context for LTR_newdic_destressed_aligned_CM_filtered_LM_filtered in swbd2003 contexts.ipynb',
 'LD_fisher_vocab_in_swbd2003_contexts.hV_C',
 'fisher_vocabulary_main.txt',
 'swbd2003_contexts.txt',
 'Calculate segmental wordform distribution for LTR_newdic_destressed_aligned_CM_filtered_LM_filtered in swbd2003 contexts.ipynb',
 'Filter LD_fisher_vocab_in_swbd2003_contexts against LTR_newdic_destressed_aligned_CM_filtered_LM_filtered.ipynb',
 'LTR_CMU_destressed_aligned_CM_filtered_LM_filtered_in_swbd2003_contexts.pW_C.npy',
 'LM_filtered_swbd2003_contexts.txt',
 'LD_fisher_vocab_in_swbd2003_contexts_projected_LTR_newdic_destressed.pV_C',
 'LD_fisher_vocab_in_swbd2003_contexts_projected_

In [None]:
# pVC_dim_md = {'C':{'from fp':c,
#                    'changes':'sorted alphabetically',
#                    'size':len(contexts_sorted)},
#               'V':{'from fp':v,
#                    'changes':'none - already sorted',
#                    'size':len(vocabulary_sorted)}}
# # other_md = {'Produced in step':'Step 2b',
# #             'Base notebook name':'Producing contextual distributions'}

# exportMatrixMetadata(o+'.pV_C'+'_metadata.json',
#                      o+'.pV_C',
#                      pVC,
#                      pVC_dim_md,
#                      'Step 2b',
#                      'Producing contextual distributions',
#                      {'Comment':'Non-trivially normalized version of hVC with nearly the same name'})

In [25]:
pV_C.shape
pW_V.shape

(9411, 106295)

(9172, 9411)

In [26]:
pW_C_md = {'W':{'from fp':f'implicitly associated with {w}',
                'changes':'N/A',
                'size':pW_V.shape[0]},
           'V':{'from fp':f'implicitly associated with both \n\t{d} and \n\t{w}',
                'changes':'N/A',
                'size':pW_V.shape[1]},
           'C':{'from fp':f'implicitly associated with {d}',
                'changes':'N/A',
                'size':pV_C.shape[1]}}

exportMatrixMetadata(o+'.pW_C'+'_metadata.json',
                     o+'.pW_C'+'.npy',
                     pW_C,
                     pW_C_md,
                     'Step 4c',
                     'Calculate segmental wordform distribution given corpus contexts',
                     {})

Wrote metadata for 
	LD_Fisher_vocab_in_swbd2003_contexts/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered_in_swbd2003_contexts.pW_C.pW_C.npy
 to 
	LD_Fisher_vocab_in_swbd2003_contexts/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered_in_swbd2003_contexts.pW_C.pW_C_metadata.json


In [27]:
importDict(o + '.pW_C' + '_metadata.json')

{'matrix fp': 'LD_Fisher_vocab_in_swbd2003_contexts/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered_in_swbd2003_contexts.pW_C.pW_C.npy',
 'matrix shape': [9172, 106295],
 'Produced in step': 'Step 4c',
 'Produced in notebook': 'Calculate segmental wordform distribution given corpus contexts',
 'W': {'from fp': 'implicitly associated with LTR_newdic_destressed_aligned_w_GD_AmE_destressed/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered.pW_V.npz',
  'changes': 'N/A',
  'size': 9172},
 'V': {'from fp': 'implicitly associated with both \n\tLD_Fisher_vocab_in_swbd2003_contexts/LD_fisher_vocab_in_swbd2003_contexts_projected_LTR_newdic_destressed.pV_C.npy and \n\tLTR_newdic_destressed_aligned_w_GD_AmE_destressed/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered.pW_V.npz',
  'changes': 'N/A',
  'size': 9411},
 'C': {'from fp': 'implicitly associated with LD_Fisher_vocab_in_swbd2003_contexts/LD_fisher_vocab_in_swbd2003_contexts_projected_LTR_newdic_destressed.pV_C.npy',
  'change