In [2]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

**Notebook author:** emeinhardt@ucsd.edu

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Overview" data-toc-modified-id="Overview-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Overview</a></span><ul class="toc-item"><li><span><a href="#Usage" data-toc-modified-id="Usage-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Usage</a></span></li><li><span><a href="#Requirements" data-toc-modified-id="Requirements-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Requirements</a></span></li></ul></li><li><span><a href="#Parameters" data-toc-modified-id="Parameters-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Parameters</a></span></li><li><span><a href="#Imports-/-load-data" data-toc-modified-id="Imports-/-load-data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Imports / load data</a></span></li><li><span><a href="#Filter-the-input-$p(V|C)$" data-toc-modified-id="Filter-the-input-$p(V|C)$-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Filter the input $p(V|C)$</a></span></li><li><span><a href="#...and-write-to-disk" data-toc-modified-id="...and-write-to-disk-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>...and write to disk</a></span></li></ul></div>

# Overview

Given 
 - a filepath $d$ to a memory-mapped `numpy` array defining a distribution over an orthographic vocabulary $p(V|C)$ (conditioned on $n$-gram contexts $C$), where each column is a distribution
 - a filepath $v$ to a `.txt` file containing the orthographic vocabulary associated with $d$
    - 'associated with $d$' means that the ordering of items in $v$ corresponds to the ordering of rows of $d$
 - a filepath $c$ to a `.txt` file containing the $n$-gram contexts associated with $d$
    - 'associated with $d$' means that the ordering of items in $c$ corresponds to the ordering of columns of $d$
 - a filepath $l$ to a `.tsv` file (e.g. a transcription lexicon file) with a subset of the vocabulary in $v$ (under a column labeled `Orthographic_Wordform`)
 - an output filepath $o$
 
this notebook produces a version $d'$ of $d$ defined only on the words in $l$ written to $o$. The order of orthographic words in $d'$ is sorted alphabetically.

If optional flag $f$ = `'True'`, then this notebook will also, in the course of producing $d'$, filter out those columns whose associated context contains wordforms not in the language model's vocabulary $v$, and also produce a modified version of the file at $c$ (written to $c'$ in the same output directory as $o$) indicating what the filtered contexts are.

## Usage

#FIXME

## Requirements

 - `numpy`

# Parameters

In [3]:
from os import getcwd, chdir, listdir, path, mkdir, makedirs

In [4]:
from boilerplate import *

In [5]:
# Parameters

d = ''
# d = 'LD_Fisher_vocab_in_Buckeye_contexts/LD_fisher_vocab_in_buckeye_contexts.pV_C'

v = ''
# v = 'LD_Fisher_vocab_in_Buckeye_contexts/fisher_vocabulary_main.txt'

c = ''
# c = 'LD_Fisher_vocab_in_Buckeye_contexts/buckeye_contexts.txt'

l = ''
# l = 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_w_GD_AmE-diphones.tsv'
# this default/testing choice of l should throw an error below...

o = ''
# o = 'LD_Fisher_vocab_in_Buckeye_contexts/LD_fisher_vocab_in_buckeye_contexts_filtered_LTR_Buckeye.pV_C'

f = ''
# f = 'False'

In [10]:
# d = 'LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model/LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model.pV.json'
# v = ''
# c = ''
# l = 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered.tsv'
# o = 'LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model/LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model_projected_LTR_NXT_swbd_destressed_aligned_CM_filtered'
# f = ''

In [7]:
# d = "LD_Fisher_vocab_in_Buckeye_contexts/LD_fisher_vocab_in_buckeye_contexts.pV_C"
# v = "LM_Fisher/fisher_vocabulary_main.txt"
# c = "buckeye_contexts.txt"
# l = "LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.tsv"
# o = "LD_Fisher_vocab_in_Buckeye_contexts/LD_fisher_vocab_in_buckeye_contexts_projected_LTR_Buckeye.pV_C"
# f = 'True'

In [8]:
output_dir = path.dirname(o)
ensure_dir_exists(output_dir)

In [11]:
assert f == 'True' or f == '', f"f argument, if provided, must either be 'True' or the empty string, got {f} instead."

In [12]:
if f == 'True':
    c_no_ext = path.splitext(c)[0]
    c_prime = c_no_ext + '_filtered_against_' + path.basename(path.splitext(v)[0]) + '.txt'

# Imports / load data

In [13]:
# import csv
import numpy as np

In [14]:
from probdist import *

In [17]:
if v != '' and c != '':
    # For now, you want both the vocabulary and the contexts to be in the *same order* as they are in pV_C
    # vocabulary = importSeqs(v, list)
    # contexts = importSeqs(c, list)
    vocabulary = sorted(importSeqs(v, list))
    contexts = sorted(importSeqs(c, list))
elif v == '' and c == '':
    assert '.pV.json' in d, f"d must be a unigram distribution with filename ending in .pV.json, instead got d=\n\t'{d}'"
    pV = importProbDist(d)
    vocabulary = sorted(pV.keys())
    assert len(vocabulary) == len(set(vocabulary))
    contexts = []
else:
    raise Exception(f"Either v and c must be both empty strings or both specified. Got v,c = \n\t{v}\n\t{c}")

num_orthographic_words = len(vocabulary)
num_contexts = len(contexts)
my_shape = (num_orthographic_words, num_contexts)
my_shape

num_cells = np.prod(my_shape) if num_contexts != 0 else num_orthographic_words
"{:,}".format(num_cells)

(46202, 0)

'46,202'

In [18]:
if v != '' and c != '':
    pVC = np.memmap(d, dtype='float64', mode='r', shape=my_shape)
    pVC.shape
    pVC.nbytes / 1e9

In [19]:
if v != '' and c != '':
    pVC_md = importDict(d+'_metadata.json')
    pVC_md

In [13]:
# this requires a very high peak memory usage (as high as 80-90GB), 
# and any time gained by doing calculations in memory is 
# more than offset by the time to load large arrays into memory

# pVC_on_disk = pVC
# pVC = np.zeros(dtype='float64', shape=my_shape)
# pVC[:,:] = pVC_on_disk

In [20]:
new_vocabulary_tsv = loadTSV_as_dictlist(l)
new_vocabulary_tsv[:5]

new_vocabulary = tuple(row['Orthographic_Wordform'] for row in new_vocabulary_tsv)
len(new_vocabulary)

[OrderedDict([('Orthographic_Wordform', '007'),
              ('Transcription', 'd.ʌ.b.ə.l.oʊ.s.ɛ.v.ɪ.n')]),
 OrderedDict([('Orthographic_Wordform', '1'), ('Transcription', 'w.ʌ.n')]),
 OrderedDict([('Orthographic_Wordform', '101'),
              ('Transcription', 'w.ʌ.n.oʊ.w.ʌ.n')]),
 OrderedDict([('Orthographic_Wordform', '128'),
              ('Transcription', 'w.ʌ.n.t.u.eɪ.t')]),
 OrderedDict([('Orthographic_Wordform', '2'), ('Transcription', 't.u')])]

15317

In [21]:
[row for row in new_vocabulary_tsv if row['Orthographic_Wordform'] == '']

[]

In [22]:
illegal_vocab = [v for v in new_vocabulary if not v in vocabulary]
len(illegal_vocab)
# assert len(illegal_vocab) == 0, '{0} wordforms in l are not in v:\n{1}'.format(len(illegal_vocab), illegal_vocab)
assert len(illegal_vocab) == 0, f'{len(illegal_vocab)} wordforms in \n\t{l}\n are not in \n\t{v}:\n{illegal_vocab}'

0

In [23]:
legal_vocab = [v for v in new_vocabulary if v in vocabulary]
len(legal_vocab)
# assert len(legal_vocab) > 0, 'No wordforms in l are in v.'
assert len(legal_vocab) > 0, f'No wordforms in \n\t{l}\n are in \n\t{v}.'

15317

In [24]:
vocabulary = tuple(vocabulary)
len(vocabulary)

contexts = tuple(contexts)
len(contexts)

new_vocabulary = tuple(sorted(list(new_vocabulary)))
len(new_vocabulary)

46202

0

15317

# Filter the input $p(V|C)$

In [25]:
len(contexts)
if len(contexts) >= 12355:
    contexts[12345:12355]

0

In [26]:
def hasNoUnks(ctxt):
    wordforms = ctxt.split(' ')
    return all(w in vocabulary for w in wordforms)

if f == 'True':
    filtered_contexts = [c for c in contexts if hasNoUnks(c)]
    num_filtered_contexts = len(filtered_contexts)

    print(f'Contexts before filtering = {len(contexts)}')
    print(f'Contexts after filtering = {len(filtered_contexts)}')
    print(f'|Context loss| = {len(contexts) - len(filtered_contexts)}')
    print(f'Relative context loss = {(len(contexts) - len(filtered_contexts))/len(contexts) * 100}%')

In [27]:
if f == 'True':
    assert num_contexts == 0 or (num_contexts != 0 and num_filtered_contexts > 0), f'No contexts left in {c} after removing those containing any wordforms not in {v}.'

In [28]:
if f == 'True':
    filtered_contexts = tuple(sorted(filtered_contexts))
    len(filtered_contexts)

In [23]:
# already done down below...
# if f == 'True':
#     # export 
#     exportSeqs(c_prime, filtered_contexts)

In [29]:
num_orthographic_words_new = len(new_vocabulary)
if f != 'True':
    new_shape = (num_orthographic_words_new, num_contexts)
    new_shape
else:
    new_shape = (num_orthographic_words_new, num_filtered_contexts)
    new_shape

(15317, 0)

In [30]:
if v != '' and c != '':
    #ordering of desired_row_indices should reflect the ordering in the (already alphabetically sorted) new_vocabulary
    desired_row_indices = np.array([vocabulary.index(v) for v in new_vocabulary])

    # list(desired_row_indices) == sorted(list(desired_row_indices))
    # assert list(desired_row_indices) == sorted(list(desired_row_indices))

In [31]:
if v != '' and c != '':
    if f == 'True':
        #ordering of desired_col_indices should reflect the ordering in (already alphabetically sorted) filtered_contexts
        desired_col_indices = np.array([contexts.index(c) for c in filtered_contexts])
    #     list(desired_col_indices) == sorted(list(desired_col_indices))
    #     assert list(desired_col_indices) == sorted(list(desired_col_indices))

In [38]:
if v != '' and c != '':
    new_pVC = np.zeros(dtype='float64', shape=new_shape)
    if f != 'True':
        new_pVC[:,:] = pVC[desired_row_indices,:]
    else:
        desired_col_indices = np.array([contexts.index(c) for c in filtered_contexts])
        new_pVC[:,:] = pVC[np.ix_(desired_row_indices, desired_col_indices)]
else:
    new_pV = {v:pV[v] for v in new_vocabulary}

In [39]:
if v != '' and c != '':
    unnormalized_column_sums = np.sum(new_pVC, axis=0)
    unnormalized_column_sums
else:
    norm(new_pV)

0.8630214796151175

In [40]:
if v != '' and c != '':
    mass_losses_by_context = 1.0 - unnormalized_column_sums
    mass_losses_by_context
    mass_losses_by_context.mean()
else:
    1.0 - norm(new_pV)

0.13697852038488245

In [44]:
if v != '' and c != '':
    # normalize!
    normalized_new_pVC = new_pVC / unnormalized_column_sums
else:
    normalized_new_pV = ProbDist(new_pV)

In [45]:
if v != '' and c != '':
    normalized_column_sums = np.sum(normalized_new_pVC, axis = 0)
    normalized_column_sums

    assert np.allclose(normalized_column_sums, np.ones(shape = normalized_column_sums.shape))
else:
    assert isNormalized(normalized_new_pV)

In [57]:
if v == '' and c == '':
    new_pV_np = distToNP(normalized_new_pV)
    new_pV_np.shape
    new_pV_np = np.expand_dims(new_pV_np, axis=1)
    new_pV_np.shape
    assert isNormalized_np(new_pV_np)

(15317,)

(15317, 1)

In [53]:
o

'LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model/LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model_projected_LTR_NXT_swbd_destressed_aligned_CM_filtered'

# ...and write to disk

In [32]:
# new_pVC_on_disk = np.memmap(o, dtype='float64', mode='w+', shape=new_shape)
# new_pVC_on_disk[:,:] = normalized_new_pVC
if v != '' and c != '':
    np.save(o, normalized_new_pVC)
else:
    exportProbDist(o + '.pV.json', normalized_new_pV)
    np.save(o + '.pV', new_pV_np)

In [33]:
if f == 'True' and c != '':
    c_basename = path.basename(c)
    c_prime_basename = 'LM_filtered_' + c_basename
    c_prime = path.join(output_dir, c_prime_basename)
    exportSeqs(c_prime, filtered_contexts)

In [34]:
o

'LD_Fisher_vocab_in_Buckeye_contexts/LD_fisher_vocab_in_buckeye_contexts_projected_LTR_Buckeye.pV_C'

In [35]:
listdir(output_dir)

['LD_fisher_vocab_in_buckeye_contexts_projected_LTR_Buckeye.pV_C.npy',
 'LD_fisher_vocab_in_buckeye_contexts.pV_C',
 'buckeye_contexts.txt',
 'LM_filtered_buckeye_contexts.txt',
 '.ipynb_checkpoints',
 'LD_fisher_vocab_in_buckeye_contexts_projected_LTR_Buckeye.pV_C',
 'Producing Fisher vocab in Buckeye contexts contextual distributions.ipynb',
 'LD_fisher_vocab_in_buckeye_contexts.hV_C_metadata.json',
 'Filter LD_fisher_vocab_in_buckeye_contexts against LTR_Buckeye_aligned_CM_filtered_LM_filtered.ipynb',
 'Calculate segmental wordform distribution for LTR_Buckeye_aligned_CM_filtered_LM_filtered in buckeye contexts.ipynb',
 'LD_fisher_vocab_in_buckeye_contexts.hV_C',
 'fisher_vocabulary_main.txt',
 'LD_fisher_vocab_in_buckeye_contexts.pV_C_metadata.json',
 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_in_buckeye_contexts.pW_C.npy']

In [60]:
if v != '' and c != '':
    if f == 'True':
        c_changes = f'1. Filtered by removing contexts with orthWords not in the LM = not in {v}\n2. sorted'
        c_size = len(filtered_contexts)
        c_new_fp = c_prime
    else:
        c_changes = f'none'
        c_size = len(contexts)
        c_new_fp = 'N/A'

    normalized_new_pVC_dim_md = {'C':{'from fp':c,
                                      'changes':c_changes,
                                      'new fp':c_new_fp,
                                      'size':c_size},
                                 'V':{'from fp':v,
                                      'changes':f'1. Filtered by removing orthWords not in LTR @ {l}\n2. sorted',
                                      'new fp':f'orthographic words in {l}',
                                      'size':len(new_vocabulary)
                                      }}
    exportMatrixMetadata(o + '_metadata.json',
                         o,
                         normalized_new_pVC,
                         normalized_new_pVC_dim_md,
                         'Step 3d',
                         'Filter contextual lexicon distribution by transcription lexicon',
                         {})
else:
    normalized_new_pV_dim_md = {'C':{'from fp':'NA',
                                     'changes':'NA',
                                      'new fp':'',
                                      'size':'0'},
                                 'V':{'from fp':d,
                                      'changes':f'1. Filtered by removing orthWords not in LTR @ {l}\n2. sorted',
                                      'new fp':f'orthographic words in {l}',
                                      'size':len(new_vocabulary)
                                      }}
    exportMatrixMetadata(o + '.pV' + '_metadata.json',
                         o + '.pV.npy',
                         new_pV_np,
                         normalized_new_pV_dim_md,
                         'Step 3d',
                         'Filter contextual lexicon distribution by transcription lexicon',
                         {})

Wrote metadata for 
	LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model/LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model_projected_LTR_NXT_swbd_destressed_aligned_CM_filtered.pV.npy
 to 
	LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model/LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model_projected_LTR_NXT_swbd_destressed_aligned_CM_filtered.pV.npy_metadata.json


In [61]:
if c != '' and v != '':
    importDict(o+'_metadata.json')
else:
    importDict(o + '.pV.npy' + '_metadata.json')

{'matrix fp': 'LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model/LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model_projected_LTR_NXT_swbd_destressed_aligned_CM_filtered.pV.npy',
 'matrix shape': [15317, 1],
 'Produced in step': 'Step 3d',
 'Produced in notebook': 'Filter contextual lexicon distribution by transcription lexicon',
 'C': {'from fp': 'NA', 'changes': 'NA', 'new fp': '', 'size': '0'},
 'V': {'from fp': 'LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model/LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model.pV.json',
  'changes': '1. Filtered by removing orthWords not in LTR @ LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered.tsv\n2. sorted',
  'new fp': 'orthographic words in LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered.tsv',
  'size': 15317}}