In [1]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

**Notebook author:** emeinhardt@ucsd.edu

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Overview" data-toc-modified-id="Overview-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Overview</a></span><ul class="toc-item"><li><span><a href="#Usage" data-toc-modified-id="Usage-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Usage</a></span></li><li><span><a href="#Requirements" data-toc-modified-id="Requirements-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Requirements</a></span></li></ul></li><li><span><a href="#Parameters" data-toc-modified-id="Parameters-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Parameters</a></span></li><li><span><a href="#Imports-/-load-data" data-toc-modified-id="Imports-/-load-data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Imports / load data</a></span></li><li><span><a href="#Filter-the-input-$p(V|C)$" data-toc-modified-id="Filter-the-input-$p(V|C)$-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Filter the input $p(V|C)$</a></span></li><li><span><a href="#...and-write-to-disk" data-toc-modified-id="...and-write-to-disk-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>...and write to disk</a></span></li></ul></div>

# Overview

Given 
 - a filepath $d$ to a memory-mapped `numpy` array defining a distribution over an orthographic vocabulary $p(V|C)$ (conditioned on $n$-gram contexts $C$), where each column is a distribution
 - a filepath $v$ to a `.txt` file containing the orthographic vocabulary associated with $d$
    - 'associated with $d$' means that the ordering of items in $v$ corresponds to the ordering of rows of $d$
 - a filepath $c$ to a `.txt` file containing the $n$-gram contexts associated with $d$
    - 'associated with $d$' means that the ordering of items in $c$ corresponds to the ordering of columns of $d$
 - a filepath $l$ to a `.tsv` file (e.g. a transcription lexicon file) with a subset of the vocabulary in $v$ (under a column labeled `Orthographic_Wordform`)
 - an output filepath $o$
 
this notebook produces a version $d'$ of $d$ defined only on the words in $l$ written to $o$. The order of orthographic words in $d'$ reflects the order of words in $l$.

If optional flag $f$ = `'True'`, then this notebook will also, in the course of producing $d'$, filter out those columns whose associated context contains wordforms not in the language model's vocabulary $v$, and also produce a modified version of the file at $c$ (written to $c'$ in the same output directory as $o$) indicating what the filtered contexts are.

## Usage

#FIXME

## Requirements

 - `numpy`

# Parameters

In [2]:
from os import getcwd, chdir, listdir, path, mkdir, makedirs

In [3]:
from boilerplate import *

In [4]:
# Parameters

d = ''
# d = 'LD_Fisher_vocab_in_Buckeye_contexts/LD_fisher_vocab_in_buckeye_contexts.pV_C'

v = ''
# v = 'LD_Fisher_vocab_in_Buckeye_contexts/fisher_vocabulary_main.txt'

c = ''
# c = 'LD_Fisher_vocab_in_Buckeye_contexts/buckeye_contexts.txt'

l = ''
# l = 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_w_GD_AmE-diphones.tsv'
# this default/testing choice of l should throw an error below...

o = ''
# o = 'LD_Fisher_vocab_in_Buckeye_contexts/LD_fisher_vocab_in_buckeye_contexts_filtered_LTR_Buckeye.pV_C'

f = ''
# f = 'False'

In [5]:
# d = "LD_Fisher_vocab_in_Buckeye_contexts/LD_fisher_vocab_in_buckeye_contexts.pV_C"
# v = "LM_Fisher/fisher_vocabulary_main.txt"
# c = "buckeye_contexts.txt"
# l = "LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.tsv"
# o = "LD_Fisher_vocab_in_Buckeye_contexts/LD_fisher_vocab_in_buckeye_contexts_projected_LTR_Buckeye.pV_C"
# f = 'True'

In [6]:
output_dir = path.dirname(o)
ensure_dir_exists(output_dir)

In [7]:
assert f == 'True' or f == '', f"f argument, if provided, must either be 'True' or the empty string, got {f} instead."

In [8]:
if f == 'True':
    c_no_ext = path.splitext(c)[0]
    c_prime = c_no_ext + '_filtered_against_' + path.basename(path.splitext(v)[0]) + '.txt'

# Imports / load data

In [9]:
# import csv
import numpy as np

In [10]:
vocabulary = importSeqs(v, list)
contexts = importSeqs(c, list)

num_orthographic_words = len(vocabulary)
num_contexts = len(contexts)
my_shape = (num_orthographic_words, num_contexts)
my_shape

num_cells = np.prod(my_shape)
"{:,}".format(num_cells)

(44064, 17415)

'767,374,560'

In [11]:
pVC = np.memmap(d, dtype='float64', mode='r', shape=my_shape)
pVC.shape
pVC.nbytes / 1e9

(44064, 17415)

6.13899648

In [12]:
# this requires a very high peak memory usage (as high as 80-90GB), 
# and any time gained by doing calculations in memory is 
# more than offset by the time to load large arrays into memory

# pVC_on_disk = pVC
# pVC = np.zeros(dtype='float64', shape=my_shape)
# pVC[:,:] = pVC_on_disk

In [13]:
new_vocabulary_tsv = loadTSV_as_dictlist(l)
new_vocabulary_tsv[:5]

new_vocabulary = tuple(row['Orthographic_Wordform'] for row in new_vocabulary_tsv)
len(new_vocabulary)

[OrderedDict([('Orthographic_Wordform', "'em"), ('Transcription', 'ɛ.m')]),
 OrderedDict([('Orthographic_Wordform', 'a'), ('Transcription', 'eɪ')]),
 OrderedDict([('Orthographic_Wordform', "a's"), ('Transcription', 'eɪ.z.z')]),
 OrderedDict([('Orthographic_Wordform', "aaron's"),
              ('Transcription', 'eɪ.ɹ.ʌ.n.z')]),
 OrderedDict([('Orthographic_Wordform', 'abandoned'),
              ('Transcription', 'ʌ.b.æ.n.d.ʌ.n.d')])]

6574

In [14]:
[row for row in new_vocabulary_tsv if row['Orthographic_Wordform'] == '']

[]

In [15]:
illegal_vocab = [v for v in new_vocabulary if not v in vocabulary]
len(illegal_vocab)
# assert len(illegal_vocab) == 0, '{0} wordforms in l are not in v:\n{1}'.format(len(illegal_vocab), illegal_vocab)
assert len(illegal_vocab) == 0, f'{len(illegal_vocab)} wordforms in \n\t{l}\n are not in \n\t{v}:\n{illegal_vocab}'

0

In [16]:
legal_vocab = [v for v in new_vocabulary if v in vocabulary]
len(legal_vocab)
# assert len(legal_vocab) > 0, 'No wordforms in l are in v.'
assert len(legal_vocab) > 0, f'No wordforms in \n\t{l}\n are in \n\t{v}.'

6574

# Filter the input $p(V|C)$

In [17]:
len(contexts)
contexts[12345:12355]

17415

['of the medical',
 'of the movies',
 'of the uh',
 'of the year',
 'of them are',
 'of uh all',
 'of uh myths',
 'of uh the',
 'of what you',
 'off of coral']

In [18]:
def hasNoUnks(ctxt):
    wordforms = ctxt.split(' ')
    return all(w in vocabulary for w in wordforms)

if f == 'True':
    filtered_contexts = [c for c in contexts if hasNoUnks(c)]
    num_filtered_contexts = len(filtered_contexts)

    print(f'Contexts before filtering = {len(contexts)}')
    print(f'Contexts after filtering = {len(filtered_contexts)}')
    print(f'|Context loss| = {len(contexts) - len(filtered_contexts)}')
    print(f'Relative context loss = {(len(contexts) - len(filtered_contexts))/len(contexts) * 100}%')

Contexts before filtering = 17415
Contexts after filtering = 16443
|Context loss| = 972
Relative context loss = 5.5813953488372094%


In [19]:
if f == 'True':
    assert num_contexts == 0 or (num_contexts != 0 and num_filtered_contexts > 0), f'No contexts left in {c} after removing those containing any wordforms not in {v}.'

In [20]:
if f == 'True':
    # export 
    exportSeqs(c_prime, filtered_contexts)

In [21]:
num_orthographic_words_new = len(new_vocabulary)
if f != 'True':
    new_shape = (num_orthographic_words_new, num_contexts)
    new_shape
else:
    new_shape = (num_orthographic_words_new, num_filtered_contexts)
    new_shape

(6574, 16443)

In [22]:
desired_row_indices = np.array([vocabulary.index(v) for v in new_vocabulary])

In [23]:
new_pVC = np.zeros(dtype='float64', shape=new_shape)
if f != 'True':
    new_pVC[:,:] = pVC[desired_row_indices,:]
else:
    desired_col_indices = np.array([contexts.index(c) for c in filtered_contexts])
    new_pVC[:,:] = pVC[np.ix_(desired_row_indices, desired_col_indices)]

In [24]:
unnormalized_column_sums = np.sum(new_pVC, axis=0)
unnormalized_column_sums

array([0.79411324, 0.98925214, 0.9920863 , ..., 0.97166786, 0.91852775,
       0.99592878])

In [25]:
mass_losses_by_context = 1.0 - unnormalized_column_sums
mass_losses_by_context
mass_losses_by_context.mean()

array([0.20588676, 0.01074786, 0.0079137 , ..., 0.02833214, 0.08147225,
       0.00407122])

0.055962087541014914

In [26]:
# normalize!
normalized_new_pVC = new_pVC / unnormalized_column_sums

In [30]:
normalized_column_sums = np.sum(normalized_new_pVC, axis = 0)
normalized_column_sums

assert np.allclose(normalized_column_sums, np.ones(shape = normalized_column_sums.shape))

array([1., 1., 1., ..., 1., 1., 1.])

# ...and write to disk

In [None]:
# new_pVC_on_disk = np.memmap(o, dtype='float64', mode='w+', shape=new_shape)
# new_pVC_on_disk[:,:] = normalized_new_pVC

np.save(o, normalized_new_pVC)

In [None]:
if f == 'True':
    c_basename = path.basename(c)
    c_prime_basename = 'LM_filtered_' + c_basename
    c_prime = path.join(output_dir, c_prime_basename)
    exportSeqs(c_prime, filtered_contexts)

In [None]:
o

In [None]:
listdir(output_dir)