In [1]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

**Notebook author:** emeinhardt@ucsd.edu

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Overview" data-toc-modified-id="Overview-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Overview</a></span><ul class="toc-item"><li><span><a href="#Usage" data-toc-modified-id="Usage-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Usage</a></span></li><li><span><a href="#Requirements" data-toc-modified-id="Requirements-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Requirements</a></span></li></ul></li><li><span><a href="#Parameters" data-toc-modified-id="Parameters-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Parameters</a></span></li><li><span><a href="#Imports-/-load-data" data-toc-modified-id="Imports-/-load-data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Imports / load data</a></span></li><li><span><a href="#Filter-the-input-$p(V|C)$" data-toc-modified-id="Filter-the-input-$p(V|C)$-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Filter the input $p(V|C)$</a></span></li><li><span><a href="#...and-write-to-disk" data-toc-modified-id="...and-write-to-disk-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>...and write to disk</a></span></li></ul></div>

# Overview

Given 
 - a filepath $d$ to a memory-mapped `numpy` array defining a distribution over an orthographic vocabulary $p(V|C)$ (conditioned on $n$-gram contexts $C$), where each column is a distribution
 - a filepath $v$ to a `.txt` file containing the orthographic vocabulary associated with $d$
 - a filepath $c$ to a `.txt` file containing the $n$-gram contexts associated with $d$
 - a filepath $l$ to a `.tsv` file (e.g. a transcription lexicon file) with a subset of the vocabulary in $v$ (under a column labeled `Orthographic_Wordform`)
 - an output filepath $o$
 
this notebook produces a version $d'$ of $d$ defined only on the words in $l$. The order of orthographic words in $d'$ reflects the order of words in $l$.

## Usage

#FIXME

## Requirements

 - `joblib`

# Parameters

In [2]:
from os import getcwd, chdir, listdir, path, mkdir, makedirs

In [3]:
# Parameters

d = ''
# d = 'LD_Fisher_vocab_in_Buckeye_contexts/LD_fisher_vocab_in_buckeye_contexts.pV_C'

v = ''
# v = 'LD_Fisher_vocab_in_Buckeye_contexts/fisher_vocabulary_main.txt'

c = ''
# c = 'LD_Fisher_vocab_in_Buckeye_contexts/buckeye_contexts.txt'

l = ''
# l = 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_w_GD_AmE-diphones.tsv'
# this default/testing choice of l should throw an error below...

o = ''
# o = 'LD_Fisher_vocab_in_Buckeye_contexts/LD_fisher_vocab_in_buckeye_contexts_filtered_LTR_Buckeye.pV_C'

In [4]:
# d = "LD_Fisher_vocab_in_Buckeye_contexts/LD_fisher_vocab_in_buckeye_contexts.pV_C"
# v = "LM_Fisher/fisher_vocabulary_main.txt"
# c = "buckeye_contexts.txt"
# l = "LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.tsv"
# o = "LD_Fisher_vocab_in_Buckeye_contexts/LD_fisher_vocab_in_buckeye_contexts_projected_LTR_Buckeye.pV_C"


In [5]:
output_dir = path.dirname(o)
if not path.exists(output_dir):
    print('Making output path {0}'.format(output_dir))
    makedirs(output_dir)

# Imports / load data

In [6]:
from boilerplate import *
# import csv
import numpy as np

In [7]:
vocabulary = importSeqs(v, list)
contexts = importSeqs(c, list)

num_orthographic_words = len(vocabulary)
num_contexts = len(contexts)
my_shape = (num_orthographic_words, num_contexts)
my_shape

num_cells = np.prod(my_shape)
"{:,}".format(num_cells)

(44064, 17415)

'767,374,560'

In [8]:
pVC = np.memmap(d, dtype='float64', mode='r', shape=my_shape)
pVC.shape
pVC.nbytes / 1e9

(44064, 17415)

6.13899648

In [9]:
new_vocabulary_tsv = loadTSV_as_dictlist(l)
new_vocabulary_tsv[:5]

new_vocabulary = tuple(row['Orthographic_Wordform'] for row in new_vocabulary_tsv)
len(new_vocabulary)

[OrderedDict([('Orthographic_Wordform', "aaron's"),
              ('Transcription', 'eɪ.ɹ.ʌ.n.z')]),
 OrderedDict([('Orthographic_Wordform', 'abandoned'),
              ('Transcription', 'ʌ.b.æ.n.d.ʌ.n.d')]),
 OrderedDict([('Orthographic_Wordform', 'abercrombie'),
              ('Transcription', 'æ.b.ɚ.k.ɹ.ɑ.m.b.i')]),
 OrderedDict([('Orthographic_Wordform', 'abhorrent'),
              ('Transcription', 'ʌ.b.h.oʊ.ɹ.ʌ.n.t')]),
 OrderedDict([('Orthographic_Wordform', 'abide'),
              ('Transcription', 'ʌ.b.aɪ.d')])]

6573

In [12]:
[row for row in new_vocabulary_tsv if row['Orthographic_Wordform'] == '']

[OrderedDict([('Orthographic_Wordform', ''), ('Transcription', 'n.ʌ.n')]),
 OrderedDict([('Orthographic_Wordform', ''), ('Transcription', 'n.ʌ.n')]),
 OrderedDict([('Orthographic_Wordform', ''), ('Transcription', 'n.ʌ.n')])]

In [10]:
illegal_vocab = [v for v in new_vocabulary if not v in vocabulary]
len(illegal_vocab)
# assert len(illegal_vocab) == 0, '{0} wordforms in l are not in v:\n{1}'.format(len(illegal_vocab), illegal_vocab)
assert len(illegal_vocab) == 0, f'{len(illegal_vocab)} wordforms in \n\t{l}\n are not in \n\t{v}:\n{illegal_vocab}'

3

AssertionError: 3 wordforms in 
	LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.tsv
 are not in 
	LM_Fisher/fisher_vocabulary_main.txt:
['', '', '']

In [64]:
legal_vocab = [v for v in new_vocabulary if v in vocabulary]
len(legal_vocab)
# assert len(legal_vocab) > 0, 'No wordforms in l are in v.'
assert len(legal_vocab) > 0, f'No wordforms in \n\t{l}\n are in \n\t{v}.'

7491

# Filter the input $p(V|C)$

In [54]:
num_orthographic_words_new = len(new_vocabulary)
new_shape = (num_orthographic_words_new, num_contexts)
new_shape

(7998, 17415)

In [63]:
desired_row_indices = np.array([vocabulary.index(v) for v in new_vocabulary])

ValueError: 'Ellimen' is not in list

In [55]:
new_pVC = np.zeros(dtype='float64', shape=new_shape)
new_pVC[:,:] = pVC[desired_row_indices,:]

# ...and write to disk

In [None]:
new_pVC_on_disk = np.memmap(o, dtype='float64', mode='w+', shape=new_shape)
new_pVC_on_disk[:,:] = new_pVC

In [None]:
o

In [None]:
listdir(output_dir)