In [1]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

**Notebook author:** emeinhardt@ucsd.edu

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Overview" data-toc-modified-id="Overview-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Overview</a></span><ul class="toc-item"><li><span><a href="#Motivation" data-toc-modified-id="Motivation-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Motivation</a></span><ul class="toc-item"><li><span><a href="#What-we-want-to-calculate" data-toc-modified-id="What-we-want-to-calculate-1.1.1"><span class="toc-item-num">1.1.1&nbsp;&nbsp;</span>What we <em>want</em> to calculate</a></span></li><li><span><a href="#What-we-can-calculate" data-toc-modified-id="What-we-can-calculate-1.1.2"><span class="toc-item-num">1.1.2&nbsp;&nbsp;</span>What we <em>can</em> calculate</a></span></li><li><span><a href="#The-structure-of-calculations" data-toc-modified-id="The-structure-of-calculations-1.1.3"><span class="toc-item-num">1.1.3&nbsp;&nbsp;</span>The structure of calculations</a></span></li><li><span><a href="#The-structure-of-the-pipeline" data-toc-modified-id="The-structure-of-the-pipeline-1.1.4"><span class="toc-item-num">1.1.4&nbsp;&nbsp;</span>The structure of the pipeline</a></span></li></ul></li><li><span><a href="#Requirements" data-toc-modified-id="Requirements-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Requirements</a></span><ul class="toc-item"><li><span><a href="#Python-environment" data-toc-modified-id="Python-environment-1.2.1"><span class="toc-item-num">1.2.1&nbsp;&nbsp;</span>Python environment</a></span></li><li><span><a href="#Hardware-/-runtime-expectations" data-toc-modified-id="Hardware-/-runtime-expectations-1.2.2"><span class="toc-item-num">1.2.2&nbsp;&nbsp;</span>Hardware / runtime expectations</a></span></li></ul></li><li><span><a href="#Todo" data-toc-modified-id="Todo-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Todo</a></span></li></ul></li><li><span><a href="#Imports" data-toc-modified-id="Imports-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Imports</a></span></li><li><span><a href="#Permitted-steps-/-control-flow" data-toc-modified-id="Permitted-steps-/-control-flow-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Permitted steps / control flow</a></span></li><li><span><a href="#Step-0:-Import/check-for-foundational-files" data-toc-modified-id="Step-0:-Import/check-for-foundational-files-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Step 0: Import/check for foundational files</a></span><ul class="toc-item"><li><span><a href="#Importing-existing-data-and-creating-directories" data-toc-modified-id="Importing-existing-data-and-creating-directories-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Importing existing data and creating directories</a></span></li><li><span><a href="#Step-0a:-Check-for-gating-data" data-toc-modified-id="Step-0a:-Check-for-gating-data-4.2"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>Step 0a: Check for gating data</a></span></li><li><span><a href="#Step-0b:-Check-for-transcribed-lexicons" data-toc-modified-id="Step-0b:-Check-for-transcribed-lexicons-4.3"><span class="toc-item-num">4.3&nbsp;&nbsp;</span>Step 0b: Check for transcribed lexicons</a></span></li><li><span><a href="#Step-0c:-Check-for-n-gram-contexts" data-toc-modified-id="Step-0c:-Check-for-n-gram-contexts-4.4"><span class="toc-item-num">4.4&nbsp;&nbsp;</span>Step 0c: Check for n-gram contexts</a></span></li><li><span><a href="#Step-0d:-Check-for-language-model(s)" data-toc-modified-id="Step-0d:-Check-for-language-model(s)-4.5"><span class="toc-item-num">4.5&nbsp;&nbsp;</span>Step 0d: Check for language model(s)</a></span></li></ul></li><li><span><a href="#Step-1:-Segment-inventory-alignment" data-toc-modified-id="Step-1:-Segment-inventory-alignment-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Step 1: Segment inventory alignment</a></span><ul class="toc-item"><li><span><a href="#Step-1a:-Define-inventory-alignment-projections" data-toc-modified-id="Step-1a:-Define-inventory-alignment-projections-5.1"><span class="toc-item-num">5.1&nbsp;&nbsp;</span>Step 1a: Define inventory alignment projections</a></span></li><li><span><a href="#Step-1b:-Apply-inventory-alignment-projections" data-toc-modified-id="Step-1b:-Apply-inventory-alignment-projections-5.2"><span class="toc-item-num">5.2&nbsp;&nbsp;</span>Step 1b: Apply inventory alignment projections</a></span><ul class="toc-item"><li><span><a href="#Check-for-projection-definitions" data-toc-modified-id="Check-for-projection-definitions-5.2.1"><span class="toc-item-num">5.2.1&nbsp;&nbsp;</span>Check for projection definitions</a></span></li><li><span><a href="#How-are-inventory-alignment-projections-actually-applied?" data-toc-modified-id="How-are-inventory-alignment-projections-actually-applied?-5.2.2"><span class="toc-item-num">5.2.2&nbsp;&nbsp;</span>How are inventory alignment projections actually applied?</a></span></li><li><span><a href="#Apply-projection-definitions" data-toc-modified-id="Apply-projection-definitions-5.2.3"><span class="toc-item-num">5.2.3&nbsp;&nbsp;</span>Apply projection definitions</a></span></li></ul></li></ul></li><li><span><a href="#Step-2:-Generating-channel-and-(orthographic)-lexicon-distributions" data-toc-modified-id="Step-2:-Generating-channel-and-(orthographic)-lexicon-distributions-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Step 2: Generating channel and (orthographic) lexicon distributions</a></span><ul class="toc-item"><li><span><a href="#Step-2a:-Generating-channel-distributions-and-associated-metadata" data-toc-modified-id="Step-2a:-Generating-channel-distributions-and-associated-metadata-6.1"><span class="toc-item-num">6.1&nbsp;&nbsp;</span>Step 2a: Generating channel distributions and associated metadata</a></span><ul class="toc-item"><li><span><a href="#Metadata" data-toc-modified-id="Metadata-6.1.1"><span class="toc-item-num">6.1.1&nbsp;&nbsp;</span>Metadata</a></span></li><li><span><a href="#Channel-distributions" data-toc-modified-id="Channel-distributions-6.1.2"><span class="toc-item-num">6.1.2&nbsp;&nbsp;</span>Channel distributions</a></span></li></ul></li><li><span><a href="#Step-2b:-Generating-(contextual)-lexicon-distributions-(over-orthographic-vocabularies)" data-toc-modified-id="Step-2b:-Generating-(contextual)-lexicon-distributions-(over-orthographic-vocabularies)-6.2"><span class="toc-item-num">6.2&nbsp;&nbsp;</span>Step 2b: Generating (contextual) lexicon distributions (over orthographic vocabularies)</a></span></li></ul></li><li><span><a href="#Step-3:-Creating-combinable-models" data-toc-modified-id="Step-3:-Creating-combinable-models-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Step 3: Creating combinable models</a></span><ul class="toc-item"><li><span><a href="#Step-3a:-Filter-transcription-lexicons-to-only-include-words-that-can-be-modeled-by-a-given-channel-distribution" data-toc-modified-id="Step-3a:-Filter-transcription-lexicons-to-only-include-words-that-can-be-modeled-by-a-given-channel-distribution-7.1"><span class="toc-item-num">7.1&nbsp;&nbsp;</span>Step 3a: Filter transcription lexicons to only include words that can be modeled by a given channel distribution</a></span></li><li><span><a href="#Step-3b:-Filter-transcription-lexicons-to-only-include-words-that-are-in-a-language-model's-vocabulary" data-toc-modified-id="Step-3b:-Filter-transcription-lexicons-to-only-include-words-that-are-in-a-language-model's-vocabulary-7.2"><span class="toc-item-num">7.2&nbsp;&nbsp;</span>Step 3b: Filter transcription lexicons to only include words that are in a language model's vocabulary</a></span></li><li><span><a href="#Step-3c:-Filter-the-conditioning-events-of-channel-distributions-to-only-include-$k$-factors-contained-in-elements-of-a-transcription-lexicon's-segmental-wordforms" data-toc-modified-id="Step-3c:-Filter-the-conditioning-events-of-channel-distributions-to-only-include-$k$-factors-contained-in-elements-of-a-transcription-lexicon's-segmental-wordforms-7.3"><span class="toc-item-num">7.3&nbsp;&nbsp;</span>Step 3c: Filter the conditioning events of channel distributions to only include $k$-factors contained in elements of a transcription lexicon's segmental wordforms</a></span></li><li><span><a href="#Step-3d:-For-each-(filtered)-transcribed-lexicon-relation,-define-the-relevant-contextual-lexicon-distributions-over-orthographic-wordforms" data-toc-modified-id="Step-3d:-For-each-(filtered)-transcribed-lexicon-relation,-define-the-relevant-contextual-lexicon-distributions-over-orthographic-wordforms-7.4"><span class="toc-item-num">7.4&nbsp;&nbsp;</span>Step 3d: For each (filtered) transcribed lexicon relation, define the relevant contextual lexicon distributions over orthographic wordforms</a></span></li><li><span><a href="#Step-3e:-For-each-(filtered)-transcribed-lexicon-relation,-define-a-conditional-distribution-on-segmental-wordforms-given-an-orthographic-wordform" data-toc-modified-id="Step-3e:-For-each-(filtered)-transcribed-lexicon-relation,-define-a-conditional-distribution-on-segmental-wordforms-given-an-orthographic-wordform-7.5"><span class="toc-item-num">7.5&nbsp;&nbsp;</span>Step 3e: For each (filtered) transcribed lexicon relation, define a conditional distribution on segmental wordforms given an orthographic wordform</a></span></li></ul></li><li><span><a href="#Step-4:-Pre-calculate-remaining-forward-model-components-and-meta-data" data-toc-modified-id="Step-4:-Pre-calculate-remaining-forward-model-components-and-meta-data-8"><span class="toc-item-num">8&nbsp;&nbsp;</span>Step 4: Pre-calculate remaining forward model components and meta-data</a></span><ul class="toc-item"><li><span><a href="#Step-4a:-Generate-triphone-lexicon-distributions-for-every-triphone-channel-model" data-toc-modified-id="Step-4a:-Generate-triphone-lexicon-distributions-for-every-triphone-channel-model-8.1"><span class="toc-item-num">8.1&nbsp;&nbsp;</span>Step 4a: Generate triphone lexicon distributions for every triphone channel model</a></span></li><li><span><a href="#Step-4b:-Pre-calculate-prefix-relation,-$k$-cousins,-and-$k$-spheres-for-each-segmental-lexicon" data-toc-modified-id="Step-4b:-Pre-calculate-prefix-relation,-$k$-cousins,-and-$k$-spheres-for-each-segmental-lexicon-8.2"><span class="toc-item-num">8.2&nbsp;&nbsp;</span>Step 4b: Pre-calculate prefix relation, $k$-cousins, and $k$-spheres for each segmental lexicon</a></span></li><li><span><a href="#Step-4c:-Calculate-the-marginal-probability-$p(W|C)$-of-each-segmental-wordform-$w$-given-$n$-gram-contexts-$C$" data-toc-modified-id="Step-4c:-Calculate-the-marginal-probability-$p(W|C)$-of-each-segmental-wordform-$w$-given-$n$-gram-contexts-$C$-8.3"><span class="toc-item-num">8.3&nbsp;&nbsp;</span>Step 4c: Calculate the marginal probability $p(W|C)$ of each segmental wordform $w$ given $n$-gram contexts $C$</a></span></li><li><span><a href="#Step-4d:-Define-observation-distributions" data-toc-modified-id="Step-4d:-Define-observation-distributions-8.4"><span class="toc-item-num">8.4&nbsp;&nbsp;</span>Step 4d: Define observation distributions</a></span></li><li><span><a href="#Step-4e:-Define-channel-distributions-on-a-set-of-segmental-wordforms(+prefixes)" data-toc-modified-id="Step-4e:-Define-channel-distributions-on-a-set-of-segmental-wordforms(+prefixes)-8.5"><span class="toc-item-num">8.5&nbsp;&nbsp;</span>Step 4e: Define channel distributions on a set of segmental wordforms(+prefixes)</a></span></li></ul></li><li><span><a href="#Step-5:-Calculate-posterior-probabilities" data-toc-modified-id="Step-5:-Calculate-posterior-probabilities-9"><span class="toc-item-num">9&nbsp;&nbsp;</span>Step 5: Calculate posterior probabilities</a></span><ul class="toc-item"><li><span><a href="#Step-5a:-Calculate-$p(V|W,-C)$" data-toc-modified-id="Step-5a:-Calculate-$p(V|W,-C)$-9.1"><span class="toc-item-num">9.1&nbsp;&nbsp;</span>Step 5a: Calculate $p(V|W, C)$</a></span></li><li><span><a href="#Step-5b:-Calculate-$p(\hat{X}_0^f|X_0^f,-C)$" data-toc-modified-id="Step-5b:-Calculate-$p(\hat{X}_0^f|X_0^f,-C)$-9.2"><span class="toc-item-num">9.2&nbsp;&nbsp;</span>Step 5b: Calculate $p(\hat{X}_0^f|X_0^f, C)$</a></span></li><li><span><a href="#Step-5c:-Calculate-$p(\hat{V}-=-v^*|-V-=-v^*,-C)$" data-toc-modified-id="Step-5c:-Calculate-$p(\hat{V}-=-v^*|-V-=-v^*,-C)$-9.3"><span class="toc-item-num">9.3&nbsp;&nbsp;</span>Step 5c: Calculate $p(\hat{V} = v^*| V = v^*, C)$</a></span></li></ul></li><li><span><a href="#Step-6:-Analysis" data-toc-modified-id="Step-6:-Analysis-10"><span class="toc-item-num">10&nbsp;&nbsp;</span>Step 6: Analysis</a></span><ul class="toc-item"><li><span><a href="#Step-6a:-Ensure-copy-of-word-analysis-relations-from-corpus-repositories-is-present-here..." data-toc-modified-id="Step-6a:-Ensure-copy-of-word-analysis-relations-from-corpus-repositories-is-present-here...-10.1"><span class="toc-item-num">10.1&nbsp;&nbsp;</span>Step 6a: Ensure copy of word analysis relations from corpus repositories is present here...</a></span></li><li><span><a href="#Step-6b:-Add-desired-probability-annotations..." data-toc-modified-id="Step-6b:-Add-desired-probability-annotations...-10.2"><span class="toc-item-num">10.2&nbsp;&nbsp;</span>Step 6b: Add desired probability annotations...</a></span></li><li><span><a href="#Step-6c:-Export-as-a-dataframe" data-toc-modified-id="Step-6c:-Export-as-a-dataframe-10.3"><span class="toc-item-num">10.3&nbsp;&nbsp;</span>Step 6c: Export as a dataframe</a></span></li></ul></li></ul></div>

# Overview

This notebook describes the processing pipeline from 
 - gating data
 - transcribed lexicon
 - a language model and (possibly empty) n-gram contexts

to 
 - channel distribution
 - lexicon distribution(s) (distributions over wordforms)
 - expected posterior distribution over intended wordform given what has been produced of what was intended.
 
 
It describes what happens at each step, checks some pre- and post-conditions, describes what you, the user must do (if anything), and scripts some commands to automatically do the necessary processing.

## Motivation

There are about one-and-a-half calculations that this processing pipeline enables.

Let 
 - $C = \{c_0, c_1 ... c_{n-1}\}$ denote a set of $n$-gram contexts (sequences of $\leq (n-1)$ orthographic wordforms) drawn from a speech corpus (e.g. Buckeye or Switchboard).
 - $V = \{v_0, v_1 ... v_V\}$ denote a set of orthographic wordforms (a 'vocabulary') drawn from a speech corpus (the same corpus $C$ is drawn from).
 - $W = \{w_0, w_1 ... w_W\}$ denote a set of segmental wordforms ('transcribed wordforms'). Each element $w \in W$ consists of a sequence $x_0 x_1 ... x_f = x_0^f$ of segments ('phonemes') drawn from a set of symbols $\Sigma_x$.
 
(All sets are finite, unless otherwise noted.)

 - A language model describes $p(V|C)$.
 - A lexical database or speech corpus describes a relation $r$ between orthographic wordforms $V$ and segmental wordforms $W$, where $(v,w) \in r$ iff $v$ can be produced as $w$.
 - Let $p(W|V)$ be defined as follows: $p(w|v) = \frac{1}{r_{v}}$, where $r_v = |\{w' | (v, w') \in r\}|$
 - Given that the $i$th segment that a speaker intended to produce is $x_i$, a triphone channel model describes $p(Y_i|x_{i-1}, x_i; x_{i+1})$, a distribution over received/perceived segments $\Sigma_y$. This is estimated from diphone gating data. Note that it doesn't permit modeling insertions or deletions.
 - $p(Y_0^f|W) = p(Y_0^f|X_0^f)$ is a (channel) distribution over perceived (segmental) wordforms given an intended (segmental) wordform. It is completely defined by a choice of $p(W|V)$ and a choice of $p(Y_i|x_{i-1}, x_i; x_{i+1})$.

The *forward model*, then defines a distribution $p(Y_0^f, X_0^f, V|C) = p(Y_0^f|X_0^f)p(X_0^f|V)p(V|C)$.

### What we *want* to calculate

We are interested in calculating the *expected posterior of a listener* over what orthographic wordform the speaker intended given what the speaker actually intended (taking the expectation over the speaker's actual intended segmental wordform, the perceived segmental wordform, and the listener's estimate of the speaker's intended segmental wordform):

$$p(\hat{V} = v^*|V = v^*, c) = \sum\limits_{x_0^{*f}, y_0^f, x_0^{'f}} p(\hat{V} = v^*, \hat{X}_0^f = x_0^{'f}, y_0^f, X_0^f = x_0^{*f}|V = v^*, c)$$

$$p(\hat{V} = v^*|V = v^*, c) = \sum\limits_{x_0^{*f}, y_0^f, x_0^{'f}} p(\hat{V} = v^*|\hat{X}_0^f = x_0^{'f}, c) p(\hat{X}_0^f = x_0^{'f}|Y_0^f = y_0^f, c) p(Y_0^f = y_0^f | X_0^f = x_0^{*f})p(X_0^f = x_0^{*f}|V = v^*)$$

where 

1. $p(\hat{X}_0^f = x_0^{'f}|Y_0^f = y_0^f, c) = \frac{p(y_0^f|x_0^{'f})p(x_0^{'f}|c)}{p(y_0^f | c)}$
2. $p(x_0^{'f}|c) = \sum\limits_{v'} p(x_0^{'f}|v')p(v'|c)$
3. $p(y_0^f| c) = \sum\limits_{v', x_0^{''f}} p(y_0^f|x_0^{''f})p(x_0^{''f}|v')p(v'|c) = \sum\limits_{x_0^{''f}} p(y_0^f|x_0^{''f})p(x_0^{''f}|c)$
4. $p(\hat{V} = v^*|\hat{X}_0^f = x_0^{'f}, c) = \frac{p(x_0^{'f}|v^*)p(v^*|c)}{p(x_0^{'f}|c)}$ 


### What we *can* calculate

 1. Unfortunately, because of the enormous size of $Y_0^f$, exact marginalization over all $y_0^f$ is *not* feasible.
 2. Fortunately, because, for any given $x_0^{*f}$, the fraction of $Y_0^f$ with non-negligible mass in $p(Y_0^f|x_0^{*f})$ is small, we can construct a Monte Carlo estimate of 
 $$p(\hat{X}_0^f = x_0^{'f}|x_0^{*f}, c) = \sum\limits_{y_0^f} p(\hat{X}_0^f = x_0^{'f}|y_0^f, c) p(y_0^f|x_0^{*f})$$
 as
 $$\hat{p}(\hat{X}_0^f = x_0^{'f}|x_0^{*f}, c) = \frac{1}{n} \sum\limits_{y_0^f \in S} p(\hat{X}_0^f = x_0^{'f}|y_0^f, c)$$
 where $S = $ a set of $n$ samples from $p(Y_0^f|x_0^{*f})$. In practice an $n \approx 50$ seems to result in estimates that are within $10^{-6}$ of the true estimate. 
 
 3. Unfortunately, even with this approximation $p(\hat{X}_0^f|X_0^{*f}, c)$ has about $10^8 - 10^9$ entries: too many to feasibly calculate exactly, especially across *all choices of $c$* as well.
 4. However, because of the relative dispersion of wordforms, the relatively low overall rate of channel noise, and the information provided by sentential context, the fraction of $\hat{X}_0^f$ assigned non-negligible mass in $p(\hat{X}_0^f|x_0^{*f}, c)$ for any given $x_0^{*f}$ is relatively small, and largely concentrated on $x_0^{'f}$ that are within $k$ edits (substitutions) of $x_0^{*f}$ for small $k$. Accordingly, we can get an arbitrarily good approximation $\hat{p}^{k}$ of $p(\hat{X}_0^f|x_0^{*f}, c)$ by 
  - choosing an arbitrarily small threshold $\epsilon$
  - calculating
$$p^k = \{p(\hat{X}_0^f = x_0^{'f}|x_0^{*f}, c) | x_0^{'f} \text{ is within } k \text{ edits of }x_0^{*f}\}$$ for progressively increasing $k$, stopping when $1 - \sum p_i^k \leq \epsilon$ and assigning $0$ probability to all $x_0^{'f}$ not associated with $p^k$.

Note that this means that if we choose the same $\epsilon$ for all $(x_0^{*f}, c)$ pairs, some segmental wordforms $x_0^f$ will have approximations involving higher or lower $k$ values than others, but that all will have distributions that are at least as accurate as some same minimum (determined by $\epsilon$).

### The structure of calculations

$$p(\hat{V} = v^*|V = v^*, c) = \sum\limits_{x_0^{*f}, x_0^{'f}} p(\hat{V} = v^*|\hat{X}_0^f = x_0^{'f}, c) p(\hat{X}_0^f = x_0^{'f}| X_0^f = x_0^{*f}, c) p(X_0^f = x_0^{*f}|V = v^*)$$

so $$p(\hat{V} = v^*|V = v^*, c) \approx \sum\limits_{x_0^{*f}, x_0^{'f}. d(x_0^{*f}, x_0^{'f}) \leq k} p(\hat{V} = v^*|\hat{X}_0^f = x_0^{'f}, c) \hat{p}(\hat{X}_0^f = x_0^{'f}| X_0^f = x_0^{*f}, c) p(X_0^f = x_0^{*f}|V = v^*)$$

We want to have the following distributions pre-computed (or as close to that as possible) as `numpy` arrays
 1. $p(W|V) = p(X_0^f|V)$
 2. $\hat{p}(\hat{X}_0^f|X_0^f, C)$
 3. $p(Y_0^f|W)$
 4. $p(W|C)$
 5. $p(V|W, C)$
 
so that we can efficiently calculate $p(\hat{V} = v^*| V = v^*, c)$ for all pairs of $(v^*, c)$ and
where 
 - $p(\hat{V} = v^*| V = v^*, c)$
 - $\hat{p}(\hat{X}_0^f|X_0^f, C)$, and 
 - $p(V|W,C)$ 
 
involve decreasing amounts of non-trivial computation.

### The structure of the pipeline

Given foundational files:
 - a language model $m$ over some orthographic vocabulary $V_m$
 - $n$-gram contexts $C$ taken from one or more speech corpora
 - a target orthographic vocabulary $V$ taken from each of the speech corpora that contexts are taken from
 - diphone gating data
 - one or more transcription relations $r$
 
the first step in the processing pipeline involves aligning segmental inventories of gating data and transcription relations. Once that's been done, we can define triphone channel distributions on transcription-relation aligned gating data that can be applied to gating-data aligned transcribed wordforms. We also need to define $p(V_m|C)$ for each choice of $C$.

Next, we create mutually combinable versions of transcription relations, channel distributions, and distributions over orthographic vocabularies - e.g.
 - we need to remove words from each $r$ that contain triphones that can't be modeled by the aligned triphone channel distribution
 - we need to remove words from each $r$ that aren't in the language model vocabulary $V_m$
 - $p(V_m|C)$ needs to be projected down to remove orthographic words that aren't in transcription relations and context sets $C$ need to be scrubbed of contexts containing orthographic wordforms not in the language model

With all the atomic components of the model(/each possible combined model) constructed, we then pre-calculate remaining components of the forward model(s). Finally, we calculate components of the posterior. (The separation of this last step from *everything previous* facilitates parallel computation.)

## Requirements

### Python environment
This repository was developed using Python 3.6 with the following package requirements (upstream dependencies not included):
 - **Jupyter/notebook related packages**: `jupyter` `jupyter_contrib_nbextensions` `papermill`
    - Notable notebook extensions used include `ExecuteTime` and `Table of Contents (2)`.
 - **Numeric computing and data processing**: `scipy` `numpy` `pandas` `tqdm` `joblib` `numba` `sparse` `pytorch` `torchvision`
 - **Language modeling**: `kenlm`, `arpa`
 - **Plotting**: `matplotlib` `plotnine`
 - **Misc**: `funcy` `more_itertools`

See `dev_environment.sh` for `conda` and `pip` commands to create an environment that has all of these packages. (Not all are available on conda, and not all of those that are available on conda are available from the main channel.)


### Hardware / runtime expectations

The last machine this repository was developed on (`wittgenstein`) has 32 cores. `joblib` is used extensively to parallelize data processing. On this machine, using `joblib` and often using nearly all of those cores:
1. Step 1 takes about a minute.
2. Step 2 takes about 3 hours.
3. Step 3 takes about 15-20 minutes.
4. Step 4 takes about 4.5 hours.

To run all of these scripts comfortably and without modification, you will need about 128GB of memory and ≈1 TB of hard disk space. Little or no attempt has been made to compress or avoid unnecessary file outputs except insofar as it creates representations that lead to usefully smaller memory loads or facilitate matrix-based compuations. Some unused or older versions of scripts in this repository were developed on machines with 180-190GB of RAM and use somewhere between 140 and 160GB of RAM on the largest inputs (invariably something related to the CMU dictionary - the largest lexicon by far).

In [2]:
import watermark
%load_ext watermark

In [3]:
%watermark -ihmuv

last updated: 2019-10-18T17:41:55-07:00

CPython 3.7.4
IPython 7.8.0

compiler   : GCC 7.3.0
system     : Linux
release    : 4.15.0-65-generic
machine    : x86_64
processor  : x86_64
CPU cores  : 12
interpreter: 64bit
host name  : kotoba


In [4]:
!lscpu

Architecture:        x86_64
CPU op-mode(s):      32-bit, 64-bit
Byte Order:          Little Endian
CPU(s):              12
On-line CPU(s) list: 0-11
Thread(s) per core:  2
Core(s) per socket:  6
Socket(s):           1
NUMA node(s):        1
Vendor ID:           GenuineIntel
CPU family:          6
Model:               158
Model name:          Intel(R) Core(TM) i7-8700 CPU @ 3.20GHz
Stepping:            10
CPU MHz:             2946.056
CPU max MHz:         4600.0000
CPU min MHz:         800.0000
BogoMIPS:            6384.00
Virtualization:      VT-x
L1d cache:           32K
L1i cache:           32K
L2 cache:            256K
L3 cache:            12288K
NUMA node0 CPU(s):   0-11
Flags:               fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc art arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq d

In [5]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:            62G        1.1G         56G        6.2M        5.1G         61G
Swap:          2.0G        3.0M        2.0G


In [6]:
!modinfo nvidia | grep version

version:        430.50
srcversion:     820C553BA7509F4D25C0B8E


In [7]:
from platform import node

my_hostname = node(); my_hostname

'kotoba'

## Todo

0. **Extensibility**: Step 3b (filtering transcribed lexicons against language model vocabularies) uses output filenames that won't scale if any transcription lexicon is used with more than one language model. (This also affects 3c.)
1. **Extensibility/Modularity/Maintainability**: Every notebook that depends on the behavior or output of another notebook being a certain way should have that assumption flagged at the top of the notebook. 
   - **Most common example**: one of notebook $B$'s arguments is a *directory path* $d$, where it expects to find a specific set of files with certain fixed filenames (or with filenames that are derived in some way from one of $B$'s arguments - often $d$); this assumption is predicated on the behavior of some notebook $A$ earlier in the pipeline producing exactly some set of files in a common directory all with certain filenames.
2. **Portability/Reproducibility**: For every file that this repository depends on that *isn't* tracked by the repository (e.g. processed versions of swbd2003, Buckeye, etc.), there should be *something* (e.g. a cell in this script) that lets the user identify where those files are located, and then said something copies them to wherever this repository is expecting to find them.
3. **Portability/Reproducibility**: Check for and remove absolute paths in this and other files.
4. **Portability/Reproducibility**: For platform independence (read: supporting windows users, I guess?) use `path.join` instead of manually choosing directory slashes...
5. **Documentation**: 
   1. Make sure motivation section is up to date.
   2. Math-y documentation in channel distribution and posterior distribution notebooks probably needs to be updated / at least have notation overhauled.
   3. Go through notebooks used here and make sure `Overview` cells are accurate.
   4. Go through notebooks used here and make sure Papermill-related `Usage` cells are filled in / accurate.

# Imports

In [8]:
import papermill as pm

In [9]:
from tqdm import tqdm

In [10]:
from os import getcwd, chdir, listdir, path, mkdir, makedirs

import json
import csv

In [11]:
from copy import deepcopy

In [12]:
from collections import OrderedDict

In [13]:
from itertools import product
from boilerplate import ensure_dir_exists, union, endNote, startNote, stampedNote, stamp

In [14]:
def progress_report(nb_fp, arg_dict):
    startNote()
    output_dir = path.dirname(nb_fp)
    nb_fn = path.basename(nb_fp)
    print(f"Running notebook:\n\t{nb_fn}")
    print(f"Output directory:\n\t{output_dir}")
    print("Arguments:")
    print(json.dumps(arg_dict, indent=1))

In [15]:
from funcy import *

In [16]:
repo_dir = getcwd()
repo_dir

'/mnt/cube/home/AD/emeinhar/wr'

In [17]:
repo_contents = listdir()
repo_contents

['CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.1_lambda0.5',
 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25',
 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0',
 'MeasBasAnalysis.ipynb',
 'probdist.py',
 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_2gram_model',
 'GD_AmE-diphones - LTR_NXT_swbd_destressed alignment application to LTR_NXT_swbd_destressed.ipynb',
 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0',
 'Calculate prefix data, k-cousins, and k-spheres (vec-dev).ipynb',
 'CM_AmE_destressed_unaligned_pseudocount0.1_lambda1.0',
 'LD_Fisher_vocab_in_(empty)_(NA)_contexts_1gram_model',
 'Calculate orthographic posterior given segmental wordform + context (sparse + dask + tiledb).ipynb',
 'Calculate segmental posterior given segmental wordform + context - LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_preceding_contexts_4gram_model - pc=0.01, λ=0.25.ipynb',
 'Wo

# Permitted steps / control flow

To facilitate quick (re)running of just the steps you want to (re)do and to avoid tediously (re)running steps you do not want to calculate, this list here controls what calculation steps will actually be done. (Combine with 'Run All' to facilitate quick calculation.)

In [163]:
permittedSteps = [
##     '0', #always checked
#     '1a',
#     '1b',
#     '2ai',
#     '2aii',
#     '2b',
#     '3a',
#     '3b',
    '3c',
#     '3d',
#     '3e',
#     '4a',
#     '4b',
#     '4c',
#     '4d',
#     '4e',
#     '5a',
#     '5b',
#     '5c',
#     '6a',
#     '6b',
#     '6c',
]

assert '0' not in permittedSteps, "Step 0 is currently always performed / skipping is not supported."

If the flag below is `False`, then (in selected places where I've added support in this notebook) the driver notebook will check for and not overwrite existing output notebooks (evidence of a previous and assumed-to-be-successful run).

In [19]:
overwrite = True

If the flag below is `True`, then this machine will skip (*only* in Steps 4e, 5b) processing 'trim' inputs. (See parameters in the Step 3e notebook.)

In [20]:
skip_trim = True

# Step 0: Import/check for foundational files

**NOTE:**
 - I assume all relevant transcriptions have been converted to Unicode IPA characters and that segment sequences have `.` as a separator. For each data source used here, the IPA alignment step is documented in a GitHub repository elsewhere. (While I don't *think* any script here depends on use of Unicode IPA symbols, I haven't - and won't - test that idea, and it is *absolutely required* that contiguous sequences of segments be separated by `.` in data files.)
 - Where language models and n-gram contexts (drawn from speech corpora) are referenced, each of these is assumed to have come from as is from other GitHub repositories/from executing the notebooks in those repositories.

## Importing existing data and creating directories

In [21]:
from shutil import copy

In [22]:
# AmE_gating_data_dir = 'GD_AmE'
# AmE_gating_data_fn = 'AmE-diphones-IPA-annotated-columns.csv'
# AmE_GD_fp = path.join(AmE_gating_data_dir, AmE_gating_data_fn)

# ensure_dir_exists(AmE_gating_data_dir)

# if not path.exists(AmE_GD_fp):
#     'Gating data file {0} does not exist. Attempting to copy from repository in sister folder.'.format(AmE_GD_fp)
    
#     wmc2014ipa_repo_dir = '../wmc2014-ipa'

#     assert path.exists(wmc2014ipa_repo_dir), "Cannot move gating data file from repository for you because it cannot be found.\n For automatic placement, you must download the wmc2014-ipa repository (from https://github.com/emeinhardt/wmc2014-ipa) and place it in the same folder as 'wr'"

#     copy(path.join(wmc2014ipa_repo_dir, AmE_gating_data_fn),
#          path.join(repo_dir, AmE_gating_data_dir, AmE_gating_data_fn))

In [23]:
# newdic_destressed_ltr_folder = 'LTR_newdic_destressed'
# cmu_destressed_ltr_folder = 'LTR_CMU_destressed'
# cmu_stressed_ltr_folder = 'LTR_CMU_stressed'
# buckeye_ltr_folder = 'LTR_Buckeye'
# nxt_swbd_ltr_folder = 'LTR_NXT_swbd_destressed'

# LTR_folders = (newdic_destressed_ltr_folder, cmu_destressed_ltr_folder, cmu_stressed_ltr_folder, buckeye_ltr_folder, nxt_swbd_ltr_folder)
# LTR_folders_to_process = (newdic_destressed_ltr_folder, cmu_destressed_ltr_folder, buckeye_ltr_folder, nxt_swbd_ltr_folder)

# LTR_folders_to_repo_dir = {
#     newdic_destressed_ltr_folder:'../newdic-nettalk-ipa',
#     cmu_destressed_ltr_folder:'../cmu-ipa',
#     cmu_stressed_ltr_folder:'../cmu-ipa',
#     buckeye_ltr_folder:'../buckeye-lm',
#     nxt_swbd_ltr_folder:'../switchboard-lm'
# }

# for LTR_dir in LTR_folders_to_process:
#     ensure_dir_exists(LTR_dir)
#     LTR_fp = path.join(LTR_dir, LTR_dir + '.tsv')
    
#     if not path.exists(LTR_fp):
#         'Transcribed lexicon relation {0} does not exist. Attempting to copy from repository in sister folder.'.format(LTR_fp)
    
#     LTR_repo_dir = LTR_folders_to_repo_dir[LTR_dir]
#     assert path.exists(LTR_repo_dir), "Cannot move transcribed lexicon relation .tsv from repository for you because it cannot be found.\n For automatic placement, you must download the {0} repository (from https://github.com/emeinhardt/{0}) and place it in the same folder as 'wr'".format(LTR_repo_dir)
    
    

The four steps below verify that foundational files assumed to be present by downstream notebooks are, in fact, present in the repository directory. If for some reason those files are not present, the processing pipeline will be aborted.

## Step 0a: Check for gating data

In [24]:
AmE_gating_data_dir = 'GD_AmE'
AmE_gating_data_fn = 'AmE-diphones-IPA-annotated-columns.csv'
AmE_GD_fp = path.join(AmE_gating_data_dir, AmE_gating_data_fn)

ensure_dir_exists(AmE_gating_data_dir)

if not path.exists(AmE_GD_fp):
    'Gating data file {0} does not exist. Attempting to copy from repository in sister folder.'.format(AmE_GD_fp)
    
    wmc2014ipa_repo_dir = '../wmc2014-ipa'

    assert path.exists(wmc2014ipa_repo_dir), "Cannot move gating data file from repository for you because it cannot be found.\n For automatic placement, you must download the wmc2014-ipa repository (from https://github.com/emeinhardt/wmc2014-ipa) and place it in the same folder as 'wr'"

    copy(path.join(wmc2014ipa_repo_dir, AmE_gating_data_fn),
         path.join(repo_dir, AmE_gating_data_dir, AmE_gating_data_fn))

In [25]:
# AmE_gating_data_dir = 'GD_AmE'
# AmE_gating_data_fn = 'AmE-diphones-IPA-annotated-columns.csv'
# AmE_GD_fp = path.join(AmE_gating_data_dir, AmE_gating_data_fn)
assert path.exists(AmE_gating_data_dir), 'Gating data directory {0} does not exist.'.format(AmE_gating_data_dir)
assert path.exists(AmE_GD_fp), 'Gating data file {0} does not exist.'.format(AmE_GD_fp)

The processed gating data used here come from 
 - https://github.com/emeinhardt/wmc2014-ipa
 
See those repositories for information on how they were produced.

## Step 0b: Check for transcribed lexicons

 - Each transcribed lexicon `LEXNAME` should be in a folder (e.g. `LTR_LEXNAME`) containing a file `LTR_LEXNAME.tsv`. For documentation purposes, the source file and a notebook documenting the production of the `.tsv` file should, if practicable be included in the folder as well.
   - A transcribed lexicon `LTR_....tsv` file should have two columns: `Orthographic_Wordform` and `Transcription`.
   - NB: The `LTR_` prefix on transcribed lexicon data files and containing folders is simply a convention for organization and readability, but is not required or expected by any file or script in this repository.

The assertions in the code below will only succeed if the condition described above is true for all transcribed lexicons listed for checking below.

In [26]:
newdic_destressed_ltr_folder = 'LTR_newdic_destressed'
cmu_destressed_ltr_folder = 'LTR_CMU_destressed'
cmu_stressed_ltr_folder = 'LTR_CMU_stressed'
buckeye_ltr_folder = 'LTR_Buckeye'
nxt_swbd_ltr_folder = 'LTR_NXT_swbd_destressed'

LTR_folders = (newdic_destressed_ltr_folder, cmu_destressed_ltr_folder, cmu_stressed_ltr_folder, buckeye_ltr_folder, nxt_swbd_ltr_folder)
# LTR_folders_to_process = (newdic_destressed_ltr_folder, cmu_destressed_ltr_folder, buckeye_ltr_folder, nxt_swbd_ltr_folder)
LTR_folders_to_process = (buckeye_ltr_folder, nxt_swbd_ltr_folder)

# LTR_folders_to_repo_dir = {
#     newdic_destressed_ltr_folder:'../newdic-nettalk-ipa',
#     cmu_destressed_ltr_folder:'../cmu-ipa',
#     cmu_stressed_ltr_folder:'../cmu-ipa',
#     buckeye_ltr_folder:'../buckeye-lm',
#     nxt_swbd_ltr_folder:'../switchboard-lm'
# }

# for LTR_dir in LTR_folders_to_process:
#     ensure_dir_exists(LTR_dir)
#     LTR_fp = path.join(LTR_dir, LTR_dir + '.tsv')
    
#     if not path.exists(LTR_fp):
#         'Transcribed lexicon relation {0} does not exist.'.format(LTR_fp)
    
#         if LTR_dir in {buckeye_ltr_folder, nxt_swbd_ltr_folder}:
#             print('Attempting to copy transcribed lexicon relation .tsv from repository folder...')
            
#             LTR_repo_dir = LTR_folders_to_repo_dir[LTR_dir]
            
#             assert path.exists(LTR_repo_dir), "Cannot move transcribed lexicon relation .tsv from repository for you because it cannot be found.\n For automatic placement, you must download the {0} repository (from https://github.com/emeinhardt/{0}) and place it in the same folder as 'wr'".format(LTR_repo_dir)
            
#             copy(path.join(LTR_repo_dir, FIXME), 
#                  path.join(repo_dir, LTR_dir, LTR_dir + '.tsv'))
    
    

for dirname in tqdm(LTR_folders_to_process):
    assert path.exists(dirname), 'Transcribed lexicon directory {0} not found in repo directory'.format(dirname)
    fname = path.join(dirname, dirname + '.tsv')
    assert path.exists(fname), 'Transcribed lexicon {0} not found in repo directory'.format(fname)

100%|██████████| 2/2 [00:00<00:00, 688.21it/s]


**How are transcribed lexicon relations made?**

Each was created by processing a transcription source (a lexical database, an annotated corpus, etc.). The processing step is described in other repositories:
 - https://github.com/emeinhardt/newdic-nettalk-ipa
 - https://github.com/emeinhardt/cmu-ipa
 - https://github.com/emeinhardt/buckeye-lm
 - https://github.com/emeinhardt/switchboard-lm

Given the processed outputs of these repositories, the `Making a Transcribed Lexicon Relation - <LEXNAME>.ipynb` notebook in each `LTR_LEXNAME` folder describes how the homogeneous `.tsv` files downstream steps are created.

**To be clear, if the filepaths checked above don't already exist, you will have to create them. The expected way of doing that (e.g. for out-of-the-box currently supported transcription lexicons) is opening the specified directory and running the notebook already there.**

**If no such directory or notebook exists, I suggest you create it, following the existing directories and notebooks as templates.**

In [27]:
for dirname in LTR_folders_to_process:
    listdir(dirname)

['LTR_Buckeye.tsv',
 'buckeye_orthography_phonemic_transcription_relation.tsv',
 'Making a Transcribed Lexicon Relation - Buckeye-old.ipynb',
 'buckeye_words_analysis_relation.json',
 'buckeye_vocabulary_main.txt',
 '.ipynb_checkpoints',
 'Making a Transcribed Lexicon Relation - Buckeye.ipynb']

['nxt_swbd_orthography_transcription_relation.tsv',
 '.ipynb_checkpoints',
 'NXT_swbd_vocabulary_main.txt',
 'Making a Transcribed Lexicon Relation - NXT_swbd.ipynb',
 'LTR_NXT_swbd_destressed.tsv']

## Step 0c: Check for n-gram contexts

In [28]:
from funcy import str_join, walk_values

In [29]:
# context_size_range = (1,2,3,4) 
context_size_range = (1,2,3)

In [30]:
context_fns = {
    #swbd2003 wordforms need to be POS tagged and 
    # contexts need to be organized by size + correctly constructed and
    # exclusion criteria need to be applied
#     'swbd2003':FIXME,
    'Buckeye':{'preceding':{l:str_join('_', ['buckeye', 'contexts', 'preceding', str(l), 'filtered']) + '.txt' 
                            for l in (1,2,3,4)},
               'following':{l:str_join('_', ['buckeye', 'contexts', 'following', str(l), 'filtered']) + '.txt' 
                            for l in (1,2,3,4)},
               'bidirectional':'buckeye_contexts_bidirectional_filtered.json'},
    'NXT_swbd':{'preceding':{l:str_join('_', ['nxt_swbd', 'contexts', 'preceding', str(l), 'filtered']) + '.txt' 
                            for l in (1,2,3,4)},
                'following':{l:str_join('_', ['nxt_swbd', 'contexts', 'following', str(l), 'filtered']) + '.txt' 
                            for l in (1,2,3,4)},
                'bidirectional':'nxt_swbd_contexts_bidirectional_filtered.json'}
}

context_dirs = tuple([str_join('_', ('C', corpus_name)) 
                      for corpus_name in context_fns])

for context_dir in context_dirs:
    ensure_dir_exists(context_dir)

context_dir_to_repo_dir = {
#     swbd2003_ltr_folder:'../switchboard-lm',
    'Buckeye':'../buckeye-lm',
    'NXT_swbd':'../switchboard-lm'
}

corpus_to_contexts = {
    'Buckeye':list(sorted({context_fns['Buckeye'][direction][l] 
                           for direction in ('preceding', 'following') for l in context_size_range})) + ['buckeye_contexts_bidirectional_filtered.json'],
    'NXT_swbd':list(sorted({context_fns['NXT_swbd'][direction][l] 
                           for direction in ('preceding', 'following') for l in context_size_range})) + ['nxt_swbd_contexts_bidirectional_filtered.json'],
}
corpus_to_contexts = walk_values(tuple, corpus_to_contexts)
corpus_to_contexts

for context_dir in context_dirs:
    context_name = context_dir[2:]
    for direction in context_fns[context_name]:
        if direction == 'bidirectional':
            context_fn = context_fns[context_name][direction]
            print(f"Checking in\n\t'{context_dir}'\nfor\n\t'{context_fn}'\n")
            
            if overwrite or not path.exists(path.join(context_dir, context_fn)):
                if not path.exists(path.join(context_dir, context_fn)):
                    print('{0} not found in {1}.'.format(context_fn, context_dir))
                repo_dir = context_dir_to_repo_dir[context_name]
                print('Attempting to copy context file {0} from {1}.'.format(context_fn, repo_dir))
                
                copy(path.join(repo_dir, context_fn),
                     path.join(context_dir, context_fn))
        else:
            for l in context_fns[context_name][direction]:
                context_fn = context_fns[context_name][direction][l]
                print(f"Checking in\n\t'{context_dir}'\nfor\n\t'{context_fn}'\n")
                
                if overwrite or not path.exists(path.join(context_dir, context_fn)):
                    if not path.exists(path.join(context_dir, context_fn)):
                        print('{0} not found in {1}.'.format(context_fn, context_dir))
                    repo_dir = context_dir_to_repo_dir[context_name]
                    print('Attempting to copy context file {0} from {1}.'.format(context_fn, repo_dir))
                
                    copy(path.join(repo_dir, context_fn),
                         path.join(context_dir, context_fn))
                

# buckeye_contexts = 'buckeye_contexts.txt'
# swbd2003_contexts = 'swbd2003_contexts.txt'

# contexts = (buckeye_contexts, swbd2003_contexts)

# for c_fn in contexts:
#     assert path.exists(c_fn), "N-gram contexts file {0} does not exist.".format(c_fn)

{'Buckeye': ('buckeye_contexts_following_1_filtered.txt',
  'buckeye_contexts_following_2_filtered.txt',
  'buckeye_contexts_following_3_filtered.txt',
  'buckeye_contexts_preceding_1_filtered.txt',
  'buckeye_contexts_preceding_2_filtered.txt',
  'buckeye_contexts_preceding_3_filtered.txt',
  'buckeye_contexts_bidirectional_filtered.json'),
 'NXT_swbd': ('nxt_swbd_contexts_following_1_filtered.txt',
  'nxt_swbd_contexts_following_2_filtered.txt',
  'nxt_swbd_contexts_following_3_filtered.txt',
  'nxt_swbd_contexts_preceding_1_filtered.txt',
  'nxt_swbd_contexts_preceding_2_filtered.txt',
  'nxt_swbd_contexts_preceding_3_filtered.txt',
  'nxt_swbd_contexts_bidirectional_filtered.json')}

Checking in
	'C_Buckeye'
for
	'buckeye_contexts_preceding_1_filtered.txt'

Attempting to copy context file buckeye_contexts_preceding_1_filtered.txt from ../buckeye-lm.


'C_Buckeye/buckeye_contexts_preceding_1_filtered.txt'

Checking in
	'C_Buckeye'
for
	'buckeye_contexts_preceding_2_filtered.txt'

Attempting to copy context file buckeye_contexts_preceding_2_filtered.txt from ../buckeye-lm.


'C_Buckeye/buckeye_contexts_preceding_2_filtered.txt'

Checking in
	'C_Buckeye'
for
	'buckeye_contexts_preceding_3_filtered.txt'

Attempting to copy context file buckeye_contexts_preceding_3_filtered.txt from ../buckeye-lm.


'C_Buckeye/buckeye_contexts_preceding_3_filtered.txt'

Checking in
	'C_Buckeye'
for
	'buckeye_contexts_preceding_4_filtered.txt'

Attempting to copy context file buckeye_contexts_preceding_4_filtered.txt from ../buckeye-lm.


'C_Buckeye/buckeye_contexts_preceding_4_filtered.txt'

Checking in
	'C_Buckeye'
for
	'buckeye_contexts_following_1_filtered.txt'

Attempting to copy context file buckeye_contexts_following_1_filtered.txt from ../buckeye-lm.


'C_Buckeye/buckeye_contexts_following_1_filtered.txt'

Checking in
	'C_Buckeye'
for
	'buckeye_contexts_following_2_filtered.txt'

Attempting to copy context file buckeye_contexts_following_2_filtered.txt from ../buckeye-lm.


'C_Buckeye/buckeye_contexts_following_2_filtered.txt'

Checking in
	'C_Buckeye'
for
	'buckeye_contexts_following_3_filtered.txt'

Attempting to copy context file buckeye_contexts_following_3_filtered.txt from ../buckeye-lm.


'C_Buckeye/buckeye_contexts_following_3_filtered.txt'

Checking in
	'C_Buckeye'
for
	'buckeye_contexts_following_4_filtered.txt'

Attempting to copy context file buckeye_contexts_following_4_filtered.txt from ../buckeye-lm.


'C_Buckeye/buckeye_contexts_following_4_filtered.txt'

Checking in
	'C_Buckeye'
for
	'buckeye_contexts_bidirectional_filtered.json'

Attempting to copy context file buckeye_contexts_bidirectional_filtered.json from ../buckeye-lm.


'C_Buckeye/buckeye_contexts_bidirectional_filtered.json'

Checking in
	'C_NXT_swbd'
for
	'nxt_swbd_contexts_preceding_1_filtered.txt'

Attempting to copy context file nxt_swbd_contexts_preceding_1_filtered.txt from ../switchboard-lm.


'C_NXT_swbd/nxt_swbd_contexts_preceding_1_filtered.txt'

Checking in
	'C_NXT_swbd'
for
	'nxt_swbd_contexts_preceding_2_filtered.txt'

Attempting to copy context file nxt_swbd_contexts_preceding_2_filtered.txt from ../switchboard-lm.


'C_NXT_swbd/nxt_swbd_contexts_preceding_2_filtered.txt'

Checking in
	'C_NXT_swbd'
for
	'nxt_swbd_contexts_preceding_3_filtered.txt'

Attempting to copy context file nxt_swbd_contexts_preceding_3_filtered.txt from ../switchboard-lm.


'C_NXT_swbd/nxt_swbd_contexts_preceding_3_filtered.txt'

Checking in
	'C_NXT_swbd'
for
	'nxt_swbd_contexts_preceding_4_filtered.txt'

Attempting to copy context file nxt_swbd_contexts_preceding_4_filtered.txt from ../switchboard-lm.


'C_NXT_swbd/nxt_swbd_contexts_preceding_4_filtered.txt'

Checking in
	'C_NXT_swbd'
for
	'nxt_swbd_contexts_following_1_filtered.txt'

Attempting to copy context file nxt_swbd_contexts_following_1_filtered.txt from ../switchboard-lm.


'C_NXT_swbd/nxt_swbd_contexts_following_1_filtered.txt'

Checking in
	'C_NXT_swbd'
for
	'nxt_swbd_contexts_following_2_filtered.txt'

Attempting to copy context file nxt_swbd_contexts_following_2_filtered.txt from ../switchboard-lm.


'C_NXT_swbd/nxt_swbd_contexts_following_2_filtered.txt'

Checking in
	'C_NXT_swbd'
for
	'nxt_swbd_contexts_following_3_filtered.txt'

Attempting to copy context file nxt_swbd_contexts_following_3_filtered.txt from ../switchboard-lm.


'C_NXT_swbd/nxt_swbd_contexts_following_3_filtered.txt'

Checking in
	'C_NXT_swbd'
for
	'nxt_swbd_contexts_following_4_filtered.txt'

Attempting to copy context file nxt_swbd_contexts_following_4_filtered.txt from ../switchboard-lm.


'C_NXT_swbd/nxt_swbd_contexts_following_4_filtered.txt'

Checking in
	'C_NXT_swbd'
for
	'nxt_swbd_contexts_bidirectional_filtered.json'

Attempting to copy context file nxt_swbd_contexts_bidirectional_filtered.json from ../switchboard-lm.


'C_NXT_swbd/nxt_swbd_contexts_bidirectional_filtered.json'

The n-gram context files are taken from
 - https://github.com/emeinhardt/buckeye-lm
 - https://github.com/emeinhardt/switchboard-lm
 
See those repositories for more information on how the contexts were extracted. (*NB*: Like the transcription lexicons, the context files are not included in this repository both to avoid duplication and because of licensing restrictions: to recreate these contexts, you will need access to your own copy of the Buckeye and (various) Switchboard corpora.)

## Step 0d: Check for language model(s)

In [31]:
# order_range = (2,3,4,5)
order_range = (2,3,4)

In [32]:
fisher_lm_dir = 'LM_Fisher'

ensure_dir_exists(fisher_lm_dir)

LM_fn_stem = 'fisher_utterances_main'
LM_fns = {
    '':{l:LM_fn_stem + '_' + str(l) + 'gram.mmap'
        for l in order_range},
    'rev':{l:LM_fn_stem + '_' + 'rev' + '_' + str(l) + 'gram.mmap'
           for l in order_range}
}

fisher_lm_repo_dir = '../fisher-lm'

for direction in LM_fns:
    for l in LM_fns[direction]:
        lm_fn = LM_fns[direction][l]
        
        if not path.exists(path.join(fisher_lm_dir, lm_fn)):
            print('{0} not found in {1}'.format(lm_fn, fisher_lm_dir))
            print('Attempting to copy from repository directory...')
            copy(path.join(fisher_lm_repo_dir, lm_fn),
                 path.join(fisher_lm_dir, lm_fn))
            

# fisher_lm_fn = 'fisher_utterances_main_4gram.mmap'
# fisher_lm_fp = path.join(fisher_lm_dir, fisher_lm_fn)

fisher_lm_vocab_fn = 'fisher_vocabulary_main.txt'
if not path.exists(path.join(fisher_lm_dir, fisher_lm_vocab_fn)):
    print('{0} not found in {1}'.format(fisher_lm_vocab_fn, fisher_lm_dir))
    print('Attempting to copy from repository directory...')
    copy(path.join(fisher_lm_repo_dir, fisher_lm_vocab_fn),
         path.join(fisher_lm_dir, fisher_lm_vocab_fn))

# fisher_lm_vocab_fp = path.join(fisher_lm_dir, fisher_lm_vocab_fn)

# assert path.exists(fisher_lm_fp), 'Language model {0} not found'.format(fisher_lm_fp)
# assert path.exists(fisher_lm_vocab_fp), 'Language model vocabulary {0} not found'.format(fisher_lm_vocab_fp)

# fisher_lm_fps = {'lm':fisher_lm_fp, 
#                  'vocab':fisher_lm_vocab_fp}

The (memory mapped) $n$-gram language models (for $n \geq 2$ are copied from the output of this repository:
 - https://github.com/emeinhardt/fisher-lm
 
See that repository for more information. (*NB* Again, the language model file is not included in this repository both to avoid duplication and because of licensing restrictions: to recreate these contexts, you will need access to your own copy of the Buckeye and Switchboard corpora.

In [33]:
unigram_fisher_lm_repo_dir = '../fisher-lm-srilm'

unigram_LM_fns = {'fisher_unigram_counts.tsv',
                  'LD_Fisher_vocab_add1_unigram_model.arpa',
                  'LD_Fisher_vocab_add1_unigram_model.pV.npy',
                  'LD_Fisher_vocab_add1_unigram_model.pV.npy_metadata.json',
                  'LD_Fisher_vocab_add1_unigram_model.pV.json'}

for lm_fn in unigram_LM_fns:
    if not path.exists(path.join(fisher_lm_dir, lm_fn)):
        print('{0} not found in {1}'.format(lm_fn, fisher_lm_dir))
        print('Attempting to copy from repository directory...')
        copy(path.join(unigram_fisher_lm_repo_dir, lm_fn),
             path.join(fisher_lm_dir, lm_fn))

In [34]:
main_unigram_counts_fp = path.join(fisher_lm_dir, 
                                   'fisher_unigram_counts.tsv')
main_unigram_LM_fp = path.join(fisher_lm_dir, 
                               'LD_Fisher_vocab_add1_unigram_model.arpa')

Unigram model files are copied from the output of this repository:
 - https://github.com/emeinhardt/fisher-lm-srilm
 
See that repository for more information.

# Step 1: Segment inventory alignment

## Step 1a: Define inventory alignment projections

The segment inventory of any given transcribed lexicon and the segment inventory of the gating data often do not line up. For the gating data to be usefully applied to a given lexicon of transcriptions, the strings in the (segmental) lexicon must contain only segments found in the gating data stimuli inventory.

To ensure this happens, the notebook `Gating Data - Transcription Lexicon Alignment Maker.ipynb` 
 - takes as inputs 
     - a transcribed lexicon file path and a gating data file path
     - a lexicon projection file path and a gating data projection file path
 - identifies the inventories of each and what symbols are relatively unique to the lexicon and the gating data
 - produces 
   - *a Jupyter notebook* for **you to open and finish by defining two projection functions** (i.e. Python dictionaries) to be applied to strings in the transcribed lexicon and to the gating data (one function for each). When you finish doing this (and set an export flag in the notebook to True and run the remainder of the notebook), this notebook will produce
     - two *.json files storing these projections* according to the previously provided output file paths.

The cell below will clear all existing alignment folders created using the code in this subsection:

In [35]:
# %rm -rf LTR*_aligned_w_*
# %rm -rf *" alignment definition"*

The cell below will only succeed if the American English gating data of Warner, McQueen, and Cutler (2014) is contained in the repo directory with a particular directory and filename.

In [36]:
gating_data_folder = 'GD_AmE'
gating_data_fn = 'AmE-diphones-IPA-annotated-columns.csv'
gating_data_fp = path.join(gating_data_folder, gating_data_fn)

assert path.exists(gating_data_folder), 'AmE gating data folder {0} not found in repo directory'.format(gating_data_folder)
assert path.exists(gating_data_fp), 'AmE gating data {0} not found in repo directory'.format(gating_data_fp)

The third cell below will create a notebook for alignment projection definitions for each of the transcribed lexicons from the previous step and the AmE gating data.

In [37]:
#FIXME replace usage with path.splitext
def removeExtension(fp):
    dir_name = path.dirname(fp)
    file_name = path.basename(fp)
    ext = file_name.split('.')[-1]
    rest = '.'.join( file_name.split('.')[:-1] )
    return path.join(dir_name, rest)

In [38]:
alignment_arg_bundles = []
for LTR_dirname in LTR_folders_to_process:
    LTR_fn = LTR_dirname + '.tsv'
    LTR_fp = path.join(LTR_dirname, LTR_fn)
    
    nb_output_name = 'GD_AmE-diphones - ' + LTR_dirname + ' alignment definition' + '.ipynb'
    my_g = gating_data_fp
    my_l = LTR_fp
    my_s = 'destressed'
    
    gd_alignment_dn = 'GD_AmE_' + my_s + '_' + 'aligned_w_' + LTR_dirname
    gd_alignment_fn = 'alignment_of_' + removeExtension(gating_data_fn) + '_w_' + LTR_dirname + '.json'
    gd_alignment_fp = path.join(gd_alignment_dn, gd_alignment_fn)
    if not path.exists(gd_alignment_dn):
        makedirs(gd_alignment_dn)
    my_gp = gd_alignment_fp
    
    ltr_alignment_dn = LTR_dirname + '_aligned_w_' + 'GD_AmE_' + my_s
    ltr_alignment_fn = 'alignment_of_' + LTR_dirname + '_w_' + removeExtension(gating_data_fn) + '.json'
    ltr_alignment_fp = path.join(ltr_alignment_dn, ltr_alignment_fn)
    if not path.exists(ltr_alignment_dn):
        makedirs(ltr_alignment_dn)
    my_lp = ltr_alignment_fp
    
    
    my_arg_bundle = OrderedDict({
        'LTR_dirname':LTR_dirname,
        'LTR_fn':LTR_fn,
        'LTR_fp':LTR_fp,
        'gd_alignment_dn':gd_alignment_dn,
        'gd_alignment_fn':gd_alignment_fn,
        'gd_alignment_fp':gd_alignment_fp,
        'ltr_alignment_dn':ltr_alignment_dn,
        'ltr_alignment_fn':ltr_alignment_fn,
        'ltr_alignment_fp':ltr_alignment_fp,
        'align_def_nb_output_name':nb_output_name,
        'my_g':my_g,
        'my_l':my_l,
        'my_s':my_s,
        'my_gp':my_gp,
        'my_lp':my_lp,
    })
    my_arg_bundle
    alignment_arg_bundles.append(my_arg_bundle)

OrderedDict([('LTR_dirname', 'LTR_Buckeye'),
             ('LTR_fn', 'LTR_Buckeye.tsv'),
             ('LTR_fp', 'LTR_Buckeye/LTR_Buckeye.tsv'),
             ('gd_alignment_dn', 'GD_AmE_destressed_aligned_w_LTR_Buckeye'),
             ('gd_alignment_fn',
              'alignment_of_AmE-diphones-IPA-annotated-columns_w_LTR_Buckeye.json'),
             ('gd_alignment_fp',
              'GD_AmE_destressed_aligned_w_LTR_Buckeye/alignment_of_AmE-diphones-IPA-annotated-columns_w_LTR_Buckeye.json'),
             ('ltr_alignment_dn', 'LTR_Buckeye_aligned_w_GD_AmE_destressed'),
             ('ltr_alignment_fn',
              'alignment_of_LTR_Buckeye_w_AmE-diphones-IPA-annotated-columns.json'),
             ('ltr_alignment_fp',
              'LTR_Buckeye_aligned_w_GD_AmE_destressed/alignment_of_LTR_Buckeye_w_AmE-diphones-IPA-annotated-columns.json'),
             ('align_def_nb_output_name',
              'GD_AmE-diphones - LTR_Buckeye alignment definition.ipynb'),
             ('my_g', 'GD_AmE

OrderedDict([('LTR_dirname', 'LTR_NXT_swbd_destressed'),
             ('LTR_fn', 'LTR_NXT_swbd_destressed.tsv'),
             ('LTR_fp', 'LTR_NXT_swbd_destressed/LTR_NXT_swbd_destressed.tsv'),
             ('gd_alignment_dn',
              'GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed'),
             ('gd_alignment_fn',
              'alignment_of_AmE-diphones-IPA-annotated-columns_w_LTR_NXT_swbd_destressed.json'),
             ('gd_alignment_fp',
              'GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed/alignment_of_AmE-diphones-IPA-annotated-columns_w_LTR_NXT_swbd_destressed.json'),
             ('ltr_alignment_dn',
              'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed'),
             ('ltr_alignment_fn',
              'alignment_of_LTR_NXT_swbd_destressed_w_AmE-diphones-IPA-annotated-columns.json'),
             ('ltr_alignment_fp',
              'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/alignment_of_LTR_NXT_swbd_destressed_w_AmE-diphones-IPA-

In [39]:
if '1a' in permittedSteps:
    # takes ~30s on wittgenstein
    for arg_bundle in tqdm(alignment_arg_bundles):
        nb = pm.execute_notebook(
            'Gating Data - Transcription Lexicon Alignment Maker.ipynb',
            arg_bundle['align_def_nb_output_name'],
            parameters=dict(g = arg_bundle['my_g'], 
                            l = arg_bundle['my_l'], 
                            s = arg_bundle['my_s'], 
                            gp = arg_bundle['my_gp'], 
                            lp = arg_bundle['my_lp'])
        )
    #     pm.execute_notebook(
    #        'Gating Data - Transcription Lexicon Alignment Maker.ipynb',
    #        nb_output_name,
    #        parameters=dict(g = my_g, l = my_l, s = my_s, gp = my_gp, lp = my_lp)
    #     )
        print("Finished creating alignment definition notebook '{0}'.\nOpen and run the notebook, complete the projection definition, and run the remainder of the notebook (remembering to change the export flag to 'True').\n".format(arg_bundle['align_def_nb_output_name']))

## Step 1b: Apply inventory alignment projections

The cell below will clear all existing alignment folders created using the code in this subsection:

In [40]:
# %rm -rf *" alignment application "*

### Check for projection definitions

The cell below will succeed if you have run each of the previously produced notebooks correctly and produced a projection mapping file.

In [41]:
for arg_bundle in alignment_arg_bundles:
    args = arg_bundle
    assert path.exists(args['gd_alignment_fp']), 'Gating data alignment projection mapping not found:\n\t{0}'.format(args['gd_alignment_fp'])
    assert path.exists(args['ltr_alignment_fp']), 'Transcribed lexicon data alignment projection mapping not found:\n\t{0}'.format(args['ltr_alignment_fp'])

### How are inventory alignment projections actually applied?

See `Align transcriptions.ipynb`.

### Apply projection definitions

The cell below applies each pair of alignment projections to each matched pair of gating data and transcribed lexicon choice:

In [42]:
for arg_bundle in alignment_arg_bundles:
    args = arg_bundle
    LTR_fn = args['LTR_fn']
    
#     my_pg = args['my_gp']
#     my_g = args['my_g']
    my_o_fn = 'GD_AmE-diphones' + '_aligned_w_' + removeExtension(LTR_fn) + '.tsv'
    my_og = path.join(args['gd_alignment_dn'], my_o_fn)
    args['align_apply_gd_nb_output_name'] = 'GD_AmE-diphones - ' + removeExtension(LTR_fn) + ' alignment application to ' + 'AmE-diphones' + '.ipynb'
    args['my_og'] = my_og
    
#     my_pl = args['my_lp']
#     my_l = args['my_l']
    my_o_fn = removeExtension(LTR_fn) + '_aligned_w_' + 'GD_AmE-diphones' + '.tsv'
    my_ol = path.join(args['ltr_alignment_dn'], my_o_fn)
    args['align_apply_ltr_nb_output_name'] = 'GD_AmE-diphones - ' + removeExtension(LTR_fn) + ' alignment application to ' + removeExtension(LTR_fn) + '.ipynb'
    args['my_ol'] = my_ol

In [43]:
if '1b' in permittedSteps:
    # takes ~45s on wittgenstein
    for arg_bundle in alignment_arg_bundles:
        args = arg_bundle
    #     LTR_fn = args['LTR_fn']
        startNote()
        my_pg = args['my_gp']
        my_g = args['my_g']
    #     my_o_fn = 'GD_AmE-diphones' + '_aligned_w_' + removeExtension(LTR_fn) + '.tsv'
    #     my_og = path.join(args['gd_alignment_dn'], my_o_fn)
    #     args['align_apply_gd_nb_output_name'] = 'GD_AmE-diphones - ' + removeExtension(LTR_fn) + ' alignment application to ' + 'AmE-diphones' + '.ipynb'
    #     args['my_og'] = my_og
        my_og = args['my_og']
        print("Creating notebook '{0}' w/ args p, g, o = \n\t{1}\n\t{2}\n\t{3}".format(args['align_apply_gd_nb_output_name'], my_pg, my_g, my_og))
        nb = pm.execute_notebook(
            'Align transcriptions.ipynb',
            args['align_apply_gd_nb_output_name'],
            parameters=dict(p = my_pg,
                            g = my_g,
                            o = my_og)
        )
        print('Finished applying alignment projection\n\tp = {0}\nto\n\tg = {1}\nResult saved to\n\t{2}'.format(my_pg, my_g, my_og))
        print(' ')

        my_pl = args['my_lp']
        my_l = args['my_l']
    #     my_o_fn = removeExtension(LTR_fn) + '_aligned_w_' + 'GD_AmE-diphones' + '.tsv'
    #     my_ol = path.join(args['ltr_alignment_dn'], my_o_fn)
    #     args['align_apply_ltr_nb_output_name'] = 'GD_AmE-diphones - ' + removeExtension(LTR_fn) + ' alignment application to ' + removeExtension(LTR_fn) + '.ipynb'
    #     args['my_ol'] = my_ol
        my_ol = args['my_ol']
        print('Creating notebook {0} w/ args p, g, o = \n\t{1}\n\t{2}\n\t{3}'.format(args['align_apply_ltr_nb_output_name'], my_pg, my_l, my_ol))
        nb = pm.execute_notebook(
            'Align transcriptions.ipynb',
            args['align_apply_ltr_nb_output_name'],
            parameters=dict(p = my_pl,
                            l = my_l,
                            o = my_ol)
        )
        print('Finished applying alignment projection\n\tp = {0}\nto\n\tl = {1}\nResult saved to\n\t{2}'.format(my_pl, my_l, my_ol))
        endNote()
        print('\n')

# Step 2: Generating channel and (orthographic) lexicon distributions

## Step 2a: Generating channel distributions and associated metadata

In [44]:
%ls -d GD_*

 [0m[01;34mGD_AmE[0m/
 [01;34mGD_AmE_destressed_aligned_w_LTR_Buckeye[0m/
 [01;34mGD_AmE_destressed_aligned_w_LTR_CMU_destressed[0m/
 [01;34mGD_AmE_destressed_aligned_w_LTR_newdic_destressed[0m/
 [01;34mGD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed[0m/
'GD_AmE-diphones - LTR_Buckeye alignment application to AmE-diphones.ipynb'
'GD_AmE-diphones - LTR_Buckeye alignment application to LTR_Buckeye.ipynb'
'GD_AmE-diphones - LTR_Buckeye alignment definition.ipynb'
'GD_AmE-diphones - LTR_CMU_destressed alignment application to AmE-diphones.ipynb'
'GD_AmE-diphones - LTR_CMU_destressed alignment application to LTR_CMU_destressed.ipynb'
'GD_AmE-diphones - LTR_CMU_destressed alignment definition.ipynb'
'GD_AmE-diphones - LTR_newdic_destressed alignment application to AmE-diphones.ipynb'
'GD_AmE-diphones - LTR_newdic_destressed alignment application to LTR_newdic_destressed.ipynb'
'GD_AmE-diphones - LTR_newdic_destressed alignment definition.ipynb'
'GD_AmE-diphones -

In [45]:
gating_data_folders = ('GD_AmE', ) + tuple(map(lambda ab: ab['gd_alignment_dn'], alignment_arg_bundles))
gating_data_folders

('GD_AmE',
 'GD_AmE_destressed_aligned_w_LTR_Buckeye',
 'GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed')

In [46]:
gating_data_fps = ('GD_AmE/AmE-diphones-IPA-annotated-columns.csv',) + \
                  tuple(map(lambda ab: ab['my_og'], alignment_arg_bundles))
gating_data_fps

('GD_AmE/AmE-diphones-IPA-annotated-columns.csv',
 'GD_AmE_destressed_aligned_w_LTR_Buckeye/GD_AmE-diphones_aligned_w_LTR_Buckeye.tsv',
 'GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed/GD_AmE-diphones_aligned_w_LTR_NXT_swbd_destressed.tsv')

### Metadata

First (for downstream convenience) we identify the $n$-phones (not) contained in and (not) constructible from each of the versions of the gating data.

**NB:** This is done by calling the notebook `Run n-phone analysis of gating data.ipynb`, passing it **the filepath to a gating data `.csv`/`.tsv` file** and **a path to an output directory** for the dozen or so files the notebook will produce.

In [47]:
if '2ai' in permittedSteps:
    # takes ~2m on wittgenstein
    for gating_data_fp in gating_data_fps:
        gd_dir = path.dirname(gating_data_fp)

        progress_report(path.join(gd_dir, gd_dir) + " n-phone analysis.ipynb",
                        dict(g = gating_data_fp,
                            o = gd_dir))
        nb = pm.execute_notebook(
            'Run n-phone analysis of gating data.ipynb',
    #         args['align_apply_ltr_nb_output_name'],
            path.join(gd_dir, gd_dir) + " n-phone analysis.ipynb",
            parameters=dict(g = gating_data_fp,
                            o = gd_dir)
        )
        listdir(gd_dir)
        endNote()
        print('\n')

### Channel distributions

Next, the notebook `Producing channel distributions.ipynb` will create `.json` files defining (among other things) a uniphone and triphone channel distribution. It requires the following arguments to specify information about what kind of channel model to build and where to put it:
 - **a filepath** to a gating data file
 - **a directory** containing metadata indicating possible/impossible $n$-phones
 - **a string argument** ("stressed" or "destressed") indicating whether the distribution will be over a segment inventory with or without stress information
 - **a real valued, non-negative**  smoothing parameter (a pseudocount to add to every channel outcome)
 - **a noise scaling (mixing) fraction in the unit interval** (scaling the relative probability of channel error vs. correct transmission)
 - **an output directory** to write the channel model to.

In [48]:
pseudocounts = (0, 0.001, 0.01, 0.1)
# pseudocounts = (0, 0.01, 0.05, 0.1)
scaling_factors = (1.0, 0.5, 0.25, 0.125)

pseudocount_to_scaling_factors = {c:(1.0,) if c == 0 else scaling_factors
                                  for c in pseudocounts}

In [49]:
gating_data_fps

('GD_AmE/AmE-diphones-IPA-annotated-columns.csv',
 'GD_AmE_destressed_aligned_w_LTR_Buckeye/GD_AmE-diphones_aligned_w_LTR_Buckeye.tsv',
 'GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed/GD_AmE-diphones_aligned_w_LTR_NXT_swbd_destressed.tsv')

In [50]:
cm_arg_bundles = []
for gating_data_fp in gating_data_fps:
    metadata_dir = path.dirname(gating_data_fp)
    s = "destressed"
    channel_model_dir_stem = 'CM' + metadata_dir[2:]
    
    for pc in pseudocounts:
        for sf in pseudocount_to_scaling_factors[pc]:
            channel_model_dir_suffix = '_pseudocount' + str(pc) + '_lambda' + str(sf)
            
            if metadata_dir == 'GD_AmE':
                channel_model_dir = channel_model_dir_stem + '_' + s + '_unaligned' + channel_model_dir_suffix
            else:
                channel_model_dir = channel_model_dir_stem + channel_model_dir_suffix
            nb_output_name = 'Producing channel distributions from ' + metadata_dir + ', pc={0}'.format(pc) + ', λ={0}'.format(sf) + '.ipynb'
            new_arg_bundle = {'gating_data_fp':gating_data_fp,
                              'metadata_dir':metadata_dir,
                              's':s,
                              'c':pc,
                              'L':sf,
                              'cm_dir':channel_model_dir,
                              'nb_output_name':nb_output_name,
                              'nb_fp':path.join(channel_model_dir, nb_output_name)}
            new_arg_bundle
            cm_arg_bundles.append(new_arg_bundle)

{'gating_data_fp': 'GD_AmE/AmE-diphones-IPA-annotated-columns.csv',
 'metadata_dir': 'GD_AmE',
 's': 'destressed',
 'c': 0,
 'L': 1.0,
 'cm_dir': 'CM_AmE_destressed_unaligned_pseudocount0_lambda1.0',
 'nb_output_name': 'Producing channel distributions from GD_AmE, pc=0, λ=1.0.ipynb',
 'nb_fp': 'CM_AmE_destressed_unaligned_pseudocount0_lambda1.0/Producing channel distributions from GD_AmE, pc=0, λ=1.0.ipynb'}

{'gating_data_fp': 'GD_AmE/AmE-diphones-IPA-annotated-columns.csv',
 'metadata_dir': 'GD_AmE',
 's': 'destressed',
 'c': 0.001,
 'L': 1.0,
 'cm_dir': 'CM_AmE_destressed_unaligned_pseudocount0.001_lambda1.0',
 'nb_output_name': 'Producing channel distributions from GD_AmE, pc=0.001, λ=1.0.ipynb',
 'nb_fp': 'CM_AmE_destressed_unaligned_pseudocount0.001_lambda1.0/Producing channel distributions from GD_AmE, pc=0.001, λ=1.0.ipynb'}

{'gating_data_fp': 'GD_AmE/AmE-diphones-IPA-annotated-columns.csv',
 'metadata_dir': 'GD_AmE',
 's': 'destressed',
 'c': 0.001,
 'L': 0.5,
 'cm_dir': 'CM_AmE_destressed_unaligned_pseudocount0.001_lambda0.5',
 'nb_output_name': 'Producing channel distributions from GD_AmE, pc=0.001, λ=0.5.ipynb',
 'nb_fp': 'CM_AmE_destressed_unaligned_pseudocount0.001_lambda0.5/Producing channel distributions from GD_AmE, pc=0.001, λ=0.5.ipynb'}

{'gating_data_fp': 'GD_AmE/AmE-diphones-IPA-annotated-columns.csv',
 'metadata_dir': 'GD_AmE',
 's': 'destressed',
 'c': 0.001,
 'L': 0.25,
 'cm_dir': 'CM_AmE_destressed_unaligned_pseudocount0.001_lambda0.25',
 'nb_output_name': 'Producing channel distributions from GD_AmE, pc=0.001, λ=0.25.ipynb',
 'nb_fp': 'CM_AmE_destressed_unaligned_pseudocount0.001_lambda0.25/Producing channel distributions from GD_AmE, pc=0.001, λ=0.25.ipynb'}

{'gating_data_fp': 'GD_AmE/AmE-diphones-IPA-annotated-columns.csv',
 'metadata_dir': 'GD_AmE',
 's': 'destressed',
 'c': 0.001,
 'L': 0.125,
 'cm_dir': 'CM_AmE_destressed_unaligned_pseudocount0.001_lambda0.125',
 'nb_output_name': 'Producing channel distributions from GD_AmE, pc=0.001, λ=0.125.ipynb',
 'nb_fp': 'CM_AmE_destressed_unaligned_pseudocount0.001_lambda0.125/Producing channel distributions from GD_AmE, pc=0.001, λ=0.125.ipynb'}

{'gating_data_fp': 'GD_AmE/AmE-diphones-IPA-annotated-columns.csv',
 'metadata_dir': 'GD_AmE',
 's': 'destressed',
 'c': 0.01,
 'L': 1.0,
 'cm_dir': 'CM_AmE_destressed_unaligned_pseudocount0.01_lambda1.0',
 'nb_output_name': 'Producing channel distributions from GD_AmE, pc=0.01, λ=1.0.ipynb',
 'nb_fp': 'CM_AmE_destressed_unaligned_pseudocount0.01_lambda1.0/Producing channel distributions from GD_AmE, pc=0.01, λ=1.0.ipynb'}

{'gating_data_fp': 'GD_AmE/AmE-diphones-IPA-annotated-columns.csv',
 'metadata_dir': 'GD_AmE',
 's': 'destressed',
 'c': 0.01,
 'L': 0.5,
 'cm_dir': 'CM_AmE_destressed_unaligned_pseudocount0.01_lambda0.5',
 'nb_output_name': 'Producing channel distributions from GD_AmE, pc=0.01, λ=0.5.ipynb',
 'nb_fp': 'CM_AmE_destressed_unaligned_pseudocount0.01_lambda0.5/Producing channel distributions from GD_AmE, pc=0.01, λ=0.5.ipynb'}

{'gating_data_fp': 'GD_AmE/AmE-diphones-IPA-annotated-columns.csv',
 'metadata_dir': 'GD_AmE',
 's': 'destressed',
 'c': 0.01,
 'L': 0.25,
 'cm_dir': 'CM_AmE_destressed_unaligned_pseudocount0.01_lambda0.25',
 'nb_output_name': 'Producing channel distributions from GD_AmE, pc=0.01, λ=0.25.ipynb',
 'nb_fp': 'CM_AmE_destressed_unaligned_pseudocount0.01_lambda0.25/Producing channel distributions from GD_AmE, pc=0.01, λ=0.25.ipynb'}

{'gating_data_fp': 'GD_AmE/AmE-diphones-IPA-annotated-columns.csv',
 'metadata_dir': 'GD_AmE',
 's': 'destressed',
 'c': 0.01,
 'L': 0.125,
 'cm_dir': 'CM_AmE_destressed_unaligned_pseudocount0.01_lambda0.125',
 'nb_output_name': 'Producing channel distributions from GD_AmE, pc=0.01, λ=0.125.ipynb',
 'nb_fp': 'CM_AmE_destressed_unaligned_pseudocount0.01_lambda0.125/Producing channel distributions from GD_AmE, pc=0.01, λ=0.125.ipynb'}

{'gating_data_fp': 'GD_AmE/AmE-diphones-IPA-annotated-columns.csv',
 'metadata_dir': 'GD_AmE',
 's': 'destressed',
 'c': 0.1,
 'L': 1.0,
 'cm_dir': 'CM_AmE_destressed_unaligned_pseudocount0.1_lambda1.0',
 'nb_output_name': 'Producing channel distributions from GD_AmE, pc=0.1, λ=1.0.ipynb',
 'nb_fp': 'CM_AmE_destressed_unaligned_pseudocount0.1_lambda1.0/Producing channel distributions from GD_AmE, pc=0.1, λ=1.0.ipynb'}

{'gating_data_fp': 'GD_AmE/AmE-diphones-IPA-annotated-columns.csv',
 'metadata_dir': 'GD_AmE',
 's': 'destressed',
 'c': 0.1,
 'L': 0.5,
 'cm_dir': 'CM_AmE_destressed_unaligned_pseudocount0.1_lambda0.5',
 'nb_output_name': 'Producing channel distributions from GD_AmE, pc=0.1, λ=0.5.ipynb',
 'nb_fp': 'CM_AmE_destressed_unaligned_pseudocount0.1_lambda0.5/Producing channel distributions from GD_AmE, pc=0.1, λ=0.5.ipynb'}

{'gating_data_fp': 'GD_AmE/AmE-diphones-IPA-annotated-columns.csv',
 'metadata_dir': 'GD_AmE',
 's': 'destressed',
 'c': 0.1,
 'L': 0.25,
 'cm_dir': 'CM_AmE_destressed_unaligned_pseudocount0.1_lambda0.25',
 'nb_output_name': 'Producing channel distributions from GD_AmE, pc=0.1, λ=0.25.ipynb',
 'nb_fp': 'CM_AmE_destressed_unaligned_pseudocount0.1_lambda0.25/Producing channel distributions from GD_AmE, pc=0.1, λ=0.25.ipynb'}

{'gating_data_fp': 'GD_AmE/AmE-diphones-IPA-annotated-columns.csv',
 'metadata_dir': 'GD_AmE',
 's': 'destressed',
 'c': 0.1,
 'L': 0.125,
 'cm_dir': 'CM_AmE_destressed_unaligned_pseudocount0.1_lambda0.125',
 'nb_output_name': 'Producing channel distributions from GD_AmE, pc=0.1, λ=0.125.ipynb',
 'nb_fp': 'CM_AmE_destressed_unaligned_pseudocount0.1_lambda0.125/Producing channel distributions from GD_AmE, pc=0.1, λ=0.125.ipynb'}

{'gating_data_fp': 'GD_AmE_destressed_aligned_w_LTR_Buckeye/GD_AmE-diphones_aligned_w_LTR_Buckeye.tsv',
 'metadata_dir': 'GD_AmE_destressed_aligned_w_LTR_Buckeye',
 's': 'destressed',
 'c': 0,
 'L': 1.0,
 'cm_dir': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0_lambda1.0',
 'nb_output_name': 'Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_Buckeye, pc=0, λ=1.0.ipynb',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0_lambda1.0/Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_Buckeye, pc=0, λ=1.0.ipynb'}

{'gating_data_fp': 'GD_AmE_destressed_aligned_w_LTR_Buckeye/GD_AmE-diphones_aligned_w_LTR_Buckeye.tsv',
 'metadata_dir': 'GD_AmE_destressed_aligned_w_LTR_Buckeye',
 's': 'destressed',
 'c': 0.001,
 'L': 1.0,
 'cm_dir': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda1.0',
 'nb_output_name': 'Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_Buckeye, pc=0.001, λ=1.0.ipynb',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda1.0/Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_Buckeye, pc=0.001, λ=1.0.ipynb'}

{'gating_data_fp': 'GD_AmE_destressed_aligned_w_LTR_Buckeye/GD_AmE-diphones_aligned_w_LTR_Buckeye.tsv',
 'metadata_dir': 'GD_AmE_destressed_aligned_w_LTR_Buckeye',
 's': 'destressed',
 'c': 0.001,
 'L': 0.5,
 'cm_dir': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.5',
 'nb_output_name': 'Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_Buckeye, pc=0.001, λ=0.5.ipynb',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.5/Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_Buckeye, pc=0.001, λ=0.5.ipynb'}

{'gating_data_fp': 'GD_AmE_destressed_aligned_w_LTR_Buckeye/GD_AmE-diphones_aligned_w_LTR_Buckeye.tsv',
 'metadata_dir': 'GD_AmE_destressed_aligned_w_LTR_Buckeye',
 's': 'destressed',
 'c': 0.001,
 'L': 0.25,
 'cm_dir': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25',
 'nb_output_name': 'Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_Buckeye, pc=0.001, λ=0.25.ipynb',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25/Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_Buckeye, pc=0.001, λ=0.25.ipynb'}

{'gating_data_fp': 'GD_AmE_destressed_aligned_w_LTR_Buckeye/GD_AmE-diphones_aligned_w_LTR_Buckeye.tsv',
 'metadata_dir': 'GD_AmE_destressed_aligned_w_LTR_Buckeye',
 's': 'destressed',
 'c': 0.001,
 'L': 0.125,
 'cm_dir': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125',
 'nb_output_name': 'Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_Buckeye, pc=0.001, λ=0.125.ipynb',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125/Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_Buckeye, pc=0.001, λ=0.125.ipynb'}

{'gating_data_fp': 'GD_AmE_destressed_aligned_w_LTR_Buckeye/GD_AmE-diphones_aligned_w_LTR_Buckeye.tsv',
 'metadata_dir': 'GD_AmE_destressed_aligned_w_LTR_Buckeye',
 's': 'destressed',
 'c': 0.01,
 'L': 1.0,
 'cm_dir': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda1.0',
 'nb_output_name': 'Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_Buckeye, pc=0.01, λ=1.0.ipynb',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda1.0/Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_Buckeye, pc=0.01, λ=1.0.ipynb'}

{'gating_data_fp': 'GD_AmE_destressed_aligned_w_LTR_Buckeye/GD_AmE-diphones_aligned_w_LTR_Buckeye.tsv',
 'metadata_dir': 'GD_AmE_destressed_aligned_w_LTR_Buckeye',
 's': 'destressed',
 'c': 0.01,
 'L': 0.5,
 'cm_dir': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.5',
 'nb_output_name': 'Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_Buckeye, pc=0.01, λ=0.5.ipynb',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.5/Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_Buckeye, pc=0.01, λ=0.5.ipynb'}

{'gating_data_fp': 'GD_AmE_destressed_aligned_w_LTR_Buckeye/GD_AmE-diphones_aligned_w_LTR_Buckeye.tsv',
 'metadata_dir': 'GD_AmE_destressed_aligned_w_LTR_Buckeye',
 's': 'destressed',
 'c': 0.01,
 'L': 0.25,
 'cm_dir': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.25',
 'nb_output_name': 'Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_Buckeye, pc=0.01, λ=0.25.ipynb',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.25/Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_Buckeye, pc=0.01, λ=0.25.ipynb'}

{'gating_data_fp': 'GD_AmE_destressed_aligned_w_LTR_Buckeye/GD_AmE-diphones_aligned_w_LTR_Buckeye.tsv',
 'metadata_dir': 'GD_AmE_destressed_aligned_w_LTR_Buckeye',
 's': 'destressed',
 'c': 0.01,
 'L': 0.125,
 'cm_dir': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.125',
 'nb_output_name': 'Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_Buckeye, pc=0.01, λ=0.125.ipynb',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.125/Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_Buckeye, pc=0.01, λ=0.125.ipynb'}

{'gating_data_fp': 'GD_AmE_destressed_aligned_w_LTR_Buckeye/GD_AmE-diphones_aligned_w_LTR_Buckeye.tsv',
 'metadata_dir': 'GD_AmE_destressed_aligned_w_LTR_Buckeye',
 's': 'destressed',
 'c': 0.1,
 'L': 1.0,
 'cm_dir': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda1.0',
 'nb_output_name': 'Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_Buckeye, pc=0.1, λ=1.0.ipynb',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda1.0/Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_Buckeye, pc=0.1, λ=1.0.ipynb'}

{'gating_data_fp': 'GD_AmE_destressed_aligned_w_LTR_Buckeye/GD_AmE-diphones_aligned_w_LTR_Buckeye.tsv',
 'metadata_dir': 'GD_AmE_destressed_aligned_w_LTR_Buckeye',
 's': 'destressed',
 'c': 0.1,
 'L': 0.5,
 'cm_dir': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.5',
 'nb_output_name': 'Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_Buckeye, pc=0.1, λ=0.5.ipynb',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.5/Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_Buckeye, pc=0.1, λ=0.5.ipynb'}

{'gating_data_fp': 'GD_AmE_destressed_aligned_w_LTR_Buckeye/GD_AmE-diphones_aligned_w_LTR_Buckeye.tsv',
 'metadata_dir': 'GD_AmE_destressed_aligned_w_LTR_Buckeye',
 's': 'destressed',
 'c': 0.1,
 'L': 0.25,
 'cm_dir': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.25',
 'nb_output_name': 'Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_Buckeye, pc=0.1, λ=0.25.ipynb',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.25/Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_Buckeye, pc=0.1, λ=0.25.ipynb'}

{'gating_data_fp': 'GD_AmE_destressed_aligned_w_LTR_Buckeye/GD_AmE-diphones_aligned_w_LTR_Buckeye.tsv',
 'metadata_dir': 'GD_AmE_destressed_aligned_w_LTR_Buckeye',
 's': 'destressed',
 'c': 0.1,
 'L': 0.125,
 'cm_dir': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.125',
 'nb_output_name': 'Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_Buckeye, pc=0.1, λ=0.125.ipynb',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.125/Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_Buckeye, pc=0.1, λ=0.125.ipynb'}

{'gating_data_fp': 'GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed/GD_AmE-diphones_aligned_w_LTR_NXT_swbd_destressed.tsv',
 'metadata_dir': 'GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed',
 's': 'destressed',
 'c': 0,
 'L': 1.0,
 'cm_dir': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0_lambda1.0',
 'nb_output_name': 'Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed, pc=0, λ=1.0.ipynb',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0_lambda1.0/Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed, pc=0, λ=1.0.ipynb'}

{'gating_data_fp': 'GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed/GD_AmE-diphones_aligned_w_LTR_NXT_swbd_destressed.tsv',
 'metadata_dir': 'GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed',
 's': 'destressed',
 'c': 0.001,
 'L': 1.0,
 'cm_dir': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0',
 'nb_output_name': 'Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed, pc=0.001, λ=1.0.ipynb',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0/Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed, pc=0.001, λ=1.0.ipynb'}

{'gating_data_fp': 'GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed/GD_AmE-diphones_aligned_w_LTR_NXT_swbd_destressed.tsv',
 'metadata_dir': 'GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed',
 's': 'destressed',
 'c': 0.001,
 'L': 0.5,
 'cm_dir': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.5',
 'nb_output_name': 'Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed, pc=0.001, λ=0.5.ipynb',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.5/Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed, pc=0.001, λ=0.5.ipynb'}

{'gating_data_fp': 'GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed/GD_AmE-diphones_aligned_w_LTR_NXT_swbd_destressed.tsv',
 'metadata_dir': 'GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed',
 's': 'destressed',
 'c': 0.001,
 'L': 0.25,
 'cm_dir': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25',
 'nb_output_name': 'Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed, pc=0.001, λ=0.25.ipynb',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed, pc=0.001, λ=0.25.ipynb'}

{'gating_data_fp': 'GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed/GD_AmE-diphones_aligned_w_LTR_NXT_swbd_destressed.tsv',
 'metadata_dir': 'GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed',
 's': 'destressed',
 'c': 0.001,
 'L': 0.125,
 'cm_dir': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.125',
 'nb_output_name': 'Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed, pc=0.001, λ=0.125.ipynb',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.125/Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed, pc=0.001, λ=0.125.ipynb'}

{'gating_data_fp': 'GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed/GD_AmE-diphones_aligned_w_LTR_NXT_swbd_destressed.tsv',
 'metadata_dir': 'GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed',
 's': 'destressed',
 'c': 0.01,
 'L': 1.0,
 'cm_dir': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda1.0',
 'nb_output_name': 'Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed, pc=0.01, λ=1.0.ipynb',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda1.0/Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed, pc=0.01, λ=1.0.ipynb'}

{'gating_data_fp': 'GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed/GD_AmE-diphones_aligned_w_LTR_NXT_swbd_destressed.tsv',
 'metadata_dir': 'GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed',
 's': 'destressed',
 'c': 0.01,
 'L': 0.5,
 'cm_dir': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.5',
 'nb_output_name': 'Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed, pc=0.01, λ=0.5.ipynb',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.5/Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed, pc=0.01, λ=0.5.ipynb'}

{'gating_data_fp': 'GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed/GD_AmE-diphones_aligned_w_LTR_NXT_swbd_destressed.tsv',
 'metadata_dir': 'GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed',
 's': 'destressed',
 'c': 0.01,
 'L': 0.25,
 'cm_dir': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.25',
 'nb_output_name': 'Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed, pc=0.01, λ=0.25.ipynb',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.25/Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed, pc=0.01, λ=0.25.ipynb'}

{'gating_data_fp': 'GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed/GD_AmE-diphones_aligned_w_LTR_NXT_swbd_destressed.tsv',
 'metadata_dir': 'GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed',
 's': 'destressed',
 'c': 0.01,
 'L': 0.125,
 'cm_dir': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.125',
 'nb_output_name': 'Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed, pc=0.01, λ=0.125.ipynb',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.125/Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed, pc=0.01, λ=0.125.ipynb'}

{'gating_data_fp': 'GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed/GD_AmE-diphones_aligned_w_LTR_NXT_swbd_destressed.tsv',
 'metadata_dir': 'GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed',
 's': 'destressed',
 'c': 0.1,
 'L': 1.0,
 'cm_dir': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0',
 'nb_output_name': 'Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed, pc=0.1, λ=1.0.ipynb',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0/Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed, pc=0.1, λ=1.0.ipynb'}

{'gating_data_fp': 'GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed/GD_AmE-diphones_aligned_w_LTR_NXT_swbd_destressed.tsv',
 'metadata_dir': 'GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed',
 's': 'destressed',
 'c': 0.1,
 'L': 0.5,
 'cm_dir': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.5',
 'nb_output_name': 'Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed, pc=0.1, λ=0.5.ipynb',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.5/Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed, pc=0.1, λ=0.5.ipynb'}

{'gating_data_fp': 'GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed/GD_AmE-diphones_aligned_w_LTR_NXT_swbd_destressed.tsv',
 'metadata_dir': 'GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed',
 's': 'destressed',
 'c': 0.1,
 'L': 0.25,
 'cm_dir': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.25',
 'nb_output_name': 'Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed, pc=0.1, λ=0.25.ipynb',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.25/Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed, pc=0.1, λ=0.25.ipynb'}

{'gating_data_fp': 'GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed/GD_AmE-diphones_aligned_w_LTR_NXT_swbd_destressed.tsv',
 'metadata_dir': 'GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed',
 's': 'destressed',
 'c': 0.1,
 'L': 0.125,
 'cm_dir': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.125',
 'nb_output_name': 'Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed, pc=0.1, λ=0.125.ipynb',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.125/Producing channel distributions from GD_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed, pc=0.1, λ=0.125.ipynb'}

In [51]:
len(cm_arg_bundles)

39

In [52]:
cm_arg_bundles[0]

{'gating_data_fp': 'GD_AmE/AmE-diphones-IPA-annotated-columns.csv',
 'metadata_dir': 'GD_AmE',
 's': 'destressed',
 'c': 0,
 'L': 1.0,
 'cm_dir': 'CM_AmE_destressed_unaligned_pseudocount0_lambda1.0',
 'nb_output_name': 'Producing channel distributions from GD_AmE, pc=0, λ=1.0.ipynb',
 'nb_fp': 'CM_AmE_destressed_unaligned_pseudocount0_lambda1.0/Producing channel distributions from GD_AmE, pc=0, λ=1.0.ipynb'}

In [53]:
if '2aii' in permittedSteps:
    # used to take ~110m on wittgenstein
    # with two corpora (NXT_swbd, buckeye) x 
    for ab in cm_arg_bundles:
        ensure_dir_exists(ab['cm_dir'])

        progress_report(ab['nb_fp'],
#                         path.join(ab['cm_dir'], ab['nb_output_name']),
                        dict(g = ab['gating_data_fp'],
                             m = ab['metadata_dir'],
                             s = ab['s'],
                             c = ab['c'],
                             L = ab['L'],
                             o = ab['cm_dir']))

        if not overwrite and path.exists(ab['nb_fp']):
#         if not overwrite and path.exists(path.join(ab['cm_dir'], ab['nb_output_name'])):
            print('{0} already exists. Skipping...'.format(path.join(ab['nb_fp'])))
#             print('{0} already exists. Skipping...'.format(path.join(ab['cm_dir'], ab['nb_output_name'])))
            endNote()
            continue
        
        nb = pm.execute_notebook(
        'Producing channel distributions.ipynb',
        ab['nb_fp'],
#         path.join(ab['cm_dir'], ab['nb_output_name']),
        parameters=dict(g = ab['gating_data_fp'],
                        m = ab['metadata_dir'],
                        s = ab['s'],
                        c = ab['c'],
                        L = ab['L'],
                        o = ab['cm_dir'])
        )
        endNote()
        print('\n')

**How are channel distributions created?**

Take a look at `Producing channel distributions.ipynb`. Besides removing stress information, there are some mathematically non-trivial details that go into defining both uniphone and triphone channel distributions.

## Step 2b: Generating (contextual) lexicon distributions (over orthographic vocabularies)

Given 
 - a language model $m$ (defined by a `.arpa` file or `kenlm` memory-mapped analogue) 
 - a choice of n-gram contexts $C$ (a `.txt` file with one context per line)
 - a vocabulary $V$ (a `.txt` file with one word per line)
 - a (partial) output filepath $o$ / output filepath prefix $o$
 
`Producing contextual distributions.ipynb` will write a serialized/memory-mapped `numpy` array to $o$.hV_C that defines $-log_2( p(V|C) )$ - slightly transformed output from `kenlm`. It will also write $p(V|C)$ to $o$.pV_C, and copy both $V$ and $C$ to the base directory specified by $o$. (In both cases, each column is associated with the distribution $p(V|c)$ for some $c$.)

In [54]:
context_fns
context_dirs
corpus_to_contexts
context_dir_to_repo_dir

{'Buckeye': {'preceding': {1: 'buckeye_contexts_preceding_1_filtered.txt',
   2: 'buckeye_contexts_preceding_2_filtered.txt',
   3: 'buckeye_contexts_preceding_3_filtered.txt',
   4: 'buckeye_contexts_preceding_4_filtered.txt'},
  'following': {1: 'buckeye_contexts_following_1_filtered.txt',
   2: 'buckeye_contexts_following_2_filtered.txt',
   3: 'buckeye_contexts_following_3_filtered.txt',
   4: 'buckeye_contexts_following_4_filtered.txt'},
  'bidirectional': 'buckeye_contexts_bidirectional_filtered.json'},
 'NXT_swbd': {'preceding': {1: 'nxt_swbd_contexts_preceding_1_filtered.txt',
   2: 'nxt_swbd_contexts_preceding_2_filtered.txt',
   3: 'nxt_swbd_contexts_preceding_3_filtered.txt',
   4: 'nxt_swbd_contexts_preceding_4_filtered.txt'},
  'following': {1: 'nxt_swbd_contexts_following_1_filtered.txt',
   2: 'nxt_swbd_contexts_following_2_filtered.txt',
   3: 'nxt_swbd_contexts_following_3_filtered.txt',
   4: 'nxt_swbd_contexts_following_4_filtered.txt'},
  'bidirectional': 'nxt_swbd_

('C_Buckeye', 'C_NXT_swbd')

{'Buckeye': ('buckeye_contexts_following_1_filtered.txt',
  'buckeye_contexts_following_2_filtered.txt',
  'buckeye_contexts_following_3_filtered.txt',
  'buckeye_contexts_preceding_1_filtered.txt',
  'buckeye_contexts_preceding_2_filtered.txt',
  'buckeye_contexts_preceding_3_filtered.txt',
  'buckeye_contexts_bidirectional_filtered.json'),
 'NXT_swbd': ('nxt_swbd_contexts_following_1_filtered.txt',
  'nxt_swbd_contexts_following_2_filtered.txt',
  'nxt_swbd_contexts_following_3_filtered.txt',
  'nxt_swbd_contexts_preceding_1_filtered.txt',
  'nxt_swbd_contexts_preceding_2_filtered.txt',
  'nxt_swbd_contexts_preceding_3_filtered.txt',
  'nxt_swbd_contexts_bidirectional_filtered.json')}

{'Buckeye': '../buckeye-lm', 'NXT_swbd': '../switchboard-lm'}

In [55]:
context_fns
context_dirs
corpus_to_contexts
context_dir_to_repo_dir

# fisher_lm_dir = 'LM_Fisher'

LM_fns
# LM_fn_stem = 'fisher_utterances_main'
# LM_fns = {
#     '':{l:LM_fn_stem + '_' + str(l) + 'gram.mmap'
#         for l in (2,3,4,5)},
#     'rev':{l:LM_fn_stem + '_' + 'rev' + '_' + str(l) + 'gram.mmap'
#            for l in (2,3,4,5)}
# }


# fisher_lm_vocab_fn = 'fisher_vocabulary_main.txt'

{'Buckeye': {'preceding': {1: 'buckeye_contexts_preceding_1_filtered.txt',
   2: 'buckeye_contexts_preceding_2_filtered.txt',
   3: 'buckeye_contexts_preceding_3_filtered.txt',
   4: 'buckeye_contexts_preceding_4_filtered.txt'},
  'following': {1: 'buckeye_contexts_following_1_filtered.txt',
   2: 'buckeye_contexts_following_2_filtered.txt',
   3: 'buckeye_contexts_following_3_filtered.txt',
   4: 'buckeye_contexts_following_4_filtered.txt'},
  'bidirectional': 'buckeye_contexts_bidirectional_filtered.json'},
 'NXT_swbd': {'preceding': {1: 'nxt_swbd_contexts_preceding_1_filtered.txt',
   2: 'nxt_swbd_contexts_preceding_2_filtered.txt',
   3: 'nxt_swbd_contexts_preceding_3_filtered.txt',
   4: 'nxt_swbd_contexts_preceding_4_filtered.txt'},
  'following': {1: 'nxt_swbd_contexts_following_1_filtered.txt',
   2: 'nxt_swbd_contexts_following_2_filtered.txt',
   3: 'nxt_swbd_contexts_following_3_filtered.txt',
   4: 'nxt_swbd_contexts_following_4_filtered.txt'},
  'bidirectional': 'nxt_swbd_

('C_Buckeye', 'C_NXT_swbd')

{'Buckeye': ('buckeye_contexts_following_1_filtered.txt',
  'buckeye_contexts_following_2_filtered.txt',
  'buckeye_contexts_following_3_filtered.txt',
  'buckeye_contexts_preceding_1_filtered.txt',
  'buckeye_contexts_preceding_2_filtered.txt',
  'buckeye_contexts_preceding_3_filtered.txt',
  'buckeye_contexts_bidirectional_filtered.json'),
 'NXT_swbd': ('nxt_swbd_contexts_following_1_filtered.txt',
  'nxt_swbd_contexts_following_2_filtered.txt',
  'nxt_swbd_contexts_following_3_filtered.txt',
  'nxt_swbd_contexts_preceding_1_filtered.txt',
  'nxt_swbd_contexts_preceding_2_filtered.txt',
  'nxt_swbd_contexts_preceding_3_filtered.txt',
  'nxt_swbd_contexts_bidirectional_filtered.json')}

{'Buckeye': '../buckeye-lm', 'NXT_swbd': '../switchboard-lm'}

{'': {2: 'fisher_utterances_main_2gram.mmap',
  3: 'fisher_utterances_main_3gram.mmap',
  4: 'fisher_utterances_main_4gram.mmap'},
 'rev': {2: 'fisher_utterances_main_rev_2gram.mmap',
  3: 'fisher_utterances_main_rev_3gram.mmap',
  4: 'fisher_utterances_main_rev_4gram.mmap'}}

In [56]:
my_vocab_fn = fisher_lm_vocab_fn
LD_bundles = []

for corpus_name in corpus_to_contexts:
    for context_fn in corpus_to_contexts[corpus_name]:
        if 'bidirectional' in context_fn:
            continue #not supported for now
        elif 'preceding' in context_fn:
            context_direction = 'preceding'
            lm_direction = ''
        else:
            context_direction = 'following'
            lm_direction = 'rev'
        
        context_size = context_fn[-14]
        order = int(context_size) + 1
        
        LD_id = str_join('_', ['LD','Fisher','vocab','in',
                               corpus_name, context_direction, 'contexts',
                               str(order) + 'gram', 'model'])
        
        new_bundle = {
            'corpus':corpus_name,
            'context_fn':context_fn,
            'context_fp':path.join('C_' + corpus_name, context_fn),
            'lm_fn':LM_fns[lm_direction][order],
            'lm_fp':path.join(fisher_lm_dir, LM_fns[lm_direction][order]),
            'LD_dir':LD_id,
            'o_fn_stem':LD_id,
            'o':path.join(LD_id, LD_id),
            'm':path.join(fisher_lm_dir, LM_fns[lm_direction][order]),
            'v':path.join(fisher_lm_dir, my_vocab_fn),
            'c':path.join('C_' + corpus_name, context_fn),
            'nb_fp':path.join(LD_id, 
                              'Producing ' + LD_id.replace('_', ' ')[3:] + ' contextual distributions.ipynb')
        }
        
        LD_bundles.append(new_bundle)
LD_bundles
LDs = LD_bundles

[{'corpus': 'Buckeye',
  'context_fn': 'buckeye_contexts_following_1_filtered.txt',
  'context_fp': 'C_Buckeye/buckeye_contexts_following_1_filtered.txt',
  'lm_fn': 'fisher_utterances_main_rev_2gram.mmap',
  'lm_fp': 'LM_Fisher/fisher_utterances_main_rev_2gram.mmap',
  'LD_dir': 'LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model',
  'o_fn_stem': 'LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model',
  'o': 'LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model/LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model',
  'm': 'LM_Fisher/fisher_utterances_main_rev_2gram.mmap',
  'v': 'LM_Fisher/fisher_vocabulary_main.txt',
  'c': 'C_Buckeye/buckeye_contexts_following_1_filtered.txt',
  'nb_fp': 'LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model/Producing Fisher vocab in Buckeye following contexts 2gram model contextual distributions.ipynb'},
 {'corpus': 'Buckeye',
  'context_fn': 'buckeye_contexts_following_2_filtered.txt',
  'context_fp': 'C_Buckeye/buckeye_c

In [57]:
LD_unigram_bundles = [
    {
        'corpus':'Fisher',
        'context_fn':'',
        'context_fp':'',
        'lm_fn':path.basename(main_unigram_counts_fp),
        'lm_fp':main_unigram_counts_fp,
        'LD_dir':'LD_Fisher_vocab_in_(empty)_(NA)_contexts_1gram_model',
        'o_fn_stem':'LD_Fisher_vocab_in_(empty)_(NA)_contexts_1gram_model',
        'o':path.join('LD_Fisher_vocab_in_(empty)_(NA)_contexts_1gram_model', 
                      'LD_Fisher_vocab_in_(empty)_(NA)_contexts_1gram_model'),
        'm':main_unigram_counts_fp,
        'v':path.join(fisher_lm_dir, my_vocab_fn),
        'c':'',
        'nb_fp':path.join('LD_Fisher_vocab_in_(empty)_(NA)_contexts_1gram_model', 
                          'Producing ' + 'LD_Fisher_vocab_in_(empty)_(NA)_contexts_1gram_model'.replace('_', ' ')[3:] + ' contextual distributions.ipynb')
    },
    {
        'corpus':'Buckeye',
        'context_fn':'',
        'context_fp':'',
        'lm_fn':path.basename(main_unigram_counts_fp),
        'lm_fp':main_unigram_counts_fp,
        'LD_dir':'LD_Buckeye_vocab_in_(empty)_(NA)_contexts_1gram_model',
        'o_fn_stem':'LD_Buckeye_vocab_in_(empty)_(NA)_contexts_1gram_model',
        'o':path.join('LD_Buckeye_vocab_in_(empty)_(NA)_contexts_1gram_model', 
                      'LD_Buckeye_vocab_in_(empty)_(NA)_contexts_1gram_model'),
        'm':main_unigram_counts_fp,
        'v':path.join('LTR_Buckeye', 'buckeye_vocabulary_main.txt'),
        'c':'',
        'nb_fp':path.join('LD_Buckeye_vocab_in_(empty)_(NA)_contexts_1gram_model', 
                          'Producing ' + 'LD_Buckeye_vocab_in_(empty)_(NA)_contexts_1gram_model'.replace('_', ' ')[3:] + ' contextual distributions.ipynb')
    },
    {
        'corpus':'NXT_swbd',
        'context_fn':'',
        'context_fp':'',
        'lm_fn':path.basename(main_unigram_counts_fp),
        'lm_fp':main_unigram_counts_fp,
        'LD_dir':'LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model',
        'o_fn_stem':'LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model',
        'o':path.join('LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model', 
                      'LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model'),
        'm':main_unigram_counts_fp,
        'v':path.join('LTR_NXT_swbd_destressed', 'NXT_swbd_vocabulary_main.txt'),
        'c':'',
        'nb_fp':path.join('LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model', 
                          'Producing ' + 'LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model'.replace('_', ' ')[3:] + ' contextual distributions.ipynb')
    }
]

# for corpus_name in ('Buckeye', 'NXT_swbd'):
#     context_size = 0
#     order = 1
#     LD_id = str_join('_', ['LD', 'Fisher', 'vocab', 'in',
#                            '(empty)', '(NA)','contexts',
#                            str(order) + 'gram', 'model'])
#     new_bundle = {
#         'corpus':corpus_name,
#         'context_fn':'',
#         'context_fp':'',
#         'lm_fn':path.basename(main_unigram_LM_fp),
#         'lm_fp':main_unigram_LM_fp,
#         'LD_dir':LD_id,
#             'o_fn_stem':LD_id,
#             'o':path.join(LD_id, LD_id),
#             'm':main_unigram_LM_fp,
#             'v':path.join(fisher_lm_dir, my_vocab_fn),
#             'c':'',
#             'nb_fp':path.join(LD_id, 
#                               'Producing ' + LD_id.replace('_', ' ')[3:] + ' contextual distributions.ipynb')
#         }
#     new_bundle
#     LD_unigram_bundles.append(new_bundle)
# # LD_bundles
# # LDs = LD_bundles
## LDs.extend(LD_unigram_bundles)
LD_unigram_bundles

[{'corpus': 'Fisher',
  'context_fn': '',
  'context_fp': '',
  'lm_fn': 'fisher_unigram_counts.tsv',
  'lm_fp': 'LM_Fisher/fisher_unigram_counts.tsv',
  'LD_dir': 'LD_Fisher_vocab_in_(empty)_(NA)_contexts_1gram_model',
  'o_fn_stem': 'LD_Fisher_vocab_in_(empty)_(NA)_contexts_1gram_model',
  'o': 'LD_Fisher_vocab_in_(empty)_(NA)_contexts_1gram_model/LD_Fisher_vocab_in_(empty)_(NA)_contexts_1gram_model',
  'm': 'LM_Fisher/fisher_unigram_counts.tsv',
  'v': 'LM_Fisher/fisher_vocabulary_main.txt',
  'c': '',
  'nb_fp': 'LD_Fisher_vocab_in_(empty)_(NA)_contexts_1gram_model/Producing Fisher vocab in (empty) (NA) contexts 1gram model contextual distributions.ipynb'},
 {'corpus': 'Buckeye',
  'context_fn': '',
  'context_fp': '',
  'lm_fn': 'fisher_unigram_counts.tsv',
  'lm_fp': 'LM_Fisher/fisher_unigram_counts.tsv',
  'LD_dir': 'LD_Buckeye_vocab_in_(empty)_(NA)_contexts_1gram_model',
  'o_fn_stem': 'LD_Buckeye_vocab_in_(empty)_(NA)_contexts_1gram_model',
  'o': 'LD_Buckeye_vocab_in_(empty)_

In [58]:
LDs.extend(LD_unigram_bundles)

In [59]:
# lmap(partial(omit, keys=('v', 'nb_fp', 'corpus', 'context_fp', 'lm_fp', 'o_fn_stem')),
#      LDs)

In [60]:
# contexts
# fisher_lm_fps

In [61]:
# LDs = [{'LD_dir':'LD_Fisher_vocab_in_Buckeye_contexts',
#         'o_fn_stem':'LD_fisher_vocab_in_buckeye_contexts',
#         'o':'LD_Fisher_vocab_in_Buckeye_contexts' + '/' + 'LD_fisher_vocab_in_buckeye_contexts',
#         'm':fisher_lm_fps['lm'],
#         'v':fisher_lm_fps['vocab'],
#         'c':buckeye_contexts,
#         'nb_fp':path.join('LD_Fisher_vocab_in_Buckeye_contexts', 
#                           'Producing ' + 'LD_Fisher_vocab_in_Buckeye_contexts'.replace('_', ' ')[3:] + ' contextual distributions.ipynb')},
# #        {'LD_dir':'LD_Fisher_vocab_in_swbd2003_contexts',
# #         'o_fn_stem':'LD_fisher_vocab_in_swbd2003_contexts',
# #         'o':'LD_Fisher_vocab_in_swbd2003_contexts' + '/' + 'LD_fisher_vocab_in_swbd2003_contexts',
# #         'm':fisher_lm_fps['lm'],
# #         'v':fisher_lm_fps['vocab'],
# #         'c':swbd2003_contexts,
# #         'nb_fp':path.join('LD_Fisher_vocab_in_swbd2003_contexts', 
# #                           'Producing ' + 'LD_Fisher_vocab_in_swbd2003_contexts'.replace('_', ' ')[3:] + ' contextual distributions.ipynb')},
#        {'LD_dir':'LD_Fisher_vocab_in_NXT_swbd_contexts',
#         'o_fn_stem':'LD_Fisher_vocab_in_nxt_swbd_contexts',
#         'o':'LD_Fisher_vocab_in_NXT_swbd_contexts' + '/' + 'LD_Fisher_vocab_in_nxt_swbd_contexts',
#         'm':FIXME,
#         'v':,
#         'c':,
#         'nb_fp':path.join('LD_Fisher_vocab_in_NXT_swbd_contexts',
#                           'Producing ' + 'LD_Fisher_vocab_in_NXT_swbd_contexts'.replace('_', ' ')[3:] + ' contextual distributions.ipynb')}
#       ]

In [62]:
if '2b' in permittedSteps:
    # used to take ~80m on wittgenstein
    
    # timing data
    # corpus = Buckeye
    #     preceding contexts:
    #         n = 2
    #             pitts/1.3m
    #         n = 3
    #             montague/7.5m
    #         n = 4
    #             pitts/12m
    #         n = 5
    #             sidious/16m
    #     following contexts:
    #         n = 2r
    #             x
    #         n = 3r
    #             x
    #         n = 4r
    #             x
    #         n = 5r
    #             wittgenstein/14m
    # corpus = NXT_swbd
    #     preceding contexts:
    #         n = 2
    #             sidious/2.66m
    #         n = 3
    #             sidious/13.5m
    #         n = 4
    #             x
    #         n = 5
    #             wittgenstein/34m
    #     following contexts:
    #         n = 2r
    #             montague/3m
    #         n = 3r
    #             montague/16m
    #         n = 4r
    #             wittgenstein/28.5m
    #         n = 5r
    #             wittgenstein/32m
    
    #takes 132m on wittgentesin including trim inputs but no 5gram models
    for ld in LDs:
#         output_dir = path.dirname(ld['LD_dir'])
        output_dir = ld['LD_dir']
        ensure_dir_exists(output_dir)
        
        progress_report(ld['nb_fp'],
                        dict(m = ld['m'],
                        v = ld['v'],
                        c = ld['c'],
                        o = ld['o']))
        
        if not overwrite and path.exists(ld['nb_fp']):
            print('{0} already exists. Skipping...'.format(ld['nb_fp']))
            endNote()
            continue

        nb = pm.execute_notebook(
        'Producing contextual distributions.ipynb',
    #     'Producing ' + ld['LD_dir'].replace('_', ' ')[3:] + ' contextual distributions.ipynb',
        ld['nb_fp'],
        parameters=dict(m = ld['m'],
                        v = ld['v'],
                        c = ld['c'],
                        o = ld['o'])
        )
        endNote()
        print('\n')

In [63]:
%pwd

'/mnt/cube/home/AD/emeinhar/wr'

# Step 3: Creating combinable models

**The basic problem:**

1. **Channel model + transcribed lexicon relation**: Even after the gating data and a transcribed lexicon relation are defined over the same inventory, 
 - the lexicon may contain triphones or diphones that are not in the stimuli triphones/diphones of a channel model.
 - channel distributions will contain triphones/diphones that cannot be found in the transcribed lexicon relation. (While the other steps here are strictly necessary, this is simply a practical step for making downstream computation faster.)
2. **Language model + transcribed lexicon relation**: The orthographic vocabulary of a transcribed lexicon relation may contain wordforms not in an n-gram model's vocabulary. (We *don't* want to use the out-of-vocabulary estimate for those wordforms.)
3.  **Contextual distributions + transcribed lexicon relation**: The contextual distributions from Step 3b above are defined over the *language model's* orthographic vocabulary, which will likely include wordforms that are not in the transcribed lexicon relation. We want to create modified forms of these distributions where we condition on the choice of an orthographic wordform that is in the transcribed lexicon relation.

Once we have
 - a version $l'$ of the transcribed lexicon relation $l$ trimmed with respect to both the triphones of the channel model $c$ and the (orthographic) vocabulary of a language model $m$
 - a version $d'$ of the contextual distributions over $m$'s vocabulary (with respect to some set of n-gram contexts) $d$ trimmed to only define distributions over $l'$
 - a version $c'$ of the channel model $c$ trimmed with respect to a transcribed lexicon relation $l'$
 - a probability distribution over segmental wordforms given an orthographic wordform
 
we will be able to combine everything together to calculate confusability of wordforms in corpus contexts.

## Step 3a: Filter transcription lexicons to only include words that can be modeled by a given channel distribution

In [64]:
#gather relevant LTRs and their associated CMs

In [65]:
aligned_LTRs = lmap(lambda ab: {'LTR_fp':ab['my_ol'],
                                'GD_fp':ab['my_og']},
                    alignment_arg_bundles)
lmap(lambda d: d['LTR_fp'],
     aligned_LTRs)

['LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_w_GD_AmE-diphones.tsv',
 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_w_GD_AmE-diphones.tsv']

In [66]:
# CM_dirs = list(map(lambda cab: {'CM_fp':path.join(cab['cm_dir'],'pY1X0X1X2.json'),
#                                 'GD_fp':cab['gating_data_fp']},
#                    filter(lambda cab: cab['c'] != 0, cm_arg_bundles)))
# CM_dirs
# CM_dirs[5]
# # listdir(CM_dirs[0][])

aligned_CMs = list(map(lambda cab: {'CM_fp':path.join(cab['cm_dir'],'pY1X0X1X2.json'),
                                    'GD_fp':cab['gating_data_fp']},
                       filter(lambda cab: cab['c'] != 0 and 'aligned' in cab['gating_data_fp'], 
                              cm_arg_bundles)))
aligned_CMs

[{'CM_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda1.0/pY1X0X1X2.json',
  'GD_fp': 'GD_AmE_destressed_aligned_w_LTR_Buckeye/GD_AmE-diphones_aligned_w_LTR_Buckeye.tsv'},
 {'CM_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.5/pY1X0X1X2.json',
  'GD_fp': 'GD_AmE_destressed_aligned_w_LTR_Buckeye/GD_AmE-diphones_aligned_w_LTR_Buckeye.tsv'},
 {'CM_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25/pY1X0X1X2.json',
  'GD_fp': 'GD_AmE_destressed_aligned_w_LTR_Buckeye/GD_AmE-diphones_aligned_w_LTR_Buckeye.tsv'},
 {'CM_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125/pY1X0X1X2.json',
  'GD_fp': 'GD_AmE_destressed_aligned_w_LTR_Buckeye/GD_AmE-diphones_aligned_w_LTR_Buckeye.tsv'},
 {'CM_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda1.0/pY1X0X1X2.json',
  'GD_fp': 'GD_AmE_destressed_aligned_w_LTR_Buckeye/GD_AmE-diphones_aligned_w_LTR_Buckeye.tsv'},
 {'CM_fp': 'CM_AmE_destressed_ali

In [67]:
def get_aligned_CMs(ltr_bundle):
    matches = [cm_bundle for cm_bundle in aligned_CMs if cm_bundle['GD_fp'] == ltr_bundle['GD_fp']]
    return list(map(lambda d: d['CM_fp'],
                    matches))

In [68]:
aligned_LTRs[0]['LTR_fp']

#NB: all of these will have the same set of stimuli triphones
#    ...which is all we care about here
get_aligned_CMs(aligned_LTRs[0]) 

'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_w_GD_AmE-diphones.tsv'

['CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda1.0/pY1X0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.5/pY1X0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25/pY1X0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125/pY1X0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda1.0/pY1X0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.5/pY1X0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.25/pY1X0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.125/pY1X0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda1.0/pY1X0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.5/pY1X0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.25/pY1X0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_

In [69]:
aligned_LTRs_and_CM = [{'LTR_fp':ltr['LTR_fp'],
                        'matching_CMs':get_aligned_CMs(ltr)}
                       for ltr in aligned_LTRs]
aligned_LTRs_and_CM

[{'LTR_fp': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_w_GD_AmE-diphones.tsv',
  'matching_CMs': ['CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda1.0/pY1X0X1X2.json',
   'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.5/pY1X0X1X2.json',
   'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25/pY1X0X1X2.json',
   'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125/pY1X0X1X2.json',
   'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda1.0/pY1X0X1X2.json',
   'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.5/pY1X0X1X2.json',
   'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.25/pY1X0X1X2.json',
   'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.125/pY1X0X1X2.json',
   'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda1.0/pY1X0X1X2.json',
   'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.5/pY1X0X1X2.json',
   'CM_Am

In [70]:
for each in aligned_LTRs_and_CM:
    each['c'] = each['matching_CMs'][0]
    each['l'] = each['LTR_fp']
    o_dir = path.dirname(each['LTR_fp'])
    o_fn = path.basename(each['LTR_fp']).split('w_')[0] + 'CM_filtered' + '.tsv'
    each['o'] = path.join(o_dir, o_fn)
    
    nb_fn = 'Filter ' + o_fn.split('_aligned')[0] + ' against channel model.ipynb'
    each['nb_fp'] = path.join(o_dir, nb_fn)
    
    print('c = {0}\nl = {1}\no = {2}\nnb = {3}'.format(each['c'], each['l'], each['o'], each['nb_fp']))
    print(' ')

c = CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda1.0/pY1X0X1X2.json
l = LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_w_GD_AmE-diphones.tsv
o = LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered.tsv
nb = LTR_Buckeye_aligned_w_GD_AmE_destressed/Filter LTR_Buckeye against channel model.ipynb
 
c = CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0/pY1X0X1X2.json
l = LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_w_GD_AmE-diphones.tsv
o = LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered.tsv
nb = LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/Filter LTR_NXT_swbd_destressed against channel model.ipynb
 


In [71]:
if '3a' in permittedSteps:
    # takes about 30s on wittgenstein
    for each in aligned_LTRs_and_CM:
        output_dir = path.dirname(each['o'])
        ensure_dir_exists(output_dir)

        progress_report(each['nb_fp'],
                        dict(l = each['l'],
                             c = each['c'],
                             o = each['o']))

        nb = pm.execute_notebook(
        'Filter transcription lexicon by channel model.ipynb',
        each['nb_fp'],
        parameters=dict(l = each['l'],
                        c = each['c'],
                        o = each['o'])
        )
        endNote()
        print('\n')

## Step 3b: Filter transcription lexicons to only include words that are in a language model's vocabulary

**Dependencies**
 - **Step 3a**: `LTR_..._aligned_CM_filtered....tsv`

In [72]:
# fisher_lm_fps
# fisher_lm_vocab_fp

In [73]:
LTR_fps = list(map(lambda pair: pair['LTR_fp'],
                   aligned_LTRs))
LTR_fps

LTR_CM_filtered = list(map(lambda d: d['o'],
                           aligned_LTRs_and_CM))
LTR_CM_filtered

['LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_w_GD_AmE-diphones.tsv',
 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_w_GD_AmE-diphones.tsv']

['LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered.tsv',
 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered.tsv']

How many lexicon entries have unmodelable triphones? (We'll next check how many such lexicon entries aren't in the language model vocabulary.)

In [74]:
!wc -l LTR_newdic_destressed_aligned_w_GD_AmE_destressed/LTR_newdic_destressed_aligned_w_GD_AmE-diphones.tsv
!wc -l LTR_CMU_destressed_aligned_w_GD_AmE_destressed/LTR_CMU_destressed_aligned_w_GD_AmE-diphones.tsv
!wc -l LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_w_GD_AmE-diphones.tsv
!wc -l LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_w_GD_AmE-diphones.tsv

19529 LTR_newdic_destressed_aligned_w_GD_AmE_destressed/LTR_newdic_destressed_aligned_w_GD_AmE-diphones.tsv
133855 LTR_CMU_destressed_aligned_w_GD_AmE_destressed/LTR_CMU_destressed_aligned_w_GD_AmE-diphones.tsv
7999 LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_w_GD_AmE-diphones.tsv
15814 LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_w_GD_AmE-diphones.tsv


In [75]:
!wc -l LTR_newdic_destressed_aligned_w_GD_AmE_destressed/LTR_newdic_destressed_aligned_CM_filtered.tsv
!wc -l LTR_CMU_destressed_aligned_w_GD_AmE_destressed/LTR_CMU_destressed_aligned_CM_filtered.tsv
!wc -l LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered.tsv
!wc -l LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered.tsv

17079 LTR_newdic_destressed_aligned_w_GD_AmE_destressed/LTR_newdic_destressed_aligned_CM_filtered.tsv
127799 LTR_CMU_destressed_aligned_w_GD_AmE_destressed/LTR_CMU_destressed_aligned_CM_filtered.tsv
7011 LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered.tsv
15318 LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered.tsv


Word type loss for newdic:
 - $19529 - 17079 = 2450$ word types lost due to unmodelable triphones

Word type loss for CMU_destressed:
 - $133855 - 127799 = 6056$ word types lost due to unmodelable triphones

Word type loss for Buckeye:
 - $7999 - 7011 = 988$ word types lost due to unmodelable triphones
 
Word type loss for NXT_swbd_destressed:
 - $15834 - 15338 = 496$ word types lost due to unmodelable triphones

In [76]:
LTR_CM_filtered

['LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered.tsv',
 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered.tsv']

In [77]:
fisher_lm_vocab_fp = path.join(fisher_lm_dir, fisher_lm_vocab_fn)

In [78]:
LTR_LM_filter_bundles = []
for each_LTR_fp in LTR_CM_filtered:
    bundle = dict()
    LTR_descr = path.basename(each_LTR_fp)[:-4]
    LM_V_descr = path.basename(fisher_lm_vocab_fp)[:-4]
    bundle['l'] = each_LTR_fp
    bundle['v'] = fisher_lm_vocab_fp
    bundle['o'] = each_LTR_fp[:-4] + '_LM_filtered' + '.tsv'
    bundle['nb_fp'] = path.join(path.dirname(each_LTR_fp),
                                f'Filter {LTR_descr} against {LM_V_descr}' + '.ipynb')
    bundle
    LTR_LM_filter_bundles.append(bundle)
    print('')

{'l': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered.tsv',
 'v': 'LM_Fisher/fisher_vocabulary_main.txt',
 'o': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.tsv',
 'nb_fp': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/Filter LTR_Buckeye_aligned_CM_filtered against fisher_vocabulary_main.ipynb'}




{'l': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered.tsv',
 'v': 'LM_Fisher/fisher_vocabulary_main.txt',
 'o': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.tsv',
 'nb_fp': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/Filter LTR_NXT_swbd_destressed_aligned_CM_filtered against fisher_vocabulary_main.ipynb'}




In [79]:
if '3b' in permittedSteps:
    # takes about ~1m on wittgenstein
    for each in LTR_LM_filter_bundles:
        output_dir = path.dirname(each['o'])
        ensure_dir_exists(output_dir)

        progress_report(each['nb_fp'],
                        dict(l = each['l'],
                             v = each['v'],
                             o = each['o']))

        nb = pm.execute_notebook(
        'Filter transcription lexicon by language model vocabulary.ipynb',
        each['nb_fp'],
        parameters=dict(l = each['l'],
                        v = each['v'],
                        o = each['o'])
        )
        endNote()
        print('\n')

**A bit less than half** of the triphone channel model-modelable `newdic` lexicon **isn't** in the LM vocab:

In [80]:
!wc -l LTR_newdic_destressed_aligned_w_GD_AmE_destressed/LTR_newdic_destressed_aligned_CM_filtered.tsv
!wc -l LTR_newdic_destressed_aligned_w_GD_AmE_destressed/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered.tsv

17079 LTR_newdic_destressed_aligned_w_GD_AmE_destressed/LTR_newdic_destressed_aligned_CM_filtered.tsv
9412 LTR_newdic_destressed_aligned_w_GD_AmE_destressed/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered.tsv


$17079 - 9412 = 7667$ lost

$7667 / 17079 ≈ 0.45$ proportionally

Word type loss for newdic:
 - $19529 - 17079 = 2450$ word types lost due to unmodelable triphones
 - $17079 - 9412 = 7667$ further word types lost due orthographic wordforms not being in the language model vocabulary
 - $\frac{2450+7667}{19529} ≈ 51.8\%$ of word types lost.

**About three quarters** of the triphone channel model-modelable `CMU_destressed` lexicon **isn't** in the LM vocab:

In [81]:
!wc -l LTR_CMU_destressed_aligned_w_GD_AmE_destressed/LTR_CMU_destressed_aligned_CM_filtered.tsv
!wc -l LTR_CMU_destressed_aligned_w_GD_AmE_destressed/LTR_CMU_destressed_aligned_CM_filtered_LM_filtered.tsv

127799 LTR_CMU_destressed_aligned_w_GD_AmE_destressed/LTR_CMU_destressed_aligned_CM_filtered.tsv
33125 LTR_CMU_destressed_aligned_w_GD_AmE_destressed/LTR_CMU_destressed_aligned_CM_filtered_LM_filtered.tsv


$127799 - 33125 = 94674$ lost

$94674 / 127799 ≈ 0.74$ proportionally

Word type loss for CMU_destressed:
 - $133855 - 127799 = 6056$ word types lost due to unmodelable triphones
 - $127799 - 33125 = 94674$ further word types lost due orthographic wordforms not being in the language model vocabulary
 - $\frac{6056+94674}{133855} ≈ 75.25\%$ of word types lost.

**A bit less than 20%** of the triphone channel model-modelable `Buckeye` lexicon **isn't** in the LM vocab:

In [82]:
!wc -l LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered.tsv
!wc -l LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.tsv

7011 LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered.tsv
6576 LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.tsv


$7011 - 6576 = 435$ lost

$435 / 7011 ≈ .06$ proportionally

Word type loss for Buckeye:
 - $7999 - 7011 = 988$ word types lost due to unmodelable triphones
 - $7011 - 6576 = 435$ further word types lost due orthographic wordforms not being in the language model vocabulary
 - $\frac{435+988}{7999} ≈ 17.8\%$ of word types lost.

**About 15%** of the triphone channel model-modelable `NXT_swbd_destressed` lexicon **isn't** in the LM vocab:

In [83]:
!wc -l LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered.tsv
!wc -l LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.tsv

15318 LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered.tsv
13246 LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.tsv


$15338 - 13266 = 2072$ lost

$2072 / 15338 ≈ 0.14$ proportionally

Word type loss for NXT_swbd_destressed:
 - $15834 - 15338 = 496$ word types lost due to unmodelable triphones
 - $15338 - 13266 = 2072$ further word types lost due orthographic wordforms not being in the language model vocabulary
 - $\frac{496+2072}{15834} ≈ 16.2\%$ of word types lost.

Collecting the filtered transcribed lexicon relations...

In [84]:
LTR_CM_filtered_LM_filtered = list(map(lambda bundle: bundle['o'],
                                       LTR_LM_filter_bundles))
LTR_CM_filtered_LM_filtered

['LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.tsv',
 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.tsv']

## Step 3c: Filter the conditioning events of channel distributions to only include $k$-factors contained in elements of a transcription lexicon's segmental wordforms

**Dependencies**
 - **Step 3b**: `LTR_..._aligned_CM_filtered_LM_filtered.tsv`

In [85]:
#what channel models might you want to use with what lexicons?
# there are 3x3 triphone CMs that are aligned with one of 3 LTRs and have one of 3 relevant pseudocount levels
# For each of the 3 LTRs aligned with the gating data, there's exactly 1 `LTR...CM_filtered_LM_filtered.tsv` file
# ∴ there are 3x3 triphone channel models to trim

# Also, for each triphone CM, there are preview and postview distributions to trim

In [86]:
for each in aligned_LTRs_and_CM:
    LTR_dir = path.dirname(each['LTR_fp'])
    LTR_trimmed_fn = path.basename(each['LTR_fp']).split('_w_')[0] + '_CM_filtered_LM_filtered.tsv'
    each['LTR_trimmed_fp'] = path.join(LTR_dir, LTR_trimmed_fn)
    each['matching_trimmed_CMs'] = [path.join(path.dirname(fp),
                                              LTR_trimmed_fn[:-4] + '_' + path.basename(fp))
                                    for fp in each['matching_CMs']]
    
#     each['LTR_fp']
    each['LTR_trimmed_fp']
    each['matching_CMs']
    each['matching_trimmed_CMs']
#     each['trimmed LTR_fp']
    print('')

'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.tsv'

['CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda1.0/pY1X0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.5/pY1X0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25/pY1X0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125/pY1X0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda1.0/pY1X0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.5/pY1X0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.25/pY1X0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.125/pY1X0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda1.0/pY1X0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.5/pY1X0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.25/pY1X0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_

['CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.125/LTR_Buckeye_aligned




'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.tsv'

['CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0/pY1X0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.5/pY1X0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/pY1X0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.125/pY1X0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda1.0/pY1X0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.5/pY1X0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.25/pY1X0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.125/pY1X0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0/pY1X0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.5/pY1X0X1X2.json',
 'CM_AmE_destressed_al

['CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambd




In [87]:
trimmed_LTR_CM_triples = []
for each in aligned_LTRs_and_CM:
    assert len(each['matching_CMs']) == len(each['matching_trimmed_CMs'])
    
    for i in range(len(each['matching_CMs'])):
        args = dict()
        args['l'] = each['LTR_trimmed_fp']
        args['c'] = each['matching_CMs'][i]
        args['o'] = each['matching_trimmed_CMs'][i]
        args['nb_fp'] = path.join(path.dirname(args['c']),
                                  f"Filter {path.dirname(args['c'])} against {path.basename(args['l'])[:-4]}.ipynb")
        args
        trimmed_LTR_CM_triples.append(args)
        print('')


{'l': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.tsv',
 'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda1.0/pY1X0X1X2.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda1.0/Filter CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda1.0 against LTR_Buckeye_aligned_CM_filtered_LM_filtered.ipynb'}




{'l': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.tsv',
 'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.5/pY1X0X1X2.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.5/Filter CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.5 against LTR_Buckeye_aligned_CM_filtered_LM_filtered.ipynb'}




{'l': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.tsv',
 'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25/pY1X0X1X2.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25/Filter CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25 against LTR_Buckeye_aligned_CM_filtered_LM_filtered.ipynb'}




{'l': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.tsv',
 'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125/pY1X0X1X2.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125/Filter CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125 against LTR_Buckeye_aligned_CM_filtered_LM_filtered.ipynb'}




{'l': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.tsv',
 'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda1.0/pY1X0X1X2.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda1.0/Filter CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda1.0 against LTR_Buckeye_aligned_CM_filtered_LM_filtered.ipynb'}




{'l': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.tsv',
 'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.5/pY1X0X1X2.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.5/Filter CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.5 against LTR_Buckeye_aligned_CM_filtered_LM_filtered.ipynb'}




{'l': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.tsv',
 'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.25/pY1X0X1X2.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.25/Filter CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.25 against LTR_Buckeye_aligned_CM_filtered_LM_filtered.ipynb'}




{'l': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.tsv',
 'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.125/pY1X0X1X2.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.125/Filter CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.125 against LTR_Buckeye_aligned_CM_filtered_LM_filtered.ipynb'}




{'l': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.tsv',
 'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda1.0/pY1X0X1X2.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda1.0/Filter CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda1.0 against LTR_Buckeye_aligned_CM_filtered_LM_filtered.ipynb'}




{'l': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.tsv',
 'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.5/pY1X0X1X2.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.5/Filter CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.5 against LTR_Buckeye_aligned_CM_filtered_LM_filtered.ipynb'}




{'l': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.tsv',
 'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.25/pY1X0X1X2.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.25/Filter CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.25 against LTR_Buckeye_aligned_CM_filtered_LM_filtered.ipynb'}




{'l': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.tsv',
 'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.125/pY1X0X1X2.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.125/Filter CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.125 against LTR_Buckeye_aligned_CM_filtered_LM_filtered.ipynb'}




{'l': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.tsv',
 'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0/pY1X0X1X2.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0/Filter CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0 against LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.ipynb'}




{'l': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.tsv',
 'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.5/pY1X0X1X2.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.5/Filter CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.5 against LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.ipynb'}




{'l': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.tsv',
 'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/pY1X0X1X2.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/Filter CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25 against LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.ipynb'}




{'l': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.tsv',
 'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.125/pY1X0X1X2.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.125/Filter CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.125 against LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.ipynb'}




{'l': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.tsv',
 'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda1.0/pY1X0X1X2.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda1.0/Filter CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda1.0 against LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.ipynb'}




{'l': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.tsv',
 'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.5/pY1X0X1X2.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.5/Filter CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.5 against LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.ipynb'}




{'l': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.tsv',
 'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.25/pY1X0X1X2.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.25/Filter CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.25 against LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.ipynb'}




{'l': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.tsv',
 'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.125/pY1X0X1X2.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.125/Filter CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.125 against LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.ipynb'}




{'l': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.tsv',
 'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0/pY1X0X1X2.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0/Filter CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0 against LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.ipynb'}




{'l': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.tsv',
 'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.5/pY1X0X1X2.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.5/Filter CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.5 against LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.ipynb'}




{'l': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.tsv',
 'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.25/pY1X0X1X2.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.25/Filter CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.25 against LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.ipynb'}




{'l': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.tsv',
 'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.125/pY1X0X1X2.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.125/Filter CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.125 against LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.ipynb'}




In [157]:
aligned_LTRs_and_CM

[{'LTR_fp': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_w_GD_AmE-diphones.tsv',
  'matching_CMs': ['CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda1.0/pY1X0X1X2.json',
   'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.5/pY1X0X1X2.json',
   'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25/pY1X0X1X2.json',
   'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125/pY1X0X1X2.json',
   'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda1.0/pY1X0X1X2.json',
   'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.5/pY1X0X1X2.json',
   'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.25/pY1X0X1X2.json',
   'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.125/pY1X0X1X2.json',
   'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda1.0/pY1X0X1X2.json',
   'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.5/pY1X0X1X2.json',
   'CM_Am

In [159]:
len(aligned_LTRs_and_CM)
aligned_LTRs_and_CM[0]['o']
aligned_LTRs_and_CM[1]['o']

2

'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered.tsv'

'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered.tsv'

In [161]:
#unigram language models currently always includes all words in the corpus, 
# so existing step 3b isn't relevant + doesn't affect them; also they're not in the above list.
# this cell adds them in appropriately

unigram_trimmed_LTR_CM_triples = []

for each_LTR_and_CM_alignment_bundle in aligned_LTRs_and_CM:
    b = each_LTR_and_CM_alignment_bundle
    only_CM_filtered_lexicon_fp = b['o']
    my_l = only_CM_filtered_lexicon_fp
    ltr_fn_base = path.splitext(path.basename(my_l))[0]
    
    for CM_path in b['matching_CMs']:
        my_c = CM_path
        cm_dir, cm_fn = path.dirname(my_c), path.basename(my_c)
        my_o = path.join(cm_dir, ltr_fn_base + '_' + cm_fn)
        my_nb_fp = path.join(cm_dir,
                             f"Filter {cm_dir} against {ltr_fn_base}.ipynb")
        
        new_bundle = {'l':my_l,
                      'c':my_c,
                      'o':my_o,
                      'nb_fp':my_nb_fp}
        new_bundle
        unigram_trimmed_LTR_CM_triples.append(new_bundle)

# trimmed_LTR_CM_triples.extend(unigram_trimmed_LTR_CM_triples)

{'l': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered.tsv',
 'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda1.0/pY1X0X1X2.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda1.0/LTR_Buckeye_aligned_CM_filtered_pY1X0X1X2.json',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda1.0/Filter CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda1.0 against LTR_Buckeye_aligned_CM_filtered.ipynb'}

{'l': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered.tsv',
 'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.5/pY1X0X1X2.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.5/LTR_Buckeye_aligned_CM_filtered_pY1X0X1X2.json',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.5/Filter CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.5 against LTR_Buckeye_aligned_CM_filtered.ipynb'}

{'l': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered.tsv',
 'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25/pY1X0X1X2.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25/LTR_Buckeye_aligned_CM_filtered_pY1X0X1X2.json',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25/Filter CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25 against LTR_Buckeye_aligned_CM_filtered.ipynb'}

{'l': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered.tsv',
 'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125/pY1X0X1X2.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125/LTR_Buckeye_aligned_CM_filtered_pY1X0X1X2.json',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125/Filter CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125 against LTR_Buckeye_aligned_CM_filtered.ipynb'}

{'l': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered.tsv',
 'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda1.0/pY1X0X1X2.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda1.0/LTR_Buckeye_aligned_CM_filtered_pY1X0X1X2.json',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda1.0/Filter CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda1.0 against LTR_Buckeye_aligned_CM_filtered.ipynb'}

{'l': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered.tsv',
 'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.5/pY1X0X1X2.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.5/LTR_Buckeye_aligned_CM_filtered_pY1X0X1X2.json',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.5/Filter CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.5 against LTR_Buckeye_aligned_CM_filtered.ipynb'}

{'l': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered.tsv',
 'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.25/pY1X0X1X2.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.25/LTR_Buckeye_aligned_CM_filtered_pY1X0X1X2.json',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.25/Filter CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.25 against LTR_Buckeye_aligned_CM_filtered.ipynb'}

{'l': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered.tsv',
 'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.125/pY1X0X1X2.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.125/LTR_Buckeye_aligned_CM_filtered_pY1X0X1X2.json',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.125/Filter CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.125 against LTR_Buckeye_aligned_CM_filtered.ipynb'}

{'l': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered.tsv',
 'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda1.0/pY1X0X1X2.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda1.0/LTR_Buckeye_aligned_CM_filtered_pY1X0X1X2.json',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda1.0/Filter CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda1.0 against LTR_Buckeye_aligned_CM_filtered.ipynb'}

{'l': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered.tsv',
 'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.5/pY1X0X1X2.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.5/LTR_Buckeye_aligned_CM_filtered_pY1X0X1X2.json',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.5/Filter CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.5 against LTR_Buckeye_aligned_CM_filtered.ipynb'}

{'l': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered.tsv',
 'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.25/pY1X0X1X2.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.25/LTR_Buckeye_aligned_CM_filtered_pY1X0X1X2.json',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.25/Filter CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.25 against LTR_Buckeye_aligned_CM_filtered.ipynb'}

{'l': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered.tsv',
 'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.125/pY1X0X1X2.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.125/LTR_Buckeye_aligned_CM_filtered_pY1X0X1X2.json',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.125/Filter CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.125 against LTR_Buckeye_aligned_CM_filtered.ipynb'}

{'l': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered.tsv',
 'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0/pY1X0X1X2.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_pY1X0X1X2.json',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0/Filter CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0 against LTR_NXT_swbd_destressed_aligned_CM_filtered.ipynb'}

{'l': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered.tsv',
 'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.5/pY1X0X1X2.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_pY1X0X1X2.json',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.5/Filter CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.5 against LTR_NXT_swbd_destressed_aligned_CM_filtered.ipynb'}

{'l': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered.tsv',
 'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/pY1X0X1X2.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_pY1X0X1X2.json',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/Filter CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25 against LTR_NXT_swbd_destressed_aligned_CM_filtered.ipynb'}

{'l': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered.tsv',
 'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.125/pY1X0X1X2.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_pY1X0X1X2.json',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.125/Filter CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.125 against LTR_NXT_swbd_destressed_aligned_CM_filtered.ipynb'}

{'l': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered.tsv',
 'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda1.0/pY1X0X1X2.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_pY1X0X1X2.json',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda1.0/Filter CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda1.0 against LTR_NXT_swbd_destressed_aligned_CM_filtered.ipynb'}

{'l': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered.tsv',
 'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.5/pY1X0X1X2.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_pY1X0X1X2.json',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.5/Filter CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.5 against LTR_NXT_swbd_destressed_aligned_CM_filtered.ipynb'}

{'l': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered.tsv',
 'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.25/pY1X0X1X2.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_pY1X0X1X2.json',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.25/Filter CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.25 against LTR_NXT_swbd_destressed_aligned_CM_filtered.ipynb'}

{'l': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered.tsv',
 'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.125/pY1X0X1X2.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_pY1X0X1X2.json',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.125/Filter CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.125 against LTR_NXT_swbd_destressed_aligned_CM_filtered.ipynb'}

{'l': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered.tsv',
 'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0/pY1X0X1X2.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_pY1X0X1X2.json',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0/Filter CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0 against LTR_NXT_swbd_destressed_aligned_CM_filtered.ipynb'}

{'l': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered.tsv',
 'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.5/pY1X0X1X2.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_pY1X0X1X2.json',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.5/Filter CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.5 against LTR_NXT_swbd_destressed_aligned_CM_filtered.ipynb'}

{'l': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered.tsv',
 'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.25/pY1X0X1X2.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_pY1X0X1X2.json',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.25/Filter CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.25 against LTR_NXT_swbd_destressed_aligned_CM_filtered.ipynb'}

{'l': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered.tsv',
 'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.125/pY1X0X1X2.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_pY1X0X1X2.json',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.125/Filter CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.125 against LTR_NXT_swbd_destressed_aligned_CM_filtered.ipynb'}

In [164]:
if '3c' in permittedSteps:
    # takes about 120s on wittgenstein

    for each in unigram_trimmed_LTR_CM_triples:
#     for each in trimmed_LTR_CM_triples:
        output_dir = path.dirname(each['o'])
        ensure_dir_exists(output_dir)
    #     if not path.exists(output_dir):
    #         print(f"Creating output path '{output_dir}'")
    #         makedirs(output_dir)

        progress_report(each['nb_fp'],
                        dict(l = each['l'],
                             c = each['c'],
                             o = each['o']))

        nb = pm.execute_notebook(
        'Filter channel model by transcription lexicon.ipynb',
        each['nb_fp'],
        parameters=dict(l = each['l'],
                        c = each['c'],
                        o = each['o'])
        )
        endNote()
        print('\n')

Start  @ 14:31:08
Running notebook:
	Filter CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda1.0 against LTR_Buckeye_aligned_CM_filtered.ipynb
Output directory:
	CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda1.0
Arguments:
{
 "l": "LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered.tsv",
 "c": "CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda1.0/pY1X0X1X2.json",
 "o": "CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda1.0/LTR_Buckeye_aligned_CM_filtered_pY1X0X1X2.json"
}


HBox(children=(IntProgress(value=0, description='Executing', max=50, style=ProgressStyle(description_width='in…


End  @ 14:31:14


Start  @ 14:31:14
Running notebook:
	Filter CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.5 against LTR_Buckeye_aligned_CM_filtered.ipynb
Output directory:
	CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.5
Arguments:
{
 "l": "LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered.tsv",
 "c": "CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.5/pY1X0X1X2.json",
 "o": "CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.5/LTR_Buckeye_aligned_CM_filtered_pY1X0X1X2.json"
}


HBox(children=(IntProgress(value=0, description='Executing', max=50, style=ProgressStyle(description_width='in…


End  @ 14:31:20


Start  @ 14:31:20
Running notebook:
	Filter CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25 against LTR_Buckeye_aligned_CM_filtered.ipynb
Output directory:
	CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25
Arguments:
{
 "l": "LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered.tsv",
 "c": "CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25/pY1X0X1X2.json",
 "o": "CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25/LTR_Buckeye_aligned_CM_filtered_pY1X0X1X2.json"
}


HBox(children=(IntProgress(value=0, description='Executing', max=50, style=ProgressStyle(description_width='in…


End  @ 14:31:26


Start  @ 14:31:26
Running notebook:
	Filter CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125 against LTR_Buckeye_aligned_CM_filtered.ipynb
Output directory:
	CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125
Arguments:
{
 "l": "LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered.tsv",
 "c": "CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125/pY1X0X1X2.json",
 "o": "CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125/LTR_Buckeye_aligned_CM_filtered_pY1X0X1X2.json"
}


HBox(children=(IntProgress(value=0, description='Executing', max=50, style=ProgressStyle(description_width='in…


End  @ 14:31:31


Start  @ 14:31:31
Running notebook:
	Filter CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda1.0 against LTR_Buckeye_aligned_CM_filtered.ipynb
Output directory:
	CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda1.0
Arguments:
{
 "l": "LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered.tsv",
 "c": "CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda1.0/pY1X0X1X2.json",
 "o": "CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda1.0/LTR_Buckeye_aligned_CM_filtered_pY1X0X1X2.json"
}


HBox(children=(IntProgress(value=0, description='Executing', max=50, style=ProgressStyle(description_width='in…


End  @ 14:31:37


Start  @ 14:31:37
Running notebook:
	Filter CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.5 against LTR_Buckeye_aligned_CM_filtered.ipynb
Output directory:
	CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.5
Arguments:
{
 "l": "LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered.tsv",
 "c": "CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.5/pY1X0X1X2.json",
 "o": "CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.5/LTR_Buckeye_aligned_CM_filtered_pY1X0X1X2.json"
}


HBox(children=(IntProgress(value=0, description='Executing', max=50, style=ProgressStyle(description_width='in…


End  @ 14:31:43


Start  @ 14:31:43
Running notebook:
	Filter CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.25 against LTR_Buckeye_aligned_CM_filtered.ipynb
Output directory:
	CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.25
Arguments:
{
 "l": "LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered.tsv",
 "c": "CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.25/pY1X0X1X2.json",
 "o": "CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.25/LTR_Buckeye_aligned_CM_filtered_pY1X0X1X2.json"
}


HBox(children=(IntProgress(value=0, description='Executing', max=50, style=ProgressStyle(description_width='in…


End  @ 14:31:48


Start  @ 14:31:48
Running notebook:
	Filter CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.125 against LTR_Buckeye_aligned_CM_filtered.ipynb
Output directory:
	CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.125
Arguments:
{
 "l": "LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered.tsv",
 "c": "CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.125/pY1X0X1X2.json",
 "o": "CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.125/LTR_Buckeye_aligned_CM_filtered_pY1X0X1X2.json"
}


HBox(children=(IntProgress(value=0, description='Executing', max=50, style=ProgressStyle(description_width='in…


End  @ 14:31:54


Start  @ 14:31:54
Running notebook:
	Filter CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda1.0 against LTR_Buckeye_aligned_CM_filtered.ipynb
Output directory:
	CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda1.0
Arguments:
{
 "l": "LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered.tsv",
 "c": "CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda1.0/pY1X0X1X2.json",
 "o": "CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda1.0/LTR_Buckeye_aligned_CM_filtered_pY1X0X1X2.json"
}


HBox(children=(IntProgress(value=0, description='Executing', max=50, style=ProgressStyle(description_width='in…


End  @ 14:32:00


Start  @ 14:32:00
Running notebook:
	Filter CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.5 against LTR_Buckeye_aligned_CM_filtered.ipynb
Output directory:
	CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.5
Arguments:
{
 "l": "LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered.tsv",
 "c": "CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.5/pY1X0X1X2.json",
 "o": "CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.5/LTR_Buckeye_aligned_CM_filtered_pY1X0X1X2.json"
}


HBox(children=(IntProgress(value=0, description='Executing', max=50, style=ProgressStyle(description_width='in…


End  @ 14:32:06


Start  @ 14:32:06
Running notebook:
	Filter CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.25 against LTR_Buckeye_aligned_CM_filtered.ipynb
Output directory:
	CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.25
Arguments:
{
 "l": "LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered.tsv",
 "c": "CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.25/pY1X0X1X2.json",
 "o": "CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.25/LTR_Buckeye_aligned_CM_filtered_pY1X0X1X2.json"
}


HBox(children=(IntProgress(value=0, description='Executing', max=50, style=ProgressStyle(description_width='in…


End  @ 14:32:11


Start  @ 14:32:11
Running notebook:
	Filter CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.125 against LTR_Buckeye_aligned_CM_filtered.ipynb
Output directory:
	CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.125
Arguments:
{
 "l": "LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered.tsv",
 "c": "CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.125/pY1X0X1X2.json",
 "o": "CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.125/LTR_Buckeye_aligned_CM_filtered_pY1X0X1X2.json"
}


HBox(children=(IntProgress(value=0, description='Executing', max=50, style=ProgressStyle(description_width='in…


End  @ 14:32:17


Start  @ 14:32:17
Running notebook:
	Filter CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0 against LTR_NXT_swbd_destressed_aligned_CM_filtered.ipynb
Output directory:
	CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0
Arguments:
{
 "l": "LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered.tsv",
 "c": "CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0/pY1X0X1X2.json",
 "o": "CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_pY1X0X1X2.json"
}


HBox(children=(IntProgress(value=0, description='Executing', max=50, style=ProgressStyle(description_width='in…


End  @ 14:32:24


Start  @ 14:32:24
Running notebook:
	Filter CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.5 against LTR_NXT_swbd_destressed_aligned_CM_filtered.ipynb
Output directory:
	CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.5
Arguments:
{
 "l": "LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered.tsv",
 "c": "CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.5/pY1X0X1X2.json",
 "o": "CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_pY1X0X1X2.json"
}


HBox(children=(IntProgress(value=0, description='Executing', max=50, style=ProgressStyle(description_width='in…


End  @ 14:32:31


Start  @ 14:32:31
Running notebook:
	Filter CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25 against LTR_NXT_swbd_destressed_aligned_CM_filtered.ipynb
Output directory:
	CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25
Arguments:
{
 "l": "LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered.tsv",
 "c": "CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/pY1X0X1X2.json",
 "o": "CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_pY1X0X1X2.json"
}


HBox(children=(IntProgress(value=0, description='Executing', max=50, style=ProgressStyle(description_width='in…


End  @ 14:32:38


Start  @ 14:32:38
Running notebook:
	Filter CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.125 against LTR_NXT_swbd_destressed_aligned_CM_filtered.ipynb
Output directory:
	CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.125
Arguments:
{
 "l": "LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered.tsv",
 "c": "CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.125/pY1X0X1X2.json",
 "o": "CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_pY1X0X1X2.json"
}


HBox(children=(IntProgress(value=0, description='Executing', max=50, style=ProgressStyle(description_width='in…


End  @ 14:32:46


Start  @ 14:32:46
Running notebook:
	Filter CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda1.0 against LTR_NXT_swbd_destressed_aligned_CM_filtered.ipynb
Output directory:
	CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda1.0
Arguments:
{
 "l": "LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered.tsv",
 "c": "CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda1.0/pY1X0X1X2.json",
 "o": "CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_pY1X0X1X2.json"
}


HBox(children=(IntProgress(value=0, description='Executing', max=50, style=ProgressStyle(description_width='in…


End  @ 14:32:53


Start  @ 14:32:53
Running notebook:
	Filter CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.5 against LTR_NXT_swbd_destressed_aligned_CM_filtered.ipynb
Output directory:
	CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.5
Arguments:
{
 "l": "LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered.tsv",
 "c": "CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.5/pY1X0X1X2.json",
 "o": "CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_pY1X0X1X2.json"
}


HBox(children=(IntProgress(value=0, description='Executing', max=50, style=ProgressStyle(description_width='in…


End  @ 14:33:00


Start  @ 14:33:00
Running notebook:
	Filter CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.25 against LTR_NXT_swbd_destressed_aligned_CM_filtered.ipynb
Output directory:
	CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.25
Arguments:
{
 "l": "LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered.tsv",
 "c": "CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.25/pY1X0X1X2.json",
 "o": "CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_pY1X0X1X2.json"
}


HBox(children=(IntProgress(value=0, description='Executing', max=50, style=ProgressStyle(description_width='in…


End  @ 14:33:07


Start  @ 14:33:07
Running notebook:
	Filter CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.125 against LTR_NXT_swbd_destressed_aligned_CM_filtered.ipynb
Output directory:
	CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.125
Arguments:
{
 "l": "LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered.tsv",
 "c": "CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.125/pY1X0X1X2.json",
 "o": "CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_pY1X0X1X2.json"
}


HBox(children=(IntProgress(value=0, description='Executing', max=50, style=ProgressStyle(description_width='in…


End  @ 14:33:14


Start  @ 14:33:14
Running notebook:
	Filter CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0 against LTR_NXT_swbd_destressed_aligned_CM_filtered.ipynb
Output directory:
	CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0
Arguments:
{
 "l": "LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered.tsv",
 "c": "CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0/pY1X0X1X2.json",
 "o": "CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_pY1X0X1X2.json"
}


HBox(children=(IntProgress(value=0, description='Executing', max=50, style=ProgressStyle(description_width='in…


End  @ 14:33:22


Start  @ 14:33:22
Running notebook:
	Filter CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.5 against LTR_NXT_swbd_destressed_aligned_CM_filtered.ipynb
Output directory:
	CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.5
Arguments:
{
 "l": "LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered.tsv",
 "c": "CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.5/pY1X0X1X2.json",
 "o": "CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_pY1X0X1X2.json"
}


HBox(children=(IntProgress(value=0, description='Executing', max=50, style=ProgressStyle(description_width='in…


End  @ 14:33:29


Start  @ 14:33:29
Running notebook:
	Filter CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.25 against LTR_NXT_swbd_destressed_aligned_CM_filtered.ipynb
Output directory:
	CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.25
Arguments:
{
 "l": "LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered.tsv",
 "c": "CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.25/pY1X0X1X2.json",
 "o": "CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_pY1X0X1X2.json"
}


HBox(children=(IntProgress(value=0, description='Executing', max=50, style=ProgressStyle(description_width='in…


End  @ 14:33:36


Start  @ 14:33:36
Running notebook:
	Filter CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.125 against LTR_NXT_swbd_destressed_aligned_CM_filtered.ipynb
Output directory:
	CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.125
Arguments:
{
 "l": "LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered.tsv",
 "c": "CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.125/pY1X0X1X2.json",
 "o": "CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_pY1X0X1X2.json"
}


HBox(children=(IntProgress(value=0, description='Executing', max=50, style=ProgressStyle(description_width='in…


End  @ 14:33:43




## Step 3d: For each (filtered) transcribed lexicon relation, define the relevant contextual lexicon distributions over orthographic wordforms

**Dependencies**
 - **Step 2b**: `...pV_C`
 - **Step 3b**: `LTR_..._aligned_CM_filtered_LM_filtered.tsv`

In [89]:
LTR_CM_filtered_LM_filtered

['LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.tsv',
 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.tsv']

In [90]:
LDs

[{'corpus': 'Buckeye',
  'context_fn': 'buckeye_contexts_following_1_filtered.txt',
  'context_fp': 'C_Buckeye/buckeye_contexts_following_1_filtered.txt',
  'lm_fn': 'fisher_utterances_main_rev_2gram.mmap',
  'lm_fp': 'LM_Fisher/fisher_utterances_main_rev_2gram.mmap',
  'LD_dir': 'LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model',
  'o_fn_stem': 'LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model',
  'o': 'LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model/LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model',
  'm': 'LM_Fisher/fisher_utterances_main_rev_2gram.mmap',
  'v': 'LM_Fisher/fisher_vocabulary_main.txt',
  'c': 'C_Buckeye/buckeye_contexts_following_1_filtered.txt',
  'nb_fp': 'LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model/Producing Fisher vocab in Buckeye following contexts 2gram model contextual distributions.ipynb'},
 {'corpus': 'Buckeye',
  'context_fn': 'buckeye_contexts_following_2_filtered.txt',
  'context_fp': 'C_Buckeye/buckeye_c

In [91]:
lmap(partial(omit, keys=('context_fn', 'lm_fn')), LDs)

[{'corpus': 'Buckeye',
  'context_fp': 'C_Buckeye/buckeye_contexts_following_1_filtered.txt',
  'lm_fp': 'LM_Fisher/fisher_utterances_main_rev_2gram.mmap',
  'LD_dir': 'LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model',
  'o_fn_stem': 'LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model',
  'o': 'LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model/LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model',
  'm': 'LM_Fisher/fisher_utterances_main_rev_2gram.mmap',
  'v': 'LM_Fisher/fisher_vocabulary_main.txt',
  'c': 'C_Buckeye/buckeye_contexts_following_1_filtered.txt',
  'nb_fp': 'LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model/Producing Fisher vocab in Buckeye following contexts 2gram model contextual distributions.ipynb'},
 {'corpus': 'Buckeye',
  'context_fp': 'C_Buckeye/buckeye_contexts_following_2_filtered.txt',
  'lm_fp': 'LM_Fisher/fisher_utterances_main_rev_3gram.mmap',
  'LD_dir': 'LD_Fisher_vocab_in_Buckeye_following_contexts_3gram_model',
  'o

In [92]:
LD_projection_args = []
for ld in LDs:
    if ld['corpus'] == 'Buckeye':
        if ld['c'] != '':
            l = 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.tsv'
        else:
            l = 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered.tsv'
            lm_dir = 'LD_Buckeye_vocab_in_(empty)_(NA)_contexts_1gram_model'
#             d = path.join(lm_dir,
#                           lm_dir + '.pV.json')
#             o = path.join(lm_dir, lm_dir + '_projected_' + 'FIXME')
    elif ld['corpus'] == 'NXT_swbd':
        if ld['c'] != '':
            l = 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.tsv'
        else:
            l = 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered.tsv'
            lm_dir = 'LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model'
#             d = path.join(lm_dir,
#                           lm_dir + '.pV.json')
#             o = path.join(lm_dir, lm_dir + '_projected_' + 'FIXME')
    elif ld['corpus'] == 'Fisher':
        continue
    else:
        raise Exception(f"Corpus = '{ld['corpus']}' not currently supported...\nLD bundle = \n{ld}")
    
    if ld['c'] != '':
        lm_dir = path.dirname(ld['o'])
        lm_fn = path.basename(ld['o'])
        lm_stem = path.splitext(lm_fn)[0]
    
        l_fn = path.basename(l)
        l_stem = path.splitext(l_fn)[0]
    
        o = path.join(lm_dir, lm_dir + '_projected_' + l_stem) + '.pV_C'
    
        projection_ab = {
            'd':ld['o'] + '.pV_C',
            'v':fisher_lm_vocab_fp,
            'c':ld['c'],
            'l':l,
            'o':o,
            'f':'True',
            'nb_fp':path.join(lm_dir, 
                              'Filter ' + lm_stem + ' against ' + l_stem + '.ipynb')
        }
    else:
        l_fn = path.basename(l)
        l_stem = path.splitext(l_fn)[0]
                        
        d = path.join(lm_dir,
                      lm_dir + '.pV.json')
        o = path.join(lm_dir, lm_dir + '_projected_' + l_stem)
        lm_fn = path.basename(o)
        lm_stem = path.splitext(lm_fn)[0]
    
        
        projection_ab = {
            'd':d,
            'v':'',
            'c':'',
            'l':l,
            'o':o,
            'f':'',
            'nb_fp':path.join(lm_dir,
                              'Filter ' + lm_stem + ' against ' + l_stem + '.ipynb')
        }
    LD_projection_args.append(projection_ab)

len(LD_projection_args)
LD_projection_args

14

[{'d': 'LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model/LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model.pV_C',
  'v': 'LM_Fisher/fisher_vocabulary_main.txt',
  'c': 'C_Buckeye/buckeye_contexts_following_1_filtered.txt',
  'l': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.tsv',
  'o': 'LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model/LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model_projected_LTR_Buckeye_aligned_CM_filtered_LM_filtered.pV_C',
  'f': 'True',
  'nb_fp': 'LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model/Filter LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model against LTR_Buckeye_aligned_CM_filtered_LM_filtered.ipynb'},
 {'d': 'LD_Fisher_vocab_in_Buckeye_following_contexts_3gram_model/LD_Fisher_vocab_in_Buckeye_following_contexts_3gram_model.pV_C',
  'v': 'LM_Fisher/fisher_vocabulary_main.txt',
  'c': 'C_Buckeye/buckeye_contexts_following_2_filtered.txt',
  'l': 'LTR_Buckeye_aligne

In [93]:
# buckeye_contexts
# swbd2003_contexts

In [94]:
# LD_projection_args = [
#     {'d':'LD_Fisher_vocab_in_Buckeye_contexts/LD_fisher_vocab_in_buckeye_contexts.pV_C',
#      'v':fisher_lm_vocab_fp,
#      'c':buckeye_contexts,
#      'l':'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.tsv',
#      'o':'LD_Fisher_vocab_in_Buckeye_contexts/LD_fisher_vocab_in_buckeye_contexts' + '_projected_' + 'LTR_Buckeye' + '.pV_C',
#      'f':'True',
#      'nb_fp':path.join('LD_Fisher_vocab_in_Buckeye_contexts',
#                        'Filter ' + 'LD_fisher_vocab_in_buckeye_contexts' + ' against ' + 'LTR_Buckeye_aligned_CM_filtered_LM_filtered' + '.ipynb')},
#     {'d':,
#      'v':,
#      'c':,
#      'l':,
#      'o':,
#      'f':,
#      'nb_fp':}
# #     {'d':'LD_Fisher_vocab_in_swbd2003_contexts/LD_fisher_vocab_in_swbd2003_contexts.pV_C',
# #      'v':fisher_lm_vocab_fp,
# #      'c':swbd2003_contexts,
# #      'l':'LTR_CMU_destressed_aligned_w_GD_AmE_destressed/LTR_CMU_destressed_aligned_CM_filtered_LM_filtered.tsv',
# #      'o':'LD_Fisher_vocab_in_swbd2003_contexts/LD_fisher_vocab_in_swbd2003_contexts' + '_projected_' + 'LTR_CMU_destressed' + '.pV_C',
# #      'f':'True',
# #      'nb_fp':path.join('LD_Fisher_vocab_in_swbd2003_contexts',
# #                        'Filter ' + 'LD_fisher_vocab_in_swbd2003_contexts' + ' against ' + 'LTR_CMU_destressed_aligned_CM_filtered_LM_filtered' + '.ipynb')},
# #     {'d':'LD_Fisher_vocab_in_swbd2003_contexts/LD_fisher_vocab_in_swbd2003_contexts.pV_C',
# #      'v':fisher_lm_vocab_fp,
# #      'c':swbd2003_contexts,
# #      'l':'LTR_newdic_destressed_aligned_w_GD_AmE_destressed/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered.tsv',
# #      'o':'LD_Fisher_vocab_in_swbd2003_contexts/LD_fisher_vocab_in_swbd2003_contexts' + '_projected_' + 'LTR_newdic_destressed' + '.pV_C',
# #      'f':'True',
# #      'nb_fp':path.join('LD_Fisher_vocab_in_swbd2003_contexts',
# #                        'Filter ' + 'LD_fisher_vocab_in_swbd2003_contexts' + ' against ' + 'LTR_newdic_destressed_aligned_CM_filtered_LM_filtered' + '.ipynb')}
# ]


In [95]:
from joblib import Parallel, delayed

J = -1
# J = 16
BACKEND = 'multiprocessing'
# BACKEND = 'loky'
V = 10
PREFER = 'processes'
# PREFER = 'threads'

def identity(x):
    return x

def par(gen_expr):
    return Parallel(n_jobs=J, backend=BACKEND, verbose=V, prefer=PREFER)(gen_expr)

In [96]:
if '3d' in permittedSteps:
    def step3d(each):
        output_dir = path.dirname(each['o'])
        ensure_dir_exists(output_dir)

        progress_report(each['nb_fp'],
                        dict(d = each['d'],
                             v = each['v'],
                             c = each['c'],
                             l = each['l'],
                             o = each['o'],
                             f = each['f']))
        try:
            nb = pm.execute_notebook(
            'Filter contextual lexicon distribution by transcription lexicon.ipynb',
            each['nb_fp'],
            parameters=dict(d = each['d'],
                            v = each['v'],
                            c = each['c'],
                            l = each['l'],
                            o = each['o'],
                            f = each['f'])
            )
        except:
            my_nb_fp = each['nb_fp']
            print('='*40)
            print(f">> Exception in {my_nb_fp}...")
            raise
            print('-'*40)
        endNote()
        print('\n')

    #5m on wittgenstein
    par(delayed(step3d)(each) for each in LD_projection_args)

In [97]:
# lfilter(lambda ab: '1gram' in ab['d'],
#         LD_projection_args)
# unigram_LD_projection_args = lfilter(lambda ab: '1gram' in ab['d'],
#                                      LD_projection_args)

In [98]:
# if '3d' in permittedSteps:
#     # used to take about ~10-15m on wittgenstein:
#     #  ≤30s for Buckeye vocab + Buckeye contexts
#     #  ≈6.5-7m for CMU_destressed vocab in swbd2003 contexts
#     #  ≈3m for newdic_destressed vocab in swbd2003 contexts
    
#     #takes 56.5m on wittgenstein
    
# #     for each in unigram_LD_projection_args:
#     for each in LD_projection_args:
#         output_dir = path.dirname(each['o'])
#         ensure_dir_exists(output_dir)

#         progress_report(each['nb_fp'],
#                         dict(d = each['d'],
#                              v = each['v'],
#                              c = each['c'],
#                              l = each['l'],
#                              o = each['o'],
#                              f = each['f']))

#         nb = pm.execute_notebook(
#         'Filter contextual lexicon distribution by transcription lexicon.ipynb',
#         each['nb_fp'],
#         parameters=dict(d = each['d'],
#                         v = each['v'],
#                         c = each['c'],
#                         l = each['l'],
#                         o = each['o'],
#                         f = each['f'])
#         )
#         endNote()
#         print('\n')

## Step 3e: For each (filtered) transcribed lexicon relation, define a conditional distribution on segmental wordforms given an orthographic wordform

**Dependencies**
 - **Step 3b**: `LTR_..._aligned_CM_filtered_LM_filtered.tsv`

**Note**: this is the step where word edge symbols are added to segmental wordform representations.

In [99]:
LTR_CM_filtered_LM_filtered

['LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.tsv',
 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.tsv']

In [100]:
LTR_CM_filtered

['LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered.tsv',
 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered.tsv']

In [101]:
pW_V_fp_bundles = []
for ltr_fp in LTR_CM_filtered_LM_filtered:
    LTR_n = path.basename( ltr_fp )[:-4]
    LTR_dir = path.dirname( ltr_fp )
    pW_V_fp_bundles.append({'LTR_fp':ltr_fp,
                           'pW_V_fp':ltr_fp[:-4],# + '.pW_V',
                           'nb_fp':path.join(LTR_dir,'Define pW_V given {0}.ipynb'.format(LTR_n))})
    pW_V_fp_bundles.append({'LTR_fp':ltr_fp,
                           'pW_V_fp':ltr_fp[:-4] + '_trim',# + '.pW_V',
                           'nb_fp':path.join(LTR_dir,'Define pW_V given {0}'.format(LTR_n) + '_trim' + '.ipynb'),
                           'r':'False'})
pW_V_fp_bundles

[{'LTR_fp': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.tsv',
  'pW_V_fp': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered',
  'nb_fp': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/Define pW_V given LTR_Buckeye_aligned_CM_filtered_LM_filtered.ipynb'},
 {'LTR_fp': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.tsv',
  'pW_V_fp': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim',
  'nb_fp': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/Define pW_V given LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim.ipynb',
  'r': 'False'},
 {'LTR_fp': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.tsv',
  'pW_V_fp': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered',
  'nb_fp': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/Define 

In [102]:
for ltr_fp in LTR_CM_filtered:
    LTR_n = path.basename( ltr_fp )[:-4]
    LTR_dir = path.dirname( ltr_fp )
    pW_V_fp_bundles.append({'LTR_fp':ltr_fp,
                           'pW_V_fp':ltr_fp[:-4],# + '.pW_V',
                           'nb_fp':path.join(LTR_dir,'Define pW_V given {0}.ipynb'.format(LTR_n))})
    pW_V_fp_bundles.append({'LTR_fp':ltr_fp,
                           'pW_V_fp':ltr_fp[:-4] + '_trim',# + '.pW_V',
                           'nb_fp':path.join(LTR_dir,'Define pW_V given {0}'.format(LTR_n) + '_trim' + '.ipynb'),
                           'r':'False'})
pW_V_fp_bundles

[{'LTR_fp': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.tsv',
  'pW_V_fp': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered',
  'nb_fp': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/Define pW_V given LTR_Buckeye_aligned_CM_filtered_LM_filtered.ipynb'},
 {'LTR_fp': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.tsv',
  'pW_V_fp': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim',
  'nb_fp': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/Define pW_V given LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim.ipynb',
  'r': 'False'},
 {'LTR_fp': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.tsv',
  'pW_V_fp': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered',
  'nb_fp': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/Define 

In [103]:
if '3e' in permittedSteps:
    # used to take about ~30s on wittgenstein
    
    #takes ≈90s on wittgenstein 
    for each in pW_V_fp_bundles:
        pW_V_fp_output_dir = path.dirname(each['pW_V_fp'])
        ensure_dir_exists(pW_V_fp_output_dir)
        
        if not overwrite and path.exists(each['nb_fp']):
#         if not overwrite and path.exists(path.join(ab['cm_dir'], ab['nb_output_name'])):
            print('{0} already exists. Skipping...'.format(path.join(each['nb_fp'])))
#             print('{0} already exists. Skipping...'.format(path.join(ab['cm_dir'], ab['nb_output_name'])))
            endNote()
            continue

        progress_report(each['nb_fp'],
                        dict(l = each['LTR_fp'],
                             o = each['pW_V_fp']))

        nb = pm.execute_notebook(
        'Define a conditional distribution on segmental wordforms given an orthographic one.ipynb',
        each['nb_fp'],
        parameters=dict(l = each['LTR_fp'],
                        o = each['pW_V_fp'])
        )
        endNote()
        print('\n')

# Step 4: Pre-calculate remaining forward model components and meta-data

Note that none of these steps need actually be ordered with respect to each other: the ordering below is arbitrary.

## Step 4a: Generate triphone lexicon distributions for every triphone channel model

**Dependencies**
 - **Step 3c**: `LTR_..._aligned_CM_filtered_LM_filtered_pY1X0X1X2.json`

In [104]:
def get_immediate_subdirectories(a_dir):
    return [name for name in listdir(a_dir)
            if path.isdir(path.join(a_dir, name))]

In [105]:
subdirs = get_immediate_subdirectories('.')
len(subdirs)

129

In [106]:
channel_model_fps = []
for d in subdirs:
    files = listdir(d)
    is_triph_channel_model = lambda fn: 'pY1X0X1X2.json' in fn
    CM_files = list(filter(is_triph_channel_model,
                           files))
    for CM_file in CM_files:
        if 'old' not in d:
            channel_model_fps.append(path.join(d, CM_file))
len(channel_model_fps)
channel_model_fps

111

['CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.1_lambda0.5/pY1X0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/pY1X0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0/pY1X0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'CM_AmE_destressed_unaligned_pseudocount0.1_lambda1.0/pY1X0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.05/pY1X0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.05/LTR_newdic_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25/pY1X0X1X2.json',
 'CM_AmE_destressed_a

In [107]:
all_channel_model_fps = deepcopy(channel_model_fps)

In [108]:
len(lfilter(lambda fp: 'lambda' in fp,
            channel_model_fps))

channel_model_fps = lfilter(lambda fp: 'lambda' in fp, channel_model_fps)

84

In [109]:
triph_lex_bundles = []
for cm_fp in channel_model_fps:
    bundle = dict()
    bundle['c'] = cm_fp
    bundle['output_dir'] = path.dirname(cm_fp)
    bundle['c_fn'] = path.basename(cm_fp)
    bundle['o_fn_prefix'] = bundle['c_fn'].split('pY1X0X1X2')[0] + 'pX0X1X2'
    bundle['o'] = path.join(bundle['output_dir'],
                            bundle['o_fn_prefix'])
    bundle['r'] = 'False' #set to 'False' and rerun when/if new channel model posterior calculation is running at acceptable speed and segmental analyses are practical
    bundle['nb_fp'] = path.join(bundle['output_dir'],
                                f"Generating {bundle['c_fn'].split('pY1X0X1X2.json')[0][:-1]} uniform triphone lexicon dist.ipynb")
    bundle
    print(' ')
    triph_lex_bundles.append(bundle)

{'c': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.1_lambda0.5/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.1_lambda0.5',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.1_lambda0.5/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.1_lambda0.5/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25',
 'c_fn': 'LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'o_fn_prefix': 'LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/Generating LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0',
 'c_fn': 'LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'o_fn_prefix': 'LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0/Generating LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_unaligned_pseudocount0.1_lambda1.0/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_unaligned_pseudocount0.1_lambda1.0',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_unaligned_pseudocount0.1_lambda1.0/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_unaligned_pseudocount0.1_lambda1.0/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25',
 'c_fn': 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'o_fn_prefix': 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25/Generating LTR_Buckeye_aligned_CM_filtered_LM_filtered uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.1_lambda0.25/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.1_lambda0.25',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.1_lambda0.25/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.1_lambda0.25/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_unaligned_pseudocount0.01_lambda0.5/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_unaligned_pseudocount0.01_lambda0.5',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_unaligned_pseudocount0.01_lambda0.5/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_unaligned_pseudocount0.01_lambda0.5/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.001_lambda0.5/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.001_lambda0.5',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.001_lambda0.5/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.001_lambda0.5/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.5/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.5',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.5/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.5/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.5',
 'c_fn': 'LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'o_fn_prefix': 'LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.5/Generating LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_unaligned_pseudocount0.001_lambda0.25/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_unaligned_pseudocount0.001_lambda0.25',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_unaligned_pseudocount0.001_lambda0.25/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_unaligned_pseudocount0.001_lambda0.25/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.01_lambda0.125/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.01_lambda0.125',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.01_lambda0.125/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.01_lambda0.125/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_unaligned_pseudocount0.001_lambda1.0/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_unaligned_pseudocount0.001_lambda1.0',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_unaligned_pseudocount0.001_lambda1.0/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_unaligned_pseudocount0.001_lambda1.0/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.001_lambda1.0/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.001_lambda1.0',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.001_lambda1.0/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.001_lambda1.0/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.001_lambda0.125/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.001_lambda0.125',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.001_lambda0.125/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.001_lambda0.125/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.5',
 'c_fn': 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'o_fn_prefix': 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.5/Generating LTR_Buckeye_aligned_CM_filtered_LM_filtered uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.5/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.5',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.5/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.5/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda1.0',
 'c_fn': 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'o_fn_prefix': 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda1.0/Generating LTR_Buckeye_aligned_CM_filtered_LM_filtered uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda1.0/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda1.0',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda1.0/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda1.0/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.1_lambda0.125/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.1_lambda0.125',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.1_lambda0.125/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.1_lambda0.125/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.001_lambda0.25/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.001_lambda0.25',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.001_lambda0.25/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.001_lambda0.25/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.25/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.25',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.25/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.25/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.25',
 'c_fn': 'LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'o_fn_prefix': 'LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.25/Generating LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.125',
 'c_fn': 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'o_fn_prefix': 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.125/Generating LTR_Buckeye_aligned_CM_filtered_LM_filtered uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.125/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.125',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.125/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.125/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.125/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.125',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.125/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.125/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.125',
 'c_fn': 'LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'o_fn_prefix': 'LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.125/Generating LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda1.0/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda1.0',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda1.0/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda1.0/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda1.0',
 'c_fn': 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'o_fn_prefix': 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda1.0/Generating LTR_Buckeye_aligned_CM_filtered_LM_filtered uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_unaligned_pseudocount0.01_lambda0.125/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_unaligned_pseudocount0.01_lambda0.125',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_unaligned_pseudocount0.01_lambda0.125/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_unaligned_pseudocount0.01_lambda0.125/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.001_lambda0.25/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.001_lambda0.25',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.001_lambda0.25/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.001_lambda0.25/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.01_lambda1.0/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.01_lambda1.0',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.01_lambda1.0/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.01_lambda1.0/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.5',
 'c_fn': 'LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'o_fn_prefix': 'LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.5/Generating LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.5/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.5',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.5/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.5/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.1_lambda1.0/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.1_lambda1.0',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.1_lambda1.0/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.1_lambda1.0/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_unaligned_pseudocount0.1_lambda0.25/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_unaligned_pseudocount0.1_lambda0.25',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_unaligned_pseudocount0.1_lambda0.25/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_unaligned_pseudocount0.1_lambda0.25/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.25',
 'c_fn': 'LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'o_fn_prefix': 'LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.25/Generating LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.25/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.25',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.25/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.25/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.01_lambda0.25/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.01_lambda0.25',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.01_lambda0.25/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.01_lambda0.25/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.01_lambda0.5/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.01_lambda0.5',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.01_lambda0.5/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.01_lambda0.5/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.125',
 'c_fn': 'LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'o_fn_prefix': 'LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.125/Generating LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.125/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.125',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.125/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.125/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.01_lambda0.125/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.01_lambda0.125',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.01_lambda0.125/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.01_lambda0.125/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.01_lambda1.0/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.01_lambda1.0',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.01_lambda1.0/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.01_lambda1.0/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.1_lambda0.5/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.1_lambda0.5',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.1_lambda0.5/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.1_lambda0.5/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_unaligned_pseudocount0.001_lambda0.125/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_unaligned_pseudocount0.001_lambda0.125',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_unaligned_pseudocount0.001_lambda0.125/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_unaligned_pseudocount0.001_lambda0.125/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.25/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.25',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.25/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.25/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.25',
 'c_fn': 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'o_fn_prefix': 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.25/Generating LTR_Buckeye_aligned_CM_filtered_LM_filtered uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.5',
 'c_fn': 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'o_fn_prefix': 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.5/Generating LTR_Buckeye_aligned_CM_filtered_LM_filtered uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.5/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.5',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.5/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.5/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0',
 'c_fn': 'LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'o_fn_prefix': 'LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0/Generating LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.01_lambda0.5/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.01_lambda0.5',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.01_lambda0.5/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.01_lambda0.5/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.125/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.125',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.125/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.125/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.125',
 'c_fn': 'LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'o_fn_prefix': 'LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.125/Generating LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_unaligned_pseudocount0.01_lambda0.25/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_unaligned_pseudocount0.01_lambda0.25',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_unaligned_pseudocount0.01_lambda0.25/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_unaligned_pseudocount0.01_lambda0.25/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.01_lambda0.25/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.01_lambda0.25',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.01_lambda0.25/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.01_lambda0.25/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.5/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.5',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.5/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.5/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.5',
 'c_fn': 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'o_fn_prefix': 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.5/Generating LTR_Buckeye_aligned_CM_filtered_LM_filtered uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.25/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.25',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.25/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.25/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.25',
 'c_fn': 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'o_fn_prefix': 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.25/Generating LTR_Buckeye_aligned_CM_filtered_LM_filtered uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125',
 'c_fn': 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'o_fn_prefix': 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125/Generating LTR_Buckeye_aligned_CM_filtered_LM_filtered uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda1.0/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda1.0',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda1.0/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda1.0/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda1.0',
 'c_fn': 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'o_fn_prefix': 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda1.0/Generating LTR_Buckeye_aligned_CM_filtered_LM_filtered uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.1_lambda0.25/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.1_lambda0.25',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.1_lambda0.25/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.1_lambda0.25/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.001_lambda0.5/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.001_lambda0.5',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.001_lambda0.5/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.001_lambda0.5/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_unaligned_pseudocount0.001_lambda0.5/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_unaligned_pseudocount0.001_lambda0.5',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_unaligned_pseudocount0.001_lambda0.5/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_unaligned_pseudocount0.001_lambda0.5/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda1.0/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda1.0',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda1.0/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda1.0/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda1.0',
 'c_fn': 'LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'o_fn_prefix': 'LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda1.0/Generating LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.001_lambda1.0/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.001_lambda1.0',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.001_lambda1.0/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.001_lambda1.0/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_unaligned_pseudocount0.01_lambda1.0/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_unaligned_pseudocount0.01_lambda1.0',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_unaligned_pseudocount0.01_lambda1.0/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_unaligned_pseudocount0.01_lambda1.0/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.125/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.125',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.125/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.125/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.125',
 'c_fn': 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'o_fn_prefix': 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.125/Generating LTR_Buckeye_aligned_CM_filtered_LM_filtered uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_unaligned_pseudocount0.1_lambda0.5/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_unaligned_pseudocount0.1_lambda0.5',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_unaligned_pseudocount0.1_lambda0.5/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_unaligned_pseudocount0.1_lambda0.5/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.1_lambda0.125/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.1_lambda0.125',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.1_lambda0.125/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.1_lambda0.125/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_unaligned_pseudocount0.1_lambda0.125/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_unaligned_pseudocount0.1_lambda0.125',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_unaligned_pseudocount0.1_lambda0.125/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_unaligned_pseudocount0.1_lambda0.125/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.5',
 'c_fn': 'LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'o_fn_prefix': 'LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.5/Generating LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.5/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.5',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.5/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.5/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.1_lambda1.0/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.1_lambda1.0',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.1_lambda1.0/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.1_lambda1.0/Generating  uniform triphone lexicon dist.ipynb'}

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.001_lambda0.125/pY1X0X1X2.json',
 'output_dir': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.001_lambda0.125',
 'c_fn': 'pY1X0X1X2.json',
 'o_fn_prefix': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.001_lambda0.125/pX0X1X2',
 'r': 'False',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.001_lambda0.125/Generating  uniform triphone lexicon dist.ipynb'}

 


In [110]:
# if '4a' in permittedSteps:
#     def step4a(bundle):
#         output_dir = bundle['output_dir']
#         if not path.exists(output_dir):
#             print(f"Making output path {output_dir}")
#             makedirs(output_dir)

#         if not overwrite and path.exists(bundle['nb_fp']):
# #         if not overwrite and path.exists(path.join(ab['cm_dir'], ab['nb_output_name'])):
#             print('{0} already exists. Skipping...'.format(path.join(bundle['nb_fp'])))
# #             print('{0} already exists. Skipping...'.format(path.join(ab['cm_dir'], ab['nb_output_name'])))
#             endNote()
#             continue
            
#         progress_report(bundle['nb_fp'],
#                         dict(c = bundle['c'],
#                              o = bundle['o'],
#                              r = bundle['r']))
#         try:
#             nb = pm.execute_notebook(
#             'Generate triphone lexicon distribution from channel model.ipynb',
#             bundle['nb_fp'],
#             parameters=dict(c = bundle['c'],
#                             o = bundle['o'],
#                             r = bundle['r'])
#             )
#         except:
#             my_nb_fp = each['nb_fp']
#             print('='*40)
#             print(f">> Exception in {my_nb_fp}...")
#             raise
#         endNote()
#         print('\n')


#     #5m on wittgenstein
#     par(delayed(step4a)(bundle) for bundle in triph_lex_bundles)

In [111]:
if '4a' in permittedSteps:
    # used to take about ~1m on wittgenstein
    
    #takes 90s on wittgenstein
    for bundle in tqdm(triph_lex_bundles):
        output_dir = bundle['output_dir']
        if not path.exists(output_dir):
            print(f"Making output path {output_dir}")
            makedirs(output_dir)

        if not overwrite and path.exists(bundle['nb_fp']):
#         if not overwrite and path.exists(path.join(ab['cm_dir'], ab['nb_output_name'])):
            print('{0} already exists. Skipping...'.format(path.join(bundle['nb_fp'])))
#             print('{0} already exists. Skipping...'.format(path.join(ab['cm_dir'], ab['nb_output_name'])))
            endNote()
            continue
            
        progress_report(bundle['nb_fp'],
                        dict(c = bundle['c'],
                             o = bundle['o'],
                             r = bundle['r']))

        nb = pm.execute_notebook(
        'Generate triphone lexicon distribution from channel model.ipynb',
        bundle['nb_fp'],
        parameters=dict(c = bundle['c'],
                        o = bundle['o'],
                        r = bundle['r'])
        )
        endNote()
        print('\n')

## Step 4b: Pre-calculate prefix relation, $k$-cousins, and $k$-spheres for each segmental lexicon

**Dependencies**
 - **Step 3e**: `...pW_V.json`

(This is comparable to the `Metadata` generation step in 2a above.)

In [112]:
pW_fps = [each['o'] + '.json' for each in triph_lex_bundles]
pW_fps

['CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.1_lambda0.5/pX0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/pX0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0/pX0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2.json',
 'CM_AmE_destressed_unaligned_pseudocount0.1_lambda1.0/pX0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25/pX0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2.json',
 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.1_lambda0.25/pX0X1X2.json',
 'CM_AmE_destressed_unaligned_pseudocoun

In [113]:
pW_V_fps = [each['pW_V_fp'] + '.pW_V.json' for each in pW_V_fp_bundles]
pW_V_fps

['LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json',
 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim.pW_V.json',
 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim.pW_V.json',
 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered.pW_V.json',
 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_trim.pW_V.json',
 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered.pW_V.json',
 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_trim.pW_V.json']

In [114]:
# pW_V_fps = []
# for d in subdirs:
#     files = listdir(d)
#     is_pW_V = lambda fn: 'pW_V.json' in fn
#     pW_V_files = list(filter(is_pW_V,
#                            files))
#     for pW_V_file in pW_V_files:
#         pW_V_fps.append(path.join(d, pW_V_file))
# len(pW_V_fps)
# pW_V_fps

In [115]:
lexicon_md_bundles = []
for pW_V_fp in pW_V_fps:
    
    
    bundle = dict()
    bundle['p'] = pW_V_fp
    bundle['lex_name'] = path.basename(pW_V_fp).split('.pW_V.json')[0]
    bundle['o'] = path.join(path.dirname(pW_V_fp), path.basename(pW_V_fp).split('.pW_V.json')[0] )
    
    output_dir = path.dirname(pW_V_fp)
    if not path.exists(output_dir):
        print(f"Making output path '{output_dir}'")
        makedirs(output_dir)
    
    bundle['nb_fp'] = path.join(path.dirname(pW_V_fp),
                                f"Calculate word-prefix relation, Hamming distances, and k-cousin relation for {bundle['lex_name']}.ipynb")
#                                 f"Calculate prefix data, k-cousins, and k-spheres for {bundle['lex_name']}.ipynb")
    bundle
    print(' ')
    lexicon_md_bundles.append(bundle)
                                
for pW_fp in pW_fps:
    bundle = dict()
    bundle['p'] = pW_fp
    bundle['lex_name'] = path.basename(pW_fp).split('.json')[0]
    bundle['o'] = path.join(path.dirname(pW_fp), path.basename(pW_fp).split('.json')[0])
    output_dir = path.dirname(pW_fp)
    if not path.exists(output_dir):
        print(f"Making output path '{output_dir}'")
        makedirs(output_dir)
    
    bundle['nb_fp'] = path.join(path.dirname(pW_fp),
                                f"Calculate word-prefix relation, Hamming distances, and k-cousin relation for {bundle['lex_name']}.ipynb")
    bundle
    print(' ')
    lexicon_md_bundles.append(bundle)

{'p': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json',
 'lex_name': 'LTR_Buckeye_aligned_CM_filtered_LM_filtered',
 'o': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered',
 'nb_fp': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/Calculate word-prefix relation, Hamming distances, and k-cousin relation for LTR_Buckeye_aligned_CM_filtered_LM_filtered.ipynb'}

 


{'p': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim.pW_V.json',
 'lex_name': 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim',
 'o': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim',
 'nb_fp': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/Calculate word-prefix relation, Hamming distances, and k-cousin relation for LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim.ipynb'}

 


{'p': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 'lex_name': 'LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered',
 'o': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered',
 'nb_fp': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/Calculate word-prefix relation, Hamming distances, and k-cousin relation for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.ipynb'}

 


{'p': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim.pW_V.json',
 'lex_name': 'LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim',
 'o': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim',
 'nb_fp': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/Calculate word-prefix relation, Hamming distances, and k-cousin relation for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim.ipynb'}

 


{'p': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered.pW_V.json',
 'lex_name': 'LTR_Buckeye_aligned_CM_filtered',
 'o': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered',
 'nb_fp': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/Calculate word-prefix relation, Hamming distances, and k-cousin relation for LTR_Buckeye_aligned_CM_filtered.ipynb'}

 


{'p': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_trim.pW_V.json',
 'lex_name': 'LTR_Buckeye_aligned_CM_filtered_trim',
 'o': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_trim',
 'nb_fp': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/Calculate word-prefix relation, Hamming distances, and k-cousin relation for LTR_Buckeye_aligned_CM_filtered_trim.ipynb'}

 


{'p': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered.pW_V.json',
 'lex_name': 'LTR_NXT_swbd_destressed_aligned_CM_filtered',
 'o': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered',
 'nb_fp': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/Calculate word-prefix relation, Hamming distances, and k-cousin relation for LTR_NXT_swbd_destressed_aligned_CM_filtered.ipynb'}

 


{'p': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_trim.pW_V.json',
 'lex_name': 'LTR_NXT_swbd_destressed_aligned_CM_filtered_trim',
 'o': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_trim',
 'nb_fp': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/Calculate word-prefix relation, Hamming distances, and k-cousin relation for LTR_NXT_swbd_destressed_aligned_CM_filtered_trim.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.1_lambda0.5/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.1_lambda0.5/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.1_lambda0.5/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2.json',
 'lex_name': 'LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/Calculate word-prefix relation, Hamming distances, and k-cousin relation for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2.json',
 'lex_name': 'LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0/Calculate word-prefix relation, Hamming distances, and k-cousin relation for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_unaligned_pseudocount0.1_lambda1.0/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_unaligned_pseudocount0.1_lambda1.0/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_unaligned_pseudocount0.1_lambda1.0/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2.json',
 'lex_name': 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25/Calculate word-prefix relation, Hamming distances, and k-cousin relation for LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.1_lambda0.25/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.1_lambda0.25/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.1_lambda0.25/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_unaligned_pseudocount0.01_lambda0.5/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_unaligned_pseudocount0.01_lambda0.5/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_unaligned_pseudocount0.01_lambda0.5/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.001_lambda0.5/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.001_lambda0.5/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.001_lambda0.5/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.5/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.5/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.5/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2.json',
 'lex_name': 'LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.5/Calculate word-prefix relation, Hamming distances, and k-cousin relation for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_unaligned_pseudocount0.001_lambda0.25/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_unaligned_pseudocount0.001_lambda0.25/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_unaligned_pseudocount0.001_lambda0.25/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.01_lambda0.125/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.01_lambda0.125/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.01_lambda0.125/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_unaligned_pseudocount0.001_lambda1.0/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_unaligned_pseudocount0.001_lambda1.0/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_unaligned_pseudocount0.001_lambda1.0/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.001_lambda1.0/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.001_lambda1.0/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.001_lambda1.0/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.001_lambda0.125/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.001_lambda0.125/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.001_lambda0.125/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2.json',
 'lex_name': 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.5/Calculate word-prefix relation, Hamming distances, and k-cousin relation for LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.5/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.5/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.5/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2.json',
 'lex_name': 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda1.0/Calculate word-prefix relation, Hamming distances, and k-cousin relation for LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda1.0/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda1.0/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda1.0/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.1_lambda0.125/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.1_lambda0.125/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.1_lambda0.125/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.001_lambda0.25/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.001_lambda0.25/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.001_lambda0.25/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.25/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.25/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.25/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2.json',
 'lex_name': 'LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.25/Calculate word-prefix relation, Hamming distances, and k-cousin relation for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2.json',
 'lex_name': 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.125/Calculate word-prefix relation, Hamming distances, and k-cousin relation for LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.125/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.125/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.125/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.125/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.125/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.125/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2.json',
 'lex_name': 'LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.125/Calculate word-prefix relation, Hamming distances, and k-cousin relation for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda1.0/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda1.0/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda1.0/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2.json',
 'lex_name': 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda1.0/Calculate word-prefix relation, Hamming distances, and k-cousin relation for LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_unaligned_pseudocount0.01_lambda0.125/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_unaligned_pseudocount0.01_lambda0.125/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_unaligned_pseudocount0.01_lambda0.125/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.001_lambda0.25/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.001_lambda0.25/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.001_lambda0.25/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.01_lambda1.0/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.01_lambda1.0/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.01_lambda1.0/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2.json',
 'lex_name': 'LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.5/Calculate word-prefix relation, Hamming distances, and k-cousin relation for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.5/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.5/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.5/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.1_lambda1.0/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.1_lambda1.0/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.1_lambda1.0/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_unaligned_pseudocount0.1_lambda0.25/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_unaligned_pseudocount0.1_lambda0.25/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_unaligned_pseudocount0.1_lambda0.25/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2.json',
 'lex_name': 'LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.25/Calculate word-prefix relation, Hamming distances, and k-cousin relation for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.25/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.25/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.25/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.01_lambda0.25/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.01_lambda0.25/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.01_lambda0.25/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.01_lambda0.5/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.01_lambda0.5/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.01_lambda0.5/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2.json',
 'lex_name': 'LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.125/Calculate word-prefix relation, Hamming distances, and k-cousin relation for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.125/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.125/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.125/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.01_lambda0.125/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.01_lambda0.125/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.01_lambda0.125/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.01_lambda1.0/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.01_lambda1.0/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.01_lambda1.0/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.1_lambda0.5/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.1_lambda0.5/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.1_lambda0.5/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_unaligned_pseudocount0.001_lambda0.125/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_unaligned_pseudocount0.001_lambda0.125/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_unaligned_pseudocount0.001_lambda0.125/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.25/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.25/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.25/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2.json',
 'lex_name': 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.25/Calculate word-prefix relation, Hamming distances, and k-cousin relation for LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2.json',
 'lex_name': 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.5/Calculate word-prefix relation, Hamming distances, and k-cousin relation for LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.5/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.5/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.5/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2.json',
 'lex_name': 'LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0/Calculate word-prefix relation, Hamming distances, and k-cousin relation for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.01_lambda0.5/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.01_lambda0.5/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.01_lambda0.5/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.125/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.125/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.125/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2.json',
 'lex_name': 'LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.125/Calculate word-prefix relation, Hamming distances, and k-cousin relation for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_unaligned_pseudocount0.01_lambda0.25/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_unaligned_pseudocount0.01_lambda0.25/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_unaligned_pseudocount0.01_lambda0.25/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.01_lambda0.25/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.01_lambda0.25/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.01_lambda0.25/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.5/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.5/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.5/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2.json',
 'lex_name': 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.5/Calculate word-prefix relation, Hamming distances, and k-cousin relation for LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.25/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.25/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.25/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2.json',
 'lex_name': 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.25/Calculate word-prefix relation, Hamming distances, and k-cousin relation for LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2.json',
 'lex_name': 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125/Calculate word-prefix relation, Hamming distances, and k-cousin relation for LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda1.0/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda1.0/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda1.0/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2.json',
 'lex_name': 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda1.0/Calculate word-prefix relation, Hamming distances, and k-cousin relation for LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.1_lambda0.25/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.1_lambda0.25/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.1_lambda0.25/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.001_lambda0.5/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.001_lambda0.5/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.001_lambda0.5/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_unaligned_pseudocount0.001_lambda0.5/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_unaligned_pseudocount0.001_lambda0.5/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_unaligned_pseudocount0.001_lambda0.5/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda1.0/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda1.0/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda1.0/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2.json',
 'lex_name': 'LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda1.0/Calculate word-prefix relation, Hamming distances, and k-cousin relation for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.001_lambda1.0/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.001_lambda1.0/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.001_lambda1.0/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_unaligned_pseudocount0.01_lambda1.0/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_unaligned_pseudocount0.01_lambda1.0/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_unaligned_pseudocount0.01_lambda1.0/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.125/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.125/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.125/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2.json',
 'lex_name': 'LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.125/Calculate word-prefix relation, Hamming distances, and k-cousin relation for LTR_Buckeye_aligned_CM_filtered_LM_filtered_pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_unaligned_pseudocount0.1_lambda0.5/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_unaligned_pseudocount0.1_lambda0.5/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_unaligned_pseudocount0.1_lambda0.5/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.1_lambda0.125/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.1_lambda0.125/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.1_lambda0.125/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_unaligned_pseudocount0.1_lambda0.125/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_unaligned_pseudocount0.1_lambda0.125/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_unaligned_pseudocount0.1_lambda0.125/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2.json',
 'lex_name': 'LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.5/Calculate word-prefix relation, Hamming distances, and k-cousin relation for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.5/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.5/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.5/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.1_lambda1.0/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.1_lambda1.0/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.1_lambda1.0/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


{'p': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.001_lambda0.125/pX0X1X2.json',
 'lex_name': 'pX0X1X2',
 'o': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.001_lambda0.125/pX0X1X2',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0.001_lambda0.125/Calculate word-prefix relation, Hamming distances, and k-cousin relation for pX0X1X2.ipynb'}

 


In [116]:
if '4b' in permittedSteps:
    #oldest runtimes...
    # with J=-1, no CPU load, and no .npz export
    #  - newdic CM_filtered_LM_filtered takes ~45m (~168m on Solomonoff)
    #  - CMU CM_filtered_LM_filtered takes ~3.5-3.75h (14-15h? on Quine?)
    # - Buckeye CM_filtered_LM_filtered takes ~20m (~80m on Quine)

    #older runtimes...
    # with J=-1, no CPU load, *and* .npz export
    #  - newdic CM_filtered_LM_filtered takes ~45m
    #  - CMU CM_filtered_LM_filtered takes ~20.5h
    # - Buckeye CM_filtered_LM_filtered takes ~19.5h
    # (.npz representations are up to 20x smaller on disk 
    #  and are significantly smaller in memory when loaded)

    # with J=-1, no CPU load, *and* .npz export
    #  - newdic CM_filtered_LM_filtered takes ~?m
    #  - CMU CM_filtered_LM_filtered takes ~40h+ on wittgenstein
    # - Buckeye CM_filtered_LM_filtered takes ~?h
    # (.npz representations are up to 20x smaller on disk 
    #  and are significantly smaller in memory when loaded)
    
    #current runtime = 80m on wittgenstein w/ no 4a inputs
    # newdic takes 5.5m
    # CMU takes 60m / peak 55GB or 68GB mem usage
    # Buckeye takes 3.16m
    # NXT_swbd takes 9.3m
    #
    #current runtime = 6.5h on wittgenstein w/ 4a inputs
    # newdic/unaligned inputs take 24m
    # newdic/aligned inputs take 100s
    # CMU/unaligned inputs take ?m 
    # CMU/aligned inputs take ?m
    # Buckeye/unaligned inputs take ?m 
    # Buckeye/aligned inputs take ?m
    # NXT_swbd/unaligned inputs take ?m 
    # NXT_swbd/aligned inputs take ?m
    for bundle in lexicon_md_bundles:
        
        output_dir = path.dirname(bundle['o'])
        if not path.exists(output_dir):
            print(f"Making output path {output_dir}")
            makedirs(output_dir)

        if not overwrite and path.exists(bundle['nb_fp']):
#         if not overwrite and path.exists(path.join(ab['cm_dir'], ab['nb_output_name'])):
            print('{0} already exists. Skipping...'.format(path.join(bundle['nb_fp'])))
#             print('{0} already exists. Skipping...'.format(path.join(ab['cm_dir'], ab['nb_output_name'])))
            endNote()
            continue
        
        progress_report(bundle['nb_fp'],
                        dict(p = bundle['p'],
                             o = bundle['o']))

        nb = pm.execute_notebook(
        'Calculate word-prefix relation, Hamming distances, and k-cousin relation.ipynb',
        bundle['nb_fp'],
        parameters=dict(p = bundle['p'],
                        o = bundle['o'])
        )
        endNote()
        print('\n')

## Step 4c: Calculate the marginal probability $p(W|C)$ of each segmental wordform $w$ given $n$-gram contexts $C$

**Dependencies**
 - **Step 3e**: `pW_V` matrix
 - **Step 3d**: `LD_fisher_vocab_in...contexts_projected_LTR...pV_C.npy` matrix

In [117]:
#gather pV_C, pW_V fp pairs as numpy arrays

In [118]:
def LTR_to_pW_Vs(LTR_fp):
    matching_bundles = list(filter(lambda bundle: bundle['LTR_fp'] == LTR_fp,
                                   pW_V_fp_bundles))
    matching_pW_V_fps = set(map(lambda bundle: bundle['pW_V_fp'],
                                matching_bundles))
    return matching_pW_V_fps

In [119]:
pW_V_fp_bundles

[{'LTR_fp': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.tsv',
  'pW_V_fp': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered',
  'nb_fp': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/Define pW_V given LTR_Buckeye_aligned_CM_filtered_LM_filtered.ipynb'},
 {'LTR_fp': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.tsv',
  'pW_V_fp': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim',
  'nb_fp': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/Define pW_V given LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim.ipynb',
  'r': 'False'},
 {'LTR_fp': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.tsv',
  'pW_V_fp': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered',
  'nb_fp': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/Define 

In [120]:
def LTR_to_LD(LTR_fp):
    matching_bundles = list(filter(lambda bundle: bundle['l'] == LTR_fp,
                                   LD_projection_args))
    matching_pV_C_fps = set(map(lambda bundle: bundle['o'],
                                matching_bundles))
    return matching_pV_C_fps

In [121]:
LD_projection_args

[{'d': 'LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model/LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model.pV_C',
  'v': 'LM_Fisher/fisher_vocabulary_main.txt',
  'c': 'C_Buckeye/buckeye_contexts_following_1_filtered.txt',
  'l': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.tsv',
  'o': 'LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model/LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model_projected_LTR_Buckeye_aligned_CM_filtered_LM_filtered.pV_C',
  'f': 'True',
  'nb_fp': 'LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model/Filter LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model against LTR_Buckeye_aligned_CM_filtered_LM_filtered.ipynb'},
 {'d': 'LD_Fisher_vocab_in_Buckeye_following_contexts_3gram_model/LD_Fisher_vocab_in_Buckeye_following_contexts_3gram_model.pV_C',
  'v': 'LM_Fisher/fisher_vocabulary_main.txt',
  'c': 'C_Buckeye/buckeye_contexts_following_2_filtered.txt',
  'l': 'LTR_Buckeye_aligne

In [122]:
def get_matched_pW_V_LD_pairs(LTR_fp):
    matching_pW_V_fps = LTR_to_pW_Vs(LTR_fp)
    matching_pV_C_fps = LTR_to_LD(LTR_fp)
    
    return set(product(matching_pW_V_fps,
                       matching_pV_C_fps))

my_LTR_fps = list(map(lambda bundle: bundle['LTR_fp'],
                      pW_V_fp_bundles))

matched_pW_V_LD_pairs = union(map(get_matched_pW_V_LD_pairs,
                                  my_LTR_fps))
matched_pW_V_LD_pairs

{('LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered',
  'LD_Buckeye_vocab_in_(empty)_(NA)_contexts_1gram_model/LD_Buckeye_vocab_in_(empty)_(NA)_contexts_1gram_model_projected_LTR_Buckeye_aligned_CM_filtered'),
 ('LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered',
  'LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model/LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model_projected_LTR_Buckeye_aligned_CM_filtered_LM_filtered.pV_C'),
 ('LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered',
  'LD_Fisher_vocab_in_Buckeye_following_contexts_3gram_model/LD_Fisher_vocab_in_Buckeye_following_contexts_3gram_model_projected_LTR_Buckeye_aligned_CM_filtered_LM_filtered.pV_C'),
 ('LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered',
  'LD_Fisher_vocab_in_Buckeye_following_contexts_4gram_model/LD_Fisher_vocab_in_Buckeye_following_contexts_4gram_model_projected_LTR_Bucke

In [123]:
WD_bundles = []
for w,d in matched_pW_V_LD_pairs:
    bundle = dict()
    
    LTR_key = path.basename(w)
    LD_key = path.basename(d)
    if 'LD_Fisher_vocab' in LD_key:
        C_key = LD_key.split('LD_Fisher_vocab')[1].split('_projected_')[0]
    else:
        C_key = '(empty)_(NA)_contexts'
    
    output_dir = path.dirname(d)
    output_prefix = LTR_key + C_key + '.pW_C'
    
    bundle['d'] = d + '.npy'
    bundle['w'] = w + '.pW_V.npz'
    
    bundle['o'] = path.join(output_dir, output_prefix)
    
    bundle['nb_fp'] = path.join(output_dir, f"Calculate segmental wordform distribution for {LTR_key}{C_key.replace('_', ' ')}.ipynb")
    
    bundle
                                
    trim_bundle = deepcopy(bundle)
    trim_bundle['w'] = w + '_trim' + '.pW_V.npz'
    trim_bundle
    
    WD_bundles.append(bundle)

{'d': 'LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model/LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model_projected_LTR_Buckeye_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_in_Buckeye_following_contexts_2gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model/Calculate segmental wordform distribution for LTR_Buckeye_aligned_CM_filtered_LM_filtered in Buckeye following contexts 2gram model.ipynb'}

{'d': 'LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model/LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model_projected_LTR_Buckeye_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_in_Buckeye_following_contexts_2gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model/Calculate segmental wordform distribution for LTR_Buckeye_aligned_CM_filtered_LM_filtered in Buckeye following contexts 2gram model.ipynb'}

{'d': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_4gram_model/LD_Fisher_vocab_in_Buckeye_preceding_contexts_4gram_model_projected_LTR_Buckeye_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_4gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_in_Buckeye_preceding_contexts_4gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_4gram_model/Calculate segmental wordform distribution for LTR_Buckeye_aligned_CM_filtered_LM_filtered in Buckeye preceding contexts 4gram model.ipynb'}

{'d': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_4gram_model/LD_Fisher_vocab_in_Buckeye_preceding_contexts_4gram_model_projected_LTR_Buckeye_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_4gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_in_Buckeye_preceding_contexts_4gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_4gram_model/Calculate segmental wordform distribution for LTR_Buckeye_aligned_CM_filtered_LM_filtered in Buckeye preceding contexts 4gram model.ipynb'}

{'d': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_2gram_model/LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_2gram_model_projected_LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_2gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim_in_NXT_swbd_preceding_contexts_2gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_2gram_model/Calculate segmental wordform distribution for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim in NXT swbd preceding contexts 2gram model.ipynb'}

{'d': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_2gram_model/LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_2gram_model_projected_LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim_trim.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_2gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim_in_NXT_swbd_preceding_contexts_2gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_2gram_model/Calculate segmental wordform distribution for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim in NXT swbd preceding contexts 2gram model.ipynb'}

{'d': 'LD_Fisher_vocab_in_Buckeye_following_contexts_3gram_model/LD_Fisher_vocab_in_Buckeye_following_contexts_3gram_model_projected_LTR_Buckeye_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_Buckeye_following_contexts_3gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_in_Buckeye_following_contexts_3gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_Buckeye_following_contexts_3gram_model/Calculate segmental wordform distribution for LTR_Buckeye_aligned_CM_filtered_LM_filtered in Buckeye following contexts 3gram model.ipynb'}

{'d': 'LD_Fisher_vocab_in_Buckeye_following_contexts_3gram_model/LD_Fisher_vocab_in_Buckeye_following_contexts_3gram_model_projected_LTR_Buckeye_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_Buckeye_following_contexts_3gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_in_Buckeye_following_contexts_3gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_Buckeye_following_contexts_3gram_model/Calculate segmental wordform distribution for LTR_Buckeye_aligned_CM_filtered_LM_filtered in Buckeye following contexts 3gram model.ipynb'}

{'d': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_3gram_model/LD_Fisher_vocab_in_NXT_swbd_following_contexts_3gram_model_projected_LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_3gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_following_contexts_3gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_3gram_model/Calculate segmental wordform distribution for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered in NXT swbd following contexts 3gram model.ipynb'}

{'d': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_3gram_model/LD_Fisher_vocab_in_NXT_swbd_following_contexts_3gram_model_projected_LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_3gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_following_contexts_3gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_3gram_model/Calculate segmental wordform distribution for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered in NXT swbd following contexts 3gram model.ipynb'}

{'d': 'LD_Fisher_vocab_in_Buckeye_following_contexts_4gram_model/LD_Fisher_vocab_in_Buckeye_following_contexts_4gram_model_projected_LTR_Buckeye_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_Buckeye_following_contexts_4gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim_in_Buckeye_following_contexts_4gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_Buckeye_following_contexts_4gram_model/Calculate segmental wordform distribution for LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim in Buckeye following contexts 4gram model.ipynb'}

{'d': 'LD_Fisher_vocab_in_Buckeye_following_contexts_4gram_model/LD_Fisher_vocab_in_Buckeye_following_contexts_4gram_model_projected_LTR_Buckeye_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim_trim.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_Buckeye_following_contexts_4gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim_in_Buckeye_following_contexts_4gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_Buckeye_following_contexts_4gram_model/Calculate segmental wordform distribution for LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim in Buckeye following contexts 4gram model.ipynb'}

{'d': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_3gram_model/LD_Fisher_vocab_in_Buckeye_preceding_contexts_3gram_model_projected_LTR_Buckeye_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_3gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_in_Buckeye_preceding_contexts_3gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_3gram_model/Calculate segmental wordform distribution for LTR_Buckeye_aligned_CM_filtered_LM_filtered in Buckeye preceding contexts 3gram model.ipynb'}

{'d': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_3gram_model/LD_Fisher_vocab_in_Buckeye_preceding_contexts_3gram_model_projected_LTR_Buckeye_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_3gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_in_Buckeye_preceding_contexts_3gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_3gram_model/Calculate segmental wordform distribution for LTR_Buckeye_aligned_CM_filtered_LM_filtered in Buckeye preceding contexts 3gram model.ipynb'}

{'d': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_2gram_model/LD_Fisher_vocab_in_Buckeye_preceding_contexts_2gram_model_projected_LTR_Buckeye_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_2gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim_in_Buckeye_preceding_contexts_2gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_2gram_model/Calculate segmental wordform distribution for LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim in Buckeye preceding contexts 2gram model.ipynb'}

{'d': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_2gram_model/LD_Fisher_vocab_in_Buckeye_preceding_contexts_2gram_model_projected_LTR_Buckeye_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim_trim.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_2gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim_in_Buckeye_preceding_contexts_2gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_2gram_model/Calculate segmental wordform distribution for LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim in Buckeye preceding contexts 2gram model.ipynb'}

{'d': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_2gram_model/LD_Fisher_vocab_in_NXT_swbd_following_contexts_2gram_model_projected_LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_2gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim_in_NXT_swbd_following_contexts_2gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_2gram_model/Calculate segmental wordform distribution for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim in NXT swbd following contexts 2gram model.ipynb'}

{'d': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_2gram_model/LD_Fisher_vocab_in_NXT_swbd_following_contexts_2gram_model_projected_LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim_trim.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_2gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim_in_NXT_swbd_following_contexts_2gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_2gram_model/Calculate segmental wordform distribution for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim in NXT swbd following contexts 2gram model.ipynb'}

{'d': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model_projected_LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_following_contexts_4gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/Calculate segmental wordform distribution for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered in NXT swbd following contexts 4gram model.ipynb'}

{'d': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model_projected_LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_following_contexts_4gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/Calculate segmental wordform distribution for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered in NXT swbd following contexts 4gram model.ipynb'}

{'d': 'LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model/LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model_projected_LTR_NXT_swbd_destressed_aligned_CM_filtered.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_trim.pW_V.npz',
 'o': 'LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_trim(empty)_(NA)_contexts.pW_C',
 'nb_fp': 'LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model/Calculate segmental wordform distribution for LTR_NXT_swbd_destressed_aligned_CM_filtered_trim(empty) (NA) contexts.ipynb'}

{'d': 'LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model/LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model_projected_LTR_NXT_swbd_destressed_aligned_CM_filtered.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_trim_trim.pW_V.npz',
 'o': 'LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_trim(empty)_(NA)_contexts.pW_C',
 'nb_fp': 'LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model/Calculate segmental wordform distribution for LTR_NXT_swbd_destressed_aligned_CM_filtered_trim(empty) (NA) contexts.ipynb'}

{'d': 'LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model/LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model_projected_LTR_NXT_swbd_destressed_aligned_CM_filtered.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered.pW_V.npz',
 'o': 'LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered(empty)_(NA)_contexts.pW_C',
 'nb_fp': 'LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model/Calculate segmental wordform distribution for LTR_NXT_swbd_destressed_aligned_CM_filtered(empty) (NA) contexts.ipynb'}

{'d': 'LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model/LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model_projected_LTR_NXT_swbd_destressed_aligned_CM_filtered.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_trim.pW_V.npz',
 'o': 'LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered(empty)_(NA)_contexts.pW_C',
 'nb_fp': 'LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model/Calculate segmental wordform distribution for LTR_NXT_swbd_destressed_aligned_CM_filtered(empty) (NA) contexts.ipynb'}

{'d': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_3gram_model/LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_3gram_model_projected_LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_3gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_preceding_contexts_3gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_3gram_model/Calculate segmental wordform distribution for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered in NXT swbd preceding contexts 3gram model.ipynb'}

{'d': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_3gram_model/LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_3gram_model_projected_LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_3gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_preceding_contexts_3gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_3gram_model/Calculate segmental wordform distribution for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered in NXT swbd preceding contexts 3gram model.ipynb'}

{'d': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_2gram_model/LD_Fisher_vocab_in_Buckeye_preceding_contexts_2gram_model_projected_LTR_Buckeye_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_2gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_in_Buckeye_preceding_contexts_2gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_2gram_model/Calculate segmental wordform distribution for LTR_Buckeye_aligned_CM_filtered_LM_filtered in Buckeye preceding contexts 2gram model.ipynb'}

{'d': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_2gram_model/LD_Fisher_vocab_in_Buckeye_preceding_contexts_2gram_model_projected_LTR_Buckeye_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_2gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_in_Buckeye_preceding_contexts_2gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_2gram_model/Calculate segmental wordform distribution for LTR_Buckeye_aligned_CM_filtered_LM_filtered in Buckeye preceding contexts 2gram model.ipynb'}

{'d': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_3gram_model/LD_Fisher_vocab_in_Buckeye_preceding_contexts_3gram_model_projected_LTR_Buckeye_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_3gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim_in_Buckeye_preceding_contexts_3gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_3gram_model/Calculate segmental wordform distribution for LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim in Buckeye preceding contexts 3gram model.ipynb'}

{'d': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_3gram_model/LD_Fisher_vocab_in_Buckeye_preceding_contexts_3gram_model_projected_LTR_Buckeye_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim_trim.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_3gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim_in_Buckeye_preceding_contexts_3gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_3gram_model/Calculate segmental wordform distribution for LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim in Buckeye preceding contexts 3gram model.ipynb'}

{'d': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model_projected_LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim_in_NXT_swbd_following_contexts_4gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/Calculate segmental wordform distribution for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim in NXT swbd following contexts 4gram model.ipynb'}

{'d': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model_projected_LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim_trim.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim_in_NXT_swbd_following_contexts_4gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/Calculate segmental wordform distribution for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim in NXT swbd following contexts 4gram model.ipynb'}

{'d': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_3gram_model/LD_Fisher_vocab_in_NXT_swbd_following_contexts_3gram_model_projected_LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_3gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim_in_NXT_swbd_following_contexts_3gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_3gram_model/Calculate segmental wordform distribution for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim in NXT swbd following contexts 3gram model.ipynb'}

{'d': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_3gram_model/LD_Fisher_vocab_in_NXT_swbd_following_contexts_3gram_model_projected_LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim_trim.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_3gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim_in_NXT_swbd_following_contexts_3gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_3gram_model/Calculate segmental wordform distribution for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim in NXT swbd following contexts 3gram model.ipynb'}

{'d': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_2gram_model/LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_2gram_model_projected_LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_2gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_preceding_contexts_2gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_2gram_model/Calculate segmental wordform distribution for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered in NXT swbd preceding contexts 2gram model.ipynb'}

{'d': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_2gram_model/LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_2gram_model_projected_LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_2gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_preceding_contexts_2gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_2gram_model/Calculate segmental wordform distribution for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered in NXT swbd preceding contexts 2gram model.ipynb'}

{'d': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_2gram_model/LD_Fisher_vocab_in_NXT_swbd_following_contexts_2gram_model_projected_LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_2gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_following_contexts_2gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_2gram_model/Calculate segmental wordform distribution for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered in NXT swbd following contexts 2gram model.ipynb'}

{'d': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_2gram_model/LD_Fisher_vocab_in_NXT_swbd_following_contexts_2gram_model_projected_LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_2gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_following_contexts_2gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_2gram_model/Calculate segmental wordform distribution for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered in NXT swbd following contexts 2gram model.ipynb'}

{'d': 'LD_Fisher_vocab_in_Buckeye_following_contexts_4gram_model/LD_Fisher_vocab_in_Buckeye_following_contexts_4gram_model_projected_LTR_Buckeye_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_Buckeye_following_contexts_4gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_in_Buckeye_following_contexts_4gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_Buckeye_following_contexts_4gram_model/Calculate segmental wordform distribution for LTR_Buckeye_aligned_CM_filtered_LM_filtered in Buckeye following contexts 4gram model.ipynb'}

{'d': 'LD_Fisher_vocab_in_Buckeye_following_contexts_4gram_model/LD_Fisher_vocab_in_Buckeye_following_contexts_4gram_model_projected_LTR_Buckeye_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_Buckeye_following_contexts_4gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_in_Buckeye_following_contexts_4gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_Buckeye_following_contexts_4gram_model/Calculate segmental wordform distribution for LTR_Buckeye_aligned_CM_filtered_LM_filtered in Buckeye following contexts 4gram model.ipynb'}

{'d': 'LD_Buckeye_vocab_in_(empty)_(NA)_contexts_1gram_model/LD_Buckeye_vocab_in_(empty)_(NA)_contexts_1gram_model_projected_LTR_Buckeye_aligned_CM_filtered.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered.pW_V.npz',
 'o': 'LD_Buckeye_vocab_in_(empty)_(NA)_contexts_1gram_model/LTR_Buckeye_aligned_CM_filtered(empty)_(NA)_contexts.pW_C',
 'nb_fp': 'LD_Buckeye_vocab_in_(empty)_(NA)_contexts_1gram_model/Calculate segmental wordform distribution for LTR_Buckeye_aligned_CM_filtered(empty) (NA) contexts.ipynb'}

{'d': 'LD_Buckeye_vocab_in_(empty)_(NA)_contexts_1gram_model/LD_Buckeye_vocab_in_(empty)_(NA)_contexts_1gram_model_projected_LTR_Buckeye_aligned_CM_filtered.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_trim.pW_V.npz',
 'o': 'LD_Buckeye_vocab_in_(empty)_(NA)_contexts_1gram_model/LTR_Buckeye_aligned_CM_filtered(empty)_(NA)_contexts.pW_C',
 'nb_fp': 'LD_Buckeye_vocab_in_(empty)_(NA)_contexts_1gram_model/Calculate segmental wordform distribution for LTR_Buckeye_aligned_CM_filtered(empty) (NA) contexts.ipynb'}

{'d': 'LD_Fisher_vocab_in_Buckeye_following_contexts_3gram_model/LD_Fisher_vocab_in_Buckeye_following_contexts_3gram_model_projected_LTR_Buckeye_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_Buckeye_following_contexts_3gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim_in_Buckeye_following_contexts_3gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_Buckeye_following_contexts_3gram_model/Calculate segmental wordform distribution for LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim in Buckeye following contexts 3gram model.ipynb'}

{'d': 'LD_Fisher_vocab_in_Buckeye_following_contexts_3gram_model/LD_Fisher_vocab_in_Buckeye_following_contexts_3gram_model_projected_LTR_Buckeye_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim_trim.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_Buckeye_following_contexts_3gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim_in_Buckeye_following_contexts_3gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_Buckeye_following_contexts_3gram_model/Calculate segmental wordform distribution for LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim in Buckeye following contexts 3gram model.ipynb'}

{'d': 'LD_Buckeye_vocab_in_(empty)_(NA)_contexts_1gram_model/LD_Buckeye_vocab_in_(empty)_(NA)_contexts_1gram_model_projected_LTR_Buckeye_aligned_CM_filtered.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_trim.pW_V.npz',
 'o': 'LD_Buckeye_vocab_in_(empty)_(NA)_contexts_1gram_model/LTR_Buckeye_aligned_CM_filtered_trim(empty)_(NA)_contexts.pW_C',
 'nb_fp': 'LD_Buckeye_vocab_in_(empty)_(NA)_contexts_1gram_model/Calculate segmental wordform distribution for LTR_Buckeye_aligned_CM_filtered_trim(empty) (NA) contexts.ipynb'}

{'d': 'LD_Buckeye_vocab_in_(empty)_(NA)_contexts_1gram_model/LD_Buckeye_vocab_in_(empty)_(NA)_contexts_1gram_model_projected_LTR_Buckeye_aligned_CM_filtered.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_trim_trim.pW_V.npz',
 'o': 'LD_Buckeye_vocab_in_(empty)_(NA)_contexts_1gram_model/LTR_Buckeye_aligned_CM_filtered_trim(empty)_(NA)_contexts.pW_C',
 'nb_fp': 'LD_Buckeye_vocab_in_(empty)_(NA)_contexts_1gram_model/Calculate segmental wordform distribution for LTR_Buckeye_aligned_CM_filtered_trim(empty) (NA) contexts.ipynb'}

{'d': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_4gram_model/LD_Fisher_vocab_in_Buckeye_preceding_contexts_4gram_model_projected_LTR_Buckeye_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_4gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim_in_Buckeye_preceding_contexts_4gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_4gram_model/Calculate segmental wordform distribution for LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim in Buckeye preceding contexts 4gram model.ipynb'}

{'d': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_4gram_model/LD_Fisher_vocab_in_Buckeye_preceding_contexts_4gram_model_projected_LTR_Buckeye_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim_trim.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_4gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim_in_Buckeye_preceding_contexts_4gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_4gram_model/Calculate segmental wordform distribution for LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim in Buckeye preceding contexts 4gram model.ipynb'}

{'d': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_3gram_model/LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_3gram_model_projected_LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_3gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim_in_NXT_swbd_preceding_contexts_3gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_3gram_model/Calculate segmental wordform distribution for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim in NXT swbd preceding contexts 3gram model.ipynb'}

{'d': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_3gram_model/LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_3gram_model_projected_LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim_trim.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_3gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim_in_NXT_swbd_preceding_contexts_3gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_3gram_model/Calculate segmental wordform distribution for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim in NXT swbd preceding contexts 3gram model.ipynb'}

{'d': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_4gram_model/LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_4gram_model_projected_LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim_in_NXT_swbd_preceding_contexts_4gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_4gram_model/Calculate segmental wordform distribution for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim in NXT swbd preceding contexts 4gram model.ipynb'}

{'d': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_4gram_model/LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_4gram_model_projected_LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim_trim.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim_in_NXT_swbd_preceding_contexts_4gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_4gram_model/Calculate segmental wordform distribution for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim in NXT swbd preceding contexts 4gram model.ipynb'}

{'d': 'LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model/LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model_projected_LTR_Buckeye_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim_in_Buckeye_following_contexts_2gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model/Calculate segmental wordform distribution for LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim in Buckeye following contexts 2gram model.ipynb'}

{'d': 'LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model/LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model_projected_LTR_Buckeye_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim_trim.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim_in_Buckeye_following_contexts_2gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model/Calculate segmental wordform distribution for LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim in Buckeye following contexts 2gram model.ipynb'}

{'d': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_4gram_model/LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_4gram_model_projected_LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_preceding_contexts_4gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_4gram_model/Calculate segmental wordform distribution for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered in NXT swbd preceding contexts 4gram model.ipynb'}

{'d': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_4gram_model/LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_4gram_model_projected_LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_preceding_contexts_4gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_4gram_model/Calculate segmental wordform distribution for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered in NXT swbd preceding contexts 4gram model.ipynb'}

In [124]:
if '4c' in permittedSteps:
    # used to take ≈5-10m on wittgenstein with no background load
    #  ≈8-10m for filtered LTR_CMU_destressed in swbd2003 contexts
    #  ≤30s for filtered LTR_Buckeye in buckeye contexts
    #  ≈2-3m for filtered LTR_newdic_destressed in swbd2003 contexts
    
    #now takes 18m on wittgenstein
    for bundle in WD_bundles:
        ensure_dir_exists(path.dirname(bundle['o']))

        if not overwrite and path.exists(bundle['nb_fp']):
#         if not overwrite and path.exists(path.join(ab['cm_dir'], ab['nb_output_name'])):
            print('{0} already exists. Skipping...'.format(path.join(bundle['nb_fp'])))
#             print('{0} already exists. Skipping...'.format(path.join(ab['cm_dir'], ab['nb_output_name'])))
            endNote()
            continue
        
        progress_report(bundle['nb_fp'],
                        dict(d = bundle['d'],
                             w = bundle['w'],
                             o = bundle['o']))

        nb = pm.execute_notebook(
        'Calculate segmental wordform distribution given corpus contexts.ipynb',
        bundle['nb_fp'],
        parameters=dict(d = bundle['d'],
                        w = bundle['w'],
                        o = bundle['o'])
        )
        endNote()
        print('\n')

## Step 4d: Define observation distributions

**Dependencies**
 - **Step 3c**: `LTR_..._aligned_CM_filtered_LM_filtered_pY1X0X1X2.json`
 - **Step 3c**: `LTR_..._aligned_CM_filtered_LM_filtered_p3Y1X01.json`
 - **Step 3c**: `LTR_..._aligned_CM_filtered_LM_filtered_p6Y0X01.json`

In [125]:
# identify trimmed center (i.e. triphone) channel model fps defined earlier
trimmed_CM_bundles = [{'center':bundle['o']} for bundle in trimmed_LTR_CM_triples]
trimmed_CM_bundles

[{'center': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json'},
 {'center': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json'},
 {'center': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json'},
 {'center': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json'},
 {'center': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json'},
 {'center': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json'},
 {'center': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json'},
 {'ce

In [126]:
# add inferred (err, hardcoded-in-step-3c) fps for preview and postview distributions
for bundle in trimmed_CM_bundles:
    dirpath = path.dirname(bundle['center'])
    processing_prefix = path.basename(bundle['center']).split('pY1X0X1X2.json')[0]
    bundle['preview'] = path.join(dirpath, processing_prefix + 'p3Y1X01.json')
    bundle['postview'] = path.join(dirpath, processing_prefix + 'p6Y0X01.json')

In [127]:
observation_bundles = []
for bundle in trimmed_CM_bundles:
    new_bundle = dict()
    
    dirpath = path.dirname(bundle['center'])
    processing_prefix = path.basename(bundle['center']).split('pY1X0X1X2.json')[0]
    
    new_bundle['l'] = bundle['postview']
    new_bundle['c'] = bundle['center']
    new_bundle['r'] = bundle['preview']
    new_bundle['o'] = path.join(dirpath, processing_prefix + 'pC1X012')
    
#     pprintable_proc_pref = processing_prefix[:-1]
    new_bundle['nb_fp'] = path.join(dirpath, f"Calculate {processing_prefix[:-1]} observation distribution given channel models.ipynb")
    
    new_bundle
    observation_bundles.append(new_bundle)

{'l': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_p6Y0X01.json',
 'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'r': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_p3Y1X01.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pC1X012',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda1.0/Calculate LTR_Buckeye_aligned_CM_filtered_LM_filtered observation distribution given channel models.ipynb'}

{'l': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_p6Y0X01.json',
 'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'r': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_p3Y1X01.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pC1X012',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.5/Calculate LTR_Buckeye_aligned_CM_filtered_LM_filtered observation distribution given channel models.ipynb'}

{'l': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_p6Y0X01.json',
 'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'r': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_p3Y1X01.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pC1X012',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25/Calculate LTR_Buckeye_aligned_CM_filtered_LM_filtered observation distribution given channel models.ipynb'}

{'l': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_p6Y0X01.json',
 'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'r': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_p3Y1X01.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pC1X012',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125/Calculate LTR_Buckeye_aligned_CM_filtered_LM_filtered observation distribution given channel models.ipynb'}

{'l': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_p6Y0X01.json',
 'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'r': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_p3Y1X01.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pC1X012',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda1.0/Calculate LTR_Buckeye_aligned_CM_filtered_LM_filtered observation distribution given channel models.ipynb'}

{'l': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_p6Y0X01.json',
 'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'r': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_p3Y1X01.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pC1X012',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.5/Calculate LTR_Buckeye_aligned_CM_filtered_LM_filtered observation distribution given channel models.ipynb'}

{'l': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_p6Y0X01.json',
 'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'r': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_p3Y1X01.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pC1X012',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.25/Calculate LTR_Buckeye_aligned_CM_filtered_LM_filtered observation distribution given channel models.ipynb'}

{'l': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_p6Y0X01.json',
 'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'r': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_p3Y1X01.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pC1X012',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.125/Calculate LTR_Buckeye_aligned_CM_filtered_LM_filtered observation distribution given channel models.ipynb'}

{'l': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_p6Y0X01.json',
 'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'r': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_p3Y1X01.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pC1X012',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda1.0/Calculate LTR_Buckeye_aligned_CM_filtered_LM_filtered observation distribution given channel models.ipynb'}

{'l': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_p6Y0X01.json',
 'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'r': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_p3Y1X01.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pC1X012',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.5/Calculate LTR_Buckeye_aligned_CM_filtered_LM_filtered observation distribution given channel models.ipynb'}

{'l': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_p6Y0X01.json',
 'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'r': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_p3Y1X01.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pC1X012',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.25/Calculate LTR_Buckeye_aligned_CM_filtered_LM_filtered observation distribution given channel models.ipynb'}

{'l': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_p6Y0X01.json',
 'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'r': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_p3Y1X01.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pC1X012',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.125/Calculate LTR_Buckeye_aligned_CM_filtered_LM_filtered observation distribution given channel models.ipynb'}

{'l': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p6Y0X01.json',
 'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'r': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X01.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pC1X012',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0/Calculate LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered observation distribution given channel models.ipynb'}

{'l': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p6Y0X01.json',
 'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'r': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X01.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pC1X012',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.5/Calculate LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered observation distribution given channel models.ipynb'}

{'l': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p6Y0X01.json',
 'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'r': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X01.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pC1X012',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/Calculate LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered observation distribution given channel models.ipynb'}

{'l': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p6Y0X01.json',
 'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'r': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X01.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pC1X012',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.125/Calculate LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered observation distribution given channel models.ipynb'}

{'l': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p6Y0X01.json',
 'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'r': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X01.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pC1X012',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda1.0/Calculate LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered observation distribution given channel models.ipynb'}

{'l': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p6Y0X01.json',
 'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'r': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X01.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pC1X012',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.5/Calculate LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered observation distribution given channel models.ipynb'}

{'l': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p6Y0X01.json',
 'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'r': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X01.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pC1X012',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.25/Calculate LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered observation distribution given channel models.ipynb'}

{'l': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p6Y0X01.json',
 'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'r': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X01.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pC1X012',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.125/Calculate LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered observation distribution given channel models.ipynb'}

{'l': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p6Y0X01.json',
 'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'r': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X01.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pC1X012',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0/Calculate LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered observation distribution given channel models.ipynb'}

{'l': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p6Y0X01.json',
 'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'r': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X01.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pC1X012',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.5/Calculate LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered observation distribution given channel models.ipynb'}

{'l': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p6Y0X01.json',
 'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'r': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X01.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pC1X012',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.25/Calculate LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered observation distribution given channel models.ipynb'}

{'l': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p6Y0X01.json',
 'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'r': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X01.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pC1X012',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.125/Calculate LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered observation distribution given channel models.ipynb'}

In [128]:
if '4d' in permittedSteps:
    # takes 5-6h on old sidious, iirc
    # takes 2h 40m on new sidious 
    
    # takes ≈100m on wittgenstein
    # (peak mem usage is ≈100+GB but <120GB on cmu)
    # ≈7.6m per newdic input
    # ≈11.3m per CMU input
    # ≈6m per Buckeye input
    # ≈8.2m per NXT_swbd input
    for bundle in observation_bundles:
        ensure_dir_exists(path.dirname(bundle['o']))

        if not overwrite and path.exists(bundle['nb_fp']):
#         if not overwrite and path.exists(path.join(ab['cm_dir'], ab['nb_output_name'])):
            print('{0} already exists. Skipping...'.format(path.join(bundle['nb_fp'])))
#             print('{0} already exists. Skipping...'.format(path.join(ab['cm_dir'], ab['nb_output_name'])))
            endNote()
            continue
        
        progress_report(bundle['nb_fp'],
                        dict(l = bundle['l'],
                             c = bundle['c'],
                             r = bundle['r'],
                             o = bundle['o']))

        nb = pm.execute_notebook(
        'Calculate observation distribution given channel models.ipynb',
        bundle['nb_fp'],
        parameters=dict(l = bundle['l'],
                        c = bundle['c'],
                        r = bundle['r'],
                        o = bundle['o'])
        )
        endNote()
        print('\n')

## Step 4e: Define channel distributions on a set of segmental wordforms(+prefixes)

**NOTE:** Support for 4a inputs has not been added to this step due to the large number of lexicons ($L*2*3$ = the number of lexicons * 2 * one for each noise level; why the 2??? 'why the 2', indeed...) and the lack of urgency at the moment.

**Dependencies**
 - **Step 3c**: `LTR_..._aligned_CM_filtered_LM_filtered_pY1X0X1X2.json`
 - **Step 3e**: `...pW_V.json`

In [129]:
# gather paired (....pW_V.json, ...pY1X0X1X2.json) p(W|V), p(Y_i|X_{i-1},X_i;X_{i+1}) distributions
# output complete wordform channel models into the channel model directory with the same prefix that's on the triphone channel distribution file

In [130]:
# def LTR_to_pW_Vs(LTR_fp):
#     matching_bundles = list(filter(lambda bundle: bundle['LTR_fp'] == LTR_fp,
#                                    pW_V_fp_bundles))
#     matching_pW_V_fps = set(map(lambda bundle: bundle['pW_V_fp'],
#                                 matching_bundles))
#     return matching_pW_V_fps

def LTR_to_TCMs(LTR_fp):
    matching_bundles = list(filter(lambda bundle: bundle['l'] == LTR_fp,
                                   trimmed_LTR_CM_triples))
    matching_TCM_fps = set(map(lambda bundle: bundle['o'],
                               matching_bundles))
    return matching_TCM_fps

def matched_pW_Vs_and_TCMs(LTR_fp):
    matching_pW_V_fps = LTR_to_pW_Vs(LTR_fp)
    matching_TCM_fps = LTR_to_TCMs(LTR_fp)
    return {'LW_V_fps':matching_pW_V_fps,
            'TCM_fps':matching_TCM_fps}

def get_matched_pairs(LTR_fp):
    matching_TCM_fps = LTR_to_TCMs(LTR_fp)
    matching_pW_V_fps = LTR_to_pW_Vs(LTR_fp)
    matched_pairs = set(product(matching_TCM_fps,
                                matching_pW_V_fps))
    return matched_pairs

my_LTR_fps = list(map(lambda bundle: bundle['LTR_fp'],
                      pW_V_fp_bundles))

LCM_bundles = []
for c,w in union(map(get_matched_pairs,
                     my_LTR_fps)):
#     if '_trim' in w:
#         print(f'Skipping w = {w}')
#         continue
    
    bundle = dict()
    bundle['c'] = c
    if '_trim' in w:
        bundle['b'] = c.split('pY1X0X1X2.json')[0] + 'pC1X012.npy'
    bundle['w'] = w + '.pW_V.json'
    output_dir = path.dirname(c)
    output_suffix = 'trim_' if '_trim' in w else ''
    output_prefix = path.basename(c).split('pY1X0X1X2.json')[0] + output_suffix
    bundle['o'] = path.join(output_dir, output_prefix)
    
    bundle['nb_fp'] = path.join(output_dir, f'Calculate wordform channel matrices for {path.basename(w)}.ipynb')
    
    bundle
    LCM_bundles.append(bundle)

{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.25/Calculate wordform channel matrices for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.ipynb'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'b': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pC1X012.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.125/Calculate wordform channel matrices for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim.ipynb'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda1.0/Calculate wordform channel matrices for LTR_Buckeye_aligned_CM_filtered_LM_filtered.ipynb'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'b': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pC1X012.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.5/Calculate wordform channel matrices for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim.ipynb'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'b': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pC1X012.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.125/Calculate wordform channel matrices for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim.ipynb'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'b': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pC1X012.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/Calculate wordform channel matrices for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim.ipynb'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125/Calculate wordform channel matrices for LTR_Buckeye_aligned_CM_filtered_LM_filtered.ipynb'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'b': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pC1X012.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.25/Calculate wordform channel matrices for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim.ipynb'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.5/Calculate wordform channel matrices for LTR_Buckeye_aligned_CM_filtered_LM_filtered.ipynb'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.5/Calculate wordform channel matrices for LTR_Buckeye_aligned_CM_filtered_LM_filtered.ipynb'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.5/Calculate wordform channel matrices for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.ipynb'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'b': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pC1X012.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.25/Calculate wordform channel matrices for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim.ipynb'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.5/Calculate wordform channel matrices for LTR_Buckeye_aligned_CM_filtered_LM_filtered.ipynb'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'b': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pC1X012.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.125/Calculate wordform channel matrices for LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim.ipynb'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'b': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pC1X012.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.25/Calculate wordform channel matrices for LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim.ipynb'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'b': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pC1X012.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.5/Calculate wordform channel matrices for LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim.ipynb'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'b': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pC1X012.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.5/Calculate wordform channel matrices for LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim.ipynb'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/Calculate wordform channel matrices for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.ipynb'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.125/Calculate wordform channel matrices for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.ipynb'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda1.0/Calculate wordform channel matrices for LTR_Buckeye_aligned_CM_filtered_LM_filtered.ipynb'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'b': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pC1X012.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.125/Calculate wordform channel matrices for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim.ipynb'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'b': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pC1X012.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0/Calculate wordform channel matrices for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim.ipynb'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'b': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pC1X012.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0/Calculate wordform channel matrices for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim.ipynb'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda1.0/Calculate wordform channel matrices for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.ipynb'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.125/Calculate wordform channel matrices for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.ipynb'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'b': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pC1X012.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125/Calculate wordform channel matrices for LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim.ipynb'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.5/Calculate wordform channel matrices for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.ipynb'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'b': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pC1X012.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.125/Calculate wordform channel matrices for LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim.ipynb'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0/Calculate wordform channel matrices for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.ipynb'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda1.0/Calculate wordform channel matrices for LTR_Buckeye_aligned_CM_filtered_LM_filtered.ipynb'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'b': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pC1X012.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda1.0/Calculate wordform channel matrices for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim.ipynb'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.25/Calculate wordform channel matrices for LTR_Buckeye_aligned_CM_filtered_LM_filtered.ipynb'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'b': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pC1X012.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.5/Calculate wordform channel matrices for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim.ipynb'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.25/Calculate wordform channel matrices for LTR_Buckeye_aligned_CM_filtered_LM_filtered.ipynb'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'b': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pC1X012.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.5/Calculate wordform channel matrices for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim.ipynb'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'b': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pC1X012.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25/Calculate wordform channel matrices for LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim.ipynb'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.25/Calculate wordform channel matrices for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.ipynb'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.125/Calculate wordform channel matrices for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.ipynb'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'b': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pC1X012.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.5/Calculate wordform channel matrices for LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim.ipynb'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0/Calculate wordform channel matrices for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.ipynb'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'b': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pC1X012.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda1.0/Calculate wordform channel matrices for LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim.ipynb'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.125/Calculate wordform channel matrices for LTR_Buckeye_aligned_CM_filtered_LM_filtered.ipynb'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.5/Calculate wordform channel matrices for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.ipynb'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.125/Calculate wordform channel matrices for LTR_Buckeye_aligned_CM_filtered_LM_filtered.ipynb'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25/Calculate wordform channel matrices for LTR_Buckeye_aligned_CM_filtered_LM_filtered.ipynb'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'b': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pC1X012.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda1.0/Calculate wordform channel matrices for LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim.ipynb'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'b': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pC1X012.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.25/Calculate wordform channel matrices for LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim.ipynb'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'b': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pC1X012.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda1.0/Calculate wordform channel matrices for LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim.ipynb'}

In [131]:
skip_trim

True

In [132]:
trim_bundles = lfilter(lambda b: '_trim' in b['w'], LCM_bundles)

In [133]:
# skip_trim = False

if '4e' in permittedSteps:
    #old runtime info
    # takes ~2m on wittgenstein with no background load and no .npz files
    # takes ~12m on wittgenstein with no background load *and* .npz files
    # takes ~15m on (new) sidious with no background load *and* .npz files *and* assertion checking
    # takes ~20m on (new) sidious with no background load *and* .npz files *and* assertion checking *and* 'exact wordform' calculations
    
    #current runtime info
    # 30m on untrimmed inputs (tarski), including various lambda levels
    #   ≈?m on Buckeye
    #   ≈?m on NXT_swbd
    # ≈160m on trimmed inputs (tarski)
    #   ≈10m on newdic inputs
    #   ≈6m on Buckeye inputs
    #   ≈43m on CMU inputs
    #   ≈14m on NXT_swbd inputs
#     for bundle in trim_bundles:
    for bundle in LCM_bundles:
        ensure_dir_exists(path.dirname(bundle['o']))

        if '_trim' in bundle['w'] and skip_trim:
            bundle_w = bundle['w']
            print(f'Skipping bundle containing w = {bundle_w}')
            continue
        
        if not overwrite and path.exists(bundle['nb_fp']):
#         if not overwrite and path.exists(path.join(ab['cm_dir'], ab['nb_output_name'])):
            print('{0} already exists. Skipping...'.format(path.join(bundle['nb_fp'])))
#             print('{0} already exists. Skipping...'.format(path.join(ab['cm_dir'], ab['nb_output_name'])))
            endNote()
            continue
        
        
        if '_trim' in bundle['w']:
            if  'lambda1.0' not in bundle['o']:
                continue
            progress_report(bundle['nb_fp'], 
                            dict(c = bundle['c'],
                                 b = bundle['b'],
                                 w = bundle['w'],
                                 o = bundle['o']))
            nb = pm.execute_notebook(
                'Calculate segmental wordform and prefix channel matrices - OD.ipynb',
                bundle['nb_fp'],
                parameters=dict(c = bundle['c'],
                                b = bundle['b'],
                                w = bundle['w'],
                                o = bundle['o'])
            )
        else:
            progress_report(bundle['nb_fp'], 
                            dict(c = bundle['c'],
                                 w = bundle['w'],
                                 o = bundle['o']))
            nb = pm.execute_notebook(
                'Calculate segmental wordform and prefix channel matrices.ipynb',
                bundle['nb_fp'],
                parameters=dict(c = bundle['c'],
                                w = bundle['w'],
                                o = bundle['o'])
            )
        endNote()
        print('\n')

# Step 5: Calculate posterior probabilities

## Step 5a: Calculate $p(V|W, C)$

**Dependencies**
 - **Step 4c**: `pW_C` matrix
 - **Step 3e**: `pW_V` matrix
 - **Step 3d**: `LD_fisher_vocab_in...contexts_projected_LTR...pV_C.npy` matrix

$p(\hat{V} = v^*|\hat{X}_0^f = x_0^{'f}, c) = \frac{p(x_0^{'f}|v^*)p(v^*|c)}{p(x_0^{'f}|c)}$ 

In [134]:
#gather aligned triples of filepaths defining
# - $p(V|C)$
# - $p(W|C)$
# - $p(W|V)$
#plus associated (and crucially appropriately ordered!) metadata detailing 
# - C
# - V
# - W
# - the mapping between V and W
# construct the output filename and location (probably in LD?)
# construct output notebook filepaths

In [135]:
posterior_WD_bundles = []
for bundle in WD_bundles:
    output_dir = path.dirname(bundle['d'])
    LTR_key = path.basename(bundle['w']).split('.pW_V.npz')[0]
    if '1gram' not in bundle['o']:
        contexts_key = path.basename(bundle['o']).split('LM_filtered')[1].split('.pW_C')[0].replace('_', ' ')
    else:
        contexts_key = path.basename(bundle['o']).split('CM_filtered')[1].split('.pW_C')[0].replace('_', ' ')
    output_base_prefix = LTR_key.replace('_trim', '') + contexts_key.replace(' ', '_') + '.pV_WC'
    
    
    new_bundle = dict()
    if '(empty)' in bundle['d']:
        new_bundle['d'] = path.splitext(bundle['d'])[0] + '.pV' + path.splitext(bundle['d'])[1]
    else:
        new_bundle['d'] = bundle['d']          #p(V|C) as .npy
    new_bundle['w'] = bundle['w']          #p(W|V) as .npz
    new_bundle['m'] = bundle['o'] + '.npy' #p(W|C) as .npy
    new_bundle['x'] = overwrite
    
    # c = arg pointing to file specifying C
#     LM_dir = path.dirname(new_bundle['d'])
#     LM_name = '_'.join(LM_dir.lower().split('_')[-2:])
#     contexts_ext = '.txt'
#     contexts_fn = 'LM_filtered_' + LM_name + contexts_ext
#     contexts_fp = path.join(LM_dir, contexts_fn)
#     new_bundle['c'] = contexts_fp
    
#     # v = arg pointing to file specifying V
#     # l = arg pointing to file specifying W
#     vlt_prefix = bundle['w'].split('.pW_V.npz')[0]
#     vocabulary_fp = vlt_prefix + '_Orthographic_Wordforms' + '.txt'
#     lexicon_fp = vlt_prefix + '_Transcriptions' + '.txt'
#     LTR_fp = vlt_prefix + '.tsv'
#     new_bundle['v'] = vocabulary_fp
#     new_bundle['l'] = lexicon_fp
#     new_bundle['t'] = LTR_fp
    
    new_bundle['o'] = path.join(output_dir, output_base_prefix)
    new_bundle['nb_fp'] = path.join(output_dir, f'Calculate orthographic posterior given segmental wordform + context for {LTR_key.replace("_trim", "")}{contexts_key}' + '.ipynb')
    new_bundle
    posterior_WD_bundles.append(new_bundle)
    print('\n')

{'d': 'LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model/LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model_projected_LTR_Buckeye_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.npz',
 'm': 'LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_in_Buckeye_following_contexts_2gram_model.pW_C.npy',
 'x': True,
 'o': 'LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_in_Buckeye_following_contexts_2gram_model.pV_WC',
 'nb_fp': 'LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model/Calculate orthographic posterior given segmental wordform + context for LTR_Buckeye_aligned_CM_filtered_LM_filtered in Buckeye following contexts 2gram model.ipynb'}





{'d': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_4gram_model/LD_Fisher_vocab_in_Buckeye_preceding_contexts_4gram_model_projected_LTR_Buckeye_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.npz',
 'm': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_4gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_in_Buckeye_preceding_contexts_4gram_model.pW_C.npy',
 'x': True,
 'o': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_4gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_in_Buckeye_preceding_contexts_4gram_model.pV_WC',
 'nb_fp': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_4gram_model/Calculate orthographic posterior given segmental wordform + context for LTR_Buckeye_aligned_CM_filtered_LM_filtered in Buckeye preceding contexts 4gram model.ipynb'}





{'d': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_2gram_model/LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_2gram_model_projected_LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim.pW_V.npz',
 'm': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_2gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim_in_NXT_swbd_preceding_contexts_2gram_model.pW_C.npy',
 'x': True,
 'o': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_2gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim_in_NXT_swbd_preceding_contexts_2gram_model.pV_WC',
 'nb_fp': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_2gram_model/Calculate orthographic posterior given segmental wordform + context for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered trim in NXT swbd preceding contexts 2gram model.ipynb'}





{'d': 'LD_Fisher_vocab_in_Buckeye_following_contexts_3gram_model/LD_Fisher_vocab_in_Buckeye_following_contexts_3gram_model_projected_LTR_Buckeye_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.npz',
 'm': 'LD_Fisher_vocab_in_Buckeye_following_contexts_3gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_in_Buckeye_following_contexts_3gram_model.pW_C.npy',
 'x': True,
 'o': 'LD_Fisher_vocab_in_Buckeye_following_contexts_3gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_in_Buckeye_following_contexts_3gram_model.pV_WC',
 'nb_fp': 'LD_Fisher_vocab_in_Buckeye_following_contexts_3gram_model/Calculate orthographic posterior given segmental wordform + context for LTR_Buckeye_aligned_CM_filtered_LM_filtered in Buckeye following contexts 3gram model.ipynb'}





{'d': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_3gram_model/LD_Fisher_vocab_in_NXT_swbd_following_contexts_3gram_model_projected_LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.npz',
 'm': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_3gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_following_contexts_3gram_model.pW_C.npy',
 'x': True,
 'o': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_3gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_following_contexts_3gram_model.pV_WC',
 'nb_fp': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_3gram_model/Calculate orthographic posterior given segmental wordform + context for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered in NXT swbd following contexts 3gram model.ipynb'}





{'d': 'LD_Fisher_vocab_in_Buckeye_following_contexts_4gram_model/LD_Fisher_vocab_in_Buckeye_following_contexts_4gram_model_projected_LTR_Buckeye_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim.pW_V.npz',
 'm': 'LD_Fisher_vocab_in_Buckeye_following_contexts_4gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim_in_Buckeye_following_contexts_4gram_model.pW_C.npy',
 'x': True,
 'o': 'LD_Fisher_vocab_in_Buckeye_following_contexts_4gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim_in_Buckeye_following_contexts_4gram_model.pV_WC',
 'nb_fp': 'LD_Fisher_vocab_in_Buckeye_following_contexts_4gram_model/Calculate orthographic posterior given segmental wordform + context for LTR_Buckeye_aligned_CM_filtered_LM_filtered trim in Buckeye following contexts 4gram model.ipynb'}





{'d': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_3gram_model/LD_Fisher_vocab_in_Buckeye_preceding_contexts_3gram_model_projected_LTR_Buckeye_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.npz',
 'm': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_3gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_in_Buckeye_preceding_contexts_3gram_model.pW_C.npy',
 'x': True,
 'o': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_3gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_in_Buckeye_preceding_contexts_3gram_model.pV_WC',
 'nb_fp': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_3gram_model/Calculate orthographic posterior given segmental wordform + context for LTR_Buckeye_aligned_CM_filtered_LM_filtered in Buckeye preceding contexts 3gram model.ipynb'}





{'d': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_2gram_model/LD_Fisher_vocab_in_Buckeye_preceding_contexts_2gram_model_projected_LTR_Buckeye_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim.pW_V.npz',
 'm': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_2gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim_in_Buckeye_preceding_contexts_2gram_model.pW_C.npy',
 'x': True,
 'o': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_2gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim_in_Buckeye_preceding_contexts_2gram_model.pV_WC',
 'nb_fp': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_2gram_model/Calculate orthographic posterior given segmental wordform + context for LTR_Buckeye_aligned_CM_filtered_LM_filtered trim in Buckeye preceding contexts 2gram model.ipynb'}





{'d': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_2gram_model/LD_Fisher_vocab_in_NXT_swbd_following_contexts_2gram_model_projected_LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim.pW_V.npz',
 'm': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_2gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim_in_NXT_swbd_following_contexts_2gram_model.pW_C.npy',
 'x': True,
 'o': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_2gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim_in_NXT_swbd_following_contexts_2gram_model.pV_WC',
 'nb_fp': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_2gram_model/Calculate orthographic posterior given segmental wordform + context for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered trim in NXT swbd following contexts 2gram model.ipynb'}





{'d': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model_projected_LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.npz',
 'm': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_following_contexts_4gram_model.pW_C.npy',
 'x': True,
 'o': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_following_contexts_4gram_model.pV_WC',
 'nb_fp': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/Calculate orthographic posterior given segmental wordform + context for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered in NXT swbd following contexts 4gram model.ipynb'}





{'d': 'LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model/LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model_projected_LTR_NXT_swbd_destressed_aligned_CM_filtered.pV.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_trim.pW_V.npz',
 'm': 'LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_trim(empty)_(NA)_contexts.pW_C.npy',
 'x': True,
 'o': 'LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_trim(empty)_(NA)_contexts.pV_WC',
 'nb_fp': 'LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model/Calculate orthographic posterior given segmental wordform + context for LTR_NXT_swbd_destressed_aligned_CM_filtered trim(empty) (NA) contexts.ipynb'}





{'d': 'LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model/LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model_projected_LTR_NXT_swbd_destressed_aligned_CM_filtered.pV.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered.pW_V.npz',
 'm': 'LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered(empty)_(NA)_contexts.pW_C.npy',
 'x': True,
 'o': 'LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered(empty)_(NA)_contexts.pV_WC',
 'nb_fp': 'LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model/Calculate orthographic posterior given segmental wordform + context for LTR_NXT_swbd_destressed_aligned_CM_filtered(empty) (NA) contexts.ipynb'}





{'d': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_3gram_model/LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_3gram_model_projected_LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.npz',
 'm': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_3gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_preceding_contexts_3gram_model.pW_C.npy',
 'x': True,
 'o': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_3gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_preceding_contexts_3gram_model.pV_WC',
 'nb_fp': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_3gram_model/Calculate orthographic posterior given segmental wordform + context for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered in NXT swbd preceding contexts 3gram model.ipynb'}





{'d': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_2gram_model/LD_Fisher_vocab_in_Buckeye_preceding_contexts_2gram_model_projected_LTR_Buckeye_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.npz',
 'm': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_2gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_in_Buckeye_preceding_contexts_2gram_model.pW_C.npy',
 'x': True,
 'o': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_2gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_in_Buckeye_preceding_contexts_2gram_model.pV_WC',
 'nb_fp': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_2gram_model/Calculate orthographic posterior given segmental wordform + context for LTR_Buckeye_aligned_CM_filtered_LM_filtered in Buckeye preceding contexts 2gram model.ipynb'}





{'d': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_3gram_model/LD_Fisher_vocab_in_Buckeye_preceding_contexts_3gram_model_projected_LTR_Buckeye_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim.pW_V.npz',
 'm': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_3gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim_in_Buckeye_preceding_contexts_3gram_model.pW_C.npy',
 'x': True,
 'o': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_3gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim_in_Buckeye_preceding_contexts_3gram_model.pV_WC',
 'nb_fp': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_3gram_model/Calculate orthographic posterior given segmental wordform + context for LTR_Buckeye_aligned_CM_filtered_LM_filtered trim in Buckeye preceding contexts 3gram model.ipynb'}





{'d': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model_projected_LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim.pW_V.npz',
 'm': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim_in_NXT_swbd_following_contexts_4gram_model.pW_C.npy',
 'x': True,
 'o': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim_in_NXT_swbd_following_contexts_4gram_model.pV_WC',
 'nb_fp': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/Calculate orthographic posterior given segmental wordform + context for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered trim in NXT swbd following contexts 4gram model.ipynb'}





{'d': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_3gram_model/LD_Fisher_vocab_in_NXT_swbd_following_contexts_3gram_model_projected_LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim.pW_V.npz',
 'm': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_3gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim_in_NXT_swbd_following_contexts_3gram_model.pW_C.npy',
 'x': True,
 'o': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_3gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim_in_NXT_swbd_following_contexts_3gram_model.pV_WC',
 'nb_fp': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_3gram_model/Calculate orthographic posterior given segmental wordform + context for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered trim in NXT swbd following contexts 3gram model.ipynb'}





{'d': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_2gram_model/LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_2gram_model_projected_LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.npz',
 'm': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_2gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_preceding_contexts_2gram_model.pW_C.npy',
 'x': True,
 'o': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_2gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_preceding_contexts_2gram_model.pV_WC',
 'nb_fp': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_2gram_model/Calculate orthographic posterior given segmental wordform + context for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered in NXT swbd preceding contexts 2gram model.ipynb'}





{'d': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_2gram_model/LD_Fisher_vocab_in_NXT_swbd_following_contexts_2gram_model_projected_LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.npz',
 'm': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_2gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_following_contexts_2gram_model.pW_C.npy',
 'x': True,
 'o': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_2gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_following_contexts_2gram_model.pV_WC',
 'nb_fp': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_2gram_model/Calculate orthographic posterior given segmental wordform + context for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered in NXT swbd following contexts 2gram model.ipynb'}





{'d': 'LD_Fisher_vocab_in_Buckeye_following_contexts_4gram_model/LD_Fisher_vocab_in_Buckeye_following_contexts_4gram_model_projected_LTR_Buckeye_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.npz',
 'm': 'LD_Fisher_vocab_in_Buckeye_following_contexts_4gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_in_Buckeye_following_contexts_4gram_model.pW_C.npy',
 'x': True,
 'o': 'LD_Fisher_vocab_in_Buckeye_following_contexts_4gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_in_Buckeye_following_contexts_4gram_model.pV_WC',
 'nb_fp': 'LD_Fisher_vocab_in_Buckeye_following_contexts_4gram_model/Calculate orthographic posterior given segmental wordform + context for LTR_Buckeye_aligned_CM_filtered_LM_filtered in Buckeye following contexts 4gram model.ipynb'}





{'d': 'LD_Buckeye_vocab_in_(empty)_(NA)_contexts_1gram_model/LD_Buckeye_vocab_in_(empty)_(NA)_contexts_1gram_model_projected_LTR_Buckeye_aligned_CM_filtered.pV.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered.pW_V.npz',
 'm': 'LD_Buckeye_vocab_in_(empty)_(NA)_contexts_1gram_model/LTR_Buckeye_aligned_CM_filtered(empty)_(NA)_contexts.pW_C.npy',
 'x': True,
 'o': 'LD_Buckeye_vocab_in_(empty)_(NA)_contexts_1gram_model/LTR_Buckeye_aligned_CM_filtered(empty)_(NA)_contexts.pV_WC',
 'nb_fp': 'LD_Buckeye_vocab_in_(empty)_(NA)_contexts_1gram_model/Calculate orthographic posterior given segmental wordform + context for LTR_Buckeye_aligned_CM_filtered(empty) (NA) contexts.ipynb'}





{'d': 'LD_Fisher_vocab_in_Buckeye_following_contexts_3gram_model/LD_Fisher_vocab_in_Buckeye_following_contexts_3gram_model_projected_LTR_Buckeye_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim.pW_V.npz',
 'm': 'LD_Fisher_vocab_in_Buckeye_following_contexts_3gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim_in_Buckeye_following_contexts_3gram_model.pW_C.npy',
 'x': True,
 'o': 'LD_Fisher_vocab_in_Buckeye_following_contexts_3gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim_in_Buckeye_following_contexts_3gram_model.pV_WC',
 'nb_fp': 'LD_Fisher_vocab_in_Buckeye_following_contexts_3gram_model/Calculate orthographic posterior given segmental wordform + context for LTR_Buckeye_aligned_CM_filtered_LM_filtered trim in Buckeye following contexts 3gram model.ipynb'}





{'d': 'LD_Buckeye_vocab_in_(empty)_(NA)_contexts_1gram_model/LD_Buckeye_vocab_in_(empty)_(NA)_contexts_1gram_model_projected_LTR_Buckeye_aligned_CM_filtered.pV.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_trim.pW_V.npz',
 'm': 'LD_Buckeye_vocab_in_(empty)_(NA)_contexts_1gram_model/LTR_Buckeye_aligned_CM_filtered_trim(empty)_(NA)_contexts.pW_C.npy',
 'x': True,
 'o': 'LD_Buckeye_vocab_in_(empty)_(NA)_contexts_1gram_model/LTR_Buckeye_aligned_CM_filtered_trim(empty)_(NA)_contexts.pV_WC',
 'nb_fp': 'LD_Buckeye_vocab_in_(empty)_(NA)_contexts_1gram_model/Calculate orthographic posterior given segmental wordform + context for LTR_Buckeye_aligned_CM_filtered trim(empty) (NA) contexts.ipynb'}





{'d': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_4gram_model/LD_Fisher_vocab_in_Buckeye_preceding_contexts_4gram_model_projected_LTR_Buckeye_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim.pW_V.npz',
 'm': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_4gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim_in_Buckeye_preceding_contexts_4gram_model.pW_C.npy',
 'x': True,
 'o': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_4gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim_in_Buckeye_preceding_contexts_4gram_model.pV_WC',
 'nb_fp': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_4gram_model/Calculate orthographic posterior given segmental wordform + context for LTR_Buckeye_aligned_CM_filtered_LM_filtered trim in Buckeye preceding contexts 4gram model.ipynb'}





{'d': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_3gram_model/LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_3gram_model_projected_LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim.pW_V.npz',
 'm': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_3gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim_in_NXT_swbd_preceding_contexts_3gram_model.pW_C.npy',
 'x': True,
 'o': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_3gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim_in_NXT_swbd_preceding_contexts_3gram_model.pV_WC',
 'nb_fp': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_3gram_model/Calculate orthographic posterior given segmental wordform + context for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered trim in NXT swbd preceding contexts 3gram model.ipynb'}





{'d': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_4gram_model/LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_4gram_model_projected_LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim.pW_V.npz',
 'm': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim_in_NXT_swbd_preceding_contexts_4gram_model.pW_C.npy',
 'x': True,
 'o': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_trim_in_NXT_swbd_preceding_contexts_4gram_model.pV_WC',
 'nb_fp': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_4gram_model/Calculate orthographic posterior given segmental wordform + context for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered trim in NXT swbd preceding contexts 4gram model.ipynb'}





{'d': 'LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model/LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model_projected_LTR_Buckeye_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim.pW_V.npz',
 'm': 'LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim_in_Buckeye_following_contexts_2gram_model.pW_C.npy',
 'x': True,
 'o': 'LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_trim_in_Buckeye_following_contexts_2gram_model.pV_WC',
 'nb_fp': 'LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model/Calculate orthographic posterior given segmental wordform + context for LTR_Buckeye_aligned_CM_filtered_LM_filtered trim in Buckeye following contexts 2gram model.ipynb'}





{'d': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_4gram_model/LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_4gram_model_projected_LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.npz',
 'm': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_preceding_contexts_4gram_model.pW_C.npy',
 'x': True,
 'o': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_preceding_contexts_4gram_model.pV_WC',
 'nb_fp': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_4gram_model/Calculate orthographic posterior given segmental wordform + context for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered in NXT swbd preceding contexts 4gram model.ipynb'}





In [136]:
#NB for reasons I haven't tried to figure out, this cell dumps a bunch of notebook metadata
# into the cell output...

if '5a' in permittedSteps:
    # takes ~360m for old sidious w/ no background load and J=-1
    
    # takes 135m for wittgenstein w/ no background load and J=-1 and no trim inputs
    # takes 120m for wittgenstein w/ no background load and J=-1 and only trim inputs
    for bundle in posterior_WD_bundles:
        output_dir = path.dirname(bundle['o'])
        ensure_dir_exists(output_dir)

        if not overwrite and path.exists(bundle['nb_fp']):
#         if not overwrite and path.exists(path.join(ab['cm_dir'], ab['nb_output_name'])):
            print('{0} already exists. Skipping...'.format(path.join(bundle['nb_fp'])))
#             print('{0} already exists. Skipping...'.format(path.join(ab['cm_dir'], ab['nb_output_name'])))
            endNote()
            continue
        
        progress_report(bundle['nb_fp'],
                        dict(d = bundle['d'],
                             w = bundle['w'],
                             m = bundle['m'],
                             o = bundle['o'],
                             x = bundle['x']))

        pm.execute_notebook(
            'Calculate orthographic posterior given segmental wordform + context.ipynb',
            bundle['nb_fp'],
            parameters=dict(d = bundle['d'],
                            w = bundle['w'],
                            m = bundle['m'],
#                             c = bundle['c'],
#                             v = bundle['v'],
#                             l = bundle['l'],
#                             t = bundle['t'],
                            o = bundle['o'],
                            x = bundle['x'])
        )
        endNote()
        print('\n')

## Step 5b: Calculate $p(\hat{X}_0^f|X_0^f, C)$

**Dependencies**
 - **Step 4e**: `CM_AmE_destressed_aligned_w_LTR_..._pseudocount0.01/LTR_..._aligned_CM_filtered_LM_filtered_CMs_by_length_by_prefix_index.pickle` list of matrices
 - **Step 4e**: `CM_AmE_destressed_aligned_w_LTR_..._pseudocount0.01/LTR_..._aligned_CM_filtered_LM_filtered_CMs_by_length_by_prefix_index.pickle` list of matrices
 - **Step 4c**: `LD_Fisher_vocab_in_..._contexts/LTR_..._aligned_CM_filtered_LM_filtered_in_..._contexts.pW_C.npy` matrix
 - **Step 4b**: `LTR_..._aligned_w_GD_AmE_destressed` metadata directory
 - **Step 3e**: `LTR_..._aligned_w_GD_AmE_destressed/LTR_..._aligned_CM_filtered_LM_filtered.pW_V.json` dist (sanity check)
 - **Step 4e**: `CM_AmE_destressed_aligned_w_LTR_..._pseudocount0.01/LTR_..._aligned_CM_filtered_LM_filtered_p3Y1X0X12.json` dist (sanity check)
 - **Step 3d**: `LD_Fisher_vocab_in_..._contexts/LM_filtered_..._contexts_..._..._filtered.txt` (sanity check)


Given a choice of parameters $\epsilon$ and $n$, and given
 - wordform/prefix channel matrices $p(Y_0^f|X_0^k)$
 - a contextual distribution on segmental wordforms $p(X_0^f|C)$
 - segmental lexicon metadata pre-calculating $k$-cousin information
 
Calculate one of three things, in order from most to least general and least to most practical:

1. (somewhat/most) general incremental case:

$$\hat{p}(\hat{X}_0^f = x_0^{'f}|X_0^k = x_0^{*k}, c) = \frac{1}{n} \sum\limits_{y_0^k \in S} p(\hat{X}_0^f = x_0^{'f}|y_0^k, c)$$
 where 
  - cousin...distance $d(x_0^{'f}, x_0^{*k}) \leq 4$
  - $S = $ a set of $n$ samples from $p(Y_0^f|x_0^{*k})$. In practice an $n \approx 200$ seems to result in a reasonable level of accuracy. 
  - $p(\hat{X}_0^f = x_0^{'f}|Y_0^k = y_0^k, c) = \frac{p(y_0^k|x_0^{'f})p(x_0^{'f}|c)}{p(y_0^k | c)}$
  - $p(y_0^k| c) = \sum\limits_{v', x_0^{''f}} p(y_0^k|x_0^{''f})p(x_0^{''f}|v')p(v'|c) = \sum\limits_{x_0^{''f}} p(y_0^k|x_0^{''f})p(x_0^{''f}|c)$
  
2. (somewhat/most) general exact wordform case:

$$\hat{p}(\hat{X}_0^f = x_0^{'f}|X_0^f = x_0^{*f}, c) = \frac{1}{n} \sum\limits_{y_0^f \in S} p(\hat{X}_0^f = x_0^{'f}|y_0^f, c)$$
 where 
  - Hamming distance $d(x_0^{'f}, x_0^{*f}) \leq 4$
  - $S = $ a set of $n$ samples from $p(Y_0^f|x_0^{*f})$. In practice an $n \approx 200$ seems to result in a reasonable level of accuracy. 
  - $p(\hat{X}_0^f = x_0^{'f}|Y_0^f = y_0^f, c) = \frac{p(y_0^f|x_0^{'f})p(x_0^{'f}|c)}{p(y_0^f | c)}$
  - $p(y_0^f| c) = \sum\limits_{v', x_0^{''f}} p(y_0^f|x_0^{''f})p(x_0^{''f}|v')p(v'|c) = \sum\limits_{x_0^{''f}} p(y_0^f|x_0^{''f})p(x_0^{''f}|c)$
  
  
3. least general exact wordform case:

$$\hat{p}(\hat{X}_0^f = x_0^{*f}|X_0^f = x_0^{*f}, c) = \frac{1}{n} \sum\limits_{y_0^f \in S} p(\hat{X}_0^f = x_0^{*f}|y_0^f, c)$$
 where 
  - $S = $ a set of $n$ samples from $p(Y_0^f|x_0^{*f})$. In practice an $n \approx 200$ seems to result in a reasonable level of accuracy. 
  - $p(\hat{X}_0^f = x_0^{*f}|Y_0^f = y_0^f, c) = \frac{p(y_0^f|x_0^{*f})p(x_0^{*f}|c)}{p(y_0^f | c)}$
  - $p(y_0^f| c) = \sum\limits_{v', x_0^{''f}} p(y_0^f|x_0^{''f})p(x_0^{''f}|v')p(v'|c) = \sum\limits_{x_0^{''f}} p(y_0^f|x_0^{''f})p(x_0^{''f}|c)$


In [137]:
desiredCM_fp = lambda fp: 'lambda' in fp
# desiredCM_fp = lambda fp: '0.01' in fp
hasSmoothingParam = lambda fp: '0.' in fp
nxt_swbd_fp = lambda fp: 'NXT_swbd' in fp or 'nxt_swbd' in fp
buckeye_fp = lambda fp: 'Buckeye' in fp or 'buckeye' in fp
is_not_trim_fp = lambda fp: 'trim' not in fp

# desired_fp = lambda fp: (nxt_swbd_fp(fp)) and (desiredCM_fp(fp) or not hasSmoothingParam(fp)) and (is_not_trim_fp(fp) or not skip_trim)
desired_fp = lambda fp: (buckeye_fp(fp) or nxt_swbd_fp(fp)) and (desiredCM_fp(fp) or not hasSmoothingParam(fp)) and (is_not_trim_fp(fp) or not skip_trim)

len(LCM_bundles)
desired_LCM_bundles = [deepcopy(LCM_bundle) for LCM_bundle in LCM_bundles if all(walk_values(desired_fp, LCM_bundle).values())]
len(desired_LCM_bundles)
for bundle in desired_LCM_bundles:
    bundle
    print('')

48

24

{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.25/Calculate wordform channel matrices for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.ipynb'}




{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda1.0/Calculate wordform channel matrices for LTR_Buckeye_aligned_CM_filtered_LM_filtered.ipynb'}




{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125/Calculate wordform channel matrices for LTR_Buckeye_aligned_CM_filtered_LM_filtered.ipynb'}




{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.5/Calculate wordform channel matrices for LTR_Buckeye_aligned_CM_filtered_LM_filtered.ipynb'}




{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.5/Calculate wordform channel matrices for LTR_Buckeye_aligned_CM_filtered_LM_filtered.ipynb'}




{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.5/Calculate wordform channel matrices for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.ipynb'}




{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.5/Calculate wordform channel matrices for LTR_Buckeye_aligned_CM_filtered_LM_filtered.ipynb'}




{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/Calculate wordform channel matrices for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.ipynb'}




{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.125/Calculate wordform channel matrices for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.ipynb'}




{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda1.0/Calculate wordform channel matrices for LTR_Buckeye_aligned_CM_filtered_LM_filtered.ipynb'}




{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda1.0/Calculate wordform channel matrices for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.ipynb'}




{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.125/Calculate wordform channel matrices for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.ipynb'}




{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.5/Calculate wordform channel matrices for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.ipynb'}




{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0/Calculate wordform channel matrices for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.ipynb'}




{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda1.0/Calculate wordform channel matrices for LTR_Buckeye_aligned_CM_filtered_LM_filtered.ipynb'}




{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.25/Calculate wordform channel matrices for LTR_Buckeye_aligned_CM_filtered_LM_filtered.ipynb'}




{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.25/Calculate wordform channel matrices for LTR_Buckeye_aligned_CM_filtered_LM_filtered.ipynb'}




{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.25/Calculate wordform channel matrices for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.ipynb'}




{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.125/Calculate wordform channel matrices for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.ipynb'}




{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0/Calculate wordform channel matrices for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.ipynb'}




{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.125/Calculate wordform channel matrices for LTR_Buckeye_aligned_CM_filtered_LM_filtered.ipynb'}




{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.5/Calculate wordform channel matrices for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.ipynb'}




{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.125/Calculate wordform channel matrices for LTR_Buckeye_aligned_CM_filtered_LM_filtered.ipynb'}




{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json',
 'o': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_',
 'nb_fp': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25/Calculate wordform channel matrices for LTR_Buckeye_aligned_CM_filtered_LM_filtered.ipynb'}




In [138]:
set(map(lambda d: tuple(d.values()), sorted(desired_LCM_bundles,
                                            key=lambda d: d['c'])))

## len(segpost_CM_bundles)
## len(set(map(lambda d: tuple(d.values()), segpost_CM_bundles)))
## set(map(lambda d: tuple(d.values()), segpost_CM_bundles))

{('CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
  'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json',
  'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_',
  'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125/Calculate wordform channel matrices for LTR_Buckeye_aligned_CM_filtered_LM_filtered.ipynb'),
 ('CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_pY1X0X1X2.json',
  'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json',
  'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_',
  'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25/Calculate wordform channel matrices for LTR_Buckeye_aligned_CM_f

In [139]:
segpost_CM_bundles = []
for bundle in desired_LCM_bundles:
    new_bundle = dict()
    new_bundle['c'] = bundle['o'] + 'CMs_by_length_by_wordform_index.pickle'
    new_bundle['f'] = bundle['o'] + 'exact_CMs_by_length_by_wordform_index.pickle'
#     new_bundle['w'] = ''
    new_bundle['m'] = path.dirname(bundle['w'])
#     new_bundle['o'] = ''
    new_bundle['p'] = bundle['w']
    new_bundle['t'] = deepcopy(bundle['c']).replace('pY1X0X1X2', 'p3Y1X012')
#     new_bundle['s'] = ''
    new_bundle
    segpost_CM_bundles.append(new_bundle)

{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_CMs_by_length_by_wordform_index.pickle',
 'f': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle',
 'm': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed',
 'p': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 't': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X012.json'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_CMs_by_length_by_wordform_index.pickle',
 'f': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle',
 'm': 'LTR_Buckeye_aligned_w_GD_AmE_destressed',
 'p': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json',
 't': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_p3Y1X012.json'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_CMs_by_length_by_wordform_index.pickle',
 'f': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle',
 'm': 'LTR_Buckeye_aligned_w_GD_AmE_destressed',
 'p': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json',
 't': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_p3Y1X012.json'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_CMs_by_length_by_wordform_index.pickle',
 'f': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle',
 'm': 'LTR_Buckeye_aligned_w_GD_AmE_destressed',
 'p': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json',
 't': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_p3Y1X012.json'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_CMs_by_length_by_wordform_index.pickle',
 'f': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle',
 'm': 'LTR_Buckeye_aligned_w_GD_AmE_destressed',
 'p': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json',
 't': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_p3Y1X012.json'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_CMs_by_length_by_wordform_index.pickle',
 'f': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle',
 'm': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed',
 'p': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 't': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X012.json'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_CMs_by_length_by_wordform_index.pickle',
 'f': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle',
 'm': 'LTR_Buckeye_aligned_w_GD_AmE_destressed',
 'p': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json',
 't': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.5/LTR_Buckeye_aligned_CM_filtered_LM_filtered_p3Y1X012.json'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_CMs_by_length_by_wordform_index.pickle',
 'f': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle',
 'm': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed',
 'p': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 't': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X012.json'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_CMs_by_length_by_wordform_index.pickle',
 'f': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle',
 'm': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed',
 'p': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 't': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X012.json'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_CMs_by_length_by_wordform_index.pickle',
 'f': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle',
 'm': 'LTR_Buckeye_aligned_w_GD_AmE_destressed',
 'p': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json',
 't': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_p3Y1X012.json'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_CMs_by_length_by_wordform_index.pickle',
 'f': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle',
 'm': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed',
 'p': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 't': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X012.json'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_CMs_by_length_by_wordform_index.pickle',
 'f': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle',
 'm': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed',
 'p': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 't': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X012.json'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_CMs_by_length_by_wordform_index.pickle',
 'f': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle',
 'm': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed',
 'p': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 't': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X012.json'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_CMs_by_length_by_wordform_index.pickle',
 'f': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle',
 'm': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed',
 'p': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 't': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X012.json'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_CMs_by_length_by_wordform_index.pickle',
 'f': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle',
 'm': 'LTR_Buckeye_aligned_w_GD_AmE_destressed',
 'p': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json',
 't': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda1.0/LTR_Buckeye_aligned_CM_filtered_LM_filtered_p3Y1X012.json'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_CMs_by_length_by_wordform_index.pickle',
 'f': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle',
 'm': 'LTR_Buckeye_aligned_w_GD_AmE_destressed',
 'p': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json',
 't': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_p3Y1X012.json'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_CMs_by_length_by_wordform_index.pickle',
 'f': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle',
 'm': 'LTR_Buckeye_aligned_w_GD_AmE_destressed',
 'p': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json',
 't': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_p3Y1X012.json'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_CMs_by_length_by_wordform_index.pickle',
 'f': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle',
 'm': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed',
 'p': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 't': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X012.json'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_CMs_by_length_by_wordform_index.pickle',
 'f': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle',
 'm': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed',
 'p': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 't': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X012.json'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_CMs_by_length_by_wordform_index.pickle',
 'f': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle',
 'm': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed',
 'p': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 't': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X012.json'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_CMs_by_length_by_wordform_index.pickle',
 'f': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle',
 'm': 'LTR_Buckeye_aligned_w_GD_AmE_destressed',
 'p': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json',
 't': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.1_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_p3Y1X012.json'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_CMs_by_length_by_wordform_index.pickle',
 'f': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle',
 'm': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed',
 'p': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 't': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X012.json'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_CMs_by_length_by_wordform_index.pickle',
 'f': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle',
 'm': 'LTR_Buckeye_aligned_w_GD_AmE_destressed',
 'p': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json',
 't': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01_lambda0.125/LTR_Buckeye_aligned_CM_filtered_LM_filtered_p3Y1X012.json'}

{'c': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_CMs_by_length_by_wordform_index.pickle',
 'f': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle',
 'm': 'LTR_Buckeye_aligned_w_GD_AmE_destressed',
 'p': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.json',
 't': 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.001_lambda0.25/LTR_Buckeye_aligned_CM_filtered_LM_filtered_p3Y1X012.json'}

In [140]:
# len(WD_bundles)
# len(lfilter(lambda d: not any(walk_values(lambda v: 'trim' in v, d).values()),
#         WD_bundles))
# lfilter(lambda d: not any(walk_values(lambda v: 'trim' in v, d).values()),
#         WD_bundles)

In [141]:
# nxt_swbd_fp = lambda fp: 'NXT_swbd' in fp or 'nxt_swbd' in fp
# buckeye_fp = lambda fp: 'Buckeye' in fp or 'buckeye' in fp
# is_not_trim_fp = lambda fp: 'trim' not in fp

desired_WD_fp = lambda fp: (buckeye_fp(fp) or nxt_swbd_fp(fp)) and (is_not_trim_fp(fp) or not skip_trim)

len(WD_bundles)
desired_WD_bundles = [deepcopy(WD_bundle) for WD_bundle in WD_bundles if all(walk_values(desired_WD_fp, WD_bundle).values())]
len(desired_WD_bundles)
for bundle in desired_WD_bundles:
    bundle
    print('')

28

14

{'d': 'LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model/LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model_projected_LTR_Buckeye_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_in_Buckeye_following_contexts_2gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_Buckeye_following_contexts_2gram_model/Calculate segmental wordform distribution for LTR_Buckeye_aligned_CM_filtered_LM_filtered in Buckeye following contexts 2gram model.ipynb'}




{'d': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_4gram_model/LD_Fisher_vocab_in_Buckeye_preceding_contexts_4gram_model_projected_LTR_Buckeye_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_4gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_in_Buckeye_preceding_contexts_4gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_4gram_model/Calculate segmental wordform distribution for LTR_Buckeye_aligned_CM_filtered_LM_filtered in Buckeye preceding contexts 4gram model.ipynb'}




{'d': 'LD_Fisher_vocab_in_Buckeye_following_contexts_3gram_model/LD_Fisher_vocab_in_Buckeye_following_contexts_3gram_model_projected_LTR_Buckeye_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_Buckeye_following_contexts_3gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_in_Buckeye_following_contexts_3gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_Buckeye_following_contexts_3gram_model/Calculate segmental wordform distribution for LTR_Buckeye_aligned_CM_filtered_LM_filtered in Buckeye following contexts 3gram model.ipynb'}




{'d': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_3gram_model/LD_Fisher_vocab_in_NXT_swbd_following_contexts_3gram_model_projected_LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_3gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_following_contexts_3gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_3gram_model/Calculate segmental wordform distribution for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered in NXT swbd following contexts 3gram model.ipynb'}




{'d': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_3gram_model/LD_Fisher_vocab_in_Buckeye_preceding_contexts_3gram_model_projected_LTR_Buckeye_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_3gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_in_Buckeye_preceding_contexts_3gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_3gram_model/Calculate segmental wordform distribution for LTR_Buckeye_aligned_CM_filtered_LM_filtered in Buckeye preceding contexts 3gram model.ipynb'}




{'d': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model_projected_LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_following_contexts_4gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/Calculate segmental wordform distribution for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered in NXT swbd following contexts 4gram model.ipynb'}




{'d': 'LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model/LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model_projected_LTR_NXT_swbd_destressed_aligned_CM_filtered.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered.pW_V.npz',
 'o': 'LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered(empty)_(NA)_contexts.pW_C',
 'nb_fp': 'LD_NXT_swbd_vocab_in_(empty)_(NA)_contexts_1gram_model/Calculate segmental wordform distribution for LTR_NXT_swbd_destressed_aligned_CM_filtered(empty) (NA) contexts.ipynb'}




{'d': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_3gram_model/LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_3gram_model_projected_LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_3gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_preceding_contexts_3gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_3gram_model/Calculate segmental wordform distribution for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered in NXT swbd preceding contexts 3gram model.ipynb'}




{'d': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_2gram_model/LD_Fisher_vocab_in_Buckeye_preceding_contexts_2gram_model_projected_LTR_Buckeye_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_2gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_in_Buckeye_preceding_contexts_2gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_Buckeye_preceding_contexts_2gram_model/Calculate segmental wordform distribution for LTR_Buckeye_aligned_CM_filtered_LM_filtered in Buckeye preceding contexts 2gram model.ipynb'}




{'d': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_2gram_model/LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_2gram_model_projected_LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_2gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_preceding_contexts_2gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_2gram_model/Calculate segmental wordform distribution for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered in NXT swbd preceding contexts 2gram model.ipynb'}




{'d': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_2gram_model/LD_Fisher_vocab_in_NXT_swbd_following_contexts_2gram_model_projected_LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_2gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_following_contexts_2gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_2gram_model/Calculate segmental wordform distribution for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered in NXT swbd following contexts 2gram model.ipynb'}




{'d': 'LD_Fisher_vocab_in_Buckeye_following_contexts_4gram_model/LD_Fisher_vocab_in_Buckeye_following_contexts_4gram_model_projected_LTR_Buckeye_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered_LM_filtered.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_Buckeye_following_contexts_4gram_model/LTR_Buckeye_aligned_CM_filtered_LM_filtered_in_Buckeye_following_contexts_4gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_Buckeye_following_contexts_4gram_model/Calculate segmental wordform distribution for LTR_Buckeye_aligned_CM_filtered_LM_filtered in Buckeye following contexts 4gram model.ipynb'}




{'d': 'LD_Buckeye_vocab_in_(empty)_(NA)_contexts_1gram_model/LD_Buckeye_vocab_in_(empty)_(NA)_contexts_1gram_model_projected_LTR_Buckeye_aligned_CM_filtered.npy',
 'w': 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered.pW_V.npz',
 'o': 'LD_Buckeye_vocab_in_(empty)_(NA)_contexts_1gram_model/LTR_Buckeye_aligned_CM_filtered(empty)_(NA)_contexts.pW_C',
 'nb_fp': 'LD_Buckeye_vocab_in_(empty)_(NA)_contexts_1gram_model/Calculate segmental wordform distribution for LTR_Buckeye_aligned_CM_filtered(empty) (NA) contexts.ipynb'}




{'d': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_4gram_model/LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_4gram_model_projected_LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pV_C.npy',
 'w': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.npz',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_preceding_contexts_4gram_model.pW_C',
 'nb_fp': 'LD_Fisher_vocab_in_NXT_swbd_preceding_contexts_4gram_model/Calculate segmental wordform distribution for LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered in NXT swbd preceding contexts 4gram model.ipynb'}




In [142]:
has_NXT_swbd = lambda b: 'NXT_swbd' in b['d']
is_4_gram = lambda b: '4gram' in b['d']
is_1_gram = lambda b: '1gram' in b['d']
is_forward = lambda b: 'preceding' in b['d']
is_backward = lambda b: 'following' in b['d']

wittgenstein_filter = lambda b: is_forward(b)
tarski_filter = lambda b: is_backward(b)
my_filter = lambda b: has_NXT_swbd(b) and is_4_gram(b) and tarski_filter(b)

len(desired_WD_bundles)
filtered_desired_WD_bundles = lfilter(my_filter,
                                      desired_WD_bundles)
len(filtered_desired_WD_bundles)

14

1

In [143]:
import re

In [144]:
def lexicon_name(fp):
    if 'buckeye' in fp or 'Buckeye' in fp:
        return 'Buckeye'
    elif 'nxt_swbd' in fp or 'NXT_swbd' in fp:
        return 'NXT_swbd'
    elif 'newdic' in fp:
        return 'newdic'
    elif 'cmu' in fp or 'CMU' in fp:
        return 'CMU'
    
def corpus_contexts_name(fp):
    if 'buckeye' in fp or 'Buckeye' in fp:
        return 'Buckeye'
    elif 'nxt_swbd' in fp or 'NXT_swbd' in fp:
        return 'NXT_swbd'
    else:
        raise Exception(f"corpus context in fp {fp} is neither buckeye nor nxt_swbd")
    
def get_contexts_direction(fp):
    if 'preceding' in fp:
        return 'preceding'
    elif 'following' in fp:
        return 'following'
    else:
        raise Exception(f"corpus context direction in fp {fp} is neither 'preceding' nor 'following'")
        
def get_contexts_order(fp):
#     direction = get_contexts_direction(fp)
    ngram_tokens = re.findall(r"[0-5]gram", fp)
    if len(ngram_tokens) < 1:
        raise Exception(f"No instance of substring matching '[0-5]gram' in {fp}; order could not be extracted.")
    orders = [token[0] for token in ngram_tokens]
    unique_orders = set(orders)
    if len(unique_orders) != 1:
        raise Exception(f"More than one 'n' for all substring tokens matching '[0-5]gram' in {fp}; no unique order could be extracted.")
    return list(unique_orders)[0]
    
def get_pseudocount(fp):
    pc_tokens = re.findall(r"pseudocount0.[01]*", fp)
    if len(pc_tokens) < 1:
        raise Exception(f"No instance of substring matching 'pseudocount0.[01]*' in {fp}; pseudocount could not be extracted.")
    pcs = [token.split('pseudocount')[1] for token in pc_tokens]
    unique_pcs = set(pcs)
    if len(unique_pcs) != 1:
        raise Exception(f"More than one pseudocount for all substring tokens matching 'pseudocount0.[01]*' in {fp}; no unique pseudocount could be extracted.")
    return list(unique_pcs)[0]

def get_lambda(fp):
    l_tokens = re.findall(r"lambda[01][.][0-9]*", fp)
    if len(l_tokens) < 1:
        raise Exception(f"No instance of substring matching 'lambda[01][.][0-9]*' in {fp}; lambda could not be extracted.")
    ls = [token.split('lambda')[1] for token in l_tokens]
    unique_ls = set(ls)
    if len(unique_ls) != 1:
        raise Exception(f"More than one lambda for all substring tokens matching 'lambda[01][.][0-9]*' in {fp}; no unique lambda could be extracted.")
    return list(unique_ls)[0]
    
segpost_bundles = []
for WD_bundle in filtered_desired_WD_bundles:
# for WD_bundle in desired_WD_bundles:
    LD_dir = path.dirname(WD_bundle['d'])
    
    my_p = path.splitext(WD_bundle['w'])[0] + '.json'
    
    corpus_name = corpus_contexts_name(LD_dir)
    corpus_name_lc = corpus_name.lower()
    
    direction = get_contexts_direction(LD_dir)
    order = get_contexts_order(LD_dir)
    context_size = str(int(order) - 1)
    
    matching_segpost_CM_bundles = [b for b in segpost_CM_bundles if b['p'] == my_p]
    assert len(matching_segpost_CM_bundles) != 0, f"No matching segpost CM bundle with p = {my_p} for WD bundle:\n\t{WD_bundle}\n in segpost_CM_bundles\n\t{segpost_CM_bundles}"
#     assert len(matching_segpost_CM_bundles) == 1, f"Multiple matching segpost CM bundles with p = {my_p} for WD bundle:\n\t{WD_bundle}\n in segpost_CM_bundles\n\t{matching_segpost_CM_bundles}"
    if len(matching_segpost_CM_bundles) > 1:
        print(f'!! # matching CM bundles = {len(matching_segpost_CM_bundles)}!')
        for each_matching_segpost_CM_bundle in matching_segpost_CM_bundles:
            new_bundle = deepcopy(each_matching_segpost_CM_bundle)
            
            my_pc = get_pseudocount(each_matching_segpost_CM_bundle['c'])
            my_l = get_lambda(each_matching_segpost_CM_bundle['c'])

            new_bundle['w'] = WD_bundle['o'] + '.npy'
            new_bundle['o'] = path.join(LD_dir,
                                        new_bundle['m'].split('aligned')[0] + 'aligned' + f"_pc{my_pc}_l{my_l}")
#             new_bundle['o'] = path.join(LD_dir, new_bundle['m']) + f"_pc{my_pc}_l{my_l}"
            new_bundle['s'] = path.join(LD_dir, 'LM_filtered_' + corpus_name_lc + '_contexts_' + direction + '_' + context_size + '_filtered.txt')

            descr = path.basename( WD_bundle['o'].split('.pW_C')[0] )
            new_bundle['nb_fp'] = f"Calculate segmental posterior given segmental wordform + context - {descr} - pc={my_pc}, λ={my_l}.ipynb"

            new_bundle
            print(' ')
            segpost_bundles.append(new_bundle)
            
#         assert len(matching_segpost_CM_bundles) == 1, f"Multiple matching segpost CM bundles with p = {my_p} for WD bundle:\n\t{WD_bundle}\n in segpost_CM_bundles\n\t{matching_segpost_CM_bundles}"
    else:
        matching_segpost_CM_bundle = matching_segpost_CM_bundles[0]
        new_bundle = deepcopy(matching_segpost_CM_bundle)
        
        my_pc = get_pseudocount(matching_segpost_CM_bundle['c'])
        my_l = get_lambda(matching_segpost_CM_bundle['c'])

        new_bundle['w'] = WD_bundle['o'] + '.npy'
        new_bundle['o'] = path.join(LD_dir, new_bundle['m'])
        new_bundle['s'] = path.join(LD_dir, 'LM_filtered_' + corpus_name_lc + '_contexts_' + direction + '_' + context_size + '_filtered.txt')

        descr = path.basename( WD_bundle['o'].split('.pW_C')[0] )
        new_bundle['nb_fp'] = f"Calculate segmental posterior given segmental wordform + context - {descr} - pc={my_pc}, λ={my_l}.ipynb"

        new_bundle
        print(' ')
        segpost_bundles.append(new_bundle)

!! # matching CM bundles = 12!


{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_CMs_by_length_by_wordform_index.pickle',
 'f': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle',
 'm': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed',
 'p': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 't': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X012.json',
 'w': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_following_contexts_4gram_model.pW_C.npy',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_w_GD_AmE_de

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_CMs_by_length_by_wordform_index.pickle',
 'f': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle',
 'm': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed',
 'p': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 't': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X012.json',
 'w': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_following_contexts_4gram_model.pW_C.npy',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destr

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_CMs_by_length_by_wordform_index.pickle',
 'f': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle',
 'm': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed',
 'p': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 't': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X012.json',
 'w': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_following_contexts_4gram_model.pW_C.npy',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_w_GD_

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_CMs_by_length_by_wordform_index.pickle',
 'f': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle',
 'm': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed',
 'p': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 't': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X012.json',
 'w': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_following_contexts_4gram_model.pW_C.npy',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_w_GD_AmE

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_CMs_by_length_by_wordform_index.pickle',
 'f': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle',
 'm': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed',
 'p': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 't': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X012.json',
 'w': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_following_contexts_4gram_model.pW_C.npy',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_w_GD_AmE_de

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_CMs_by_length_by_wordform_index.pickle',
 'f': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle',
 'm': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed',
 'p': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 't': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X012.json',
 'w': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_following_contexts_4gram_model.pW_C.npy',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_w_GD_

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_CMs_by_length_by_wordform_index.pickle',
 'f': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle',
 'm': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed',
 'p': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 't': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X012.json',
 'w': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_following_contexts_4gram_model.pW_C.npy',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_w_GD_AmE_de

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_CMs_by_length_by_wordform_index.pickle',
 'f': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle',
 'm': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed',
 'p': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 't': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X012.json',
 'w': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_following_contexts_4gram_model.pW_C.npy',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_w_GD_AmE

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_CMs_by_length_by_wordform_index.pickle',
 'f': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle',
 'm': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed',
 'p': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 't': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X012.json',
 'w': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_following_contexts_4gram_model.pW_C.npy',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_w_GD_AmE

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_CMs_by_length_by_wordform_index.pickle',
 'f': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle',
 'm': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed',
 'p': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 't': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X012.json',
 'w': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_following_contexts_4gram_model.pW_C.npy',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_w_

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_CMs_by_length_by_wordform_index.pickle',
 'f': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle',
 'm': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed',
 'p': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 't': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X012.json',
 'w': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_following_contexts_4gram_model.pW_C.npy',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destr

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_CMs_by_length_by_wordform_index.pickle',
 'f': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle',
 'm': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed',
 'p': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 't': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X012.json',
 'w': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_following_contexts_4gram_model.pW_C.npy',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_w_GD_AmE

 


In [145]:
len(segpost_bundles)

12

Default parameters that remain to be (optionally) set via command-line here:
 - `x` 'benchmark' flag
 - `d` "don't overwrite any existing calculations" flag
 - `n` samples per (reconstructed target wordform, input prefix/wordform, context) triple
 - `k` maximum edit distance for cousin calculations
 - `b` default batch size to start from
 - `l` parallelize calculations?
 - `g` use gpu if available? (overwrites 'l')
 - `u` gpu number to use if `g` is true
 - `r` 'include calculations for source pRefixes?' flag (if False, only calculations involving source sequences that are exact wordforms will be done)
 - `e` 'Exact-match target wordform calculations only?' flag (if r is False and e is True, then only calculations involving target sequences that are exact matches of the input source wordform will be done)
 - `wStart` the index of the wordform to start or resume doing calculations on, if not 0
 - `wEnd` the index of the last wordform to calculate, starting from `wStart`
 
See `Calculate segmental posterior given segmental wordform + context.ipynb` for more on these and other parameters.

In [146]:
get_pseudocount(segpost_bundles[0]['c'])

'0.1'

In [147]:
# segpost_bundles = sorted(segpost_bundles, key=lambda b: b['nb_fp'])
segpost_bundles = sorted(segpost_bundles, key=lambda b: get_pseudocount(b['c']))
len(segpost_bundles)
segpost_bundles[0]

12

{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_CMs_by_length_by_wordform_index.pickle',
 'f': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle',
 'm': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed',
 'p': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 't': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X012.json',
 'w': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_following_contexts_4gram_model.pW_C.npy',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_w_GD_

In [148]:
segpost_bundles_immutable = [tuple(sorted(bundle.items(), key=lambda pair:pair[0]))
                             for bundle in segpost_bundles]
len(segpost_bundles_immutable)
segpost_bundles_unique = set(segpost_bundles_immutable)
len(segpost_bundles_unique)
segpost_bundles_deduped = [dict(bundle_immutable)
                           for bundle_immutable in segpost_bundles_unique]
segpost_bundles_deduped[0]

12

12

{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_CMs_by_length_by_wordform_index.pickle',
 'f': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle',
 'm': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed',
 'nb_fp': 'Calculate segmental posterior given segmental wordform + context - LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_following_contexts_4gram_model - pc=0.1, λ=0.125.ipynb',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed_pc0.1_l0.125',
 'p': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 's': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LM_filtered_nxt_swbd_contexts_followin

In [149]:
len(segpost_bundles)
segpost_bundles

12

[{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_CMs_by_length_by_wordform_index.pickle',
  'f': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle',
  'm': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed',
  'p': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
  't': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X012.json',
  'w': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_following_contexts_4gram_model.pW_C.npy',
  'o': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligne

In [150]:
hostname_to_batch_size_default = {
    'tarski':500,
    'wittgenstein':300
}

# def get_gpu_number(direction):
def get_gpu_number(bundle_idx):
    if my_hostname in {'pitts', 'tarski', 'wittgenstein'}:
        return bundle_idx % 2
#         if direction == 'preceding':
#             return 0
#         else:
#             return 1
    else:
        return 0

In [151]:
full_segpost_bundles = []
for idx, each_b in enumerate(segpost_bundles):
    b = deepcopy(each_b)
    
    direction = get_contexts_direction(b['o'])
    
    b['a'] = 'nxt_swbd_word_analysis_relation_filtered.json' if 'NXT_swbd' in b['c'] else 'buckeye_word_analysis_relation_filtered.json'
    b['x'] = False
    b['d'] = False #*do* overwrite
    b['n'] = 1000
    b['k'] = 2
    b['b'] = hostname_to_batch_size_default.get(my_hostname, 300)
    b['l'] = True
    b['g'] = True
#     b['u'] = get_gpu_number(direction)
    b['u'] = get_gpu_number(idx)
    b['r'] = False
    b['e'] = True
    b['q'] = True
    b['wStart'] = ''
    b['wEnd'] = ''
    
    full_segpost_bundles.append(b)
    b
    print(' ')

{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_CMs_by_length_by_wordform_index.pickle',
 'f': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle',
 'm': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed',
 'p': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 't': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X012.json',
 'w': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_following_contexts_4gram_model.pW_C.npy',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_w_GD_

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_CMs_by_length_by_wordform_index.pickle',
 'f': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle',
 'm': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed',
 'p': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 't': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X012.json',
 'w': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_following_contexts_4gram_model.pW_C.npy',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_w_GD_AmE

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_CMs_by_length_by_wordform_index.pickle',
 'f': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle',
 'm': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed',
 'p': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 't': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X012.json',
 'w': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_following_contexts_4gram_model.pW_C.npy',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_w_

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_CMs_by_length_by_wordform_index.pickle',
 'f': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle',
 'm': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed',
 'p': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 't': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X012.json',
 'w': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_following_contexts_4gram_model.pW_C.npy',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_w_GD_AmE

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_CMs_by_length_by_wordform_index.pickle',
 'f': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle',
 'm': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed',
 'p': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 't': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X012.json',
 'w': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_following_contexts_4gram_model.pW_C.npy',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_w_GD_AmE_de

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_CMs_by_length_by_wordform_index.pickle',
 'f': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle',
 'm': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed',
 'p': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 't': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X012.json',
 'w': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_following_contexts_4gram_model.pW_C.npy',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_w_GD_

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_CMs_by_length_by_wordform_index.pickle',
 'f': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle',
 'm': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed',
 'p': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 't': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X012.json',
 'w': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_following_contexts_4gram_model.pW_C.npy',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_w_GD_AmE_de

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_CMs_by_length_by_wordform_index.pickle',
 'f': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle',
 'm': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed',
 'p': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 't': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.01_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X012.json',
 'w': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_following_contexts_4gram_model.pW_C.npy',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_w_GD_AmE

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_CMs_by_length_by_wordform_index.pickle',
 'f': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle',
 'm': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed',
 'p': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 't': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X012.json',
 'w': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_following_contexts_4gram_model.pW_C.npy',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_w_GD_AmE_de

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_CMs_by_length_by_wordform_index.pickle',
 'f': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle',
 'm': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed',
 'p': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 't': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.5/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X012.json',
 'w': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_following_contexts_4gram_model.pW_C.npy',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destr

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_CMs_by_length_by_wordform_index.pickle',
 'f': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle',
 'm': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed',
 'p': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 't': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda0.125/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X012.json',
 'w': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_following_contexts_4gram_model.pW_C.npy',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_w_GD_AmE

 


{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_CMs_by_length_by_wordform_index.pickle',
 'f': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle',
 'm': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed',
 'p': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
 't': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.1_lambda1.0/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X012.json',
 'w': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_following_contexts_4gram_model.pW_C.npy',
 'o': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destr

 


In [152]:
has_corpus = lambda corpus_string: lambda b: corpus_contexts_name(b['nb_fp']) == corpus_string
has_direction = lambda direction_string: lambda b: get_contexts_direction(b['nb_fp']) == direction_string
is_n_gram_bundle = lambda n_string: lambda b: f"{n_string}gram_model" in b['nb_fp']

desired_segpost_bundle = lambda corpus, n, direction: lambda b: has_corpus(corpus)(b) and has_direction(direction)(b) and is_n_gram_bundle(n)(b)

In [153]:
isNXTswbd3gram = lambda b: has_corpus('NXT_swbd')(b) and is_n_gram_bundle('3')(b)
isBuckeye2gramFollowing = desired_segpost_bundle('Buckeye','2','following')

len(full_segpost_bundles)

is_GPU_n = lambda u: lambda b: b['u'] == u
bundles_with_GPU_0 = lfilter(is_GPU_n(0), full_segpost_bundles)
bundles_with_GPU_1 = lfilter(is_GPU_n(1), full_segpost_bundles)

len(bundles_with_GPU_0)
len(bundles_with_GPU_1)

12

12

0

In [154]:
bundles_with_GPU_0

[{'c': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_CMs_by_length_by_wordform_index.pickle',
  'f': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_exact_CMs_by_length_by_wordform_index.pickle',
  'm': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed',
  'p': 'LTR_NXT_swbd_destressed_aligned_w_GD_AmE_destressed/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered.pW_V.json',
  't': 'CM_AmE_destressed_aligned_w_LTR_NXT_swbd_destressed_pseudocount0.001_lambda0.25/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_p3Y1X012.json',
  'w': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligned_CM_filtered_LM_filtered_in_NXT_swbd_following_contexts_4gram_model.pW_C.npy',
  'o': 'LD_Fisher_vocab_in_NXT_swbd_following_contexts_4gram_model/LTR_NXT_swbd_destressed_aligne

In [155]:
bundles_with_GPU_1

[]

In [156]:
if '5b' in permittedSteps:
    # takes ? 
    # wittgenstein/gpu/b300/buckeye/preceding/2 ≈7-8s per exact wordform
    # tarski/gpu/b500/nxt_swbd/preceding/2 ≈3-5s per exact wordform
    # tarski/gpu/b500/nxt_swbd/following/2 ≈3-5s per exact wordform
#     for bundle in bundles_with_GPU_0:
    for bundle in full_segpost_bundles:
        output_dir = path.dirname(bundle['o'])
        ensure_dir_exists(output_dir)

        if not overwrite and path.exists(bundle['nb_fp']):
#         if not overwrite and path.exists(path.join(ab['cm_dir'], ab['nb_output_name'])):
            print('{0} already exists. Skipping...'.format(path.join(bundle['nb_fp'])))
#             print('{0} already exists. Skipping...'.format(path.join(ab['cm_dir'], ab['nb_output_name'])))
            endNote()
            continue
        
        progress_report(bundle['nb_fp'],
                        dict(c = bundle['c'],
                             f = bundle['f'],
                             m = bundle['m'],
                             p = bundle['p'],
                             t = bundle['t'],
                             w = bundle['w'],
                             a = bundle['a'],
                             o = bundle['o'],
                             s = bundle['s'],
                             x = bundle['x'],
                             d = bundle['d'],
                             n = bundle['n'],
                             k = bundle['k'],
                             b = bundle['b'],
                             l = bundle['l'],
                             g = bundle['g'],
                             u = bundle['u'],
                             r = bundle['r'],
                             e = bundle['e'],
                             q = bundle['q'],
                             wStart = bundle['wStart'],
                             wEnd = bundle['wEnd']))
        
        pm.execute_notebook(
            'Calculate segmental posterior given segmental wordform + context.ipynb',
            bundle['nb_fp'],
            parameters=dict(c = bundle['c'],
                            f = bundle['f'],
                            m = bundle['m'],
                            p = bundle['p'],
                            t = bundle['t'],
                            w = bundle['w'],
                            a = bundle['a'],
                            o = bundle['o'],
                            s = bundle['s'],
                            x = bundle['x'],
                            d = bundle['d'],
                            n = bundle['n'],
                            k = bundle['k'],
                            b = bundle['b'],
                            l = bundle['l'],
                            g = bundle['g'],
                            u = bundle['u'],
                            r = bundle['r'],
                            e = bundle['e'],
                            q = bundle['q'],
                            wStart = bundle['wStart'],
                            wEnd = bundle['wEnd'])
        )
        endNote()
        print('\n')

## Step 5c: Calculate $p(\hat{V} = v^*| V = v^*, C)$

In [None]:
#FIXME without anything close to a distribution on W-hat, there is nothing to do here

# Step 6: Analysis

## Step 6a: Ensure copy of word analysis relations from corpus repositories is present here...

In [None]:
overwrite = True
# overwrite = False
overwrite

In [None]:
switchboard_lm_repo_path = '../switchboard-lm'
switchboard_word_analysis_relation_fn = 'nxt_swbd_word_analysis_relation_filtered.json'
    
switchboard_word_analysis_relation_fp = path.join(switchboard_lm_repo_path, switchboard_word_analysis_relation_fn)
switchboard_word_analysis_relation_fp

assert path.exists(switchboard_word_analysis_relation_fp)

In [None]:
buckeye_lm_repo_path = '../buckeye-lm'
buckeye_word_analysis_relation_fn = 'buckeye_word_analysis_relation_filtered.json'

buckeye_word_analysis_relation_fp = path.join(buckeye_lm_repo_path, buckeye_word_analysis_relation_fn)
buckeye_word_analysis_relation_fp

assert path.exists(buckeye_word_analysis_relation_fp)

In [None]:
if '6a' in permittedSteps:
    if path.exists(switchboard_word_analysis_relation_fn):
        print(f"{switchboard_word_analysis_relation_fn} already found in repository directory...")
        if overwrite:
            print(f"Overwriting with copy from corpus repository.")
            copy(switchboard_word_analysis_relation_fp,
                 path.join(repo_dir, switchboard_word_analysis_relation_fn))
        else:
            print("'overwrite' flag is False. Leaving copy currently in repository as is.")
    else:
        print(f"Copying word analysis relation into repository directory...")
        copy(switchboard_word_analysis_relation_fp,
         path.join(repo_dir, switchboard_word_analysis_relation_fn))

In [None]:
if '6a' in permittedSteps:
    if path.exists(buckeye_word_analysis_relation_fn):
        print(f"{buckeye_word_analysis_relation_fn} already found in repository directory...")
        if overwrite:
            print(f"Overwriting with copy from corpus repository.")
            copy(buckeye_word_analysis_relation_fp,
                 path.join(repo_dir, buckeye_word_analysis_relation_fn))
        else:
            print("'overwrite' flag is False. Leaving copy currently in repository as is.")
    else:
        print(f"Copying word analysis relation into repository directory...")
        copy(buckeye_word_analysis_relation_fp,
         path.join(repo_dir, buckeye_word_analysis_relation_fn))

## Step 6b: Add desired probability annotations...

**NB:** Due to the expensive and time-consuming nature of the segmental posterior calculations, it's desirable to add probability annotations incrementally as they roll in. Because of the number of annotations (2 directions * 4 orders per corpus-lexicon combination), *this processing step involves stateful and destructive updates* of inputs: 
 - an `..._annotated.json` version of each word analysis relation is created (if it doesn't already exist).
 - from a potential pool of posterior distribution filepaths that could exist, the extant ones are collected and divided into two piles: `Buckeye`-related ones and `NXT_swbd` related ones.
 - for each `..._annotated.json`-pile pair, the update notebook is called once for each distribution in the pile and `..._annotated.json` is updated to reflect information in the distribution.

In [None]:
annotated_buckeye_word_analysis_relation_fn = 'buckeye_word_analysis_relation_filtered_annotated.json'
annotated_switchboard_word_analysis_relation_fn = 'nxt_swbd_word_analysis_relation_filtered_annotated.json'

In [None]:
annotated_buckeye_word_analysis_relation_fn = buckeye_word_analysis_relation_fn.split('.json')[0] + '_annotated.json'
annotated_switchboard_word_analysis_relation_fn = switchboard_word_analysis_relation_fn.split('.json')[0] + '_annotated.json'

if '6b' in permittedSteps:
    if path.exists(annotated_buckeye_word_analysis_relation_fn) and not overwrite:
        print("Existing 'annotated' copy of the Buckeye word analysis relation found...")
    else:
        print("Creating fresh 'annotated' copy of the Buckeye word analysis relation...")
        copy(path.join(repo_dir, buckeye_word_analysis_relation_fn),
             path.join(repo_dir, annotated_buckeye_word_analysis_relation_fn))


    if path.exists(annotated_switchboard_word_analysis_relation_fn) and not overwrite:
        print("Existing 'annotated' copy of the NXT_swbd word analysis relation found...")
    else:
        print("Creating fresh 'annotated' copy of the NXT_swbd word analysis relation...")
        copy(path.join(repo_dir, switchboard_word_analysis_relation_fn),
             path.join(repo_dir, annotated_switchboard_word_analysis_relation_fn))

In [None]:
#gather and/or construct all segmental posterior filepaths in the repo
#associate with metadata / sanity check files
# check for done-ness via metadata file / something else? or leave that up to the update notebook?

In [None]:
seg_post_dist_fp_stems = lpluck('o', full_segpost_bundles)
seg_post_dist_fp_stems

In [None]:
listdir(path.dirname(seg_post_dist_fp_stems[0]))

In [None]:
is_seg_post_dist_dir = lambda base_name: 'pW_WC' in base_name and not 'calc' in base_name and not 'metadata' in base_name
is_seg_post_dist_md_fn = lambda base_name: 'pW_WC' in base_name and 'metadata' in base_name

def get_seg_post_dist_dirnames(spd_fp_stem):
    LM_dir = path.dirname(spd_fp_stem)
    seg_post_dist_dirs = lfilter(is_seg_post_dist_dir, listdir(LM_dir))
    return seg_post_dist_dirs

def seg_post_dist_started(spd_fp_stem):
    seg_post_dist_dirs = get_seg_post_dist_dirnames(spd_fp_stem)
    return len(seg_post_dist_dirs) > 0

def get_seg_post_metadata_fns(spd_fp_stem):
    LM_dir = path.dirname(spd_fp_stem)
    matching_seg_post_md_fns = lfilter(is_seg_post_dist_md_fn, listdir(LM_dir))
    return matching_seg_post_md_fns

def seg_post_dist_finished(spd_fp_stem):
    if not seg_post_dist_started(spd_fp_stem):
        return False
#     LM_dir = path.dirname(spd_fp_stem)
#     matching_seg_post_md_fns = lfilter(is_seg_post_dist_md_fn, listdir(LM_dir))
    matching_seg_post_md_fns = get_seg_post_metadata_fns(spd_fp_stem)
    return len(matching_seg_post_md_fns) > 0

In [None]:
has_seg_post_dist_started = lfilter(seg_post_dist_started, seg_post_dist_fp_stems)
has_seg_post_dist_started

In [None]:
# started_seg_post_dist_fns = lmap(lambda fp: get_seg_post_dists(fp)[0], 
#                                  has_seg_post_dist_started)
# started_seg_post_dist_fns

In [None]:
seg_post_dist_finished_md_fps = lmap(lambda fp: path.join(path.dirname(fp),
                                                          get_seg_post_metadata_fns(fp)[0]), 
                                     lfilter(seg_post_dist_finished, 
                                             has_seg_post_dist_started))
seg_post_dist_finished_md_fps

In [None]:
#divide finished dists into two piles, based on corpus...

In [None]:
seg_post_dist_finished_md_fps_grouped = walk_values(set, group_by(corpus_contexts_name, 
# seg_post_dist_finished_md_fps_grouped = walk_values(frozenset, group_by(corpus_contexts_name, 
                                                                        seg_post_dist_finished_md_fps))
seg_post_dist_finished_md_fps_grouped

# seg_post_dist_finished_md_fps_grouped = flip(seg_post_dist_finished_md_fps_grouped)
# seg_post_dist_finished_md_fps_grouped

In [None]:
from boilerplate import importDict

In [None]:
SPDFMDs = {fp:merge(importDict(fp), {'corpus':corpus_contexts_name(fp)})
           for fp in seg_post_dist_finished_md_fps}
SPDFMDs[seg_post_dist_finished_md_fps[0]]

In [None]:
#FIXME actually check that they're finished somehow:
# for type = pW_WC_E , assert that all values are valid probabilities and check that wStart = 0 and wEnd = W.size - 1

In [None]:
#move through each pile (separate cells) and update the relevant analysis relation accordingly

In [None]:
#build annotation bundles
annotation_update_bundles = []
for b in SPDFMDs.values():
    new_bundle = dict()
    
    my_corpus = b['corpus']
    direction = get_contexts_direction(b['matrix fp'])
    order = get_contexts_order(b['matrix fp'])
#     context_size = str(int(order) - 1)
    
    new_bundle['corpus'] = my_corpus
    
    if b['corpus'] == 'Buckeye':
        my_analysis_relation_fp = annotated_buckeye_word_analysis_relation_fn
    elif b['corpus'] == 'NXT_swbd':
        my_analysis_relation_fp = annotated_switchboard_word_analysis_relation_fn
    else:
        corpus_type = b['corpus']
        raise Exception(f"Unknown corpus type = {corpus_type}")
    
    new_bundle['a'] = my_analysis_relation_fp
    new_bundle['p'] = b['matrix fp']
    new_bundle['m'] = path.join(path.dirname(b['matrix fp']), 
                                path.basename(b['w']).split('.pW_V')[0] )
    new_bundle['w'] = b['p'] #b['W']['from fp']
    new_bundle['c'] = b['s'] #b['C']['from fp']
    
    new_bundle['nb_fp'] = f'Word analysis relation annotation update - {my_corpus}_{direction}_contexts_{order}gram_model.ipynb'
    
    new_bundle
    annotation_update_bundles.append(new_bundle)
    print(' ')

In [None]:
# has_corpus = lambda corpus_string: lambda b: corpus_contexts_name(b['nb_fp']) == corpus_string

In [None]:
lfilter(has_corpus('NXT_swbd'), annotation_update_bundles)

In [None]:
lfilter(has_corpus('Buckeye'), annotation_update_bundles)

In [None]:
if '6b' in permittedSteps:
    # takes about 60s per update, on average
    #
#     for bundle in lfilter(has_corpus('Buckeye'), annotation_update_bundles):
#     for bundle in lfilter(has_corpus('NXT_swbd'), annotation_update_bundles):

    # takes ≈15.1m on kotoba for all buckeye+NXT_swbd bigram and trigram updates + NXT_swbd 4-gram updates
    for bundle in annotation_update_bundles:
        
        progress_report(bundle['nb_fp'],
                        dict(a = bundle['a'],
                             p = bundle['p'],
                             w = bundle['w'],
                             m = bundle['m'],
                             c = bundle['c']))
        
        pm.execute_notebook('Word analysis relation annotation.ipynb',
                            bundle['nb_fp'],
                            parameters=dict(a = bundle['a'],
                                            p = bundle['p'],
                                            w = bundle['w'],
                                            m = bundle['m'],
                                            c = bundle['c']))
        endNote()
        print('\n')

## Step 6c: Export as a dataframe

In [None]:
#do that here with funcy and pandas...

if '6c' in permittedSteps:
    pass