In [1]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

**Notebook author:** emeinhardt@ucsd.edu

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Overview" data-toc-modified-id="Overview-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Overview</a></span><ul class="toc-item"><li><span><a href="#Requirements" data-toc-modified-id="Requirements-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Requirements</a></span></li><li><span><a href="#Usage" data-toc-modified-id="Usage-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Usage</a></span><ul class="toc-item"><li><span><a href="#Papermill---command-line" data-toc-modified-id="Papermill---command-line-1.2.1"><span class="toc-item-num">1.2.1&nbsp;&nbsp;</span>Papermill - command line</a></span></li><li><span><a href="#Old-School" data-toc-modified-id="Old-School-1.2.2"><span class="toc-item-num">1.2.2&nbsp;&nbsp;</span>Old School</a></span></li></ul></li></ul></li><li><span><a href="#Parameters" data-toc-modified-id="Parameters-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Parameters</a></span></li><li><span><a href="#Imports-/-load-data" data-toc-modified-id="Imports-/-load-data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Imports / load data</a></span></li><li><span><a href="#Main-calculation" data-toc-modified-id="Main-calculation-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Main calculation</a></span></li><li><span><a href="#Write-to-file" data-toc-modified-id="Write-to-file-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Write to file</a></span></li></ul></div>

# Overview

Given 
 - a file $l$ describing a relation $L$ between orthographic wordforms $V$ and transcribed (segmental) wordforms $W$
 - an output filepath $o$

this notebook creates a probability distribution $p(V|W)$ and writes it to $o$ (as a `.json` file).

For a given $v$, the distribution is uniform over $\{w | (v,w) \in L \}$.

## Requirements

 - `joblib` *greatly* accelerates the search for all the segmental wordforms associated with a given orthographic wordform

## Usage

### Papermill - command line

This notebook is intended to be used with the [`papermill`](https://papermill.readthedocs.io/en/latest/) package.

**Example:**

```
papermill "Define a conditional distribution on segmental wordforms given an orthographic one.ipynb" "Define pW_V given LTR_CMU_destressed.ipynb" -p l "/home/AD/emeinhar/wr/LTR_CMU_destressed/LTR_CMU_destressed.tsv" -p o "/home/AD/emeinhar/wr/LTR_CMU_destressed/LTR_CMU_destressed.pW_V.json"
```
will 
 - create a new notebook `Define pW_V given LTR_CMU_destressed.ipynb`

...and output `/home/AD/emeinhar/wr/LTR_CMU_destressed/LTR_CMU_destressed.pW_V.json`.

### Old School

If you don't have or want to use this notebook as intended, edit the filenames/paths in the cell below with the top comment `# parameters`.

# Parameters

In [4]:
from os import getcwd, chdir, listdir, path, mkdir, makedirs

In [3]:
# parameters

l = ''
# l = '/home/AD/emeinhar/wr/LTR_CMU_destressed/LTR_CMU_destressed.tsv'

o = ''
# o = '/home/AD/emeinhar/wr/LTR_CMU_destressed/LTR_CMU_destressed.pW_V.json'

In [5]:
output_dir = path.dirname(o)
if not path.exists(output_dir):
    print('Making output directory {0}'.format(output_dir))
    makedirs(output_dir)

# Imports / load data

In [2]:
import csv
from probdist import *
from boilerplate import *

In [28]:
from joblib import Parallel, delayed

J = 30
BACKEND = 'multiprocessing'
# BACKEND = 'loky'
V = 10
PREFER = 'processes'
# PREFER = 'threads'

def identity(x):
    return x

def par(gen_expr):
    return Parallel(n_jobs=J, backend=BACKEND, verbose=V, prefer=PREFER)(gen_expr)

In [40]:
lexicon = []

with open(l) as csvfile:
    my_reader = csv.DictReader(csvfile, delimiter='\t', quoting=csv.QUOTE_NONE, quotechar='@')
    for row in my_reader:
        #print(row)
        lexicon.append(row)
len(lexicon)
lexicon[:5]

133854

[OrderedDict([('Orthographic_Wordform', '!exclamation-point'),
              ('Transcription', 'ɛ.k.s.k.l.ʌ.m.eɪ.ʃ.ʌ.n.p.ɔɪ.n.t')]),
 OrderedDict([('Orthographic_Wordform', '"close-quote'),
              ('Transcription', 'k.l.oʊ.z.k.w.oʊ.t')]),
 OrderedDict([('Orthographic_Wordform', '"double-quote'),
              ('Transcription', 'd.ʌ.b.ʌ.l.k.w.oʊ.t')]),
 OrderedDict([('Orthographic_Wordform', '"end-of-quote'),
              ('Transcription', 'ɛ.n.d.ʌ.v.k.w.oʊ.t')]),
 OrderedDict([('Orthographic_Wordform', '"end-quote'),
              ('Transcription', 'ɛ.n.d.k.w.oʊ.t')])]

# Main calculation

In [8]:
from random import choice

In [29]:
vocabulary = set(map(lambda e: e['Orthographic_Wordform'],
                     lexicon))
len(vocabulary)

133854

In [17]:
random_v = choice(lexicon)['Orthographic_Wordform']
random_v

'projectors'

In [18]:
def entries_with_orthword(v):
    return [row for row in lexicon if row['Orthographic_Wordform'] == v]

entries_with_orthword(random_v)

[OrderedDict([('Orthographic_Wordform', 'projectors'),
              ('Transcription', 'p.ɹ.ɑ.dʒ.ɛ.k.t.ɚ.z')])]

In [19]:
def orthword_to_phonword(v):
    matching_entries = entries_with_orthword(v)
    phonwords = list(map(lambda e: e['Transcription'],
                        matching_entries))
    return phonwords

In [31]:
# orth_to_phons = {v:orthword_to_phonword(v)
#                  for v in vocabulary}

#takes ~4.1m on wittgenstein with J=30 and stuff going on in the background

def foo(v):
    return (v, orthword_to_phonword(v))

orth_to_phons = dict(par(delayed(foo)(v)
                         for v in vocabulary))

[Parallel(n_jobs=30)]: Using backend MultiprocessingBackend with 30 concurrent workers.
[Parallel(n_jobs=30)]: Done   1 tasks      | elapsed:    0.1s
[Parallel(n_jobs=30)]: Batch computation too fast (0.0558s.) Setting batch_size=6.
[Parallel(n_jobs=30)]: Done  12 tasks      | elapsed:    0.1s
[Parallel(n_jobs=30)]: Done  25 tasks      | elapsed:    0.2s
[Parallel(n_jobs=30)]: Done  38 tasks      | elapsed:    0.2s
[Parallel(n_jobs=30)]: Done  53 tasks      | elapsed:    0.2s
[Parallel(n_jobs=30)]: Done 108 tasks      | elapsed:    0.5s
[Parallel(n_jobs=30)]: Done 210 tasks      | elapsed:    0.6s
[Parallel(n_jobs=30)]: Done 312 tasks      | elapsed:    0.8s
[Parallel(n_jobs=30)]: Done 426 tasks      | elapsed:    1.0s
[Parallel(n_jobs=30)]: Done 540 tasks      | elapsed:    1.3s
[Parallel(n_jobs=30)]: Done 666 tasks      | elapsed:    1.5s
[Parallel(n_jobs=30)]: Done 792 tasks      | elapsed:    1.8s
[Parallel(n_jobs=30)]: Done 930 tasks      | elapsed:    2.0s
[Parallel(n_jobs=30)]: 

[Parallel(n_jobs=30)]: Done 56472 tasks      | elapsed:  1.7min
[Parallel(n_jobs=30)]: Done 57306 tasks      | elapsed:  1.8min
[Parallel(n_jobs=30)]: Done 58140 tasks      | elapsed:  1.8min
[Parallel(n_jobs=30)]: Done 58986 tasks      | elapsed:  1.8min
[Parallel(n_jobs=30)]: Done 59832 tasks      | elapsed:  1.8min
[Parallel(n_jobs=30)]: Done 60690 tasks      | elapsed:  1.9min
[Parallel(n_jobs=30)]: Done 61548 tasks      | elapsed:  1.9min
[Parallel(n_jobs=30)]: Done 62418 tasks      | elapsed:  1.9min
[Parallel(n_jobs=30)]: Done 63288 tasks      | elapsed:  2.0min
[Parallel(n_jobs=30)]: Done 64170 tasks      | elapsed:  2.0min
[Parallel(n_jobs=30)]: Done 65052 tasks      | elapsed:  2.0min
[Parallel(n_jobs=30)]: Done 65946 tasks      | elapsed:  2.0min
[Parallel(n_jobs=30)]: Done 66840 tasks      | elapsed:  2.1min
[Parallel(n_jobs=30)]: Done 67746 tasks      | elapsed:  2.1min
[Parallel(n_jobs=30)]: Done 68652 tasks      | elapsed:  2.1min
[Parallel(n_jobs=30)]: Done 69570 tasks 

In [39]:
orth_to_num_phonwords = {v:len(orth_to_phons[v])
                         for v in vocabulary}
v_with_multiple_phonwords = {v for v in vocabulary if orth_to_num_phonwords[v] > 1}
len(vocabulary)
len(v_with_multiple_phonwords)

133854

0

In [41]:
list(v_with_multiple_phonwords)[:5]

[]

In [34]:
def pW_v(v):
    return ProbDist(orth_to_phons[v])

pW_V = condDistsAsProbDists({v:pW_v(v) for v in vocabulary})
assert areNormalized(pW_V)

# Write to file

In [55]:
exportProbDist(o, condProbDistAsDicts_for_export(pW_V))

In [58]:
output_dir

'/home/AD/emeinhar/wr/LTR_CMU_destressed'

In [56]:
listdir(output_dir)

['Making a Transcribed Lexicon Relation - CMU_destressed.ipynb',
 'LTR_CMU_destressed.pW_V.json',
 'LTR_CMU_destressed.tsv',
 '.ipynb_checkpoints',
 'cmudict-0.7b_IPA_destressed.tsv']

In [59]:
# !cat -n /home/AD/emeinhar/wr/LTR_CMU_destressed/LTR_CMU_destressed.pW_V.json | head -10

     1	{
     2	    "harmonious": {
     3	        "h.ɑ.ɹ.m.oʊ.n.i.ʌ.s": 1.0
     4	    },
     5	    "itzhak": {
     6	        "ɪ.t.s.ɑ.k": 1.0
     7	    },
     8	    "maxilla": {
     9	        "m.æ.k.s.ɪ.l.ʌ": 1.0
    10	    },
cat: write error: Broken pipe
