In [1]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

**Notebook author:** emeinhardt@ucsd.edu

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Overview" data-toc-modified-id="Overview-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Overview</a></span><ul class="toc-item"><li><span><a href="#Requirements" data-toc-modified-id="Requirements-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Requirements</a></span></li><li><span><a href="#Usage" data-toc-modified-id="Usage-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Usage</a></span><ul class="toc-item"><li><span><a href="#Papermill---command-line" data-toc-modified-id="Papermill---command-line-1.2.1"><span class="toc-item-num">1.2.1&nbsp;&nbsp;</span>Papermill - command line</a></span></li><li><span><a href="#Old-School" data-toc-modified-id="Old-School-1.2.2"><span class="toc-item-num">1.2.2&nbsp;&nbsp;</span>Old School</a></span></li></ul></li></ul></li><li><span><a href="#Parameters" data-toc-modified-id="Parameters-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Parameters</a></span></li><li><span><a href="#Imports-/-load-data" data-toc-modified-id="Imports-/-load-data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Imports / load data</a></span></li><li><span><a href="#Main-calculation" data-toc-modified-id="Main-calculation-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Main calculation</a></span></li><li><span><a href="#Write-to-file" data-toc-modified-id="Write-to-file-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Write to file</a></span></li><li><span><a href="#Create-and-export-numpy-version-+-metadata" data-toc-modified-id="Create-and-export-numpy-version-+-metadata-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Create and export <code>numpy</code> version + metadata</a></span></li></ul></div>

# Overview

Given 
 - a file $l$ describing a relation $L$ between orthographic wordforms $V$ and transcribed (segmental) wordforms $W$
 - an output filepath $o$

this notebook creates a probability distribution $p(V|W)$ and writes it to $o$ as a `.json` file $o$.pW_V.json and as a sparse `npz` array $o$.pW_V.npz.

For a given $v$, the distribution is uniform over $\{w | (v,w) \in L \}$.

## Requirements

 - `more_itertools`
 - `joblib` *greatly* accelerates the search for all the segmental wordforms associated with a given orthographic wordform
 - `numpy`
 - `sparse`

## Usage

### Papermill - command line

This notebook is intended to be used with the [`papermill`](https://papermill.readthedocs.io/en/latest/) package.

**Example:**

```
papermill "Define a conditional distribution on segmental wordforms given an orthographic one.ipynb" "Define pW_V given LTR_CMU_destressed.ipynb" -p l "/home/AD/emeinhar/wr/LTR_CMU_destressed/LTR_CMU_destressed.tsv" -p o "/home/AD/emeinhar/wr/LTR_CMU_destressed/LTR_CMU_destressed"
```
will 
 - create a new notebook `Define pW_V given LTR_CMU_destressed.ipynb`

...and output 
 - `/home/AD/emeinhar/wr/LTR_CMU_destressed/LTR_CMU_destressed.pW_V.json`
 - `/home/AD/emeinhar/wr/LTR_CMU_destressed/LTR_CMU_destressed.pW_V.npz`, a sparse `numpy` array where columns are distributions and where the two text files `LTR_CMU_destressed_Orthographic_Wordforms.txt` and `LTR_CMU_destressed_Transcriptions.txt` indicate the ordering and interpretation of rows and columns, respectively.

### Old School

If you don't have or want to use this notebook as intended, edit the filenames/paths in the cell below with the top comment `# parameters`.

# Parameters

In [2]:
from os import getcwd, chdir, listdir, path, mkdir, makedirs

In [3]:
# parameters

# l = ''
l = '/home/AD/emeinhar/wr/LTR_CMU_destressed/LTR_CMU_destressed.tsv'

# o = ''
o = '/home/AD/emeinhar/wr/LTR_CMU_destressed/LTR_CMU_destressed'

In [4]:
output_dir = path.dirname(o)
if not path.exists(output_dir):
    print('Making output directory {0}'.format(output_dir))
    makedirs(output_dir)

# Imports / load data

In [32]:
import csv
from probdist import *
from boilerplate import *
from more_itertools import unique_everseen
from itertools import starmap, chain
import numpy as np
import sparse

In [6]:
from joblib import Parallel, delayed

J = 30
BACKEND = 'multiprocessing'
# BACKEND = 'loky'
V = 10
PREFER = 'processes'
# PREFER = 'threads'

def identity(x):
    return x

def par(gen_expr):
    return Parallel(n_jobs=J, backend=BACKEND, verbose=V, prefer=PREFER)(gen_expr)

In [7]:
lexicon = []

with open(l) as csvfile:
    my_reader = csv.DictReader(csvfile, delimiter='\t', quoting=csv.QUOTE_NONE, quotechar='@')
    for row in my_reader:
        #print(row)
        lexicon.append(row)
len(lexicon)
lexicon[:5]

133854

[OrderedDict([('Orthographic_Wordform', '!exclamation-point'),
              ('Transcription', 'ɛ.k.s.k.l.ʌ.m.eɪ.ʃ.ʌ.n.p.ɔɪ.n.t')]),
 OrderedDict([('Orthographic_Wordform', '"close-quote'),
              ('Transcription', 'k.l.oʊ.z.k.w.oʊ.t')]),
 OrderedDict([('Orthographic_Wordform', '"double-quote'),
              ('Transcription', 'd.ʌ.b.ʌ.l.k.w.oʊ.t')]),
 OrderedDict([('Orthographic_Wordform', '"end-of-quote'),
              ('Transcription', 'ɛ.n.d.ʌ.v.k.w.oʊ.t')]),
 OrderedDict([('Orthographic_Wordform', '"end-quote'),
              ('Transcription', 'ɛ.n.d.k.w.oʊ.t')])]

In [8]:
orthographic_wordforms = [r['Orthographic_Wordform'] for r in lexicon]
segmental_wordforms = [r['Transcription'] for r in lexicon]

In [9]:
len(orthographic_wordforms)
len(set(orthographic_wordforms))

133854

133854

In [10]:
len(segmental_wordforms)
len(set(segmental_wordforms))

133854

113745

In [11]:
orthographic_wordforms = tuple(unique_everseen([r['Orthographic_Wordform'] 
                                                for r in lexicon]))
segmental_wordforms = tuple(unique_everseen([r['Transcription'] 
                                             for r in lexicon]))

In [12]:
len(orthographic_wordforms)
len(segmental_wordforms)

133854

113745

# Main calculation

In [13]:
from random import choice

In [14]:
vocabulary = set(map(lambda e: e['Orthographic_Wordform'],
                     lexicon))
len(vocabulary)

133854

In [15]:
random_v = choice(lexicon)['Orthographic_Wordform']
random_v

'cales'

In [16]:
def entries_with_orthword(v):
    return [row for row in lexicon if row['Orthographic_Wordform'] == v]

entries_with_orthword(random_v)

[OrderedDict([('Orthographic_Wordform', 'cales'),
              ('Transcription', 'k.eɪ.l.z')])]

In [17]:
def orthword_to_phonword(v):
    matching_entries = entries_with_orthword(v)
    phonwords = list(map(lambda e: e['Transcription'],
                        matching_entries))
    return phonwords

In [18]:
# orth_to_phons = {v:orthword_to_phonword(v)
#                  for v in vocabulary}

#takes ~4.1m on wittgenstein with J=30 and stuff going on in the background

def foo(v):
    return (v, orthword_to_phonword(v))

orth_to_phons = dict(par(delayed(foo)(v)
                         for v in vocabulary))

[Parallel(n_jobs=30)]: Using backend MultiprocessingBackend with 30 concurrent workers.
[Parallel(n_jobs=30)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=30)]: Batch computation too fast (0.0442s.) Setting batch_size=8.
[Parallel(n_jobs=30)]: Done  12 tasks      | elapsed:    0.1s
[Parallel(n_jobs=30)]: Done  25 tasks      | elapsed:    0.1s
[Parallel(n_jobs=30)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=30)]: Done  53 tasks      | elapsed:    0.2s
[Parallel(n_jobs=30)]: Done 124 tasks      | elapsed:    0.5s
[Parallel(n_jobs=30)]: Done 260 tasks      | elapsed:    0.6s
[Parallel(n_jobs=30)]: Done 396 tasks      | elapsed:    0.8s
[Parallel(n_jobs=30)]: Done 548 tasks      | elapsed:    1.1s
[Parallel(n_jobs=30)]: Done 700 tasks      | elapsed:    1.3s
[Parallel(n_jobs=30)]: Done 868 tasks      | elapsed:    1.5s
[Parallel(n_jobs=30)]: Done 1036 tasks      | elapsed:    1.8s
[Parallel(n_jobs=30)]: Done 1220 tasks      | elapsed:    2.1s
[Parallel(n_jobs=30)]

[Parallel(n_jobs=30)]: Done 75276 tasks      | elapsed:  1.9min
[Parallel(n_jobs=30)]: Done 76388 tasks      | elapsed:  1.9min
[Parallel(n_jobs=30)]: Done 77500 tasks      | elapsed:  1.9min
[Parallel(n_jobs=30)]: Done 78628 tasks      | elapsed:  2.0min
[Parallel(n_jobs=30)]: Done 79756 tasks      | elapsed:  2.0min
[Parallel(n_jobs=30)]: Done 80900 tasks      | elapsed:  2.0min
[Parallel(n_jobs=30)]: Done 82044 tasks      | elapsed:  2.0min
[Parallel(n_jobs=30)]: Done 83204 tasks      | elapsed:  2.1min
[Parallel(n_jobs=30)]: Done 84364 tasks      | elapsed:  2.1min
[Parallel(n_jobs=30)]: Done 85540 tasks      | elapsed:  2.1min
[Parallel(n_jobs=30)]: Done 86716 tasks      | elapsed:  2.2min
[Parallel(n_jobs=30)]: Done 87908 tasks      | elapsed:  2.2min
[Parallel(n_jobs=30)]: Done 89100 tasks      | elapsed:  2.2min
[Parallel(n_jobs=30)]: Done 90308 tasks      | elapsed:  2.2min
[Parallel(n_jobs=30)]: Done 91516 tasks      | elapsed:  2.3min
[Parallel(n_jobs=30)]: Done 92740 tasks 

In [19]:
orth_to_num_phonwords = {v:len(orth_to_phons[v])
                         for v in vocabulary}
v_with_multiple_phonwords = {v for v in vocabulary if orth_to_num_phonwords[v] > 1}
len(vocabulary)
len(v_with_multiple_phonwords)

133854

0

In [20]:
list(v_with_multiple_phonwords)[:5]

[]

In [21]:
def pW_v(v):
    return ProbDist(orth_to_phons[v])

pW_V = condDistsAsProbDists({v:pW_v(v) for v in vocabulary})
assert areNormalized(pW_V)

# Write to file

In [22]:
exportProbDist(o + '.pW_V.json', condProbDistAsDicts_for_export(pW_V))

In [23]:
output_dir

'/home/AD/emeinhar/wr/LTR_CMU_destressed'

In [24]:
listdir(output_dir)

['Making a Transcribed Lexicon Relation - CMU_destressed.ipynb',
 'LTR_CMU_destressed.pW_V.json',
 'LTR_CMU_destressed.tsv',
 '.ipynb_checkpoints',
 'cmudict-0.7b_IPA_destressed.tsv']

In [25]:
# !cat -n /home/AD/emeinhar/wr/LTR_CMU_destressed/LTR_CMU_destressed.pW_V.json | head -10

# Create and export `numpy` version + metadata

In [30]:
my_shape = (len(segmental_wordforms), len(orthographic_wordforms))
my_shape
my_cells = np.prod(my_shape)
"{:,} cells".format(my_cells)
my_GB = my_cells * 64 / 8 / 1e9
my_GB
if my_GB > 50:
    print('Consider using a sparse format.')
if my_GB > 100:
    print('The array may not fit in memory.')

(113745, 133854)

'15,225,223,230 cells'

121.80178584

Consider using a sparse format
The array may not fit in memory.


In [112]:
def pW_v_np(v_idx):
    v = orthographic_wordforms[v_idx]
    pW = pW_V[v]

    #results in next cell taking ?m for the destressed CMU dict, J=30, other stuff in background
#     return np.array([float(pW[w]) for w in segmental_wordforms])

    pW_np = np.zeros(shape=(len(segmental_wordforms),))
    for w in pW:
        w_idx = segmental_wordforms.index(w)
        pW_np[w_idx] = pW[w]
    return pW_np

def pW_v_sparseArgs(v_idx):
    pW_np = pW_v_np(v_idx)
    non_zero_row_coords = pW_np.nonzero()[0]
    non_zero_col_coords = np.array([v_idx for each in non_zero_row_coords])
    coords = (non_zero_row_coords, non_zero_col_coords)
    
#     iter_coords = list(zip(*coords))
    data = [pW_np[i] for i in non_zero_row_coords]
#     shape = (len(segmental_wordforms),1)
#     shape = my_shape
#     sparse_args = (coords, data, shape)
    sparse_args = (list(non_zero_row_coords), list(non_zero_col_coords), list(data))
    return sparse_args

def concat(list_a, list_b):
    return list_a + list_b

def combine_sparseArgs(sa_triple_a, sa_triple_b):
#     non_zero_row_coords_a, non_zero_col_coords_a, data_a = sa_triple_a
#     non_zero_row_coords_b, non_zero_col_coords_b, data_b = sa_triple_b
#     non_zero_row_coords = concat(non_zero_row_coords_a, non_zero_row_coords_b)
#     non_zero_col_coords = concat(non_zero_col_coords_a, non_zero_col_coords_b)
#     data = concat(data_a, data_b)
#     new_triple_old = (non_zero_row_coords, non_zero_col_coords, data)
#     new_triple = tuple(starmap(concat, zip(sa_triple_a, sa_triple_b)))
#     print(new_triple_old)
#     print(new_triple)
#     assert new_triple == new_triple_old
    return tuple(starmap(concat, zip(sa_triple_a, sa_triple_b)))

# from itertools import chain

def concat_(lists):
    return list(chain.from_iterable(lists))

def union_sparseArgs(sa_triples):
    return list(map(concat_,
                    zip(*sa_triples)))

def sparseArgs_to_sparse(sparseArgs, shape=my_shape):
    non_zero_row_coords, non_zero_col_coords, data = sparseArgs
    coords = (np.array(non_zero_row_coords), np.array(non_zero_col_coords))
    shape = shape
    return sparse.COO(coords, data, shape)

In [117]:
combine_sparseArgs(pW_v_sparseArgs(0), pW_v_sparseArgs(1))
union_sparseArgs( [pW_v_sparseArgs(0), pW_v_sparseArgs(1), pW_v_sparseArgs(2)] )
sparseArgs_to_sparse(union_sparseArgs( [pW_v_sparseArgs(0), pW_v_sparseArgs(1), pW_v_sparseArgs(2)] ),
                     (len(segmental_wordforms),3))
sparseArgs_to_sparse(union_sparseArgs( [pW_v_sparseArgs(0), pW_v_sparseArgs(1), pW_v_sparseArgs(2)] ),
                     (len(segmental_wordforms),3)).todense()
np.array_equal( sparseArgs_to_sparse(union_sparseArgs( [pW_v_sparseArgs(0), pW_v_sparseArgs(1), pW_v_sparseArgs(2)] ),
                                     (len(segmental_wordforms),3)).todense(),
                np.vstack([pW_v_np(0), pW_v_np(1), pW_v_np(2)]).T)

([0, 1], [0, 1], [1.0, 1.0])

[[0, 1, 2], [0, 1, 2], [1.0, 1.0, 1.0]]

<COO: shape=(113745, 3), dtype=float64, nnz=3, fill_value=0.0>

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       ...,
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])

True

In [108]:
my_shape

(113745, 133854)

In [109]:
len(segmental_wordforms)

113745

In [84]:
pW_v_np(0)
pW_v_np(0).nonzero()

array([1., 0., 0., ..., 0., 0., 0.])

(array([0]),)

In [73]:
pW_v_sparseArgs(0)[0]
pW_v_sparseArgs(1)[0]


[0]

[1]

In [74]:
pW_v_np(0)
sparse.COO.from_numpy(pW_v_np(0))
pW_v_sparseArgs(0)
# sparse.COO(*pW_v_sparseArgs(0))
pW_v_sparseArgs(1)
# sparse.COO

array([1., 0., 0., ..., 0., 0., 0.])

<COO: shape=(113745,), dtype=float64, nnz=1, fill_value=0.0>

([0], [0], [1.0])

([1], [1], [1.0])

In [90]:
from itertools import starmap, chain

In [96]:
A = ([0,3], [0,4], [1.0,5.0])
B = ([1], [1], [2.0])
C = ([10], [10], [-3])
A
B
C
list(zip(A,B))
list(starmap(lambda a,b: a + b,
             list(zip(A,B))))
list(map(np.array,
        list(starmap(lambda a,b: a + b,
             list(zip(A,B))))))
' '
list(zip(*[A,B]))
list(map(lambda ls: list(chain.from_iterable(ls)),
         list(zip(*[A,B]))))
list(map(lambda ls: list(chain.from_iterable(ls)),
         list(zip(*[A,B,C]))))

([0, 3], [0, 4], [1.0, 5.0])

([1], [1], [2.0])

([10], [10], [-3])

[([0, 3], [1]), ([0, 4], [1]), ([1.0, 5.0], [2.0])]

[[0, 3, 1], [0, 4, 1], [1.0, 5.0, 2.0]]

[array([0, 3, 1]), array([0, 4, 1]), array([1., 5., 2.])]

' '

[([0, 3], [1]), ([0, 4], [1]), ([1.0, 5.0], [2.0])]

[[0, 3, 1], [0, 4, 1], [1.0, 5.0, 2.0]]

[[0, 3, 1, 10], [0, 4, 1, 10], [1.0, 5.0, 2.0, -3]]

In [75]:
A = np.array([0,0,1]); A
B = np.array([1,0,0]); B
C = np.vstack([A,B]); C


array([0, 0, 1])

array([1, 0, 0])

array([[0, 0, 1],
       [1, 0, 0]])

In [43]:
A.nonzero()
C.nonzero()
list(zip(*C.nonzero()))

(array([2]),)

(array([0, 1]), array([2, 0]))

[(0, 2), (1, 0)]

In [123]:
# #takes ?m on wittgenstein with J=30 and other stuff going on in the background
# # also takes >120GB of memory!!!
# if my_GB < 100:
#     pW_V_np = np.vstack(par(delayed(pW_v_np)(v_idx)
#                             for v_idx in range(len(orthographic_wordforms)))).T
# else:
my_sparseArgs = par(delayed(pW_v_sparseArgs)(v_idx)
                    for v_idx in range(len(orthographic_wordforms)))
pW_V_sparse = sparseArgs_to_sparse(union_sparseArgs(my_sparseArgs))
print('Density: {0}'.format( pW_V_sparse.density ))
print('Size in GB: {0}'.format( pW_V_sparse.nbytes / 1e9 ))
sparse.save_npz(o + '.pW_V', pW_V_sparse)

[Parallel(n_jobs=30)]: Using backend MultiprocessingBackend with 30 concurrent workers.
[Parallel(n_jobs=30)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=30)]: Batch computation too fast (0.0060s.) Setting batch_size=66.
[Parallel(n_jobs=30)]: Done  12 tasks      | elapsed:    0.0s
[Parallel(n_jobs=30)]: Done  25 tasks      | elapsed:    0.0s
[Parallel(n_jobs=30)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=30)]: Done  53 tasks      | elapsed:    0.1s
[Parallel(n_jobs=30)]: Batch computation too fast (0.0699s.) Setting batch_size=376.
[Parallel(n_jobs=30)]: Done 588 tasks      | elapsed:    0.1s
[Parallel(n_jobs=30)]: Done 1710 tasks      | elapsed:    0.2s
[Parallel(n_jobs=30)]: Done 2832 tasks      | elapsed:    0.3s
[Parallel(n_jobs=30)]: Done 4396 tasks      | elapsed:    0.6s
[Parallel(n_jobs=30)]: Done 11540 tasks      | elapsed:    0.9s
[Parallel(n_jobs=30)]: Done 19436 tasks      | elapsed:    1.4s
[Parallel(n_jobs=30)]: Done 27332 tasks      | elapsed

Density: 8.791595234955383e-06
Size in GB: 0.003212496


In [127]:
exportSeqs(o + '_Orthographic_Wordforms.txt', orthographic_wordforms)
exportSeqs(o + '_Transcriptions.txt', segmental_wordforms)

In [128]:
o

'/home/AD/emeinhar/wr/LTR_CMU_destressed/LTR_CMU_destressed'

In [129]:
listdir(path.dirname(o))

['Making a Transcribed Lexicon Relation - CMU_destressed.ipynb',
 'LTR_CMU_destressed.pW_V.json',
 'LTR_CMU_destressed.tsv',
 '.ipynb_checkpoints',
 'LTR_CMU_destressed.pW_V.npz',
 'cmudict-0.7b_IPA_destressed.tsv',
 'LTR_CMU_destressed_Orthographic_Wordforms.txt',
 'LTR_CMU_destressed_Transcriptions.txt']