In [1]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

**Notebook author:** emeinhardt@ucsd.edu

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Overview-and-requirements" data-toc-modified-id="Overview-and-requirements-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Overview and requirements</a></span><ul class="toc-item"><li><span><a href="#Usage" data-toc-modified-id="Usage-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Usage</a></span><ul class="toc-item"><li><span><a href="#Papermill---command-line" data-toc-modified-id="Papermill---command-line-1.1.1"><span class="toc-item-num">1.1.1&nbsp;&nbsp;</span>Papermill - command line</a></span></li><li><span><a href="#Old-School" data-toc-modified-id="Old-School-1.1.2"><span class="toc-item-num">1.1.2&nbsp;&nbsp;</span>Old School</a></span></li></ul></li></ul></li><li><span><a href="#Parameters" data-toc-modified-id="Parameters-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Parameters</a></span></li><li><span><a href="#Load-data" data-toc-modified-id="Load-data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Load data</a></span></li><li><span><a href="#Filter-the-lexicon" data-toc-modified-id="Filter-the-lexicon-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Filter the lexicon</a></span></li><li><span><a href="#Export-the-lexicon" data-toc-modified-id="Export-the-lexicon-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Export the lexicon</a></span></li></ul></div>

# Overview and requirements

This notebook is intended to take 
 - a transcribed lexicon relation `.tsv`
 - a triphone channel distribution `.json` file 
 
and produce 
 - a new version of the transcribed lexicon relation that only contains entries that can be aligned with the triphone channel distribution.

## Usage

### Papermill - command line

This notebook is intended to be used with the [`papermill`](https://papermill.readthedocs.io/en/latest/) package.

**Example:**

```
papermill "Filter transcription lexicon by channel model.ipynb" "Filter LTR_Buckeye against channel model.ipynb" -p l "LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_w_GD_AmE-diphones.tsv" -p c "CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01/pY1X0X1X2.json" -p o "LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered.tsv"
```
will 
 - create a new notebook `Filter LTR_Buckeye against channel model.ipynb`

...and output `LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered.tsv`.

### Old School

If you don't have or want to use this notebook as intended, edit the filenames/paths in the cell below with the top comment `# parameters`.

# Parameters

In [4]:
from os import getcwd, chdir, listdir, path, mkdir, makedirs

In [2]:
# Parameters

# l = ''
l = 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_w_GD_AmE-diphones.tsv'
 
# c = ''
c = 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0.01/pY1X0X1X2.json'

# o = ''
o = 'LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_Buckeye_aligned_CM_filtered.tsv'

# Load data

In [3]:
import csv

In [14]:
lexicon_rows_in = []

with open(l) as csvfile:
    my_reader = csv.DictReader(csvfile, delimiter='\t', quoting=csv.QUOTE_NONE, quotechar='@')
    for row in my_reader:
        #print(row)
        lexicon_rows_in.append(row)
lexicon_rows_in[:5]

[OrderedDict([('Orthographic_Wordform', 'i'), ('Transcription', 'aɪ')]),
 OrderedDict([('Orthographic_Wordform', 'uh'), ('Transcription', 'ʌ')]),
 OrderedDict([('Orthographic_Wordform', 'grew'), ('Transcription', 'g.ɹ.u')]),
 OrderedDict([('Orthographic_Wordform', 'up'), ('Transcription', 'ʌ.p')]),
 OrderedDict([('Orthographic_Wordform', 'in'), ('Transcription', 'ɪ.n')])]

In [10]:
from probdist import *

In [11]:
channel_model = importProbDist(c)

In [12]:
len(channel_model)
stimuli_triphones = set(channel_model.keys())

46860

# Filter the lexicon

In [25]:
def padInputSequenceWithBoundaries(inputSeq):
    temp = list(dottedStringToTuple(inputSeq))
    temp = tuple([leftEdge] + temp + [rightEdge])
    return tupleToDottedString(temp)

def trimBoundariesFromSequence(seq):
    temp = list(dottedStringToTuple(seq))
    if temp[0] == leftEdge:
        temp = temp[1:]
    if temp[-1] == rightEdge:
        temp = temp[:-1]
    return tupleToDottedString(tuple(temp))

In [26]:
def rowToTriphones(row):
    three_factors = dsTo3factors(padInputSequenceWithBoundaries(row['Transcription']))
    return three_factors

In [27]:
list(map(rowToTriphones, lexicon_rows_in[:10]))

[{'⋊.aɪ.⋉'},
 {'⋊.ʌ.⋉'},
 {'g.ɹ.u', 'ɹ.u.⋉', '⋊.g.ɹ'},
 {'ʌ.p.⋉', '⋊.ʌ.p'},
 {'ɪ.n.⋉', '⋊.ɪ.n'},
 {'ɪ.z.⋉', '⋊.ɪ.z'},
 {'ð.ɪ.s', 'ɪ.s.⋉', '⋊.ð.ɪ'},
 {'ɪ.z.⋉', '⋊.ɪ.z'},
 {'ð.ɪ.s', 'ɪ.s.⋉', '⋊.ð.ɪ'},
 {'b.ɪ.k', 'k.ʌ.z', 'ɪ.k.ʌ', 'ʌ.z.⋉', '⋊.b.ɪ'}]

In [23]:
def modelableEntry(row):
    three_factors = rowToTriphones(row)
    return all([factor in stimuli_triphones for factor in three_factors])

In [29]:
lexicon_out = list(filter(modelableEntry,
                          lexicon_rows_in))

print('|Lexicon in| = {0}'.format(len(lexicon_rows_in)))
print('|Lexicon out| = {0}|'.format(len(lexicon_out)))
print('|words| removed = {0}'.format(len(lexicon_rows_in) - len(lexicon_out)))
print('% words removed = {0}'.format((len(lexicon_rows_in) - len(lexicon_out)) / len(lexicon_rows_in) * 100.0))

|Lexicon in| = 216062
|Lexicon out| = 205173|
|words| removed = 10889
% words removed = 5.039757106756395


# Export the lexicon

In [None]:
with open(o, 'w', newline='\n') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=['Orthographic_Wordform', 'Transcription'], delimiter='\t', quoting=csv.QUOTE_NONE, quotechar='@')

    writer.writeheader()
    writer.writerows(lexicon_out)