In [1]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Overview" data-toc-modified-id="Overview-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Overview</a></span><ul class="toc-item"><li><span><a href="#Motivation" data-toc-modified-id="Motivation-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Motivation</a></span></li><li><span><a href="#Usage" data-toc-modified-id="Usage-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Usage</a></span><ul class="toc-item"><li><span><a href="#Papermill---command-line" data-toc-modified-id="Papermill---command-line-1.2.1"><span class="toc-item-num">1.2.1&nbsp;&nbsp;</span>Papermill - command line</a></span></li><li><span><a href="#Old-School" data-toc-modified-id="Old-School-1.2.2"><span class="toc-item-num">1.2.2&nbsp;&nbsp;</span>Old School</a></span></li></ul></li></ul></li><li><span><a href="#Parameters" data-toc-modified-id="Parameters-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Parameters</a></span></li><li><span><a href="#Import-data" data-toc-modified-id="Import-data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Import data</a></span><ul class="toc-item"><li><span><a href="#Import-gating-data" data-toc-modified-id="Import-gating-data-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Import gating data</a></span></li><li><span><a href="#Import-lexicon" data-toc-modified-id="Import-lexicon-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Import lexicon</a></span></li></ul></li><li><span><a href="#Compare-inventories" data-toc-modified-id="Compare-inventories-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Compare inventories</a></span></li><li><span><a href="#Projection-function:-gating-data" data-toc-modified-id="Projection-function:-gating-data-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Projection function: gating data</a></span></li><li><span><a href="#Projection-function:-transcribed-lexicon" data-toc-modified-id="Projection-function:-transcribed-lexicon-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Projection function: transcribed lexicon</a></span></li><li><span><a href="#Export-projections" data-toc-modified-id="Export-projections-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Export projections</a></span></li></ul></div>

In [2]:
from os import getcwd, chdir, listdir, path

import json
import csv

# Overview

## Motivation

Let 
 - $g$ be a gating data file
 - $l$ be a transcribed lexicon relation file

This notebook will import both, calculate the segmental inventories $\Sigma_g$, $\Sigma_l$ used in each, and 
 - calculate the portion $\overline{g}$ of $g$'s inventory $\Sigma_g$ unique to $g$ relative to $l$
 - calculate the portion $\overline{l}$ of $l$'s inventory $\Sigma_l$ unique to $l$ relative to $g$

*Your job* is to then define dictionaries (for export) for mapping 
 - $\overline{g} \rightarrow \Sigma_g \cap \Sigma_l$
 - $\overline{l} \rightarrow \Sigma_g \cap \Sigma_l$

These dictionaries will then be used by a notebook for creating 
- a version of the gating data in $g$ aligned with $l$
- a version of the transcribed lexicon relation in $l$ aligned with $g$

(To be abundantly clear: each such dictionary is a projection function on the underlying file's inventory $\Sigma$ and aligned strings are created by applying the relevant projection function.)

## Usage

### Papermill - command line

This notebook is intended to be used with the [`papermill`](https://papermill.readthedocs.io/en/latest/) package.

**Example:**

If `g` = `AmE-diphones-IPA-annotated-columns.csv` and `l` = `LTR_newdic.tsv`, then at the command line
```
papermill "Gating Data - Transcription Lexicon Alignment Maker.ipynb" "AmE-diphones - LTR_newdic alignment.ipynb" -p g AmE-diphones-IPA-annotated-columns.csv -p l LTR_newdic.tsv
```
will create a new notebook `AmE-diphones - LTR_newdic alignment.ipynb` ready for you to run and define an alignment between `g` and `l`.

If you want to create/work with the stressed inventory of the gating data file, add `-p s stressed`.

### Old School

If you don't have or want to use this notebook as intended, edit the filenames/paths in the cell below with the top comment #"PARAMETERS CELL".

# Parameters

In [3]:
# PARAMETERS CELL
#
# This is the Paremeters Cell that Papermill looks at and modifies
# 
# go to View->Cell Toolbar->Tags to see what's going on

g = ""
# g = "AmE-diphones-IPA-annotated-columns.csv"

l = ""
# l = "./LTR_newdic/LTR_newdic.tsv"

s = ""
# s = 'destressed'
# s = 'stressed'

In [4]:
# Parameters
g = "GD_AmE_destressed_aligned_w_LTR_Buckeye/AmE-diphones-IPA-annotated-columns.csv"
l = "LTR_Buckeye_aligned_w_GD_AmE_destressed/LTR_buckeye.tsv"


In [5]:
getcwd()

'/mnt/cube/home/AD/emeinhar/wr'

In [6]:
listdir()

['boilerplate.py',
 'LTR_Buckeye',
 '.gitignore',
 'LTR_Buckeye_aligned_w_GD_AmE_destressed',
 'GD_AmE_destressed_aligned_w_LTR_Buckeye',
 '__pycache__',
 'AmE-diphones - LTR_Buckeye alignment.ipynb',
 'LTR_CMU_destressed',
 'AmE-diphones - LTR_CMU_destressed alignment.ipynb',
 'Gating Data - Transcription Lexicon Alignment Maker.ipynb',
 'old',
 'alignment_paths_and_cmds.sh',
 '.ipynb_checkpoints',
 'GD_AmE_destressed_aligned_w_LTR_newdic_destressed',
 'string_utils.py',
 'GD_AmE_destressed_aligned_w_LTR_CMU_destressed',
 'LTR_CMU_stressed',
 'AmE-diphones - LTR_newdic_destressed alignment.ipynb',
 'LTR_newdic_destressed_aligned_w_GD_AmE_destressed',
 '.git',
 'LTR_newdic_destressed',
 'LTR_CMU_destressed_aligned_w_GD_AmE_destressed']

In [7]:
# diphoneDataInFilename = g
diphoneDataInFilepath = g
diphoneDataInFilename = path.basename(diphoneDataInFilepath)
diphoneDataInDirname = path.dirname(diphoneDataInFilepath)
diphoneDataInDirname
diphoneDataInFilename

print(' ')

# lexiconDataInFilename = l
lexiconDataInFilepath = l
lexiconDataInFilename = path.basename(lexiconDataInFilepath)
lexiconDataInDirname = path.dirname(lexiconDataInFilepath)
lexiconDataInDirname
lexiconDataInFilename

print(' ')
if s == '':
    s = 'destressed'

'GD_AmE_destressed_aligned_w_LTR_Buckeye'

'AmE-diphones-IPA-annotated-columns.csv'

 


'LTR_Buckeye_aligned_w_GD_AmE_destressed'

'LTR_buckeye.tsv'

 


# Import data

In [8]:
import csv

In [9]:
from boilerplate import *

In [10]:
from string_utils import *

## Import gating data

In [11]:
def getDiphoneGatingTrials(filename, print_fields = True):
    '''
    Opens filename in the current working directory and returns the trials as a 
    list of dictionaries, plus the fieldnames in the order present in the file.
    '''
    diphone_fields = []
    diphoneTrials = []
    diphoneDataInFilename = filename
    with open(diphoneDataInFilename, newline='') as csvfile:
        my_reader = csv.DictReader(csvfile, delimiter='\t')
        diphone_fields = my_reader.fieldnames
        if print_fields:
            print("fieldnames: {0}".format(diphone_fields))
        for row in my_reader:
            #print(row)
            diphoneTrials.append(row)
    return {'trials': diphoneTrials, 'fields':diphone_fields}

def writeProcessedDataToCSV(theTrials, theFieldnames, filename):
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, delimiter='\t',fieldnames=theFieldnames)
        writer.writeheader()
        writer.writerows(theTrials)

In [12]:
def getDestressedDiphone(row):
    return row['diphoneInSeg']

def getStressedDiphone(row):
    return row['diphoneInWStress']

In [13]:
sound_fields = ['Prec_context', 'CorrAns1', 'CorrAns2', 'Resp1', 'Resp2',
                'diphoneInSeg', 'diphoneInWStress', 'diphoneOutSeg',
                'prefixSeg', 'prefixWStress',
                'suffixSeg', 'suffixWStress',
                'stimulusSeg', 'stimulusWProsody']
diphone_fields = ['CorrAns1', 'CorrAns2', 'Resp1', 'Resp2',
                  'diphoneInSeg', 'diphoneInWStress', 'diphoneOutSeg']
#                 'stimulusSeg', 'stimulusWProsody']

def getSoundFields(row):
    return project_dict(row, sound_fields)

def getDiphoneFields(row, include_full_stimulus_column = False):
    if not include_full_stimulus_column:
        return project_dict(row, diphone_fields)
    return project_dict(row, diphone_fields + ['stimulusSeg', 'stimulusWProsody'])

core_sound_fields = ['Prec_context', 'CorrAns1', 'CorrAns2', 'Resp1', 'Resp2']

def getSounds(row):
    return set(project_dict(row, core_sound_fields).values())

In [14]:
def getStimSeg1(row, which_stress):
    seg = row['CorrAns1']
    if which_stress == 'destressed':
        return seg
    elif which_stress == 'stressed':
        s = row['seg1_stress']
        if s == '2' or s == 2:
            return seg
        else:
            return seg + str(s)
    else:
        assert which_stress in ['destressed', 'stressed'], '{0} is an invalid choice about stress representations'.format(which_stress)

def getStimSeg2(row, which_stress):
    seg = row['CorrAns2']
    if which_stress == 'destressed':
        return seg
    elif which_stress == 'stressed':
        s = row['seg2_stress']
        if s == '2' or s == 2:
            return seg
        else:
            return seg + str(s)
    else:
        assert which_stress in ['destressed', 'stressed'], '{0} is an invalid choice about stress representations'.format(which_stress)
        
def removeConsStress(stringRep):
    return ''.join([c for c in stringRep if c != "2"])

def removeStress(stringRep):
    return ''.join([c for c in stringRep if c != "0" and c != "1" and c != "2"])

def replaceSyllableBoundaries(stringRep):
    return stringRep.replace('-','.')

def justSegments(stringRep):
    return replaceSyllableBoundaries(removeStress(stringRep))

def getDiphonesInAsStr(row, which_stress):
    if which_stress == 'destressed':
        return row['diphoneInSeg']
    elif which_stress == 'stressed': 
        #we remove consonant stress annotations because there are none in IPhOD (and probably none in Hammond's newdic, either)
        assert removeStress(row['diphoneInWStress']) == row['diphoneInSeg'], '{0} and {1} have segmental mismatch'.format(row['diphoneIn'], row['diphoneInWStress'])
        return removeConsStress(row['diphoneInWStress'])
    else:
        assert which_stress in ['destressed', 'stressed'], '{0} is an invalid choice about stress representations'.format(which_stress)
        
def getDiphonesOutAsStr(row):
    return row['diphoneOutSeg']

In [15]:
def mergeXintoY(sound_x,sound_y,the_dict, exact_match = True):
    '''
    Replace every instance of sound X with one of sound Y 
    in all sound fields of the_dict.
    
    If exact_match is True, then a field's value must be exactly
    and entirely equal to sound_X; otherwise, this function will
    substitute any instance (substring) of sound_X in the sound
    fields of the_dict.
    '''
    for key in the_dict.keys():
        if exact_match:
            if sound_x == the_dict[key] and key in sound_fields:
#                 if key != 'Prec_context':
#                     print("{2}:{0}⟶{1}.".format(the_dict[key], sound_y, key))
                the_dict.update({key: sound_y})
        else: #use carefully...
            if sound_x in the_dict[key] and key in sound_fields:
                old_str = the_dict[key]
                new_str = old_str.replace(sound_x, sound_y)
#                 if key != 'Prec_context':
#                     print("{2}:{0}⟶{1}.".format(old_str, new_str, key))
                the_dict.update({key: new_str})
    return the_dict

In [16]:
%pwd

'/mnt/cube/home/AD/emeinhar/wr'

In [17]:
%ls diphones-*

ls: cannot access 'diphones-*': No such file or directory


In [18]:
# diphoneDataInFilename = "diphones-IPA-annotated-columns.csv"

file_data = getDiphoneGatingTrials(diphoneDataInFilepath)
rows_in = file_data['trials']
the_fields = file_data['fields']

fieldnames: ['Subject', 'Diph_num', 'Diph_name', 'Sylltype', 'SoundFile', 'Prec_context', 'gate', 'four_gate', 'seg1_stress', 'seg2_stress', 'CorrAns1', 'CorrAns2', 'Resp1', 'Resp2', 'Seg1Accur', 'Seg2Accur', 'Prec_context_binary', 'wrong_preccontext', 'replacedSeg1Data', 'replacedSeg2Data', 'diphoneInWStress', 'diphoneInSeg', 'diphoneOutSeg', 'stimulusWProsody', 'stimulusSeg', 'prefixWStress', 'prefixSeg', 'suffixWStress', 'suffixSeg']


## Import lexicon

In [19]:
lexicon_in = []
with open(lexiconDataInFilepath, 'r', newline='', encoding='utf-8') as csvfile:
    my_reader = csv.DictReader(csvfile, delimiter='\t', quoting=csv.QUOTE_NONE, quotechar='@')
    for row in my_reader:
        #print(row)
        lexicon_in.append(row)

len(lexicon_in)
lexicon_in[0].keys()
lexicon_in[0]

216062

odict_keys(['Orthographic_Wordform', 'Transcription'])

OrderedDict([('Orthographic_Wordform', 'i'), ('Transcription', 'aɪ')])

In [20]:
lexicon_in[0]

OrderedDict([('Orthographic_Wordform', 'i'), ('Transcription', 'aɪ')])

# Compare inventories

In [21]:
# GD_inventory = union(list(map(getSounds, rows_in)))
if s == 'destressed':
    GD_inventory = lexiconToInventory(list(map(getDestressedDiphone,
                                               rows_in)))
if s == 'stressed':
    GD_inventory = lexiconToInventory(list(map(lambda r: removeConsStress(getStressedDiphone(r)),
                                           rows_in)))
if s == '':
    raise Exception("Must choose either s = 'stressed' or s = 'destressed'")
len(GD_inventory)
GD_inventory

41

{'aɪ',
 'aʊ',
 'b',
 'd',
 'dʒ',
 'eɪ',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'l̩',
 'm',
 'n',
 'oʊ',
 'p',
 's',
 't',
 'tʃ',
 'u',
 'v',
 'w',
 'z',
 'æ',
 'ð',
 'ŋ',
 'ɑ',
 'ɔɪ',
 'ə',
 'ɚ',
 'ɛ',
 'ɪ',
 'ɹ',
 'ɾ',
 'ʃ',
 'ʊ',
 'ʌ',
 'ʒ',
 'θ'}

In [22]:
transcriptions = list(map(lambda row: row['Transcription'], lexicon_in))
LTR_inventory = lexiconToInventory(transcriptions)
len(LTR_inventory)
LTR_inventory

41

{'aɪ',
 'aʊ',
 'b',
 'd',
 'dʒ',
 'eɪ',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'l̩',
 'm',
 'm̩',
 'n',
 'n̩',
 'oʊ',
 'p',
 's',
 't',
 'tʃ',
 'u',
 'v',
 'w',
 'z',
 'æ',
 'ð',
 'ŋ',
 'ɑ',
 'ɔɪ',
 'ɚ',
 'ɛ',
 'ɪ',
 'ɹ',
 'ʃ',
 'ʊ',
 'ʌ',
 'ʒ',
 'θ'}

In [23]:
unique_to_GD = GD_inventory - LTR_inventory
len(unique_to_GD)
unique_to_GD

2

{'ə', 'ɾ'}

In [24]:
unique_to_LTR = LTR_inventory - GD_inventory
len(unique_to_LTR)
unique_to_LTR

2

{'m̩', 'n̩'}

# Projection function: gating data

In [25]:
unique_to_GD

{'ə', 'ɾ'}

In [26]:
def makeIdentityDict(keys):
    return {k:k for k in keys}

In [27]:
projection_GD = dict()

In [28]:
# projection_GD.update({' ':' '})
# projection_GD.update({'ɑb':'ɑb'})
# projection_GD.update({'ɾ':'ɾ'})

# makeIdentityDict(unique_to_GD)
# projection_GD.update( makeIdentityDict(unique_to_GD) )

In [29]:
projection_GD

{}

# Projection function: transcribed lexicon

In [30]:
unique_to_LTR

{'m̩', 'n̩'}

In [31]:
projection_LTR = dict()

In [35]:
# projection_LTR.update({'ɔ':'ɑ'})
projection_LTR.update({'ṃ':'m'})
projection_LTR.update({'ṇ':'n'})

In [36]:
projection_LTR

{'ṃ': 'm', 'ṇ': 'n'}

# Export projections

In [37]:
export = True

In [38]:
getcwd()

'/mnt/cube/home/AD/emeinhar/wr'

In [39]:
projection_GD_fn = 'alignment_of_' + diphoneDataInFilename[:-4] + '_w_' + lexiconDataInFilename[:-4] + '.json'
projection_GD_fn

projection_GD_fp = path.join(diphoneDataInDirname, projection_GD_fn)
projection_GD_fp

projection_GD

'alignment_of_AmE-diphones-IPA-annotated-columns_w_LTR_buckeye.json'

'GD_AmE_destressed_aligned_w_LTR_Buckeye/alignment_of_AmE-diphones-IPA-annotated-columns_w_LTR_buckeye.json'

{}

In [40]:
projection_LTR_fn = 'alignment_of_' + lexiconDataInFilename[:-4] + '_w_' + diphoneDataInFilename[:-4] + '.json'
projection_LTR_fn

projection_LTR_fp = path.join(lexiconDataInDirname, projection_LTR_fn)
projection_LTR_fp

projection_LTR

'alignment_of_LTR_buckeye_w_AmE-diphones-IPA-annotated-columns.json'

'LTR_Buckeye_aligned_w_GD_AmE_destressed/alignment_of_LTR_buckeye_w_AmE-diphones-IPA-annotated-columns.json'

{'ṃ': 'm', 'ṇ': 'n'}

In [41]:
if export:
    with codecs.open(projection_GD_fp, 'w', encoding='utf-8') as f:
            json.dump(projection_GD, f, ensure_ascii = False, indent = 4)

In [42]:
if export:
    with codecs.open(projection_LTR_fp, 'w', encoding='utf-8') as f:
            json.dump(projection_LTR, f, ensure_ascii = False, indent = 4)

In [43]:
listdir()

['boilerplate.py',
 'LTR_Buckeye',
 '.gitignore',
 'LTR_Buckeye_aligned_w_GD_AmE_destressed',
 'GD_AmE_destressed_aligned_w_LTR_Buckeye',
 '__pycache__',
 'AmE-diphones - LTR_Buckeye alignment.ipynb',
 'LTR_CMU_destressed',
 'AmE-diphones - LTR_CMU_destressed alignment.ipynb',
 'Gating Data - Transcription Lexicon Alignment Maker.ipynb',
 'old',
 'alignment_paths_and_cmds.sh',
 '.ipynb_checkpoints',
 'GD_AmE_destressed_aligned_w_LTR_newdic_destressed',
 'string_utils.py',
 'GD_AmE_destressed_aligned_w_LTR_CMU_destressed',
 'LTR_CMU_stressed',
 'AmE-diphones - LTR_newdic_destressed alignment.ipynb',
 'LTR_newdic_destressed_aligned_w_GD_AmE_destressed',
 '.git',
 'LTR_newdic_destressed',
 'LTR_CMU_destressed_aligned_w_GD_AmE_destressed']