In [1]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

**Notebook author:** emeinhardt@ucsd.edu

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Overview" data-toc-modified-id="Overview-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Overview</a></span><ul class="toc-item"><li><span><a href="#Requirements" data-toc-modified-id="Requirements-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Requirements</a></span></li><li><span><a href="#Usage" data-toc-modified-id="Usage-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Usage</a></span><ul class="toc-item"><li><span><a href="#Papermill---command-line" data-toc-modified-id="Papermill---command-line-1.2.1"><span class="toc-item-num">1.2.1&nbsp;&nbsp;</span>Papermill - command line</a></span></li><li><span><a href="#Old-School" data-toc-modified-id="Old-School-1.2.2"><span class="toc-item-num">1.2.2&nbsp;&nbsp;</span>Old School</a></span></li></ul></li></ul></li><li><span><a href="#Parameters" data-toc-modified-id="Parameters-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Parameters</a></span></li><li><span><a href="#Imports" data-toc-modified-id="Imports-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Imports</a></span><ul class="toc-item"><li><span><a href="#Accessing-and-manipulating-gating-data-fields" data-toc-modified-id="Accessing-and-manipulating-gating-data-fields-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Accessing and manipulating gating data fields</a></span></li><li><span><a href="#Calculating,-exporting-and-importing-licit-and-illicit-n-phones" data-toc-modified-id="Calculating,-exporting-and-importing-licit-and-illicit-n-phones-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Calculating, exporting and importing licit and illicit n-phones</a></span></li></ul></li><li><span><a href="#Calculate,-export,-import-licit-and-illicit-uniphones,-diphones,-and-(constructible)-triphones" data-toc-modified-id="Calculate,-export,-import-licit-and-illicit-uniphones,-diphones,-and-(constructible)-triphones-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Calculate, export, import licit and illicit uniphones, diphones, and (constructible) triphones</a></span><ul class="toc-item"><li><span><a href="#Import-data" data-toc-modified-id="Import-data-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Import data</a></span></li><li><span><a href="#Uniphones" data-toc-modified-id="Uniphones-4.2"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>Uniphones</a></span></li><li><span><a href="#Diphones" data-toc-modified-id="Diphones-4.3"><span class="toc-item-num">4.3&nbsp;&nbsp;</span>Diphones</a></span></li><li><span><a href="#Triphones" data-toc-modified-id="Triphones-4.4"><span class="toc-item-num">4.4&nbsp;&nbsp;</span>Triphones</a></span></li></ul></li></ul></div>

# Overview

This notebook calculates the stressed and destressed stimuli uniphones and diphones and the destressed response uniphones and diphones in the provided gating data as well as the stressed and destressed stimuli triphones and destressed response triphones that can be constructed by stitching overlapping diphones together. Each set of $n$-phones is exported in its own `.txt` file. (There are about 15 `.txt` output files.)

## Requirements

The notebook does not depend on any third-party packages, with the partial exception of `papermill`, as noted immediately below.

## Usage

### Papermill - command line

This notebook is intended to be used with the [`papermill`](https://papermill.readthedocs.io/en/latest/) package.

**Example:**

```
papermill "Run n-phone analysis of gating data.ipynb" "./GD_AmE/GD_AmE-diphones n-phone analysis.ipynb" -p g "./GD_AmE/AmE-diphones-IPA-annotated-columns.csv" -p o "./GD_AmE"
```
will create a new notebook `GD_AmE-diphones n-phone analysis.ipynb` that records data processing (but not, if it runs successfully, requiring any action or intervention from you) and write all 15 `.txt` output files to the directory `./GD_AmE`.

### Old School

If you don't have or want to use this notebook as intended (i.e. via `papermill`), edit the filenames/paths in the cell below with the top comment `#PARAMETERS CELL`.

# Parameters

In [11]:
from os import getcwd, chdir, listdir, path, mkdir, makedirs

In [13]:
#PARAMETERS
#
# This is the Paremeters cell that papermill looks at and modifies
# 
# go to View->Cell Toolbar->Tags to see the cell "parameters" tag

g = ""
# g = "./GD_AmE/AmE-diphones-IPA-annotated-columns.csv"

o = ""
# o = "./GD_AmE"
# o = "./GD_AmE_destressed_aligned_w_LTR_Buckeye/

# Imports

In [2]:
from boilerplate import *

In [3]:
from string_utils import *

In [4]:
import csv

In [5]:
def getDiphoneGatingTrials(filename, print_fields = True):
    '''
    Opens filename in the current working directory and returns the trials as a 
    list of dictionaries, plus the fieldnames in the order present in the file.
    '''
    diphone_fields = []
    diphoneTrials = []
    diphoneDataInFilename = filename
    with open(diphoneDataInFilename, newline='') as csvfile:
        my_reader = csv.DictReader(csvfile, delimiter='\t')
        diphone_fields = my_reader.fieldnames
        if print_fields:
            print("fieldnames: {0}".format(diphone_fields))
        for row in my_reader:
            #print(row)
            diphoneTrials.append(row)
    return {'trials': diphoneTrials, 'fields':diphone_fields}

def writeProcessedDataToCSV(theTrials, theFieldnames, filename):
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, delimiter='\t',fieldnames=theFieldnames)
        writer.writeheader()
        writer.writerows(theTrials)

## Accessing and manipulating gating data fields

In [6]:
def project_dict(the_dict, keys_to_keep):
    new_dict = {key:the_dict[key] for key in the_dict.keys() if key in keys_to_keep}
    return new_dict
project_dict({'Name':'Joe','ID':123,'Job':'clerk'},['Job','ID'])


{'ID': 123, 'Job': 'clerk'}

In [7]:
sound_fields = ['Prec_context', 'CorrAns1', 'CorrAns2', 'Resp1', 'Resp2',
                'diphoneInSeg', 'diphoneInWStress', 'diphoneOutSeg',
                'prefixSeg', 'prefixWStress',
                'suffixSeg', 'suffixWStress',
                'stimulusSeg', 'stimulusWProsody']
diphone_fields = ['CorrAns1', 'CorrAns2', 'Resp1', 'Resp2',
                  'diphoneInSeg', 'diphoneInWStress', 'diphoneOutSeg']
#                 'stimulusSeg', 'stimulusWProsody']

def getSoundFields(row):
    return project_dict(row, sound_fields)

def getDiphoneFields(row, include_full_stimulus_column = False):
    if not include_full_stimulus_column:
        return project_dict(row, diphone_fields)
    return project_dict(row, diphone_fields + ['stimulusSeg', 'stimulusWProsody'])

core_sound_fields = ['Prec_context', 'CorrAns1', 'CorrAns2', 'Resp1', 'Resp2']

def getSounds(row):
    return set(project_dict(row, core_sound_fields).values())

In [8]:
def getStimSeg1(row, which_stress):
    seg = row['CorrAns1']
    if which_stress == 'destressed':
        return seg
    elif which_stress == 'stressed':
        s = row['seg1_stress']
        if s == '2' or s == 2:
            return seg
        else:
            return seg + str(s)
    else:
        assert which_stress in ['destressed', 'stressed'], '{0} is an invalid choice about stress representations'.format(which_stress)

def getStimSeg2(row, which_stress):
    seg = row['CorrAns2']
    if which_stress == 'destressed':
        return seg
    elif which_stress == 'stressed':
        s = row['seg2_stress']
        if s == '2' or s == 2:
            return seg
        else:
            return seg + str(s)
    else:
        assert which_stress in ['destressed', 'stressed'], '{0} is an invalid choice about stress representations'.format(which_stress)
        
def removeConsStress(stringRep):
    return ''.join([c for c in stringRep if c != "2"])

def removeStress(stringRep):
    return ''.join([c for c in stringRep if c != "0" and c != "1" and c != "2"])

def replaceSyllableBoundaries(stringRep):
    return stringRep.replace('-','.')

def justSegments(stringRep):
    return replaceSyllableBoundaries(removeStress(stringRep))

def getDiphonesInAsStr(row, which_stress):
    if which_stress == 'destressed':
        return row['diphoneInSeg']
    elif which_stress == 'stressed': 
        #we remove consonant stress annotations because there are none in IPhOD (and probably none in Hammond's newdic, either)
        assert removeStress(row['diphoneInWStress']) == row['diphoneInSeg'], '{0} and {1} have segmental mismatch'.format(row['diphoneIn'], row['diphoneInWStress'])
        return removeConsStress(row['diphoneInWStress'])
    else:
        assert which_stress in ['destressed', 'stressed'], '{0} is an invalid choice about stress representations'.format(which_stress)
        
def getDiphonesOutAsStr(row):
    return row['diphoneOutSeg']

In [9]:
def mergeXintoY(sound_x,sound_y,the_dict, exact_match = True):
    '''
    Replace every instance of sound X with one of sound Y 
    in all sound fields of the_dict.
    
    If exact_match is True, then a field's value must be exactly
    and entirely equal to sound_X; otherwise, this function will
    substitute any instance (substring) of sound_X in the sound
    fields of the_dict.
    '''
    for key in the_dict.keys():
        if exact_match:
            if sound_x == the_dict[key] and key in sound_fields:
#                 if key != 'Prec_context':
#                     print("{2}:{0}⟶{1}.".format(the_dict[key], sound_y, key))
                the_dict.update({key: sound_y})
        else: #use carefully...
            if sound_x in the_dict[key] and key in sound_fields:
                old_str = the_dict[key]
                new_str = old_str.replace(sound_x, sound_y)
#                 if key != 'Prec_context':
#                     print("{2}:{0}⟶{1}.".format(old_str, new_str, key))
                the_dict.update({key: new_str})
    return the_dict

## Calculating, exporting and importing licit and illicit n-phones

In [30]:
#get ASAP to a set of strings

def getStimuliDiphones(rows, which_stress):
    return set(map(t2ds, [(getStimSeg1(row, which_stress), getStimSeg2(row, which_stress)) for row in rows] ))

def getResponseDiphones(rows, which_stress):
    return set(map(t2ds, [(row['Resp1'], row['Resp2']) for row in rows] ))

diphone_analyses = ('destressed stimuli', 'stressed stimuli', 'destressed response')

def getDiphones(rows):
    return {'destressed stimuli':getStimuliDiphones(rows, 'destressed'),
            'stressed stimuli': getStimuliDiphones(rows, 'stressed'),
            'destressed response': getResponseDiphones(rows, 'destressed')}


def overlap(diphoneA, diphoneB):
    """
    Diphone A 'overlaps' diphone B iff the second segment of A is the same as the first segment of B:
        overlap('a.x', 'x.b') == True
        overlap('x.b', 'a.x') == False
        overlap(('u','t'), ('t','i')) == True
        overlap(('i','t'), ('t','i')) == True
        overlap(('i','t'), ('t','a')) == True
    """
    if '.' in diphoneA:
        tupledA = dottedStringToTuple(diphoneA)
    else:
        tupledA = diphoneA
    
    if '.' in diphoneB:
        tupledB = dottedStringToTuple(diphoneB)
    else:
        tupledB = diphoneB
    
    return tupledA[1] == tupledB[0]

def glueIntoTriphone(diphoneA, diphoneB):
    assert(overlap(diphoneA, diphoneB))
    if '.' in diphoneA:
        tupledA = dottedStringToTuple(diphoneA)
    else:
        tupledA = diphoneA
    
    if '.' in diphoneB:
        tupledB = dottedStringToTuple(diphoneB)
    else:
        tupledB = diphoneB
        
    return tuple([tupledA[0], tupledA[1], tupledB[1]])

def getConstructibleTriphones(diphoneSet):
    return set([glueIntoTriphone(eachDiphoneA, eachDiphoneB) 
                for eachDiphoneA in diphoneSet 
                for eachDiphoneB in diphoneSet if overlap(eachDiphoneA, eachDiphoneB)])

def licitNphones(rows, N):
    diphoneAnalysis = getDiphones(rows)
    if N == 2 or N == 1:
        return {k:lexiconToKfactors(diphoneAnalysis[k], N) for k in diphoneAnalysis}
    if N == 3:
        return {k:set(map(t2ds, getConstructibleTriphones(diphoneAnalysis[k]))) for k in diphoneAnalysis}
    raise Exception('N must be in {1,2,3}')

def allPossibleNphones(rows, N):
    alphabets = licitNphones(rows, 1)
#     print(alphabets)
#     print(list(map(len, alphabets.values())))
    if N == 1:
        return alphabets
    elif N > 1:
#         diphoneAnalysis = getDiphones(rows)
        return {k:set( map(t2ds, sigmaK(alphabets[k], N)) ) for k in alphabets}
    else:
        raise Exception('N must be ≥ 1.')

def illicitNphones(rows, N):
    if N == 2 or 3:
        nPhoneAnalysis = licitNphones(rows, N)
        sigmaNanalysis = allPossibleNphones(rows, N)
        return {k:sigmaNanalysis[k] - nPhoneAnalysis[k] for k in nPhoneAnalysis}
    if N == 1:
        raise Exception('N must be > 1.')

def analyzeNphones(rows, N):
    if N == 1:
        return {'licit':licitNphones(rows, N)}
    elif N > 1:
        return {'licit':licitNphones(rows, N),
                'illicit':illicitNphones(rows, N)}
    else:
        raise Exception('N must be ≥ 1.')

def exportSeqs(seq_fn, seqs):
    with open(seq_fn, 'w') as the_file:
        for seq in seqs:
            the_file.write(seq + '\n')

def importSeqs(seq_fn):
    phoneSeqsAsStr = []
    with open(seq_fn, 'r') as the_file:
        for row in the_file:
            phoneSeqsAsStr.append(row.rstrip('\r\n'))
    return set(phoneSeqsAsStr)

def exportNphoneAnalysis(analysis, N):#, which_align):
#     assert which_align in {'unaligned', 'Hammond-aligned', 'IPhOD-aligned'}
    assert N in {1,2,3}

    which_infix = {1:'',
                   2:'',
                   3:'diphone-based'}[N]
    which_suffix = {1:{'licit':'',
                       'illicit':''},
                    2:{'licit':'',
                       'illicit':'illegal'},
                    3:{'licit':'constructible',
                       'illicit':'illegal'}}[N]
    which_n = {1:'uniphones',
               2:'diphones',
               3:'triphones'}[N]
    file_ext = '.txt'
    
    for which_licit in analysis:
#         print('which_licit = {0}'.format(which_licit))
        for which_stress_which_diph in analysis[which_licit]:
#             print('which_stress_which_diph = {0}'.format(which_stress_which_diph))
            my_suff = ' '.join([each for each in [which_stress_which_diph, which_infix, which_suffix[which_licit], which_n] if each != ''])
            analysis_fn = my_suff + file_ext
            analysis_fp = path.join(o, analysis_fn)
#             analysis_fn = which_align + '_' + my_suff + file_ext
#           analysis_fn = which_align + '_' + ' '.join([which_stress_which_diph, which_infix, which_suffix[which_licit], which_n]) + file_ext
            print('Exporting: ' + analysis_fp)
            seqs = analysis[which_licit][which_stress_which_diph]
            exportSeqs(analysis_fp, seqs)

def importNphoneAnalysis(N):#, which_align):
#     assert which_align in {'unaligned', 'Hammond-aligned', 'IPhOD-aligned'}
    assert N in {1,2,3}

    which_infix = {1:'',
                   2:'',
                   3:'diphone-based'}[N]
    which_suffix = {1:{'licit':'',
                       'illicit':''},
                    2:{'licit':'',
                       'illicit':'illegal'},
                    3:{'licit':'constructible',
                       'illicit':'illegal'}}[N]
    which_n = {1:'uniphones',
               2:'diphones',
               3:'triphones'}[N]
    file_ext = '.txt'

    which_licit = {1:('licit',),
                   2:('licit', 'illicit'),
                   3:('licit', 'illicit')}[N]

    which_stress_which_diph = diphone_analyses

    analysis = dict()
    for each_licit in which_licit:
#         print('each_licit = {0}'.format(each_licit))
        analysis[each_licit] = dict()
        for each_stress_each_diph in which_stress_which_diph:
#             print('each_stress_each_diph = {0}'.format(each_stress_each_diph))
            my_suff = ' '.join([each for each in [each_stress_each_diph, which_infix, which_suffix[each_licit], which_n] if each != ''])
            analysis_fn = my_suff + file_ext
            analysis_fp = path.join(o, analysis_fn)
#             analysis_fn = which_align + '_' + my_suff + file_ext
#             analysis_fn = which_align + '_' + ' '.join([each_stress_each_diph, which_infix, which_suffix[each_licit], which_n]) + file_ext
            print('Importing: ' + analysis_fp)
            analysis[each_licit][each_stress_each_diph] = importSeqs(analysis_fp)
    return analysis

# Calculate, export, import licit and illicit uniphones, diphones, and (constructible) triphones

## Import data

In [14]:
%pwd

'/mnt/cube/home/AD/emeinhar/wr'

In [15]:
# diphoneDataInFilename = "diphones-IPA-annotated-columns.csv"
diphoneDataInFilepath = g

file_data = getDiphoneGatingTrials(diphoneDataInFilepath)
rows_in = file_data['trials']
the_fields = file_data['fields']

fieldnames: ['Subject', 'Diph_num', 'Diph_name', 'Sylltype', 'SoundFile', 'Prec_context', 'gate', 'four_gate', 'seg1_stress', 'seg2_stress', 'CorrAns1', 'CorrAns2', 'Resp1', 'Resp2', 'Seg1Accur', 'Seg2Accur', 'Prec_context_binary', 'wrong_preccontext', 'replacedSeg1Data', 'replacedSeg2Data', 'diphoneInWStress', 'diphoneInSeg', 'diphoneOutSeg', 'stimulusWProsody', 'stimulusSeg', 'prefixWStress', 'prefixSeg', 'suffixWStress', 'suffixSeg']


## Uniphones

In [32]:
uniphone_analysis = analyzeNphones(rows_in, 1)
exportNphoneAnalysis(uniphone_analysis, 1)

Exporting: ./GD_AmE/destressed stimuli uniphones.txt
Exporting: ./GD_AmE/stressed stimuli uniphones.txt
Exporting: ./GD_AmE/destressed response uniphones.txt


In [33]:
set(uniphone_analysis.keys())
set(uniphone_analysis['licit'].keys())
print(uniphone_analysis['licit']['destressed stimuli'])
{each_licit:{each_type:len(uniphone_analysis[each_licit][each_type]) 
             for each_type in uniphone_analysis[each_licit]} 
 for each_licit in uniphone_analysis}

{'licit'}

{'destressed response', 'destressed stimuli', 'stressed stimuli'}

{'ɪ', 'u', 'θ', 'z', 'ʌ', 'f', 'ʃ', 'ɹ', 'ɚ', 'b', 'ɑ', 'i', 'w', 'æ', 'h', 'd', 't', 'oʊ', 'j', 'p', 'eɪ', 'aɪ', 'ə', 'l̩', 'ʊ', 'g', 'v', 'ɔɪ', 'ŋ', 'ʒ', 'm', 'tʃ', 'ɾ', 'aʊ', 'ɛ', 'ð', 'n', 'k', 'l', 'dʒ', 's'}


{'licit': {'destressed stimuli': 41,
  'stressed stimuli': 54,
  'destressed response': 38}}

In [35]:
uniphone_analysis_in = importNphoneAnalysis(1)
assert uniphone_analysis_in == uniphone_analysis

Importing: ./GD_AmE/destressed stimuli uniphones.txt
Importing: ./GD_AmE/stressed stimuli uniphones.txt
Importing: ./GD_AmE/destressed response uniphones.txt


## Diphones

In [36]:
diphone_analysis = analyzeNphones(rows_in, 2)
exportNphoneAnalysis(diphone_analysis, 2)

Exporting: ./GD_AmE/destressed stimuli diphones.txt
Exporting: ./GD_AmE/stressed stimuli diphones.txt
Exporting: ./GD_AmE/destressed response diphones.txt
Exporting: ./GD_AmE/destressed stimuli illegal diphones.txt
Exporting: ./GD_AmE/stressed stimuli illegal diphones.txt
Exporting: ./GD_AmE/destressed response illegal diphones.txt


In [37]:
set(diphone_analysis.keys())
set(diphone_analysis['licit'].keys())
# print(diphone_analysis['licit']['destressed stimuli'])

{each_licit:{each_type:len(diphone_analysis[each_licit][each_type]) 
             for each_type in diphone_analysis[each_licit]} 
 for each_licit in diphone_analysis}

{'illicit', 'licit'}

{'destressed response', 'destressed stimuli', 'stressed stimuli'}

{'licit': {'destressed stimuli': 1323,
  'stressed stimuli': 2284,
  'destressed response': 1421},
 'illicit': {'destressed stimuli': 358,
  'stressed stimuli': 632,
  'destressed response': 23}}

In [39]:
diphone_analysis_in = importNphoneAnalysis(2)
assert diphone_analysis_in == diphone_analysis

Importing: ./GD_AmE/destressed stimuli diphones.txt
Importing: ./GD_AmE/stressed stimuli diphones.txt
Importing: ./GD_AmE/destressed response diphones.txt
Importing: ./GD_AmE/destressed stimuli illegal diphones.txt
Importing: ./GD_AmE/stressed stimuli illegal diphones.txt
Importing: ./GD_AmE/destressed response illegal diphones.txt


## Triphones

In [40]:
triphone_analysis = analyzeNphones(rows_in, 3)
exportNphoneAnalysis(triphone_analysis, 3)

Exporting: ./GD_AmE/destressed stimuli diphone-based constructible triphones.txt
Exporting: ./GD_AmE/stressed stimuli diphone-based constructible triphones.txt
Exporting: ./GD_AmE/destressed response diphone-based constructible triphones.txt
Exporting: ./GD_AmE/destressed stimuli diphone-based illegal triphones.txt
Exporting: ./GD_AmE/stressed stimuli diphone-based illegal triphones.txt
Exporting: ./GD_AmE/destressed response diphone-based illegal triphones.txt


In [41]:
set(triphone_analysis.keys())
set(triphone_analysis['licit'].keys())
# print(triphone_analysis['licit']['destressed stimuli'])

{each_licit:{each_type:len(triphone_analysis[each_licit][each_type]) 
             for each_type in triphone_analysis[each_licit]} 
 for each_licit in triphone_analysis}

{'illicit', 'licit'}

{'destressed response', 'destressed stimuli', 'stressed stimuli'}

{'licit': {'destressed stimuli': 44173,
  'stressed stimuli': 99516,
  'destressed response': 53133},
 'illicit': {'destressed stimuli': 24748,
  'stressed stimuli': 57948,
  'destressed response': 1739}}

In [42]:
triphone_analysis_in = importNphoneAnalysis(3)
assert triphone_analysis_in == triphone_analysis

Importing: ./GD_AmE/destressed stimuli diphone-based constructible triphones.txt
Importing: ./GD_AmE/stressed stimuli diphone-based constructible triphones.txt
Importing: ./GD_AmE/destressed response diphone-based constructible triphones.txt
Importing: ./GD_AmE/destressed stimuli diphone-based illegal triphones.txt
Importing: ./GD_AmE/stressed stimuli diphone-based illegal triphones.txt
Importing: ./GD_AmE/destressed response diphone-based illegal triphones.txt
