In [1]:
import pandas as pd # pandas package for importing files (e.g. from Excel)
import numpy as np
import difflib # package for SequenceMatcher
from IPython.display import Markdown, display # Use Markdown

############################
# WORD LIST
############################
# Requires columns:
#Index - (important to be unique - if the index is repeated, the code gives an error)
#Word1 - English word (or other reference language)
#Phon1 - phonetic transcription of English word / reference language (can be IPA or SAMPA)
#Word2 - AL word
#Phon2 - phonetic transcription of AL word

#fnameW      = 'word_lists.xlsx'
#fnameW      = 'OCDI100_word_lists.xlsx'
#wordsheet  = 'English-French'
#wordsheet  = 'English-Dutch'
#wordsheet  = 'English-German'
#wordsheet  = 'English-Italian'
#wordsheet  = 'English-Spanish'
#wordsheet  = 'English-Welsh'
#wordsheet  = 'Test'

fnameW      = 'TranElicit_word_lists.xlsx'
wordsheet  = 'English-Spanish.Task'
#wordsheet  = 'English-Catalan.Task'
#wordsheet  = 'Spanish-Catalan.Task'
#wordsheet  = 'FalseFriends'

#fnameW      = 'OCDI_word_lists.xlsx'
#wordsheet  = 'OCDI'

# read the word list from the file
wordlist = pd.read_excel(fnameW, sheet_name=wordsheet, index_col='Index')

#print(wordlist)


############################
# PHONEME LIST
############################
# Requires columns:
# IPA (and/or) SAMPA - List of IPA / SAMPA phonetic transcriptions
# Code - Lists of associated phoneme codes

# 3 digit code for each phoneme, corresponding to manner/place/voicing or height/backness/roundedness
# Consonants have codes starting 1 or 2, vowels have codes starting 7,8,9
# consonants [0] - voicing, [1] - place, [2] - manner
# vowels [0] - height, [1] - backness, [2] - roundedness

fnameP      = 'Phon_list.xlsx'
#phonesheet = 'PhoneCoding'
phonesheet = 'PhoneCoding.broad' # collapses phoneme features into broader categories

# read as appropriate
phncodes = pd.read_excel(fnameP, sheet_name=phonesheet, index_col='IPA')
#phncodes = pd.read_excel(fnameP, sheet_name=phonesheet, index_col='SAMPA')


# Updates 26/06/2020
# - Now takes SAMPA as input (big achievement!)
# - automatically formats the input (removing separators, diacritics, etc), reducing need for manual editing
# - calculates phoneme count, so there is no need to calculate phoneme count in source file

In [3]:
  
for wrd in wordlist.index:
    wrd1  = wordlist.loc[wrd]['Phon1'] # IPA or SAMPA transcription of L1 word
    wrd2  = wordlist.loc[wrd]['Phon2']
    ort1  = wordlist.loc[wrd]['Word1'] # orthographic form of L1 word
    ort2  = wordlist.loc[wrd]['Word2']  
    
    
    # Removes non-informative elements (spaces, long vowel marker, phoneme separator, stress markers)
    for r in ((" ", ""), (":", ""), (".", ""), ("'", ""), ('"', ""), ('ˈ', "")):
        wrd1 = wrd1.replace(*r)
        wrd2 = wrd2.replace(*r)
    
    # Remove diacritics from phonemes (centralised, pharyngealized, nasalised, dental, aspirated)
    # (not of interest, and the algorithm treats them as separate units, which is problematic)
    for d in (("_0", ""), ("_?", ""),("~", ""), ('̃',""), ("_d",""), ('̪', ""), ("_h",""), ("ʰ", "")):
        wrd1 = wrd1.replace(*d)
        wrd2 = wrd2.replace(*d)
    
    #print(wrd1+', '+wrd2)     

    # identify the phoneme count of the longer word
    len1 = len(wrd1)
    len2 = len(wrd2)
    if len1 >= len2:
        length = len1
    else:
        length = len2 

    phnlist1 = list(wrd1) # convert word into list of individual phonemes
    phnlist2 = list(wrd2)
    
    #If there are separators
    #phnlist1 = wrd1.split(".") # this would be useful if we want to treat diphtongs as a single unit
    #phnlist2 = wrd2.split(".") # if we go for this route, need to add specific codes for diphtongs to phonlist

    #print(type(phnlist1[i]))
        
    # phnlist1 output is list of strings
    # If phoneme code is a number, convert to integer format, otherwise keep as string
    
    # For some reason .loc doesn't accept numbers as strings
        #print(phncodes.loc[9]['Code']) # works
        #print(phncodes.loc['9']['Code']) # doesn't work
        #print(phncodes.loc['a']['Code']) # works

    # convert phoneme list into phoneme CODE list
    for i in range(len(phnlist1)):
        if phnlist1[i].isdigit():
            phnlist1[i] = int(phnlist1[i])
        phnlist1[i] = phncodes.loc[phnlist1[i]]['Code']
                    
    for i in range(len(phnlist2)):
        if phnlist2[i].isdigit():
            phnlist2[i] = int(phnlist2[i])
        phnlist2[i] = phncodes.loc[phnlist2[i]]['Code']

In [9]:
s = difflib.SequenceMatcher(None, phnlist1, phnlist2)
simEdits = s.get_opcodes() # show edit operation: equal/replace/delete/insert
s.

In [10]:
print(simEdits)

[('equal', 0, 1, 0, 1), ('delete', 1, 2, 1, 1), ('equal', 2, 3, 1, 2), ('replace', 3, 5, 2, 5)]
