## Study of word similarities

In [53]:
import pandas as pd # pandas package for importing files (e.g. from Excel)
import numpy as np
import difflib # package for SequenceMatcher
from IPython.display import Markdown, display # Use Markdown

#fname      = 'word_lists.xlsx'
#wordsheet  = 'English-French'
#wordsheet  = 'English-Dutch'
#wordsheet  = 'English-Spanish'
#wordsheet  = 'English-German'
#wordsheet  = 'English-Italian'
#wordsheet  = 'Test'

fname      = 'TranElicit_word_lists.xlsx'
#wordsheet  = 'English-Spanish.Task'
#wordsheet  = 'English-Catalan.Task'
#wordsheet  = 'Spanish-Catalan.Task'
wordsheet  = 'FalseFriends'

#fname      = 'OCDI_word_lists.xlsx'
#wordsheet  = 'OCDI'
#wordsheet  = 'Test'

#phonesheet = 'PhoneCoding'
phonesheet = 'PhoneCoding.broad' # collapses phoneme features into broader categories


In [54]:
# read the word list from the file
wordlist = pd.read_excel(fname, sheet_name=wordsheet, index_col='Index')

#Index - numeric index (important - if the index is repeated, the code gives an error)
#Word1 - English word (or other reference language)
#IPA1 - IPA transcription of English word / reference language
#Word2 - AL word
#IPA2 - IPA transcription of AL word
#IPA1Len - length (phoneme count) of English/reference word
#IPA2Len - length (phoneme count) of AL word


# identify the phoneme count of the longer word
wordlist['LongerLen'] = np.where(wordlist['IPA1Len'] >= wordlist['IPA2Len'], wordlist['IPA1Len'], wordlist['IPA2Len'])

# read the phone coding from the file
phncodes = pd.read_excel(fname, sheet_name=phonesheet, index_col='Phone')

# Code - 3 digit code for each phoneme, corresponding to manner/place/voicing or height/backness/roundedness
# Phone - IPA phoneme
# Consonants have codes starting 1 or 2, vowels have codes starting 7,8,9
# consonants [0] - voicing, [1] - place, [2] - manner
# vowels [0] - height, [1] - backness, [2] - roundedness

#print(wordlist)
#print(phncodes)

In [55]:
### SIMILARITY ALGORITM USING DIFFLIB.SEQUENCEMATCHER
# https://docs.python.org/3.5/library/difflib.html#difflib.SequenceMatcher.get_matching_blocks

# Using SequenceMatcher's get_opcodes instead of python-levenshtein's editops
    
for wrd in wordlist.index:
    wrd1  = wordlist.loc[wrd]['IPA1'] # IPA transcription of L1 word
    wrd2  = wordlist.loc[wrd]['IPA2']
    ort1  = wordlist.loc[wrd]['Word1'] # orthographic form of L1 word
    ort2  = wordlist.loc[wrd]['Word2']  
    length= wordlist.loc[wrd]['LongerLen'] # length of longer word, to be used to calculate standardised distance

    s = difflib.SequenceMatcher(None, wrd1, wrd2) # apply SequenceMatcher on word pairs
    # Generate standardised basic similarity score (number of identical phonemes / total phonemes in longer word)
    similarity = s.ratio() 
    #print(wrd1+', '+wrd2+', '+str(similarity)) 

    simEdits = s.get_opcodes() # show edit operation: equal/replace/delete/insert
    #print(simEdits)
    
    # Refine calculation: identify and add weighted score for non-identical but similar phonemes
    nsim = 0
    for tag, i1, i2, j1, j2 in simEdits:
        # Step 1: weight for common onset
        # boost similarity score if the first phoneme is the same
        if tag is 'equal' and i1 is 0 and j1 is 0:
            nsim += 0.5 
        
        # Step 2: Non-identical but very similar phonemes
        # take a closer look at the replaced phonemes
        if tag is 'replace': 
            # Step 2a: Identify which phonemes were replaced with which
            # SequenceMatcher output shows phoneme chunks that were replaced in between matched element
            phn1 = wrd1[i1:i2]
            phn2 = wrd2[j1:j2]
            #print(tag, phn1, phn2)

            list1 = list(phn1) # convert phoneme chunk into list of individual phonemes
            list2 = list(phn2)
                        
            # Step 2b: Match each phoneme in the list to its phoneme code
                # non-loop equivalent:
                    # phnlist1[1] = phncodes.loc[phnlist1[1]]['Code'] if phnlist1[1] is not None
            phnlist1 = list(list1) # create duplicate list for phone codes; coding tip: phnlist1 = list1 copies index, so list1 will change when phnlist is edited
            for i in range(len(phnlist1)):
                if phnlist1[i] is not None:
                    phnlist1[i] = phncodes.loc[phnlist1[i]]['Code']
                    
            phnlist2 = list(list2)
            for i in range(len(phnlist2)):
                if phnlist2[i] is not None:
                    phnlist2[i] = phncodes.loc[phnlist2[i]]['Code']
            
            #print(list1, list2)
            #print(phnlist1, phnlist2)

            # Does the compared phoneme in the L2 exist in the English phonological inventory?
            IsEnglist2 = list(list2)
            for i in range(len(IsEnglist2)):
                if IsEnglist2[i] is not None:
                    IsEnglist2[i] = phncodes.loc[IsEnglist2[i]]['EnglishIPA']
            #print(IsEnglist2)

            
            # Step 2c: Identify common codes across lists (i.e. phonemes that are functionally identical, like [r] and [ɹ])
            common = ([x for x in phnlist1 if x in phnlist2]) 
            #print(common)
            codesim = len(common) # number of functionally-identical phonemes
            #print(codesim)
            nsim += codesim # add score of 1 for each identical phoneme

            # Step 2d: boost similarity score if the first phoneme is functionally identical
            if i1 is 0 and j1 is 0:
                code1 = phnlist1[0]
                code2 = phnlist2[0]
                if code1 == code2:
                    nsim += 0.5
     
            
            # Step 2e: Compare non-identical phonemes
            # 1. First remove identical phonemes matched above to avoid inflating score
                # non-loop equivalent
                    # phnlist1x[0] = None if phnlist1x[0] in common else phnlist1x[0]
            phnlist1x = list(phnlist1)
            phnlist1x[0] = None if phnlist1x[0] in common else phnlist1x[0]
            for i in range(len(phnlist1x)):
                if phnlist1x[i] in common:
                    phnlist1x[i] = None
                else:
                    phnlist1x[i] = phnlist1x[i]

            phnlist2x = list(phnlist2)
            for i in range(len(phnlist2x)):
                if phnlist2x[i] in common:
                    phnlist2x[i] = None
                else:
                    phnlist2x[i] = phnlist2x[i]

            #print(phnlist1x, phnlist2x)     

            # 2. Then convert feature code to a list of digits for detailed comparison
            # Output - List of lists
                # https://stackoverflow.com/questions/12293208/how-to-create-a-list-of-lists
                # Non loop equivalent
                    # codelist1a  = [int(x) for x in str(phnlist1x[0])] if phnlist1x[0] is not None else None
            lst1 = []
            for i in range(len(phnlist1x)):
                if phnlist1x[i] is not None:
                    line  = [int(x) for x in str(phnlist1x[i])]
                    lst1.append(line)
                else:
                    lst1.append(None)
            
            lst2 = []
            for i in range(len(phnlist2x)):
                if phnlist2x[i] is not None:
                    line  = [int(x) for x in str(phnlist2x[i])]
                    lst2.append(line)
                else:
                    lst2.append(None)
            #print(lst1)
            #print(lst2)
            
            codesim1 = 0
           
            # 3. Then compare first digit in English code to first digit in Spanish code, the second to the second, third to the third
            # identify length of shorter list, to specify number of phonemes to compare across words
            if len(lst1) <= len(lst2):
                strlen = len(lst1)
            elif len(lst1) >= len(lst2):
                strlen = len(lst2)

            # outer for-loop
                # runs through the list of phoneme codes "lst1"
            # inner for-loop
                # uses the index to work up from the first to the last number in the code (in this case 3 numbers, with index being 0, 1, 2)
            for i in range(strlen):
                if lst1[i] is not None and lst2[i] is not None:
                    # For vowels (777-999)
                    # [0] - height, [1] - backness, [2] - roundedness
                    if lst1[i][0] >= 7 and lst2[i][0] >= 7: # if it's a vowel replaceement
                    # Weight option 1: Each feature change is weighted equal, and add score as long as there is at least one common feature
                         for dgt, idgt in zip(lst1[i], range(len(lst1[i]))):
                            if dgt == lst2[i][idgt]:
                                codesim1 += 1/3 
                        # If compared digits (feature) are the same, add 1/3 to the similarity score

                    # For consonants (144-244)
                    # [0] - voicing, [1] - place, [2] - manner
                    if lst1[i][0] <= 6 and lst2[i][0] <= 6: # consonant replacement
                    # Weight option 2: Different feature changes weighted differently
                    # Only accept close feature changes
                        if lst1[i][1] == lst2[i][1] and lst1[i][2] == lst2[i][2]: # voicing change
                            codesim1 += 0.6 # allows [k] to [g]
                        elif lst1[i][0] == lst2[i][0] and lst1[i][2] == lst2[i][2]: # place change
                            if abs(int(lst1[i][1]) - int(lst2[i][1])) == 1: 
                                codesim1 += 0.4 # allows [n] to [ŋ]
                        elif lst1[i][0] == lst2[i][0] and lst1[i][1] == lst2[i][1]: # manner change
                            if abs(int(lst1[i][0]) - int(lst2[i][0])) == 1: 
                                codesim1 += 0.2 # allows [b] to [β]
                            
                            
            # Possible other ways to manipulate weights:
                # Distance of feature change (i.e. front vs mid vowel is more similar than front vs back)
                # A consonant match adds less score than a vowel match
            #print(codesim1)
            nsim += codesim1

    #print(nsim)
    # Step 3: calculate standardised version of score from Step 2 to be added to initial similarity score
    replace_ratio = nsim/length        
    #print(replace_ratio)
    
    # Step 4: add the basic score and refined score together
    similarity_phoneme = similarity + replace_ratio
    # because of the onset phoneme boost, final score can go above 1. 
    # Round back to max value of 1 if that happens
    similarity_phoneme = min(1, similarity_phoneme) #...why is the function to set maximum value min()
    #print(similarity_phoneme)
    
    print(ort1+' , '+ort2+' , '+wrd1+' , '+wrd2+', '+str(similarity_phoneme))
   
    
    #print()
    
# POINTS TO BE IMPROVED
# - Currently short words are more strongly affected by the score multiplier, which doesn't match up to real life
#   In some cases, a long word with half overlapping phonemes is more perceptually recognisable as a cognate 
#   than a 3 phoneme word with 2 overlapping phonemes, because of cohort effects (the long word has less cohorts to select from)

# - recode long vowels and diphtongs as a single unit, instead of two units as it is now
#   particularly in the case of long vowels, probably not so perceptually different from the short equivalent

# - certain types of phoneme changes are likely more salient than others
#   and this goes beyond simple differences between manner/place/voicing
#   More likely it is language-dependent: If a language does not have a minimal pair between a particular pair of phonemes,
#   the salience of that change would likely be perceptually negligible to the speaker
#   Example: Japanese doesn't differentiate between /r/ and /l/
#   Example2: English typically pronounce R as approximant [ɹ]
#   so are less likely to recognise difference between approximant [ɹ], tap [ɾ], trill [r]
#   Frequency of the minimal pair also likely makes a difference

coche , coach , koʧe , kəʊʧ, 0.7083333333333333
lengua , language , leɲgwa , læŋgwɪʤ, 0.8091575091575092
moneda , money , moneða , mʌni, 0.7055555555555555
gat , god , gat , gɒd, 0.8111111111111111
bol , ball , bol , bɔɔl, 0.9464285714285714
bol , ball , bɔɫ , bɔɔl, 0.9464285714285714
aixeta , bread , əʃɛtə , brɛd, 0.3422222222222222
camiseta , camisole , kamiseta , kæmɪsəʊl, 0.7708333333333333
grifo , grief , gɾifo , griif, 0.8999999999999999
martillo , martini , martiʎo , mɑɑtiini, 0.5458333333333334
pastel , pastel , pastel , pæstɛl, 1
perro , pear , pero , peə, 0.6964285714285714
puerta , port , pwerta , pɔɔt, 0.5388888888888889
caixa , cash , kaʃə , kæʃ, 0.9464285714285714
cama , camera , kamə , kæmərə, 0.85
cuc , cook , kuk , kʊk, 1
