## Study of word similarities

In [6]:
import pandas as pd # pandas package for importing files (e.g. from Excel)
import numpy as np
import difflib # package for SequenceMatcher
from IPython.display import Markdown, display # Use Markdown

############################
# WORD LIST
############################
# Requires columns:
#Index - (important to be unique - if the index is repeated, the code gives an error)
#Word1 - English word (or other reference language)
#Phon1 - phonetic transcription of English word / reference language (can be IPA or SAMPA)
#Word2 - AL word
#Phon2 - phonetic transcription of AL word

#fnameW      = 'word_lists.xlsx'
#fnameW      = 'OCDI100_word_lists.xlsx'
#wordsheet  = 'English-French'
#wordsheet  = 'English-Dutch'
#wordsheet  = 'English-German'
#wordsheet  = 'English-Italian'
#wordsheet  = 'English-Spanish'
#wordsheet  = 'English-Welsh'
#wordsheet  = 'Test'

fnameW      = 'TranElicit_word_lists.xlsx'
wordsheet  = 'English-Spanish.Task'
#wordsheet  = 'English-Catalan.Task'
#wordsheet  = 'Spanish-Catalan.Task'
#wordsheet  = 'FalseFriends'

#fnameW      = 'OCDI_word_lists.xlsx'
#wordsheet  = 'OCDI'

# read the word list from the file
wordlist = pd.read_excel(fnameW, sheet_name=wordsheet, index_col='Index')

#print(wordlist)


############################
# PHONEME LIST
############################
# Requires columns:
# IPA (and/or) SAMPA - List of IPA / SAMPA phonetic transcriptions
# Code - Lists of associated phoneme codes

# 3 digit code for each phoneme, corresponding to manner/place/voicing or height/backness/roundedness
# Consonants have codes starting 1 or 2, vowels have codes starting 7,8,9
# consonants [0] - voicing, [1] - place, [2] - manner
# vowels [0] - height, [1] - backness, [2] - roundedness

fnameP      = 'Phon_list.xlsx'
#phonesheet = 'PhoneCoding'
phonesheet = 'PhoneCoding.broad' # collapses phoneme features into broader categories

# read as appropriate
phncodes = pd.read_excel(fnameP, sheet_name=phonesheet, index_col='IPA')
#phncodes = pd.read_excel(fnameP, sheet_name=phonesheet, index_col='SAMPA')


# Updates 26/06/2020
# - Now takes SAMPA as input (big achievement!)
# - automatically formats the input (removing separators, diacritics, etc), reducing need for manual editing
# - calculates phoneme count, so there is no need to calculate phoneme count in source file


In [7]:
########################################################
# SIMILARITY ALGORITHM
########################################################

# This code just gets more and more efficient :D
# But I still don't know how to export as a file lol

### based off difflib.SequenceMatcher
# https://docs.python.org/3.5/library/difflib.html
    
for wrd in wordlist.index:
    wrd1  = wordlist.loc[wrd]['Phon1'] # IPA or SAMPA transcription of L1 word
    wrd2  = wordlist.loc[wrd]['Phon2']
    ort1  = wordlist.loc[wrd]['Word1'] # orthographic form of L1 word
    ort2  = wordlist.loc[wrd]['Word2']  
    
    
    # Removes non-informative elements (spaces, long vowel marker, phoneme separator, stress markers)
    for r in ((" ", ""), (":", ""), (".", ""), ("'", ""), ('"', ""), ('ˈ', "")):
        wrd1 = wrd1.replace(*r)
        wrd2 = wrd2.replace(*r)
    
    # Remove diacritics from phonemes (centralised, pharyngealized, nasalised, dental, aspirated)
    # (not of interest, and the algorithm treats them as separate units, which is problematic)
    for d in (("_0", ""), ("_?", ""),("~", ""), ('̃',""), ("_d",""), ('̪', ""), ("_h",""), ("ʰ", "")):
        wrd1 = wrd1.replace(*d)
        wrd2 = wrd2.replace(*d)
    
    #print(wrd1+', '+wrd2)     

    # identify the phoneme count of the longer word
    len1 = len(wrd1)
    len2 = len(wrd2)
    if len1 >= len2:
        length = len1
    else:
        length = len2 

    phnlist1 = list(wrd1) # convert word into list of individual phonemes
    phnlist2 = list(wrd2)
    
    #If there are separators
    #phnlist1 = wrd1.split(".") # this would be useful if we want to treat diphtongs as a single unit
    #phnlist2 = wrd2.split(".") # if we go for this route, need to add specific codes for diphtongs to phonlist

    #print(type(phnlist1[i]))
        
    # phnlist1 output is list of strings
    # If phoneme code is a number, convert to integer format, otherwise keep as string
    
    # For some reason .loc doesn't accept numbers as strings
        #print(phncodes.loc[9]['Code']) # works
        #print(phncodes.loc['9']['Code']) # doesn't work
        #print(phncodes.loc['a']['Code']) # works

    # convert phoneme list into phoneme CODE list
    for i in range(len(phnlist1)):
        if phnlist1[i].isdigit():
            phnlist1[i] = int(phnlist1[i])
        phnlist1[i] = phncodes.loc[phnlist1[i]]['Code']
                    
    for i in range(len(phnlist2)):
        if phnlist2[i].isdigit():
            phnlist2[i] = int(phnlist2[i])
        phnlist2[i] = phncodes.loc[phnlist2[i]]['Code']
       
    #print(phnlist1, phnlist2)

    # Note: By applying the algorithm on phoneme codes instead of phonemes, 
    # we allow the algorithm to match up functionally identical phonemes,
    # which otherwise which would have been treated as different items
    # e.g. [l] and [ɫ] (Voiced Alveolar Approximant, code 213)
    # The sensitivity of phoneme differentiation can therefore be fine-tuned in the code specificiation
    
    ############################################
    # STEP 1: Apply SequenceMatcher Base Algorithm
    ############################################
    
    s = difflib.SequenceMatcher(None, phnlist1, phnlist2)
    # Use SequenceMatcher to identify similiarity between phoneme code lists 
    # (base algorithm only matches identical elements)
    
    simEdits = s.get_opcodes() # show edit operation: equal/replace/delete/insert
    #print(simEdits)
    
    matchblock = s.get_matching_blocks() # Get details of each matching block
    #print(matchblock)
    matchcount = 0
    for match in matchblock:
        matchcount += int(match.size) # Count total number of identical elements
    #print(matchcount)
    
    similarity = s.ratio() # proportion of overlapping elements
    #print(similarity)
    
    longmatch = s.find_longest_match(0, len(phnlist1), 0, len(phnlist2)) # longest CONTINUOUS string of identical elements
    #print(longmatch.size)

    ############################################
    # STEP 2: Refine calculation
    ############################################
    
    nsim = 0
    for tag, i1, i2, j1, j2 in simEdits:
        
        #################################
        # Common onset facilitation effect
        # boost similarity score if the first phoneme is the same
        #################################
        if tag is 'equal' and i1 is 0 and j1 is 0:
            onset = 'same'
            onsetlong = i2-i1 # how many consecutive matched phonemes are there at onset
            nsim += 0.5*onsetlong # onset boost is multiplied by the length of the onset match        
            # cohort effect, where a large number of consecutive matched elements at onset reduces the possible referents
            
        elif tag is not 'equal' and i1 is 0 and j1 is 0:
            onset = 'different'
            onsetlong = 0
                        
            
        #################################
        # Identify non-identical but very similar phonemes and add weighted score
        #################################
        if tag is 'replace': 
            # Identify which phonemes were replaced with which
            # SequenceMatcher output shows chunks that were replaced in between matched elements
            phn1 = phnlist1[i1:i2]
            phn2 = phnlist2[j1:j2]
            #print(tag, phn1, phn2)
            
            # convert feature code to a list of digits for detailed comparison
            # Output - List of lists
                # https://stackoverflow.com/questions/12293208/how-to-create-a-list-of-lists
            lst1 = []
            for i in range(len(phn1)):
                line  = [int(x) for x in str(phn1[i])]
                lst1.append(line)
            
            lst2 = []
            for i in range(len(phn2)):
                line  = [int(x) for x in str(phn2[i])]
                lst2.append(line)

            #print(lst1)
            #print(lst2)

            codesim1 = 0
            
            # identify length of shorter list, to specify number of phonemes to compare across words
            if len(lst1) <= len(lst2):
                strlen = len(lst1)
            elif len(lst1) >= len(lst2):
                strlen = len(lst2)

            # Compare first digit in English code to first digit in Spanish code, the second to the second, third to the third
            # outer for-loop
                # runs through the list of phoneme codes "lst1"
            # inner for-loop
                # uses the index to work up from the first to the last number in the code (in this case 3 numbers, with index being 0, 1, 2)
            for i in range(strlen):
                # For vowels (code 777-999)
                # [0] - height, [1] - backness, [2] - roundedness
                if lst1[i][0] >= 7 and lst2[i][0] >= 7: # if it's a vowel replaceement
                    # Weight option 1: Each feature change is weighted equal, and add score as long as there is at least one common feature
                    for dgt, idgt in zip(lst1[i], range(len(lst1[i]))):
                        if dgt == lst2[i][idgt]:
                            codesim1 += 1/3 
                        # If compared digits (feature) are the same, add 1/3 to the similarity score

                # For consonants (code 144-244)
                # [0] - voicing, [1] - place, [2] - manner
                if lst1[i][0] <= 6 and lst2[i][0] <= 6: # consonant replacement
                    # Weight option 2: Different feature changes weighted differently
                    # Only accept close feature changes
                    if lst1[i][1] == lst2[i][1] and lst1[i][2] == lst2[i][2]: # voicing change
                        codesim1 += 0.6 # allows [k] to [g]
                    elif lst1[i][0] == lst2[i][0] and lst1[i][2] == lst2[i][2]: # place change
                        if abs(int(lst1[i][1]) - int(lst2[i][1])) == 1: 
                            codesim1 += 0.4 # allows [n] to [ŋ]
                    elif lst1[i][0] == lst2[i][0] and lst1[i][1] == lst2[i][1]: # manner change
                        if abs(int(lst1[i][0]) - int(lst2[i][0])) == 1: 
                            codesim1 += 0.2 # allows [b] to [β]

            #print(codesim1)
            
            ##############################
            #IMPORTANT
            # Temporary measure, don't add score for close phonemes 
            ##############################
            #nsim += codesim1 # total 'refinement' score
    
    # calculate standardised refinement score to be added to initial similarity score
    replace_ratio = nsim/length        

    #print(nsim)
    #print(replace_ratio)
    
    ############################################
    # STEP 3: Add the base score and refinement score together
    ############################################

    similarity_phoneme = similarity + replace_ratio

    # Note: because of the onset phoneme boost, final score can go above 1 for very similar words
    # Round back to max value of 1 if that happens
    similarity_phoneme = min(1, similarity_phoneme) #...why is the function to set maximum value min()
    #print(similarity_phoneme) # Final similarity score
    
    # Lang1_ortho, Lang2_ortho, Lang1_phon, Lang2_phon, final_similarity_score, total_identical_matches, shared_onset, number of consecutive phonemes shared at onset, longest_continuous_match
    print(ort1+','+ort2+','+wrd1+','+wrd2+','+str(similarity_phoneme)+','+str(matchcount)+','+onset+','+str(onsetlong)+','+str(longmatch.size))


bee,abeja,bi,aβexa,0.0,0,different,0,0
water,agua,wɔtə,ajɣwə,0.4444444444444444,2,different,0,1
tap,grifo,tæp,gɾifo,0.0,0,different,0,0
salad,ensalada,sæləd,ensalada,0.6153846153846154,4,different,0,3
bottle,botella,bɒtəl,boteʎa,0.44696969696969696,2,same,1,1
snack,merienda,snæk,meɾjenda,0.3333333333333333,2,different,0,1
bottle,biberón,bɒtəl,biβeɾon,0.40476190476190477,2,same,1,1
bike,bici,baɪk,biθi,0.625,2,same,1,1
mouth,boca,maʊθ,boka,0.25,1,different,0,1
bowl,bol,bəʊl,bol,0.6964285714285714,2,same,1,1
mushroom,seta,mʌʃrʊm,seta,0.2,1,different,0,1
nappy,pañal,næpɪ,paɲal,0.2222222222222222,1,different,0,1
button,botón,bʌtən,boton,0.7,3,same,1,1
bull,buey,bʊl,bwei,0.4107142857142857,1,same,1,1
box,caja,bɒks,kaxa,0.25,1,different,0,1
bed,pierna,bɛd,pjerna,0.2222222222222222,1,different,0,1
shirt,camisa,ʃɛt,kamisa,0.2222222222222222,1,different,0,1
house,casa,haʊs,kasa,0.5,2,different,0,1
horse,caballo,hɔs,kaβaʎo,0.2222222222222222,1,different,0,1
cherry,cereza,ʧɛrɪ,θeɾeθa,0.85,3,same,3

In [10]:
############################################
# POINTS TO BE IMPROVED
############################################
        # Next step 27/06/2020
        # check translation elicitation task if phon neighbour density is a significant predictor
            
        
        # additional (less strong) multiplier on non-onset longest match (at rhyme)

# - Currently short words are more strongly affected by the score multiplier, which doesn't match up to real life
#   In some cases, a long word with half overlapping phonemes is more perceptually recognisable as a cognate 
#   than a 3 phoneme word with 2 overlapping phonemes, because of cohort effects (the long word has less cohorts to select from)

# Can look into possible other ways to manipulate weights:
    # Distance of feature change (i.e. front vs mid vowel is more similar than front vs back)
    # A consonant match adds less score than a vowel match

# - certain types of phoneme changes are likely more salient than others
#   and this goes beyond simple differences between manner/place/voicing
#   More likely it is language-dependent: If a language does not have a minimal pair between a particular pair of phonemes,
#   the salience of that change would likely be perceptually negligible to the speaker
#   Example: Japanese doesn't differentiate between /r/ and /l/
#   Example2: English typically pronounce R as approximant [ɹ]
#   so are less likely to recognise difference between approximant [ɹ], tap [ɾ], trill [r]
#   Frequency of the minimal pair also likely makes a difference
