In [15]:
from __future__ import unicode_literals
from typing import List, Tuple
import pandas as pd

In [16]:
# read the file
def read_file(path: str) -> [List[List[str]]]:
    '''
    reads an excel file
    
    :param: path to the file
    :rtype: list of lists of strings
    '''
    file = pd.read_excel(path, header=None)
    df = file.stack().groupby(level=0).apply(' '.join)
    df = [i.replace('LB', '') for i in df]
    df = [d.split() for d in df]    
    return df # return a list of lists

In [18]:
f = read_file('./alignments.xlsx')

In [22]:
class Converter:
    '''
    This class matches every phone with itself while it matches phones with length > 1 to numbers as follows:
    'tʃ':'1'
    'dʒ':'2'
    'ɑɪ':'3'
    'ɑi':'4'
    'ɔɪ':'5'
    'ɑʊ':'6'
    '''
    def __init__(self):
        self.table = {}
        self.phones_dictionary()
        
    def phones_dictionary(self):
        self.table = {
            # ----- consonants -----#
            'tʃ':'1', 'dʒ':'2', 'p':'p', 'b':'b', 't':'t', 'd':'d', 'ʈ':'ʈ', 'ɖ':'ɖ',
            'c':'c', 'ɟ':'ɟ', 'k':'k','g':'g', 'ɡ':'ɡ', 'q':'q','ɢ':'ɢ','ʔ':'ʔ', 'm':'m',
            'ɱ':'ɱ','n':'n','ɳ':'ɳ','ɲ':'ɲ','ŋ':'ŋ','ɴ':'ɴ','N':'N','ʙ':'ʙ','r':'r','ʀ':'ʀ',
            'R':'R', 'ɾ':'ɾ', 'ɽ':'ɽ','ɸ':'ɸ','β':'β','f':'f','v':'v','θ':'θ','ð':'ð','s':'s',
            'z':'z','ʃ':'ʃ','ʒ':'ʒ','ʂ':'ʂ','ʐ':'ʐ','ç':'ç','ʝ':'ʝ','x':'x','ɣ':'ɣ','χ':'χ',
            'ʁ':'ʁ','ħ':'ħ','ʕ':'ʕ','h':'h','ɦ':'ɦ','ɬ':'ɬ','ɮ':'ɮ','ʋ':'ʋ','ɹ':'ɹ','ɻ':'ɻ',
            'j':'j','ɰ':'ɰ','l':'l','w':'w',
            # ------ vowels -------#
            'ɑ':'ɑ', 
            'ɪ':'ɪ',
            'i':'i',
            'y':'y',
            'e':'e',
            'E':'E',
            'ø':'ø',
            'ɛ':'ɛ',
            'œ':'œ',
            'æ':'æ',
            'a':'a',
            'A':'A',
            'ɨ':'ɨ',
            'ʉ':'ʉ',
            'ə':'ə',
            'u':'u',
            'U':'U',
            'o':'o',
            'O':'O',
            'ɔ':'ɔ',
            'ɒ':'ɒ',
            'I':'I',
            'ʌ':'ʌ',
            'ʊ':'ʊ',
            'ɜ':'ɜ',
            #-------- diphthongs -----#
            'ɑɪ':'3',
            'ɑi':'4',
            'ɔɪ':'5',
            'ɑʊ':'6',
            
        }
    def convert(self, l: list) -> [List[List[str]]]:
        converted_string = []
        for i in range(len(l)):
            if l[i] in self.table:
                converted_string += self.table[l[i]]
        return converted_string         

In [25]:
s = Converter()
s = [s.convert(ff) for ff in f]

In [26]:
s = [''.join(d) for d in s]
v = iter(s)
list_of_tuples = [(i, next(v)) for i in v]
list_of_tuples[0]

('ædvænsbʌtsætəpil', 'ædvænsbʌtsɛtəpil')

In [33]:
def iterative_levenshtein(s: str, t: str, **weight_dict) -> int:
    """ 
        iterative_levenshtein(s, t) -> ldist
        ldist is the Levenshtein distance between the strings 
        s and t.
        For all i and j, dist[i,j] will contain the Levenshtein 
        distance between the first i characters of s and the 
        first j characters of t
        
        weight_dict: keyword parameters setting the costs for characters,
                     the default value for a character will be 1
                     
        Source: https://www.python-course.eu/levenshtein_distance.php
    """
#     s = tup[0]
#     t = tup[1]

    rows = len(s)+1
    cols = len(t)+1
    
    alphabet = "12pbtdʈɖcɟkgɡqɢʔmɱnɳɲŋɴNʙrʀRɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlwɑɪiyeEøɛœæaAɨʉəuUoOɔɒIʌʊɜ3456"
    w = dict( (x, (1, 1, 1)) for x in alphabet + alphabet.upper())
    if weight_dict:
        w.update(weight_dict)
    
    dist = [[0 for x in range(cols)] for x in range(rows)]

    # source prefixes can be transformed into empty strings 
    # by deletions:
    for row in range(1, rows):
        dist[row][0] = dist[row-1][0] + w[s[row-1]][0]

    # target prefixes can be created from an empty source string
    # by inserting the characters
    for col in range(1, cols):
        dist[0][col] = dist[0][col-1] + w[t[col-1]][1]
        
    for col in range(1, cols):
        for row in range(1, rows):
            deletes = w[s[row-1]][0]
            inserts = w[t[col-1]][1]
            subs = max( (w[s[row-1]][2], w[t[col-1]][2]))
            if s[row-1] == t[col-1]:
                subs = 0
            else:
                subs = subs

            dist[row][col] = min(dist[row-1][col] + deletes,
                                 dist[row][col-1] + inserts,
                                 dist[row-1][col-1] + subs) # substitution

    for r in range(rows):
        print(dist[r])
    
 
    return dist[row][col]

#iterative_levenshtein('ædvænsbʌtsætəpil', 'ædvænsbʌtsɛtəpil')

# # default:
# print(iterative_levenshtein("Cat", 
#                             "har", 
#                             h=(1, 2, 8),
#                             a=(4, 5, 4),
#                             r=(1,1,1)))

# print(iterative_levenshtein("cat", 
#                             "hat"))
#print(iterative_levenshtein("abc", "xyz", costs=(1,1,substitution_costs)))

In [29]:
# get the first tuple in the list which is ('ædvænsbʌtsætəpil', 'ædvænsbʌtsɛtəpil')
n = list_of_tuples[0]

# compute distance between the two strings in the tuple without any costs
compute_distance_1 = iterative_levenshtein(n[0],n[1])
compute_distance_1

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
[1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
[2, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
[3, 2, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
[4, 3, 2, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
[5, 4, 3, 2, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
[6, 5, 4, 3, 2, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
[7, 6, 5, 4, 3, 2, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[8, 7, 6, 5, 4, 3, 2, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8]
[9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 1, 2, 3, 4, 5, 6, 7]
[10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 1, 2, 3, 4, 5, 6]
[11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 1, 2, 3, 4, 5, 6]
[12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 2, 1, 2, 3, 4, 5]
[13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 3, 2, 1, 2, 3, 4]
[14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 4, 3, 2, 1, 2, 3]
[15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 5, 4, 3, 2, 1, 2]
[16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 6, 5, 4, 3, 2, 1]


1

In [32]:
# compute distance between the two strings in the tuple 
# by adding weights for insertions, deletions and substitutions

#  the weights for ɛ: (1,1,2) for insertions, deletions and substitutions respectively
compute_distance_2 = iterative_levenshtein(n[0],n[1],
                                        ɛ = (1,1,2))
compute_distance_2

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
[1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
[2, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
[3, 2, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
[4, 3, 2, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
[5, 4, 3, 2, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
[6, 5, 4, 3, 2, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
[7, 6, 5, 4, 3, 2, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[8, 7, 6, 5, 4, 3, 2, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8]
[9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 1, 2, 3, 4, 5, 6, 7]
[10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 1, 2, 3, 4, 5, 6]
[11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 2, 2, 3, 4, 5, 6]
[12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 3, 2, 3, 4, 5, 6]
[13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 4, 3, 2, 3, 4, 5]
[14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 5, 4, 3, 2, 3, 4]
[15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 3]
[16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2]


2