In [2]:
import panphon.distance

In [3]:
dst = panphon.distance.Distance()

In [4]:
def closest_phones(target: str, options: list, dist_type: str) -> list:
    min_score = 100
    best_phones = []
    for option in options:
        if dist_type == 'feature':
            option_score = dst.feature_edit_distance(target, option)
        elif dist_type == 'weighted':
            option_score = dst.weighted_feature_edit_distance(target, option)
        elif dist_type == 'hamming':
            option_score = dst.hamming_feature_edit_distance(target, option)
        else:
            print('Error: Give valid dist_type: {feature, weighted, hamming}')
            
        if option_score < min_score:
            best_phones = [option]
            min_score = option_score
        elif option_score == min_score:
            best_phones.append(option)
            min_score = option_score
            
    return best_phones

In [5]:
with open('data/map/english_mfa_phones.txt', 'r') as f:
    eng_phones = [line.strip() for line in f.readlines()]

with open('data/map/russian_mfa_phones.txt', 'r') as f2:
    rus_phones = [line.strip() for line in f2.readlines()]

In [6]:
# uniq phones that don't exist in Eng
urum_phones = ['dː', 'lː', 'mː', 'r', 'sː', 'tː', 'x', 'y', 'œ', 'ɣ', 'ɯ']
russ_not_eng_phones = ['ɫ', 'tɕ', 'ɨ', 'ʂ', 'ʐ']

In [7]:
with open('data/map/urum_xsampa-to-ipa.txt', 'r') as f3:
    all_urum_phones = [line.strip().split('\t')[1] for line in f3.readlines()]

In [45]:
for urum_phone in urum_phones:
    print(urum_phone, closest_phones(urum_phone, eng_phones, 'weighted'), 
          closest_phones(urum_phone, eng_phones, 'hamming'),
          closest_phones(urum_phone, eng_phones, 'feature'))

dː ['d'] ['d'] ['d']
lː ['l'] ['l'] ['l']
mː ['m'] ['m'] ['m']
r ['ɾ'] ['ɾ'] ['ɾ']
sː ['s'] ['s'] ['s']
tː ['t'] ['t'] ['t']
x ['ç'] ['k', 'ç'] ['k', 'ç']
y ['ʉ'] ['u', 'ʉ'] ['ʉ']
œ ['ɔ', 'ɛ'] ['ɔ', 'ɛ'] ['ɔ', 'ɛ']
ɣ ['ç'] ['ɡ'] ['ɡ']
ɯ ['ə', 'ɪ', 'ʊ'] ['ə', 'ɪ', 'ʊ'] ['ə', 'ɪ', 'ʊ']


In [9]:
for russ_phone in russ_not_eng_phones:
    print(russ_phone, closest_phones(russ_phone, eng_phones, 'weighted'), 
          closest_phones(russ_phone, eng_phones, 'hamming'),
          closest_phones(russ_phone, eng_phones, 'feature'))

ɫ ['ɫ'] ['ɫ'] ['ɫ']
tɕ ['tʃ'] ['tʃ'] ['tʃ']
ɨ ['i', 'ɐ', 'ɜ'] ['i', 'ɐ', 'ɜ'] ['i', 'ɐ', 'ɜ']
ʂ ['ʃ'] ['s', 'ʃ', 'ʈ'] ['s', 'ʃ', 'ʈ']
ʐ ['ʒ'] ['z', 'ɖ', 'ʒ'] ['z', 'ɖ', 'ʒ']


In [18]:
for russ_phone in russ_not_eng_phones:
    print(russ_phone, closest_phones(russ_phone, all_urum_phones, 'weighted'), 
          closest_phones(russ_phone, all_urum_phones, 'hamming'),
          closest_phones(russ_phone, all_urum_phones, 'feature'))

ɫ ['ɫ'] ['ɫ'] ['ɫ']
tɕ ['tʃ'] ['tʃ'] ['tʃ']
ɨ ['ɯ'] ['ɯ', 'i'] ['ɯ', 'i']
ʂ ['ʃ'] ['ʃ', 's'] ['ʃ', 's']
ʐ ['ʒ'] ['ʒ', 'z'] ['ʒ', 'z']


In [16]:
for urum_phone in all_urum_phones:
    if urum_phone in rus_phones:
        continue
        
#     print(urum_phone, closest_phones(urum_phone, rus_phones, 'weighted'), 
#           closest_phones(urum_phone, rus_phones, 'hamming'),
#           closest_phones(urum_phone, rus_phones, 'feature'))
    print(f'{urum_phone}\t{closest_phones(urum_phone, rus_phones, "weighted")[0]}')

ɾ	r
œ	ɛ
ɯ	ɨ
ʃ	ʂ
ʒ	ʐ
d	d̪
dː	d̪ː
dʒ	dʐː
l	ɫ
lː	ɫː
n	n̪
s	s̪
sː	s̪ː
t	t̪
tː	t̪ː
tʃ	tʂ
y	ʉ
z	z̪


In [15]:
for russ_phone in russ_not_eng_phones:
    print(russ_phone in rus_phones)

True
True
True
True
True
