In [1]:
import numpy as np
import re
import os
import maps
from maps import *
import math
import sys

In [2]:
class Pinyin:
    consonantList = maps.consonantList
    vowelList = maps.vowelList
    
    def __init__(self, pinyinstr):
        self.tone = int(pinyinstr[-1])
        self.locp = pinyinstr[0:-1].lower()
        self.consonant, self.vowel = self.parseConsonant(self.locp)
        self.pinyinRewrite()
    
    def parseConsonant(self, pinyin):
        for consonant in self.consonantList:
            if pinyin.startswith(consonant):
                return (consonant, pinyin[len(consonant):])
        # it's a vowel without consonant
        if pinyin in self.vowelList:
            return None, pinyin.lower()
        
        print("Invalid Pinyin, please check!")
        return None, None
        
    def toStringNoTone(self):
        return "{}{}".format(self.consonant, self.vowel)
    
    def toStringWithTone(self):
        return "{}{}{}".format(self.consonant, self.vowel, self.tone)
    
    def toString(self):
        return "{}{}{}".format(self.consonant, self.vowel, self.tone)
    
    def pinyinRewrite(self):
        yVowels = {"u","ue","uan","un","u:","u:e","u:an","u:n"}
        tconsonant = {"j","g","x"}
        if 'v' in self.vowel:
            self.vowel = self.vowel.replace("v", "u:")

        if self.consonant == None or self.consonant == "":
            self.consonant = ""
            return
        if self.consonant == "y":
            if self.vowel in yVowels:
                if "u:" not in self.vowel:
                    self.vowel = self.vowel.replace("u","u:")
            else:
                self.vowel="i"+self.vowel
                regex = re.compile("i+")
                self.vowel = self.vowel.replace("iii","i")
                self.vowel = self.vowel.replace("ii","i")
            self.consonant=""

        if self.consonant == "w":
            self.vowel="u"+self.vowel
            self.vowel=self.vowel.replace("uuu","u")
            self.vowel=self.vowel.replace("uu","u")
            self.consonant = ""

        if (self.consonant in tconsonant) and (self.vowel == "u") or (self.vowel == "v"):
            self.vowel="u:"

        if self.vowel == "iou":
            self.vowel = "iu"

        if self.vowel == "uei":
            self.vowel = "ui"

        if self.vowel == "uen":
            self.vowel = "un"

In [3]:
def get_distance_2d_code(X, Y):
    x1, x2 = X
    y1, y2 = Y

    x1d = abs(x1-y1)
    x2d = abs(x2-y2)
    
    return math.sqrt( x1d**2 + x2d**2)

def get_sim_dis_from_hardcod_map(a, b):
    try:
        simPy = hardcodeMap[a.toStringNoTone()]
        if simPy is not None:
            if simPy is b.toStringNoTone():
                return 2.0
        else:
            simPy=hardcodeMap[b.toStringNoTone()]
            if simPy is not None and simPy is a.toStringNoTone():
                return 2.0
        return sys.float_info.max
    except:
        return sys.float_info.max
    
    
def get_edit_distance_close_2d_code(a, b):
    res = 0
    try:
        if (a is None) or (b is None):
            print("Error:pinyin({},{})".format(a.toString(),b.toString()))
            return res
        
        twoDcode_consonant_a = consonantMap_TwoDCode[a.consonant]
        twoDcode_consonant_b = consonantMap_TwoDCode[b.consonant]
        
        cDis = abs(get_distance_2d_code(twoDcode_consonant_a, twoDcode_consonant_b))
        
        twoDcode_vowel_a = vowelMap_TwoDCode[a.vowel]
        twoDcode_vowel_b = vowelMap_TwoDCode[b.vowel]
        
        vDis = abs(get_distance_2d_code(twoDcode_vowel_a, twoDcode_vowel_b))

        hcDis = get_sim_dis_from_hardcod_map(a,b)
        
        res = min((cDis+vDis),hcDis) + 1.0*abs(a.tone-b.tone)/10
        
    except:
        raise Exception("Error pinyin {}{}".format(a.toString(), b.toString()))
    return res

In [4]:
def get_distance(utterance1, utterance2):

    u1 = utterance1
    u2 = utterance2

    la = []
    lb = []
    for py in u1:
        la.append(Pinyin(py))
    for py in u2:
        lb.append(Pinyin(py))


    n = len(utterance1)
    m = len(utterance2)
    distance_matrix = np.full(shape=(n, m), fill_value=-1.9)
    tot = (len(utterance1) + len(utterance2)) / 2 * 2.1

    for i in range(n):
        for j in range(m):
            apy = la[i]
            bpy = lb[j]

            res = 0.0
            numDiff = 0        

            res = get_edit_distance_close_2d_code(apy, bpy)

            if apy.consonant != bpy.consonant:
                numDiff+=1
            if not(str(apy.vowel) == str(bpy.vowel)):
                numDiff+=1
            if apy.tone != bpy.tone:
                numDiff+=0.01
            diffRatio = (numDiff)/tot

            distance_matrix[i][j] = res*diffRatio

    p1_closest = np.min(distance_matrix, axis=1)
    p2_closest = np.min(distance_matrix, axis=0)
    p1_mean = np.mean(p1_closest)
    p2_mean = np.mean(p2_closest)

    if n != m:
        diss = np.min([p1_mean, p2_mean])
    else:
        diss = np.mean([p1_mean, p2_mean])
    return diss * tot/2

In [73]:
def pinyin_similarity(p1, p2):

    d = get_distance(p1, p2)
    
    alpha1 = 13
    s = 1/(d + alpha1) * alpha1
    
    return d, s

In [6]:
def pinyin_similarity(p1, p2):

    d = get_distance(p1, p2)
    s = -1 / (1 + np.exp(-d)) + 1.5
    
    return d, s

In [7]:
def pinyin_similarity(p1, p2):

    d = get_distance(p1, p2)
    if d == 0:
        s = 1
    else:
        s1 = 1 - 1 / (1 + np.exp(-np.log(d)))
        
        alpha1 = 2.5
        s2 = 1/(d + alpha1) + (1 - 1/alpha1)
        
        alpha = 0.05
        s = alpha * s1 + (1-alpha) *s2
    
    return d, s

In [74]:
pinyin_similarity(['mei3'], ['mei4'])

(0.0005, 0.9999615399407714)

In [75]:
pinyin_similarity(['zhuo2', 'mei3', 'ya4', 'huang2',],
['zhuo2', 'mei3', 'ya4', 'he2'])

(4.878604436721632, 0.7271261046135539)

In [76]:
pinyin_similarity(['zhuo3', 'mei3', 'ya4', 'gu3', 'bao3', 'jiu3', 'huang2', 'dian4', 'gong1'],
['zhuo2', 'mei3', 'ya4', 'gu3', 'bao3', 'jiu3', 'dian4', 'he2', 'gong1'])

(1.522577728734868, 0.8951578874511895)

In [77]:
pinyin_similarity(['zhuo3', 'mei3', 'ya4', 'gu3', 'bao3', 'jiu3', 'dian4', 'gong1'],
['zhuo2', 'mei3', 'ya4', 'gu3', 'bao3', 'jiu3', 'dian4', 'he2', 'gong1'])

(6.25e-05, 0.9999951923308061)

In [78]:
pinyin_similarity(['zhuo2', 'mei3', 'ya4', 'huang2',],
['bao3', 'jiu3', 'dian4', 'he2'])

(17688.032553020777, 0.0007344204334442331)

In [79]:
pinyin_similarity(['zhuo2', 'mei3', 'ya4', 'huang2',],
['zhuo2', 'mei3', 'ya4'])

(0.0, 1.0)

In [None]:
# u1 = ['zhuo2', 'mei3', 'ya4', 'huang2',]
# u2 = ['zhuo2', 'mei3', 'ya4', 'he2']

# utterance1 = u1
# utterance2 = u2

# la = []
# lb = []
# for py in u1:
#     la.append(Pinyin(py))
# for py in u2:
#     lb.append(Pinyin(py))


# res = 0.0
# numDiff = 0        
# tot = len(utterance1)*2.1
# for i in range (len(utterance1)):
#     apy = la[i]
#     bpy = lb[i]

#     if (apy is None) or (bpy is None):
#         raise Exception("!Empty Pinyin {},{}".format(la, lb))
#     res += get_edit_distance_close_2d_code(apy, bpy)

#     if apy.consonant != bpy.consonant:
#         numDiff+=1

#     if not(str(apy.vowel) == str(bpy.vowel)):
#         numDiff+=1

#     if apy.tone != bpy.tone:
#         numDiff+=0.01

# diffRatio = (numDiff)/tot
# print(res*diffRatio)