In [5]:
import phonetics
import fuzzy
import cmudict
import eng_to_ipa as ipa
import re
from tqdm import tqdm_notebook
import subprocess

In [6]:
cmu_dict = dict(cmudict.dict())

In [7]:
keep_list = ["ʌ","ɑ","æ","e","ə","ɜ","ɪ","i","ɒ","ɔ","ʊ","u","aɪ","aʊ","eɪ","oʊ","ɔɪ","eə","ɪə","ʊə","b","d","f","g","h","j","k","l","m","n","ŋ","p","r","ɹ", "s","ʃ","t","tʃ","θ","ð","v","w","z","ʒ","dʒ"]

def preprocess(word):
    """Returns a string of words stripped of punctuation"""
    punct_str = '!"#$%&\'()*+,-./:;<=>/?@[\\]^_`{|}~«» '
    return word.strip(punct_str).lower()

def get_ipa(word, core='espeak'):
    word = preprocess(word)
#     print(word)
    
    if core == "espeak":
        cmd = "espeak {} --ipa=1".format(word)
        result = subprocess.run(cmd.split(), stdout=subprocess.PIPE)
        result = result.stdout.split()[0].decode('utf-8')
        print(result)
    else:
        result = ipa.convert(word)
    
    ret = ""
    for i in result:
        if (i in keep_list) or (i in consonant_list):
            ret += i
    return ret

def ipa_to_viet(ipa_form):
    ret_= ""
    curren_idx = 0
    while curren_idx < len(ipa_form):
        if ipa_form[curren_idx:curren_idx+2] in map_dict:
            ret_ += map_dict[ipa_form[curren_idx:curren_idx+2]]
            curren_idx += 2
        elif ipa_form[curren_idx] in map_dict:
            ret_ += map_dict[ipa_form[curren_idx]]
            curren_idx += 1
        else:
            ret_ += ipa_form[curren_idx]
            curren_idx += 1
    return ret_


def remove_spec_punc(word):
    ret_= []
    for i in result:
        tmp = ""
        for j in i:
            if j != spec_pun:
                tmp += j
        ret_.append(tmp)

In [8]:
def get_ipa_q(word, core='espeak'):
    word = preprocess(word)
    if core == "espeak":
        cmd = "espeak {} --ipa=1".format(word)
        result = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)
        result = result.stdout.split()[0].decode('utf-8')
    else:
        result = ipa.convert(word)
    return result

In [9]:
ipa_chars = set()

In [10]:
words = "English is a West Germanic language that was first spoken in early medieval England and is now the third most widespread native language in the world after Standard Chinese and Spanish as well as the most widely spoken Germanic language Welcome to the WikiWikiWeb, also known as. A lot of people had their first wiki experience here. This community has been around since 1995 and consists of many people. We always accept newcomers with valuable contributions. If you haven't used a wiki before, be prepared for a bit of CultureShock. The usefulness of Wiki is in the freedom, simplicity, and power it offers."

In [11]:
words

"English is a West Germanic language that was first spoken in early medieval England and is now the third most widespread native language in the world after Standard Chinese and Spanish as well as the most widely spoken Germanic language Welcome to the WikiWikiWeb, also known as. A lot of people had their first wiki experience here. This community has been around since 1995 and consists of many people. We always accept newcomers with valuable contributions. If you haven't used a wiki before, be prepared for a bit of CultureShock. The usefulness of Wiki is in the freedom, simplicity, and power it offers."

In [12]:
words = preprocess(word=words).split(' ')

In [13]:
words

['english',
 'is',
 'a',
 'west',
 'germanic',
 'language',
 'that',
 'was',
 'first',
 'spoken',
 'in',
 'early',
 'medieval',
 'england',
 'and',
 'is',
 'now',
 'the',
 'third',
 'most',
 'widespread',
 'native',
 'language',
 'in',
 'the',
 'world',
 'after',
 'standard',
 'chinese',
 'and',
 'spanish',
 'as',
 'well',
 'as',
 'the',
 'most',
 'widely',
 'spoken',
 'germanic',
 'language',
 'welcome',
 'to',
 'the',
 'wikiwikiweb,',
 'also',
 'known',
 'as.',
 'a',
 'lot',
 'of',
 'people',
 'had',
 'their',
 'first',
 'wiki',
 'experience',
 'here.',
 'this',
 'community',
 'has',
 'been',
 'around',
 'since',
 '1995',
 'and',
 'consists',
 'of',
 'many',
 'people.',
 'we',
 'always',
 'accept',
 'newcomers',
 'with',
 'valuable',
 'contributions.',
 'if',
 'you',
 "haven't",
 'used',
 'a',
 'wiki',
 'before,',
 'be',
 'prepared',
 'for',
 'a',
 'bit',
 'of',
 'cultureshock.',
 'the',
 'usefulness',
 'of',
 'wiki',
 'is',
 'in',
 'the',
 'freedom,',
 'simplicity,',
 'and',
 'power

In [14]:
import re
import os
import json
from eng_to_ipa import transcribe
import numpy as np
import unicodedata
import eng_to_ipa

In [15]:
eng_to_ipa.syllable_count('medieval')

3

In [16]:
cmu_dict['spoken']

[['S', 'P', 'OW1', 'K', 'AH0', 'N']]

In [95]:
for idx, word in enumerate(words):
    cmd = "espeak {} --ipa=1".format(word)
    result = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)
    result = result.stdout.read().decode('utf-8').split("\n")[0].split("_")
    
    ret = "".join([i for i in mapping(remove_spec_punc(result))])
    
    print(idx, word, remove_spec_punc(result), ret, ipa_to_viet(ret))


0 in False
1 nɡ False
2 ɡl False
3 li False
4 is False
5 s False
0 english ['ɪ', 'ŋ', 'ɡ', 'l', 'ɪ', 'ʃ'] inɡlis ['i', 'n', 'ɡ', 'l', 'i', 's']
0 id False
1 d False
1 is ['ɪ', 'z'] id ['i', 'd']
0 ây False
1 y False
2 a ['eɪ'] ây ['â', 'y']
0 qu True
2 es False
3 st False
4 t False
3 west ['w', 'ɛ', 's', 't'] quest ['qu', 'e', 's', 't']
0 ch True
2 ơm False
3 ma False
4 an False
5 ni False
6 ic False
7 c False
4 germanic ['dʒ', 'ɜː', 'm', 'a', 'n', 'ɪ', 'k'] chơmanic ['ch', 'ơ', 'm', 'a', 'n', 'i', 'c']
0 la False
1 an False
2 nɡ False
3 ɡq False
4 qu True
6 ic False
7 ch True
5 language ['l', 'a', 'ŋ', 'ɡ', 'w', 'ɪ', 'dʒ'] lanɡquich ['l', 'a', 'n', 'ɡ', 'qu', 'i', 'ch']
0 đa False
1 at False
2 t False
6 that ['ð', 'a', 't'] đat ['đ', 'a', 't']
0 qu True
2 od False
3 d False
7 was ['w', 'ɒ', 'z'] quod ['qu', 'o', 'd']
0 ph True
2 ơs False
3 st False
4 t False
8 first ['f', 'ɜː', 's', 't'] phơst ['ph', 'ơ', 's', 't']
0 sp False
1 pâ False
2 âu False
3 uc False
4 cơ False
5 ơn False
6 n 

0 ip False
1 ph True
76 if ['ɪ', 'f'] iph ['i', 'ph']
0 du False
1 u False
77 you ['j', 'uː'] du ['d', 'u']
0 ha False
1 av False
2 vơ False
3 ơn False
4 nt False
5 t False
78 haven't ['h', 'a', 'v', 'ə', 'n', 't'] havơnt ['h', 'a', 'v', 'ơ', 'n', 't']
0 du False
1 ud False
2 dđ False
3 đ False
79 used ['j', 'uː', 'z', 'd'] dudđ ['d', 'u', 'd', 'đ']
0 ây False
1 y False
80 a ['eɪ'] ây ['â', 'y']
0 qu True
2 ic False
3 ci False
4 i False
81 wiki ['w', 'ɪ', 'k', 'i'] quici ['qu', 'i', 'c', 'i']
0 bi False
1 ip False
2 ph True
4 o False
82 before, ['b', 'ɪ', 'f', 'ɔː'] bipho ['b', 'i', 'ph', 'o']
0 bi False
1 i False
83 be ['b', 'iː'] bi ['b', 'i']
0 pr False
1 ri False
2 ip False
3 pe False
4 eđ False
5 đ False
84 prepared ['p', 'ɹ', 'ɪ', 'p', 'eə', 'd'] pripeđ ['p', 'r', 'i', 'p', 'e', 'đ']
0 ph True
2 o False
85 for ['f', 'ɔː'] pho ['ph', 'o']
0 ây False
1 y False
86 a ['eɪ'] ây ['â', 'y']
0 bi False
1 it False
2 t False
87 bit ['b', 'ɪ', 't'] bit ['b', 'i', 't']
0 ov False
1 v False
8

In [18]:
removed_pun_ipa_list = [712, 716]
# , 720]

In [19]:
def remove_spec_punc(word):
    ret_= []
    for i in result:
        tmp = ""
        for j in i:
            if ord(j) not in removed_pun_ipa_list:
                tmp += j
        ret_.append(tmp)
    return ret_

def mapping(word):
    ret_= []
    for w in word:
        ret_.append(map_dict[w] if w in map_dict else w)
    return ret_

In [48]:
word = "metal"
word = "lanɡquich"
cmd = "espeak {} --ipa=1".format(word)
result = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)
result = result.stdout.read().decode('utf-8').split("\n")[0].split("_")
print(idx, word, result, remove_spec_punc(result), "".join([i for i in mapping(remove_spec_punc(result))]))


102 lanɡquich ['ˈɛ', 'l ɐ ˈɛ', 'n l', 'ˌɛ', 't', 'ə', '', 't', 'ˈuː', '', 's', 'ˈɪ', 'k', 's', '', 'w', 'ˈɒ', 'n', ' k', 'j', 'ˈuː j', 'ˈuː ˈaɪ s', 'ˈiː', ' ˈeɪ', 'tʃ'] ['ɛ', 'l ɐ ɛ', 'n l', 'ɛ', 't', 'ə', '', 't', 'uː', '', 's', 'ɪ', 'k', 's', '', 'w', 'ɒ', 'n', ' k', 'j', 'uː j', 'uː aɪ s', 'iː', ' eɪ', 'tʃ'] el ɐ ɛn letơtusicsquon kduː juː aɪ si eɪch


In [98]:
viet_consonant_apha = [
 'b',
 'c',
 'd',
 'đ',
 'g',
 'h',
 'k',
 'l',
 'n',
 'p',
 'q',
 'r',
 's',
 't',
 'v',
 'x']

viet_vowel_alpha = ['a','ă', 'â', 'e', 'ê', 'o', 'ô', 'ơ', 'u', 'ư', 'i', 'y']

viet_compound_alpha = ['ch', 'gh', 'ph', 'tr', 'th', 'qu', 'kh', 'gi', 'ng', 'ng']

In [99]:
viet_compound_alpha

['ch', 'gh', 'ph', 'tr', 'th', 'qu', 'kh', 'gi', 'ng', 'ng']

In [100]:
def ipa_to_viet(ipa_form):
    ret_= []
    current_idx = 0
    while current_idx < len(ipa_form):
        print(current_idx, ipa_form[current_idx:current_idx+2], ipa_form[current_idx:current_idx+2] in viet_compound_alpha)
        
        if ipa_form[current_idx:current_idx+2] in viet_compound_alpha:
            ret_.append(ipa_form[current_idx:current_idx+2])
            current_idx += 2
        else:
            ret_.append(ipa_form[current_idx])
            current_idx += 1
    return ret_

In [101]:
ipa_to_viet('lanɡquich')

0 la False
1 an False
2 nɡ False
3 ɡq False
4 qu True
6 ic False
7 ch True


['l', 'a', 'n', 'ɡ', 'qu', 'i', 'ch']

In [92]:
'ng' in viet_compound_alpha

True

In [24]:
# http://www.antimoon.com/how/pronunc-soundsipa.htm
map_dict = {
    "iə": "i ơ",
    "ɐ": "a",
    "əʊ": "âu",
    "ʌ":    "â",
    "ɑː": "a",
    "ɑ":    "o",
    "æ":    "a",
    "e":    "e",
    "ə":    "ơ",
    "ɜ":    "ơ",
    "ɪ":    "i",
    "i":    "i",
    "ɒ":    "o",
    "ɔ":    "ô",
    "ʊ":    "u",
    "u":    "u",
    "aɪ":   "ai",
    "aʊ":   "ao",
    "eɪ":   "ây",
    "oʊ":   "âu", # go >< home
    "ɔɪ":   "oi",
    "eə":   "e",
    "ɪə":   "ia",
    "ʊə":   "ua",
    "b":    "b",
    "d":    "đ",
    "f":    "ph",
    "g":    "g",
    "h":    "h",
    "j":    "d",
    "k":    "c",
    "l":    "l",
    "m":    "m",
    "n":    "n",
    "ŋ":    "n",
    "p":    "p",
    "r":    "r",
    "s":    "s",
    "ʃ":    "s",
    "t":    "t",
    "tʃ":   "ch",
    "θ":    "th",
    "ð":    "đ",
    "v":    "v",
    "w":    "qu",
    "z":    "d",
    "ʒ":    "d",
    "dʒ":   "ch", 
    "ju":   "iu", # exception
    "ɔk":   "óc",
    "ɑʊ": "ao",
    "tɹ": "tr",
    "ɒs" : "ot",
    "ɪl": "iu", 
    "ɪv": "i",
    "əl": "ồ",
    "ɛ": "e",
    "ɹ" : "r",
    "ɜː": "ơ",
    "iː": "i",
    "uː": "u",
    "ɔː": "o"
}

### Rules

1. 

In [27]:
word = "metal"
word = "language"
cmd = "espeak {} --ipa=1".format(word)
result = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)
result = result.stdout.read().decode('utf-8').split("\n")[0].split("_")
print(idx, word, result, remove_spec_punc(result), "".join([i for i in mapping(remove_spec_punc(result))]))

0 language ['l', 'ˈa', 'ŋ', 'ɡ', 'w', 'ɪ', 'dʒ'] ['l', 'a', 'ŋ', 'ɡ', 'w', 'ɪ', 'dʒ'] lanɡquich


In [28]:
def is_consonant(char):
    if char in viet_consonant_apha:
        return True
    else:
        return False

def is_vowel(char):
    if char in viet_vowel_alpha:
        return True
    else:
        return False

    

In [29]:
def ipa_to_viet(ipa_form):
    ret_= []
    curren_idx = 0
    while curren_idx < len(ipa_form):
        if ipa_form[curren_idx:curren_idx+2] in viet_compound_alpha:
            ret_.append(ipa_form[curren_idx:curren_idx+2])
            curren_idx += 2
        elif ipa_form[curren_idx] in map_dict:
            ret_.append(ipa_form[curren_idx:curren_idx+1])
            curren_idx += 1
        else:
            ret_.append(ipa_form[curren_idx:curren_idx])
            curren_idx += 1
    return ret_