In [1]:
# 引用套件並縮寫為 pd  
import pandas as pd
#讀入 EHowNet
from ehownet_python3 import *

def read_databases():
    try:
        #讀入字典
        cdict = pd.read_csv('dictionary.csv', sep=",")
        cdict.drop_duplicates(subset = None, keep = "last", inplace = True)
        #print(dict)
        
        EHowTree=EHowNetTree("db/ehownet_ontology.sqlite")
        #tree=EHowNetTree("db/ehownet_ontology_sim.sqlite")
        #print(dict)
    
        # 讀入 SAMPA 
        sampadict = pd.read_csv('sampa.csv')
        #print(sampadict)
        return(cdict, EHowTree, sampadict)
    except Exception as e:
        print(e)
        exit(-1)

In [2]:
import chardet
from opencc import OpenCC

def file_translation(fn):
    try:
        infile = fn
        rawdata = open(infile, "rb").read()
        result = chardet.detect(rawdata)
        charenc = result['encoding']
        #print("The file encoding:", charenc)
        if charenc != 'UTF-16':
            print("Suspecious File format: ", charenc)
            exit(-1)
            
        # Here we assume that the original contains simplifed chinese, the conversion result is UTF-8 tranditional chinese
        cc = OpenCC('s2t')
        source = open(infile, 'r', encoding = charenc)
        result = open(infile+".utf8", 'w', encoding = 'utf-8')
        #source就放純文字檔，轉完就放進去result
        count = 0
        while True:
            line = source.readline()
            line = cc.convert(line)
            if not line:  #readline會一直讀下去，這邊做的break
                break
            #print(line)
            count = count +1
            result.write(line) 
            #print('===已處理'+str(count)+'行===')
        source.close()        
        result.close()
        print("Chinese conversion complete")
    except Exception as e:        
        print(e)
        exit(-1)
        


In [3]:
from __future__ import print_function, division
import os
import sys
import tgt

EXTENSION = 'TextGrid'

def print_tiernames(filenames):
    for filename in filenames:
        try:
            tg = tgt.io.read_textgrid(filename)
            print(filename)
            for tiername in tg.get_tier_names():
                print('\t' + tiername)
        except err:
            print(filename + ' caused a problem.')
            sys.stderr.write('ERROR: %s\n' % str(err))

def read_txtgrid(filename):
    tg = tgt.io.read_textgrid(filename)
    return tg

def add_text_tier(tg, NNM=""):
    new_tier = tgt.core.PointTier(name=NNM)
    tg.add_tier(new_tier)
    return new_tier

def add_interval_tier(tg, NNM="", STT=0, ETT=0):
    new_tier = tgt.core.IntervalTier(name=NNM, start_time = STT, end_time = ETT)
    tg.add_tier(new_tier)
    return new_tier

def print_tier(tg):
    for tiername in tg.get_tier_names():
        print('\t' + tiername)   

def write_txtgrid(tg, NNM):
    tgt.io.write_to_file(tg, filename=NNM )
    
def remove_old_tier(tg, name):
    if (tg.has_tier(name)):
        tg.delete_tier(name)

In [4]:
def contain_chinese(check_str):
    """
    判断字符串中是否包含中文
    :param check_str: {str} 需要检测的字符串
    :return: {bool} 包含返回True， 不包含返回False
    """
    for ch in check_str:
        if u'\u4e00' <= ch <= u'\u9fff':
            return True
    return False

In [5]:
# Major program starts here

(cdict, EHowTree, sampadict) = read_databases()
file_translation("step1.TextGrid")

Chinese conversion complete


In [6]:
tg = read_txtgrid('step1.TextGrid.utf8')
ophonetier = tg.get_tier_by_name('IU/phone')
owordtier =  tg.get_tier_by_name('Word')

phonetier = ophonetier.get_copy_with_gaps_filled()
wordtier  = owordtier.get_copy_with_gaps_filled()

pst = phonetier.start_time
pet = phonetier.end_time

wst = wordtier.start_time
wet = wordtier.end_time

st = pst
et = pet

In [7]:
print(wordtier)
print("---")
print(phonetier)

#ann = wordtier.get_annotation_by_start_time(0)
#print(ann)
#ann = phonetier.get_annotation_by_start_time(0)
#print(ann)

print(st, et)

IntervalTier(start_time=0.0, end_time=20.59631224126035, name="Word", objects=[Interval(0.0, 0.07, "sp"), Interval(0.07, 0.38, "到"), Interval(0.38, 0.57, "sp"), Interval(0.57, 0.94, "這個"), Interval(0.94, 1.1964528587272878, "sp"), Interval(1.1964528587272878, 2.09, "富士山"), Interval(2.09, 2.31, "我們"), Interval(2.31, 2.53, "念"), Interval(2.53, 2.7821661521009964, "sp"), Interval(2.7821661521009964, 3.81, "富士山"), Interval(3.81, 4.014285687135808, "他們"), Interval(4.014285687135808, 4.43, "念成"), Interval(4.43, 4.6, "sp"), Interval(4.6, 5.15056920266588, "fuji"), Interval(5.15056920266588, 5.45338301549331, "sp"), Interval(5.45338301549331, 6.005216359436273, "fuji"), Interval(6.005216359436273, 6.886760347082612, "fuji"), Interval(6.886760347082612, 7.227598588929736, "所以"), Interval(7.227598588929736, 7.73, "β"), Interval(7.73, 7.826039376265833, "你"), Interval(7.826039376265833, 7.96, "可以"), Interval(7.96, 8.34, "發現"), Interval(8.34, 8.5, "sp"), Interval(8.5, 8.7, "兩"), Interval(8.7, 8.78

In [8]:
# Extract all tha annotations from word layer

timeptr = st
ann = wordtier.get_annotation_by_start_time(timeptr)
annlist = []
while (timeptr < et):
    if ann is not None:
        if (ann.text != 'sp'): 
            #print(ann.text)
            annlist.append(ann)
        timeptr = timeptr + ann.duration()
        #print(timeptr)
        ann = wordtier.get_annotation_by_start_time(timeptr)
    else:
        break

print(annlist)

[Interval(0.07, 0.38, "到"), Interval(0.57, 0.94, "這個"), Interval(1.1964528587272878, 2.09, "富士山"), Interval(2.09, 2.31, "我們"), Interval(2.31, 2.53, "念"), Interval(2.7821661521009964, 3.81, "富士山"), Interval(3.81, 4.014285687135808, "他們"), Interval(4.014285687135808, 4.43, "念成"), Interval(4.6, 5.15056920266588, "fuji"), Interval(5.45338301549331, 6.005216359436273, "fuji"), Interval(6.005216359436273, 6.886760347082612, "fuji"), Interval(6.886760347082612, 7.227598588929736, "所以"), Interval(7.227598588929736, 7.73, "β"), Interval(7.73, 7.826039376265833, "你"), Interval(7.826039376265833, 7.96, "可以"), Interval(7.96, 8.34, "發現"), Interval(8.5, 8.7, "兩"), Interval(8.7, 8.78, "個"), Interval(8.78, 9.064960599449725, "雙脣"), Interval(9.064960599449725, 9.23, "會"), Interval(9.23, 9.7, "震動"), Interval(9.86, 10.04, "那"), Interval(10.173421945409295, 10.600281267341645, "如果"), Interval(10.600281267341645, 10.710647936130238, "這"), Interval(10.710647936130238, 10.959178004535149, "是"), Interval(10.9

In [9]:
# 處理第一層中文層

ctext = ""

for a in annlist:
    if (contain_chinese(a.text)):
        ctext = ctext + a.text
        # print(a.text)
    else:
        if (ctext[-1] == " "):
            ctext = ctext + a.text + " "
        else:
            ctext = ctext + " " + a.text + " "
            
#print(ctext)
ctier = tgt.core.PointTier(st, et, u"IU/逐字稿")
#add_text_tier(newtg, u"IU/逐字稿")
cann = tgt.core.Point((et-st)/2.0, ctext)
ctier.add_annotation(cann)
print(ctext)

到這個富士山我們念富士山他們念成 fuji fuji fuji 所以 β 你可以發現兩個雙脣會震動那如果這是無聲有聲的話是像那個嗯西班牙文 vuey  那個西班牙文裏面就有那個這個 b β的這個音  


In [10]:
# 處理第二層英文層

from googletrans import Translator
translator = Translator()
tr = translator.translate(ctext, dest = 'en')
print (tr.text)

etier = tgt.core.PointTier(st, et, "English")
#add_text_tier(newtg, "English")
eann = tgt.core.Point((et-st)/2.0,tr.text)
etier.add_annotation(eann)

To this Mount Fuji, we read Mount Fuji and they read it as fuji fuji fuji so β ​​you can find that the two lips will vibrate. If this is silent, it is like that um. Spanish vuey. There is this b β in Spanish.


In [11]:
# 處理第三層 加入詞意層
#postier = add_interval_tier(newtg, "IU/Word", st, et)
postier = tgt.core.IntervalTier(st, et, "IU/Word")
timeptr = st
ann = wordtier.get_annotation_by_start_time(timeptr)
while (timeptr < et):
    newann = tgt.core.Annotation(ann.start_time, ann.end_time, text=" ")
    if (contain_chinese(ann.text)): 
        newannstr = ""
        #去 EHowNet 查詞性
        posp=EHowTree.searchWord(ann.text)
        #print(ann.text, posp)
        if posp:
            for w in posp:
                #print(w.pos)
                newannstr = newannstr + w.pos + " "
            #print(node)
            newann.text = newannstr.rstrip()
    elif ann.text == 'sp':
        newann.text = 'sp'
    #print(newannstr)
    #print(c, "[", newann.start_time, newann.end_time, "]", ss)
    postier.add_annotation(newann)
    timeptr = ann.end_time
    ann = wordtier.get_annotation_by_start_time(timeptr)
    if ann == None:
        break
    
print(postier)

IntervalTier(start_time=0.0, end_time=20.59631224126035, name="IU/Word", objects=[Annotation(0.0, 0.07, "sp"), Annotation(0.07, 0.38, "Caa VCL Di P P P"), Annotation(0.38, 0.57, "sp"), Annotation(0.57, 0.94, ""), Annotation(0.94, 1.1964528587272878, "sp"), Annotation(1.1964528587272878, 2.09, "Nc"), Annotation(2.09, 2.31, "Nh"), Annotation(2.31, 2.53, "VK VC VC"), Annotation(2.53, 2.7821661521009964, "sp"), Annotation(2.7821661521009964, 3.81, "Nc"), Annotation(3.81, 4.014285687135808, "Nh"), Annotation(4.014285687135808, 4.43, ""), Annotation(4.43, 4.6, "sp"), Annotation(4.6, 5.15056920266588, ""), Annotation(5.15056920266588, 5.45338301549331, "sp"), Annotation(5.45338301549331, 6.005216359436273, ""), Annotation(6.005216359436273, 6.886760347082612, ""), Annotation(6.886760347082612, 7.227598588929736, "Cbb"), Annotation(7.227598588929736, 7.73, ""), Annotation(7.73, 7.826039376265833, "Nh"), Annotation(7.826039376265833, 7.96, "D VH D D"), Annotation(7.96, 8.34, "VE Na"), Annotatio

In [20]:
#第四層等於外面的 phone layer

print(phonetier)

IntervalTier(start_time=0.0, end_time=20.59631224126035, name="IU/phone", objects=[Interval(0.0, 0.07, "sp"), Interval(0.07, 0.12029973919034785, "t"), Interval(0.12029973919034785, 0.2485198396947419, "ɑ"), Interval(0.2485198396947419, 0.38, "w"), Interval(0.38, 0.57, "sp"), Interval(0.57, 0.625064944973469, "ts"), Interval(0.625064944973469, 0.7159551427993686, "ɤ"), Interval(0.7159551427993686, 0.7695154379467737, "k"), Interval(0.7695154379467737, 0.94, "ɤ"), Interval(0.94, 1.1964528587272878, ""), Interval(1.1964528587272878, 1.248390114627802, "f"), Interval(1.248390114627802, 1.3684950188977407, "u"), Interval(1.3684950188977407, 1.4950920801552439, "ʂ"), Interval(1.4950920801552439, 1.6784955150539345, "ɨ"), Interval(1.6784955150539345, 1.7628935558922698, "ʂ"), Interval(1.7628935558922698, 1.918705323593812, "a"), Interval(1.918705323593812, 2.09, "n"), Interval(2.09, 2.14, "w"), Interval(2.14, 2.19, "ɔ"), Interval(2.19, 2.25, "m"), Interval(2.25, 2.28, "ə"), Interval(2.28, 2.

In [14]:
#處理第五層 IU/Syllable (CGVN)層

cgvntier = tgt.core.IntervalTier(st, et, "IU/Syllable")
timeptr = st
ann = wordtier.get_annotation_by_start_time(timeptr)

while (timeptr < et):
    #print(ann.text)
    if contain_chinese(ann.text):
        #詞含有中文，抓出音來
        
        pholist = phonetier.get_annotations_between_timepoints(ann.start_time, ann.end_time)
        print(ann.text)
        cgvnstr = ''
        for p in pholist:
            ptext = p.text
            #print(ptext, end =" ")
            cgvn = sampadict.loc[sampadict['IPA'] == ptext, 'CGVN']
            if not cgvn.empty:
                cstring = cgvn.to_string(index=False).replace('\n','')
                #print(cstring)
                cgvnstr = cgvnstr + cstring
            #cgvnstr = cgvnstr.replace(" ","")
            print(">>> ", ptext, cgvnstr)
    elif ann.text == "sp":
        cgvnstr = "sp"
    else:
        cgvnstr = " "
    cgvnstr = cgvnstr.replace(" ","")
    newann = tgt.core.Annotation(ann.start_time, ann.end_time, cgvnstr)
    cgvntier.add_annotation(newann)
    timeptr = ann.end_time
    ann = wordtier.get_annotation_by_start_time(timeptr)
    if not ann:
        break

print(cgvntier)

到
>>>  t  C
>>>  ɑ  C V
>>>  w  C V G
這個
>>>  ts  C
>>>  ɤ  C V
>>>  k  C V C
>>>  ɤ  C V C V
富士山
>>>  f  C
>>>  u  C V V
>>>  ʂ  C V V C
>>>  ɨ  C V V C V
>>>  ʂ  C V V C V C
>>>  a  C V V C V C V
>>>  n  C V V C V C V C
我們
>>>  w  G
>>>  ɔ  G V V
>>>  m  G V V C
>>>  ə  G V V C
>>>  n  G V V C C
念
>>>  n  C
>>>  j  C G
>>>  ɛ  C G V
>>>  n  C G V C
富士山
>>>  f  C
>>>  u  C V V
>>>  ʂ  C V V C
>>>  ɨ  C V V C V
>>>  ʂ  C V V C V C
>>>  a  C V V C V C V
>>>  n  C V V C V C V C
他們
>>>  t  C
>>>  a  C V
>>>  m  C V C
>>>  ə  C V C
>>>  n  C V C C
念成
>>>  n  C
>>>  j  C G
>>>  ɛ  C G V
>>>  n  C G V C
>>>  tʂʰ  C G V C C
>>>  ə  C G V C C
>>>  n  C G V C C C
所以
>>>  s  C
>>>  w  C G
>>>  ɔ  C G V V
>>>  i  C G V V V
你
>>>  n  C
>>>  i  C V
可以
>>>  k  C
>>>  &  C
>>>  i  C V
發現
>>>  f  C
>>>  a  C V
>>>  ɕ  C V C
>>>  j  C V C G
>>>  ɛ  C V C G V
>>>  n  C V C G V C
兩
>>>  l  C
>>>  j  C G
>>>  ɑ  C G V
>>>  ŋ  C G V
個
>>>  g 
>>>  ɤ  V
雙脣
>>>  ʂ  C
>>>  w  C G
>>>  ŋ  C G
>>>  tʂʰ  C G C
>

In [19]:
#處理第五層 IU/Syllable (CGVN)層

cgvntier = tgt.core.IntervalTier(st, et, "IU/Syllable")
timeptr = st
ann = wordtier.get_annotation_by_start_time(timeptr)

while (timeptr < et):
    #print(ann.text)
    if contain_chinese(ann.text):
        #詞含有中文，抓出音來
        
        pholist = phonetier.get_annotations_between_timepoints(ann.start_time, ann.end_time)
        print(ann.text)
        cgvnstr = ''
        tlist = ""
        for p in pholist:
            tlist = tlist + p.text
        print(tlist)
    elif ann.text == "sp":
        cgvnstr = "sp"
    else:
        cgvnstr = " "
    newann = tgt.core.Annotation(ann.start_time, ann.end_time, cgvnstr)
    cgvntier.add_annotation(newann)
    timeptr = ann.end_time
    ann = wordtier.get_annotation_by_start_time(timeptr)
    if not ann:
        break

#print(cgvntier)

到
tɑw
這個
tsɤkɤ
富士山
fuʂɨʂan
我們
wɔmən
念
njɛn
富士山
fuʂɨʂan
他們
tamən
念成
njɛntʂʰən
所以
swɔi
你
ni
可以
k&i
發現
faɕjɛn
兩
ljɑŋ
個
gɤ
雙脣
ʂwŋtʂʰwən
會
xwej
震動
tʂəntoŋ
那
na
如果
ʐukwɔ
這
tʂɤ
是
ʂɨ
無聲
uʂəŋ
有聲
jowʂəŋ
的
tɤ
話
xwa
是
ʂɨ
像
ɕjɑŋ
那
na
個
kɤ
嗯
ən
西班牙文
ɕɨpanjawən
那個
nakɤ
西班牙文
ɕipanjawən
裏面
imjan
就
tɕjow
有
jow
那
na
個
kɤ
這
tʂɤ
個
gɤ
β的
tɤ
這
tʂɤ
個
kɤ
音
iŋ


In [15]:
#第六層處理四聲層

to1tier = tgt.core.IntervalTier(st, et,"IU/Tone")

timeptr = st
ann = wordtier.get_annotation_by_start_time(timeptr)
while (timeptr < et):
    #print(ann.text)
    if (contain_chinese(ann.text)): 
        newannstr = ''
        duration = ann.end_time - ann.start_time
        intv = duration / len(ann.text)
        cstart = ann.start_time
        for c in ann.text:
            newann = tgt.core.Annotation(cstart, cstart+intv, text='')
            cstart = cstart + intv
            pho = cdict.loc[cdict['character'] == c].phone
            ss = pho.to_string(index=False).replace('\n','')
            newann.text = ss.lstrip()
            print(c, ss)
    elif ann.text == "sp":
        newann = tgt.core.Annotation(ann.start_time, ann.end_time, text="sp")
    else:
        newann = tgt.core.Annotation(ann.start_time, ann.end_time, text=" ")
        
    to1tier.add_annotation(newann)
    #print(ann.text, "[", newann.start_time, newann.end_time, "]", "...")
    timeptr = ann.end_time
    ann = wordtier.get_annotation_by_start_time(timeptr)
    if not ann:
        break
    
print(to1tier)

到  51
這  51
個  21 51  0
富  51
士  51
山  55
我  21
們  35  0
念  51
富  51
士  51
山  55
他  55
們  35  0
念  51
成  35
所  21
以  21
你  21
可  21 51
以  21
發  55
現  51
兩  21
個  21 51  0
雙  55 51
脣  35
會  21 51
震  51
動  51
那  55 21 51 35
如  35
果  21
這  51
是  51
無  35
聲  55
有  21 51
聲  55
的   0 35 51
話  51
是  51
像  51
那  55 21 51 35
個  21 51  0
嗯  0
西  55
班  55
牙  35
文  35 51
那  55 21 51 35
個  21 51  0
西  55
班  55
牙  35
文  35 51
裏 Series([], )
面  51
就  51
有  21 51
那  55 21 51 35
個  21 51  0
這  51
個  21 51  0
β Series([], )
的   0 35 51
這  51
個  21 51  0
音  55 51
IntervalTier(start_time=0.0, end_time=20.59631224126035, name="IU/Tone", objects=[Annotation(0.0, 0.07, "sp"), Annotation(0.07, 0.38, "51"), Annotation(0.38, 0.57, "sp"), Annotation(0.7549999999999999, 0.94, "21 51  0"), Annotation(0.94, 1.1964528587272878, "sp"), Annotation(1.792150952909096, 2.09, "55"), Annotation(2.2, 2.3100000000000005, "35  0"), Annotation(2.31, 2.53, "51"), Annotation(2.53, 2.7821661521009964, "sp"), Annotation(3.46738871

In [16]:
# Two tiers from original textgrid file
import copy

newtg = tgt.core.TextGrid()

newtg.add_tier(ctier)
newtg.add_tier(etier)
newtg.add_tier(wordtier)
newtg.add_tier(postier)
newtg.add_tier(ipatier)
newtg.add_tier(cgvntier)
newtg.add_tier(to1tier)

newipatier = copy.copy(ipatier)
newipatier.name = 'EU/Phone'
newtg.add_tier(newipatier)

newcgvntier = copy.copy(cgvntier)
newcgvntier.name = 'EU/Syllable'
newtg.add_tier(newcgvntier)

newtonetier = copy.copy(to1tier)
newtonetier.name = 'EU/Tone'
newtg.add_tier(newtonetier)


typetier = tgt.core.IntervalTier(st, et, "EU/Type")
newtg.add_tier(typetier)

subjecttier = tgt.core.IntervalTier(st, et, "Subject")
newtg.add_tier(subjecttier)

print_tier(newtg)
write_txtgrid(newtg, "A0303.textgrid")

	IU/逐字稿
	English
	Word
	IU/Word
	Word
	IU/Syllable
	IU/Tone
	EU/Phone
	EU/Syllable
	EU/Tone
	EU/Type
	Subject


In [None]:
#加上原來兩層

newtg.add_tier(phonetier)


In [None]:
print(ctier)
print(etier)

In [None]:
import sys

def main():
    print('Number of arguments:', len(sys.argv), 'arguments.')
    if (len(sys.argv) != 2):
        print("Usage: sys.argv[0]: textfile, texgridfile")
    print('Argument List:', str(sys.argv))
    print("Hello World!")

if __name__ == "__main__":
    main()
    

In [None]:
from opencc import OpenCC

def translate():
    cc = OpenCC('s2t')
    source = open('03052019_1_009_conversion完.TextGrid.orig', 'r', encoding = 'utf-16')
    result = open('test_translated.txt', 'w', encoding = 'utf-8')
    # source就放純文字檔，轉完就放進去result
    count = 0
    while True:
        line = source.readline()
        line = cc.convert(line)
        if not line:  #readline會一直讀下去，這邊做的break
            break
        #print(line)
        count = count +1
        result.write(line) 
        #print('===已處理'+str(count)+'行===')
    source.close()        
    result.close()
    
translate()

In [None]:
str = "那"
for c in str:
    p = dict.loc[dict['character'] == c].phone
    ss = p.to_string(index=False).replace('\n','')
    #print(ss)
    print(c,ss)
    
#print(dict.phone)

In [None]:
#處理第五層 IU/Syllable (CGVN)層
remove_old_tier(newtg, "IU/Syllable")

cgvntier = add_interval_tier(newtg, "IU/Syllable", st, et)
timeptr = st
ann = wordtier.get_annotation_by_start_time(timeptr)

while (timeptr < et):
    #print(ann.text)
    if contain_chinese(ann.text):
        intv = (ann.end_time - ann.start_time) / len(c)
        cgvnstart =ann.start_time
        cgvnend = cvgnstart + intv
        for c in ann.text:
            newann = tgt.core.Annotation(cgvnstart, cgvnend, text="")
            
        pholist = phonetier.get_annotations_between_timepoints(ann.start_time, ann.end_time)
        
        cgvnstr = ''
        for p in pholist:
            cgvn = sampadict.loc[sampadict['SAMPA'] == p.text, 'CGVN']
            if cgvn.empty:
                cgvnstr = " "
                break
            else:
                #print(p)
                cgvnstr = cgvnstr + cgvn.to_string(index=False).replace('\n','')
            cgvnstr = cgvnstr.replace(" ","")
    else:
        cgvnstr = " "
        
    newann = tgt.core.Annotation(ann.start_time, ann.end_time, text=cgvnstr)
    cvgntier.add_annotation(newann)
    timeptr = ann.end_time
    ann = wordtier.get_annotation_by_start_time(timeptr)

print(cgvntier)

In [None]:
#處理第五層 IU/Syllable (CGVN)層

cgvntier = add_interval_tier(newtg, "IU/Syllable", st, et)
timeptr = st
ann = phonetier.get_annotation_by_start_time(timeptr)

while (timeptr < et):
    #print(ann.text)
    cgvn = sampadict.loc[sampadict['SAMPA'] == ann.text, 'CGVN']
    
    if cgvn.empty:
        cgvnstr = ann.text
    else:
        cgvnstr = cgvn.to_string(index=False).replace('\n','')
    
    newann = tgt.core.Annotation(ann.start_time, ann.end_time, text=cgvnstr)
    cgvntier.add_annotation(newann)
    timeptr = ann.end_time
    ann = phonetier.get_annotation_by_start_time(timeptr)
    if not ann:
        break

print(cgvntier)

In [13]:
#處理第四層 IU/Phone (IPA)層

ipatier = tgt.core.IntervalTier(st, et, "Word")
timeptr = st
ann = phonetier.get_annotation_by_start_time(timeptr)

while (timeptr < et):
    if ann.text == "sp":
        newann = tgt.core.Annotation(ann.start_time, ann.end_time, "sp")
    else:
        print(ann.text)
        ipaa = sampadict.loc[sampadict['SAMPA'] == ann.text, 'IPA']
    
        if ipaa.empty:
            ipaastr = ann.text
        else:
            ipaastr = ipaa.to_string(index=False).replace('\n','')
    
        newann = tgt.core.Annotation(ann.start_time, ann.end_time, text=ipaastr)
    
    ipatier.add_annotation(newann)
    timeptr = ann.end_time
    ann = phonetier.get_annotation_by_start_time(timeptr)
    if not ann:
        break

print(ipatier)

t
ɑ
w
ts
ɤ
k
ɤ

f
u
ʂ
ɨ
ʂ
a
n
w
ɔ
m
ə
n
n
j
ɛ
n
f
u
ʂ
ɨ
ʂ
a
n
t
a
m
ə
n
n
j
ɛ
n
tʂʰ
ə
n


s
w
ɔ
i

n
i
k
&
i
f
a
ɕ
j
ɛ
n
l
j
ɑ
ŋ
g
ɤ
ʂ
w
ŋ
tʂʰ
w
ə
n
x
w
e
j
tʂ
ə
n
t
o
ŋ

n
a

ʐ
u
k
w
ɔ
tʂ
ɤ
ʂ
ɨ
u
ʂ
ə
ŋ
j
o
w
ʂ
ə
ŋ
t
ɤ
x
w
a
ʂ
ɨ
ɕ
j
ɑ
ŋ

n
a
k
ɤ
ə
n
ɕ
ɨ
p
a
n
j
a
w
ə
n

n
a
k
ɤ
ɕ
i
p
a
n
j
a
w
ə
n
l
i
m
j
a
n
tɕ
j
o
w
j
o
w
n
a
k
ɤ
tʂ
ɤ
g
ɤ

t
ɤ
tʂ
ɤ
k
ɤ
i
ŋ

IntervalTier(start_time=0.0, end_time=20.59631224126035, name="Word", objects=[Annotation(0.0, 0.07, "sp"), Annotation(0.07, 0.12029973919034785, "tʰ"), Annotation(0.12029973919034785, 0.2485198396947419, "ɑ"), Annotation(0.2485198396947419, 0.38, "w"), Annotation(0.38, 0.57, "sp"), Annotation(0.57, 0.625064944973469, "ts"), Annotation(0.625064944973469, 0.7159551427993686, "ɤ"), Annotation(0.7159551427993686, 0.7695154379467737, "kʰ"), Annotation(0.7695154379467737, 0.94, "ɤ"), Annotation(0.94, 1.1964528587272878, ""), Annotation(1.1964528587272878, 1.248390114627802, "f"), Annotation(1.248390114627802, 1.3684950188977407, "u"), 