In [2]:
import pandas as pd

def read_dict():
    dicts=["cdict1.xls", "cdict2.xls", "cdict3.xls"]
    cdict = pd.DataFrame()
    for d in dicts:
        print("Reading", d)
        cdp = pd.read_excel(d)
        cdict = cdict.append(cdp)
    cdict = cdict.drop(columns=["字詞屬性", "字詞號", "部首字", "部首外筆劃數", "總筆劃數","漢語拼音", "相似詞", "相反詞", "釋義", "編按", "多音參見訊息", "異體字"])
    cdict = cdict[~cdict['字詞名'].str.contains('gif')]
    cdict = cdict[~cdict['字詞名'].str.contains('png')]
    cdict = cdict.replace(to_replace ='ㄦ', value = ' ㄦ', regex = True)
    cdict = cdict.replace(to_replace =',', value = '', regex = True)
    cdict = cdict.replace(to_replace ="[\（\(\[].*?[\）\)\]]", value = "", regex = True)
    cdict = cdict.dropna()
    #print(cdict)
    return cdict

cdx = read_dict()

def four_sounds(str):
    if 'ˋ' in str:
        return 4 # can return 51
    elif 'ˊ' in str:
        return 2 # can return 35
    elif 'ˇ' in str:
        return 3 # can return 21
    elif '˙' in str:
        return 0 # can return 0
    else:
        return 1 # can return 55
    
def contain_chinese(check_str):
    """
    判断字符串中是否包含中文
    :param check_str: {str} 需要检测的字符串
    :return: {bool} 包含返回True， 不包含返回False
    """
    for ch in check_str:
        if u'\u4e00' <= ch <= u'\u9fff':
            return True
    return False

Reading cdict1.xls
Reading cdict2.xls
Reading cdict3.xls
        字詞名               注音一式
0         八                 ㄅㄚ
1         扒                 ㄅㄚ
2         叭                 ㄅㄚ
3         巴                 ㄅㄚ
4         吧                 ㄅㄚ
5         芭                 ㄅㄚ
6         疤                 ㄅㄚ
7         羓                 ㄅㄚ
8         粑                 ㄅㄚ
9         笆                 ㄅㄚ
10        豝                 ㄅㄚ
11        鈀                 ㄅㄚ
12        捌                 ㄅㄚ
13        拔                ㄅㄚˊ
14        茇                ㄅㄚˊ
15        胈                ㄅㄚˊ
16        軷                ㄅㄚˊ
17        菝                ㄅㄚˊ
18        跋                ㄅㄚˊ
19        鈸                ㄅㄚˊ
20        魃                ㄅㄚˊ
21        鼥                ㄅㄚˊ
22        把                ㄅㄚˇ
23        鈀                ㄅㄚˇ
24        靶                ㄅㄚˇ
25        把                ㄅㄚˋ
26        弝                ㄅㄚˋ
27        爸                ㄅㄚˋ
28        耙                ㄅㄚˋ
29        伯  

In [1]:
# 引用套件並縮寫為 pd  
import pandas as pd
#讀入 EHowNet
from ehownet_python3 import *
import re

def read_ehownet():
    try:
        EHowTree=EHowNetTree("db/ehownet_ontology.sqlite")
        #tree=EHowNetTree("db/ehownet_ontology_sim.sqlite")
        #print(dict)
        
        return EHowTree
    except Exception as e:
        print(e)
        exit(-1)

def build_sampa_table():
    try:
        sampadict = pd.read_excel("SAMPA_revised.xlsx")
        #ssampa = sampadict.sort_values(by=SAMPA.str.len())
        s=sampadict.SAMPA.str.len().sort_values(ascending=False).index
        smp = sampadict.reindex(s)
        smp = smp.reset_index(drop=True)
        #print(smp)
    
        ptnstr = smp['SAMPA'].to_string(index=False).replace("\n","|").replace(" ","")
        #print(ptnstr)
        #newidx = sampadict.SAMPA.str.len().sort_values(ascending=False)
        ptn = re.compile(ptnstr)
    
        return (ptn, smp)
    except Exception as e:
        print(e)
        exit(-1)

In [2]:
import chardet
from opencc import OpenCC

def file_translation(fn):
    try:
        infile = fn
        rawdata = open(infile, "rb").read()
        result = chardet.detect(rawdata)
        charenc = result['encoding']
        #print("The file encoding:", charenc)
        if charenc != 'UTF-16':
            print("Suspecious File format: ", charenc)
            exit(-1)
            
        # Here we assume that the original contains simplifed chinese, the conversion result is UTF-8 tranditional chinese
        cc = OpenCC('s2t')
        source = open(infile, 'r', encoding = charenc)
        result = open(infile+".utf8", 'w', encoding = 'utf-8')
        #source就放純文字檔，轉完就放進去result
        count = 0
        while True:
            line = source.readline()
            line = cc.convert(line)
            if not line:  #readline會一直讀下去，這邊做的break
                break
            #print(line)
            count = count +1
            result.write(line) 
            #print('===已處理'+str(count)+'行===')
        source.close()        
        result.close()
        print("Chinese conversion complete")
    except Exception as e:        
        print(e)
        exit(-1)
        


In [3]:
from __future__ import print_function, division
import os
import sys
import tgt

EXTENSION = 'TextGrid'

def print_tiernames(filenames):
    for filename in filenames:
        try:
            tg = tgt.io.read_textgrid(filename)
            print(filename)
            for tiername in tg.get_tier_names():
                print('\t' + tiername)
        except err:
            print(filename + ' caused a problem.')
            sys.stderr.write('ERROR: %s\n' % str(err))

def read_txtgrid(filename):
    tg = tgt.io.read_textgrid(filename)
    return tg

def add_text_tier(tg, NNM=""):
    new_tier = tgt.core.PointTier(name=NNM)
    tg.add_tier(new_tier)
    return new_tier

def add_interval_tier(tg, NNM="", STT=0, ETT=0):
    new_tier = tgt.core.IntervalTier(name=NNM, start_time = STT, end_time = ETT)
    tg.add_tier(new_tier)
    return new_tier

def print_tier(tg):
    for tiername in tg.get_tier_names():
        print('\t' + tiername)   

def write_txtgrid(tg, NNM):
    tgt.io.write_to_file(tg, filename=NNM )
    
def remove_old_tier(tg, name):
    if (tg.has_tier(name)):
        tg.delete_tier(name)

In [4]:
def contain_chinese(check_str):
    """
    判断字符串中是否包含中文
    :param check_str: {str} 需要检测的字符串
    :return: {bool} 包含返回True， 不包含返回False
    """
    for ch in check_str:
        if u'\u4e00' <= ch <= u'\u9fff':
            return True
    return False

In [5]:
import numpy as np
import math

def remove_noise(tr):
    for o in tr._objects:
        if o.duration() < 0.01:
            tr.delete_annotation_by_start_time(o.start_time)

def filling_gaps(tr):
    tr.start_time = 0
    o = tr._objects[0]
    if o.start_time != 0:
        o.start_time = 0
    o = tr._objects[-1]
    if o.end_time != tr.end_time:
        o.end_time = tr.end_time
    print(tr)
    idx = len(tr._objects)
    for i in range(1,idx):
        if tr._objects[i].start_time != tr._objects[i-1].end_time:
            tr._objects[i].start_time = tr._objects[i-1].end_time

def align_tiers(wtr, ptr):
    for w in wtr._objects:
        wst = w.start_time
        wet = w.end_time
        pann = ptr.get_annotations_between_timepoints(w.start_time, w.end_time, False, False)
        #print(wst, wet, pann[0].start_time, pann[-1].end_time)
        
        #print("Wst:", math.isclose(wst, pann[0].start_time), wst, pann[0].start_time)
        if not math.isclose(wst, pann[0].start_time):
            pann2 = ptr.get_annotations_between_timepoints(w.start_time, w.end_time, True, False)
            pst1 = pann[0].start_time
            pst2 = pann2[0].start_time
            #print("S:", w.start_time, pann[0].start_time, pann2[0].start_time)
            if abs(wst - pst1) < abs(wst - pst2):
                pann[0].start_time = wst
            else:
                pann2[0].start_time = wst
        
        #print("Wet:", math.isclose(wet, pann[-1].end_time), wet, pann[-1].end_time)
        if  not math.isclose(wet, pann[-1].end_time):
            pann2 = ptr.get_annotations_between_timepoints(w.start_time, w.end_time, False, True)
            pet1 = pann[-1].end_time
            pet2 = pann2[-1].end_time
            #print("E: ", wet, pann[-1].end_time, pann2[-1].end_time)
            if abs(wet - pet1) < abs(wet - pet2):
                pann[-1].end_time = wet
            else:
                pann2[-1].end_time = wet

In [6]:
def sampa_to_cgvn(sa1, smp, ptn):
    idx = 0
    ipastr = ""
    while idx < len(sa1):
        ma = re.match(ptn, sa1[idx:])
        ipatoken = smp[smp['SAMPA']==ma.group()].Syllable.to_string(index=False)
        ipastr = ipastr + ipatoken + " "
        idx += len(ma.group())
    return ipastr


In [7]:
# Major program starts here

(cdict, EHowTree) = read_databases()
(sampapattern, sampa) = build_sampa_table()

In [11]:
filename = "testv4/01.TextGrid"
file_translation(filename)

Chinese conversion complete


In [12]:
tg = read_txtgrid(filename+'.utf8')

otiernames =  tg.get_tier_names()
tiernames = [x.lower() for x in otiernames]

idx = [i for i, x in enumerate("phone" in x for x in tiernames) if x]
phonetiername = otiernames[idx[0]]

idx = [i for i, x in enumerate("word" in x for x in tiernames) if x]
wordtiername = otiernames[idx[0]]

print(phonetiername, wordtiername)

EU/phone Word


In [13]:
ophonetier = tg.get_tier_by_name(phonetiername)
owordtier =  tg.get_tier_by_name(wordtiername)

phonetier = ophonetier.get_copy_with_gaps_filled()
wordtier  = owordtier.get_copy_with_gaps_filled()

nwordtier  = owordtier.get_copy_with_gaps_filled()
nphonetier = ophonetier.get_copy_with_gaps_filled()

remove_noise(nwordtier)
remove_noise(nphonetier)

print(nwordtier)

filling_gaps(nwordtier)
filling_gaps(nphonetier)

align_tiers(nwordtier, nphonetier)

pst = phonetier.start_time
pet = phonetier.end_time

wst = wordtier.start_time
wet = wordtier.end_time

st = pst
et = pet

IntervalTier(start_time=0.0, end_time=3.26, name="Word", objects=[Interval(0.0, 0.15000000000000036, "這"), Interval(0.15000000000000036, 0.3800000000000008, "是"), Interval(0.3800000000000008, 0.5199999999999996, "sp"), Interval(0.5199999999999996, 0.7000000000000011, "它"), Interval(0.7000000000000011, 1.17, "裏面"), Interval(1.17, 1.3399999999999999, "sp"), Interval(1.3399999999999999, 1.4800000000000004, "不"), Interval(1.4800000000000004, 1.7400000000000002, "大"), Interval(1.7400000000000002, 1.92, "最"), Interval(1.92, 2.0199999999999996, "不"), Interval(2.0199999999999996, 2.24, "大"), Interval(2.24, 2.2699999999999996, "sp"), Interval(2.2699999999999996, 2.5299999999999994, "一樣"), Interval(2.5299999999999994, 2.620000000000001, "的"), Interval(2.620000000000001, 3.08, "地方"), Interval(3.08, 3.26, "sp")])
IntervalTier(start_time=0.0, end_time=3.26, name="Word", objects=[Interval(0.0, 0.15000000000000036, "這"), Interval(0.15000000000000036, 0.3800000000000008, "是"), Interval(0.3800000000000

In [14]:
print(wordtier)
print("---")
print(phonetier)

#ann = wordtier.get_annotation_by_start_time(0)
#print(ann)
#ann = phonetier.get_annotation_by_start_time(0)
#print(ann)

print(st, et)

IntervalTier(start_time=0.0, end_time=3.26, name="Word", objects=[Interval(0.0, 0.15000000000000036, "這"), Interval(0.15000000000000036, 0.3800000000000008, "是"), Interval(0.3800000000000008, 0.5199999999999996, "sp"), Interval(0.5199999999999996, 0.7000000000000011, "它"), Interval(0.7000000000000011, 1.17, "裏面"), Interval(1.17, 1.3399999999999999, "sp"), Interval(1.3399999999999999, 1.4800000000000004, "不"), Interval(1.4800000000000004, 1.7400000000000002, "大"), Interval(1.7400000000000002, 1.92, "最"), Interval(1.92, 2.0199999999999996, "不"), Interval(2.0199999999999996, 2.24, "大"), Interval(2.24, 2.2699999999999996, "sp"), Interval(2.2699999999999996, 2.5299999999999994, "一樣"), Interval(2.5299999999999994, 2.620000000000001, "的"), Interval(2.620000000000001, 3.08, "地方"), Interval(3.08, 3.26, "sp")])
---
IntervalTier(start_time=0.0, end_time=3.26, name="EU/phone", objects=[Interval(0.0, 0.07000000000000028, "Z"), Interval(0.07000000000000028, 0.15000000000000036, "&"), Interval(0.1500

In [15]:
# Extract all tha annotations from word layer

timeptr = st
ann = wordtier.get_annotation_by_start_time(timeptr)
annlist = []
while (timeptr < et):
    if ann is not None:
        if (ann.text != 'sp'): 
            #print(ann.text)
            annlist.append(ann)
        timeptr = timeptr + ann.duration()
        #print(timeptr)
        ann = wordtier.get_annotation_by_start_time(timeptr)
    else:
        break

print(annlist)

[Interval(0.0, 0.15000000000000036, "這"), Interval(0.15000000000000036, 0.3800000000000008, "是"), Interval(0.5199999999999996, 0.7000000000000011, "它"), Interval(0.7000000000000011, 1.17, "裏面"), Interval(1.3399999999999999, 1.4800000000000004, "不"), Interval(1.4800000000000004, 1.7400000000000002, "大"), Interval(1.7400000000000002, 1.92, "最"), Interval(1.92, 2.0199999999999996, "不"), Interval(2.0199999999999996, 2.24, "大"), Interval(2.2699999999999996, 2.5299999999999994, "一樣"), Interval(2.5299999999999994, 2.620000000000001, "的"), Interval(2.620000000000001, 3.08, "地方")]


In [16]:
# 處理第一層中文層

ctext = ""

for a in annlist:
    if (contain_chinese(a.text)):
        ctext = ctext + a.text
        # print(a.text)
    else:
        if (ctext[-1] == " "):
            ctext = ctext + a.text + " "
        else:
            ctext = ctext + " " + a.text + " "
            
#print(ctext)
ctier = tgt.core.PointTier(st, et, u"IU/逐字稿")
#add_text_tier(newtg, u"IU/逐字稿")
cann = tgt.core.Point((et-st)/2.0, ctext)
ctier.add_annotation(cann)
print(ctext)

這是它裏面不大最不大一樣的地方


In [17]:
# 處理第二層英文層

from googletrans import Translator
translator = Translator()
tr = translator.translate(ctext, dest = 'en')
print (tr.text)

etier = tgt.core.PointTier(st, et, "English")
#add_text_tier(newtg, "English")
eann = tgt.core.Point((et-st)/2.0,tr.text)
etier.add_annotation(eann)

This is not the same as inside it is not the best place


In [18]:
# 處理第三層 加入詞意層
#postier = add_interval_tier(newtg, "IU/Word", st, et)
postier = tgt.core.IntervalTier(st, et, "IU/Word")
timeptr = st
ann = wordtier.get_annotation_by_start_time(timeptr)
while (timeptr < et):
    newann = tgt.core.Annotation(ann.start_time, ann.end_time, text=" ")
    if (contain_chinese(ann.text)): 
        newannstr = ""
        #去 EHowNet 查詞性
        posp=EHowTree.searchWord(ann.text)
        #print(ann.text, posp)
        if posp:
            for w in posp:
                #print(w.pos)
                newannstr = newannstr + w.pos + " "
            #print(node)
            newann.text = newannstr.rstrip()
    elif ann.text == 'sp':
        newann.text = 'sp'
    #print(newannstr)
    #print(c, "[", newann.start_time, newann.end_time, "]", ss)
    postier.add_annotation(newann)
    timeptr = ann.end_time
    ann = wordtier.get_annotation_by_start_time(timeptr)
    if ann == None:
        break
    
print(postier)

IntervalTier(start_time=0.0, end_time=3.26, name="IU/Word", objects=[Annotation(0.0, 0.15000000000000036, "Nep"), Annotation(0.15000000000000036, 0.3800000000000008, "SHI,SHI D"), Annotation(0.3800000000000008, 0.5199999999999996, "sp"), Annotation(0.5199999999999996, 0.7000000000000011, "Nh"), Annotation(0.7000000000000011, 1.17, "Ncd"), Annotation(1.17, 1.3399999999999999, "sp"), Annotation(1.3399999999999999, 1.4800000000000004, "D"), Annotation(1.4800000000000004, 1.7400000000000002, "VH Dfa"), Annotation(1.7400000000000002, 1.92, "Dfa"), Annotation(1.92, 2.0199999999999996, "D"), Annotation(2.0199999999999996, 2.24, "VH Dfa"), Annotation(2.24, 2.2699999999999996, "sp"), Annotation(2.2699999999999996, 2.5299999999999994, "VH Ng"), Annotation(2.5299999999999994, 2.620000000000001, "Na T"), Annotation(2.620000000000001, 3.08, "A Na"), Annotation(3.08, 3.26, "sp")])


In [19]:
#處理第五層 IU/Syllable (CGVN)層

cgvntier = tgt.core.IntervalTier(st, et, "IU/Syllable")
timeptr = st
ann = wordtier.get_annotation_by_start_time(timeptr)

while (timeptr < et):
    #print(ann.text)
    if contain_chinese(ann.text):
        #詞含有中文，抓出音來
        
        pholist = phonetier.get_annotations_between_timepoints(ann.start_time, ann.end_time)
        
        pstr = ""
        for p in pholist:
            pstr = pstr + p.text
        
        print(pstr, end = " ")
        
        cvgnstr = sampa_to_cgvn(pstr, sampa, sampapattern)
        print(cvgnstr)
    timeptr = ann.end_time
    ann = wordtier.get_annotation_by_start_time(timeptr)
    if not ann:
        break

print(cgvntier)

Z&  C  V 
S%  C  V 
ta  C  V 
limyEn  C  V  N  G  V  N 
bu  C  V 
da  C  V 
zwey  C  G  V  G 
bu  C  V 
d@y  C  V  G 
iyaN  V  G  V  N 
d&  C  V 
difaN  C  V  C  V  N 
IntervalTier(start_time=0.0, end_time=3.26, name="IU/Syllable", objects=[])


In [None]:
#處理第五層 IU/Syllable (CGVN)層

cgvntier = tgt.core.IntervalTier(st, et, "IU/Syllable")
timeptr = st
ann = wordtier.get_annotation_by_start_time(timeptr)

while (timeptr < et):
    #print(ann.text)
    if contain_chinese(ann.text):
        #詞含有中文，抓出音來
        
        pholist = phonetier.get_annotations_between_timepoints(ann.start_time, ann.end_time)
        print(ann.text)
        cgvnstr = ''
        for p in pholist:
            ptext = p.text
            #print(ptext, end =" ")
            cgvn = sampadict.loc[sampadict['IPA'] == ptext, 'CGVN']
            if not cgvn.empty:
                cstring = cgvn.to_string(index=False).replace('\n','')
                #print(cstring)
                cgvnstr = cgvnstr + cstring
            #cgvnstr = cgvnstr.replace(" ","")
            print(">>> ", ptext, cgvnstr)
    elif ann.text == "sp":
        cgvnstr = "sp"
    else:
        cgvnstr = " "
    cgvnstr = cgvnstr.replace(" ","")
    newann = tgt.core.Annotation(ann.start_time, ann.end_time, cgvnstr)
    cgvntier.add_annotation(newann)
    timeptr = ann.end_time
    ann = wordtier.get_annotation_by_start_time(timeptr)
    if not ann:
        break

print(cgvntier)

In [None]:
#第六層處理四聲層

to1tier = tgt.core.IntervalTier(st, et,"IU/Tone")

timeptr = st
ann = wordtier.get_annotation_by_start_time(timeptr)
while (timeptr < et):
    #print(ann.text)
    if (contain_chinese(ann.text)): 
        newannstr = ''
        duration = ann.end_time - ann.start_time
        intv = duration / len(ann.text)
        cstart = ann.start_time
        for c in ann.text:
            newann = tgt.core.Annotation(cstart, cstart+intv, text='')
            cstart = cstart + intv
            pho = cdict.loc[cdict['character'] == c].phone
            ss = pho.to_string(index=False).replace('\n','')
            newann.text = ss.lstrip()
            print(c, ss)
    elif ann.text == "sp":
        newann = tgt.core.Annotation(ann.start_time, ann.end_time, text="sp")
    else:
        newann = tgt.core.Annotation(ann.start_time, ann.end_time, text=" ")
        
    to1tier.add_annotation(newann)
    #print(ann.text, "[", newann.start_time, newann.end_time, "]", "...")
    timeptr = ann.end_time
    ann = wordtier.get_annotation_by_start_time(timeptr)
    if not ann:
        break
    
print(to1tier)

In [None]:
# Two tiers from original textgrid file
import copy

newtg = tgt.core.TextGrid()

newtg.add_tier(ctier)
newtg.add_tier(etier)
newtg.add_tier(wordtier)
newtg.add_tier(postier)
newtg.add_tier(ipatier)
newtg.add_tier(cgvntier)
newtg.add_tier(to1tier)

newipatier = copy.copy(ipatier)
newipatier.name = 'EU/Phone'
newtg.add_tier(newipatier)

newcgvntier = copy.copy(cgvntier)
newcgvntier.name = 'EU/Syllable'
newtg.add_tier(newcgvntier)

newtonetier = copy.copy(to1tier)
newtonetier.name = 'EU/Tone'
newtg.add_tier(newtonetier)


typetier = tgt.core.IntervalTier(st, et, "EU/Type")
newtg.add_tier(typetier)

subjecttier = tgt.core.IntervalTier(st, et, "Subject")
newtg.add_tier(subjecttier)

print_tier(newtg)
write_txtgrid(newtg, "A0303.textgrid")

In [None]:
#加上原來兩層

newtg.add_tier(phonetier)


In [None]:
print(ctier)
print(etier)

In [None]:
import sys

def main():
    print('Number of arguments:', len(sys.argv), 'arguments.')
    if (len(sys.argv) != 2):
        print("Usage: sys.argv[0]: textfile, texgridfile")
    print('Argument List:', str(sys.argv))
    print("Hello World!")

if __name__ == "__main__":
    main()
    

In [None]:
from opencc import OpenCC

def translate():
    cc = OpenCC('s2t')
    source = open('03052019_1_009_conversion完.TextGrid.orig', 'r', encoding = 'utf-16')
    result = open('test_translated.txt', 'w', encoding = 'utf-8')
    # source就放純文字檔，轉完就放進去result
    count = 0
    while True:
        line = source.readline()
        line = cc.convert(line)
        if not line:  #readline會一直讀下去，這邊做的break
            break
        #print(line)
        count = count +1
        result.write(line) 
        #print('===已處理'+str(count)+'行===')
    source.close()        
    result.close()
    
translate()

In [None]:
str = "那"
for c in str:
    p = dict.loc[dict['character'] == c].phone
    ss = p.to_string(index=False).replace('\n','')
    #print(ss)
    print(c,ss)
    
#print(dict.phone)

In [None]:
#處理第五層 IU/Syllable (CGVN)層
remove_old_tier(newtg, "IU/Syllable")

cgvntier = add_interval_tier(newtg, "IU/Syllable", st, et)
timeptr = st
ann = wordtier.get_annotation_by_start_time(timeptr)

while (timeptr < et):
    #print(ann.text)
    if contain_chinese(ann.text):
        intv = (ann.end_time - ann.start_time) / len(c)
        cgvnstart =ann.start_time
        cgvnend = cvgnstart + intv
        for c in ann.text:
            newann = tgt.core.Annotation(cgvnstart, cgvnend, text="")
            
        pholist = phonetier.get_annotations_between_timepoints(ann.start_time, ann.end_time)
        
        cgvnstr = ''
        for p in pholist:
            cgvn = sampadict.loc[sampadict['SAMPA'] == p.text, 'CGVN']
            if cgvn.empty:
                cgvnstr = " "
                break
            else:
                #print(p)
                cgvnstr = cgvnstr + cgvn.to_string(index=False).replace('\n','')
            cgvnstr = cgvnstr.replace(" ","")
    else:
        cgvnstr = " "
        
    newann = tgt.core.Annotation(ann.start_time, ann.end_time, text=cgvnstr)
    cvgntier.add_annotation(newann)
    timeptr = ann.end_time
    ann = wordtier.get_annotation_by_start_time(timeptr)

print(cgvntier)

In [None]:
#處理第五層 IU/Syllable (CGVN)層

cgvntier = add_interval_tier(newtg, "IU/Syllable", st, et)
timeptr = st
ann = phonetier.get_annotation_by_start_time(timeptr)

while (timeptr < et):
    #print(ann.text)
    cgvn = sampadict.loc[sampadict['SAMPA'] == ann.text, 'CGVN']
    
    if cgvn.empty:
        cgvnstr = ann.text
    else:
        cgvnstr = cgvn.to_string(index=False).replace('\n','')
    
    newann = tgt.core.Annotation(ann.start_time, ann.end_time, text=cgvnstr)
    cgvntier.add_annotation(newann)
    timeptr = ann.end_time
    ann = phonetier.get_annotation_by_start_time(timeptr)
    if not ann:
        break

print(cgvntier)

In [None]:
#處理第四層 IU/Phone (IPA)層

ipatier = tgt.core.IntervalTier(st, et, "Word")
timeptr = st
ann = phonetier.get_annotation_by_start_time(timeptr)

while (timeptr < et):
    if ann.text == "sp":
        newann = tgt.core.Annotation(ann.start_time, ann.end_time, "sp")
    else:
        print(ann.text)
        ipaa = sampadict.loc[sampadict['SAMPA'] == ann.text, 'IPA']
    
        if ipaa.empty:
            ipaastr = ann.text
        else:
            ipaastr = ipaa.to_string(index=False).replace('\n','')
    
        newann = tgt.core.Annotation(ann.start_time, ann.end_time, text=ipaastr)
    
    ipatier.add_annotation(newann)
    timeptr = ann.end_time
    ann = phonetier.get_annotation_by_start_time(timeptr)
    if not ann:
        break

print(ipatier)