In [2]:
import os
import sys
from collections import Counter, defaultdict
import tgt
from copy import deepcopy
from re import match, sub, findall, finditer
import glob

In [3]:
root_dir = '../'
word_level_timing = root_dir + 'word_level_timings'
motion_label = root_dir + 'motion_labels' 
original_annotation = root_dir + 'transcriptions_annotations'
lang = 'de'
target_dir = "./DUEL/{}".format(lang)

In [4]:
task_index = {
    1 : "dream_apartment",
    2: "film_script",
    3: "border_control"
             }

legal_tiers = {"A-utts" : [u"A", u"A-utts;"], 
               "B-utts" : [u"B", u"B-utts;", u"B_utts"], 
               "A-turns" : [u"A-turns;","A_turns"], 
               "B-turns" : [ u"B-turns;",u"B_turns", u"B-turns    "],
               "A-laughter" : [], 
               "B-laughter" : [u"B−laughter"],
               "A-en" : [u"A-eng", u"A-english",
                         u"A-fr_en", u"A-fr-en",
                         u"A-fr_en;",u"Translation A",
                         u"translation A", u"A translation", u"A Translation"], 
               "B-en" : [u"B-eng", u"B-english",
                         u"B-fr_en", u"B-fr_en;",
                         u"B_fr-en", u"Translation B", 
                         u"translation B", u"B translation",
                         u"B Translation", u"B-fr-en"],
               "Comments" : [u"Comments & questions",
                             u"comments", u"Problems"], 
               "Part" : [u"part"], 
               "O" : [u"E"]
              }

c = Counter()
missing_c = defaultdict(list)
global_tag_count = Counter()
log_file = open("{}_errors.log".format(lang), "w")

In [5]:
"""
Read textgrid function
"""
# simply : tg = tgt.read_textgrid(tg_path)

'\nRead textgrid function\n'

In [6]:
def clean_utt(utt, literal=False):
    if not literal:
        #replace variants, partial and misspoken words with standard spelling
        utt = sub("""<[vpm]="(.+?)">.+?</[vpm]>""", lambda m:m.group(1), utt)
        #remove fillers like "{F aehm}" entirely
        utt = sub("""{.*?}""", "", utt)
        
        #TO DO: resolve complex replacements like "(der + der) + die) Katze"
        
    else:
        #remove brackets from fillers, i.e. "{F aehm}" becomes "aehm"
        utt = sub("""{(.*?)}""",lambda m:m.group(1),utt)
    #remove all remaining xml-style tags    
    utt = sub("""<.*?>""","",utt)
    #remove open tags at the end of an utterance (can be removed once problems with the TextGrids are fixed)
    utt = sub("""<.*$""","",utt)
    #remove all remaining punctuation and brackets
    utt = sub("""[\.:;,\(\)\+\$]""","",utt)
    #remove whitespace at the beginning and end of an utterance
    utt = utt.strip()
    #replace any amount of whitespace with a single space
    utt = sub("""\s+"""," ",utt)
    return utt

In [7]:
"""
Methods to consume textgrids and convert to the disfluency
corpus style for consistency across different possible raw formats.

This file is distributed as part of DUEL corpus.
"""

# corpus, start_time deleted as parameters
# how to do the basic version? e rps and f
def disfluency_tags(utt):
    """returns the list of tags for each word (simply defined by split)
    and also the list of tags for boundaries (one more than the utt length) 
    for repair points and laughter bouts. NB problem is: the laughter bout itself is a word
    may in fact instead need to do this after we establish which words are proper words"""
    utt = utt.split()
    labels = ["",] * len(utt)
    boundaries = ["",] * (len(utt)+1) # where do we use this?
    inReparandum = 0
    inRepair = 0
    inFP = False # why does this start with True, changed to False
    inLS = False
    for i in range(0,len(utt)):
        word = utt[i]
        word_clean = clean_utt(word) # this is added
        if word_clean == "-": # this was "-"
            continue
        
        if "<laughter>" in word or "<laughter/>" in word:
            inLS = True
        if "<p" in word:
            labels[i] = "<p/>"
        for j in range(0,len(word)):
            filled_pause_begin = False
            c = word[j]
            if c=="(":
                inReparandum+=1
            if c == "{":
                if j == len(word)-1:
                    pass #edit term (non-fp)
                elif word[j+1] == "F":
                    inFP = True
                    filled_pause_begin = True
                else:
                    pass
                    
        if inFP or filled_pause_begin: # using and instead of or removed all edit tags in {F Ahm
            labels[i] = "<e/>"
        elif inReparandum>0 and inFP==False:
            labels[i] = "<rm/>"
        elif inRepair>0 and inFP==False:
            labels[i] = "<rp/>" # = instead of += for only one tag. however, open and close </rm> </rm> should be +=
            
        if inLS==True:
            labels[i] = "<ls/>"
        elif "</laughter>" in word:
            inLS=False
            
        for j in range(0,len(word)):
            c = word[j]
            if c == ")": inRepair-=1 # for now counting interegnum within the repairs
            if c == "+": 
                inRepair += 1
                inReparandum -= 1
            if c =="}": #out of the filled pause
                inFP=False
                
        # fluent terms
        if labels[i] == "":
            labels[i] = "<f/>"               
    #if inLS == True:
    #    print "WARNING NO LS END", corpus, start_time
        #raw_input()
        
        # labels[i-1] + utt[i] + labels[i]
       # sandwiched_labels = labels[0] + utt + labels[1]
    return labels

In [8]:
def textgrid_to_dict(tgfile):
    """Returns a dict with the tier names as keys and a list of
    intervals of (start_time, end_time, text) as values.

    :param tgfile: path to textgrid file"""

    textgrid = tgt.read_textgrid(textgrid_file_name)
    
    tgdict = dict()
    for tiername in textgrid.get_tier_names():
        tgdict[tiername] = []
        for textinterval in textgrid.get_tier_by_name(tiername):
            if textinterval.text != '<sil>':
                tgdict[tiername].append((float(textinterval.start_time),
                                         float(textinterval.end_time),
                                         str(textinterval.text
                                             .encode("utf-8").decode("utf-8"))))
    return tgdict

In [9]:
transcription_dir = original_annotation

tgsdict = dict()

for experiment_name in sorted(os.listdir(transcription_dir)):
    
    if ".DS_Store" in experiment_name:
        continue
        
    tgsdict[experiment_name] = []
    print(experiment_name) # r1, r2, r3...
        
    textgrid_file_name = transcription_dir + os.sep + experiment_name + os.sep + experiment_name + ".TextGrid"
    textgrid_file_name_target = target_dir + os.sep + experiment_name + os.sep + experiment_name + ".TextGrid"
    
    # read textgrids
    textgrid_dict = textgrid_to_dict(textgrid_file_name)
    
    for utts in textgrid_dict['A-utts']:
        print(utts[2])
        print(disfluency_tags(utts[2]))
    
    # tgt.io.write_to_file(textgrid, './disf_tags/'+str(f.split('/')[3].split('.')[0])+".textgrid")

    # for uttsB in textgrid_dict['B-utts']:
         
    tgsdict[experiment_name].append(textgrid_dict)
    # print(textgrid_file_name) # ./transcriptions_annotations/r1/r1.TextGrid


.ipynb_checkpoints


FileNotFoundError: [Errno 2] No such file or directory: '../transcriptions_annotations/.ipynb_checkpoints/.ipynb_checkpoints.TextGrid'

In [43]:
tg = tgsdict['r2']
tg[0]['B-utts'][0][2]

'Kann man ja einiges mit anfangen ne'