In [1]:
import os
import glob
import tgt
import sys
import pandas as pd
from re import match, sub, findall, finditer


root_dir = '../'
word_level_timing = root_dir+'word_level_timings'
motion_label = root_dir+'motion_labels' 
original_annotation = root_dir + 'transcriptions_annotations'
# word_level     = root_dir + 'word_level'
# video          = root_dir + 'video' 
# audio          = root_dir +'audio' 
# documents      = root_dir + 'documents', 
# metadata       = root_dir+ 'metadata', 
# transcriptions = root_dir+'transcriptions_annotations'

In [2]:
def get_all_textgrid_files(path):
    filenames = []
    for root, dirs, files in os.walk(path):
        for file in files:
            if file.endswith(".TextGrid"):
                filenames.append(os.path.join(root, file))
    return filenames
annotations = get_all_textgrid_files(original_annotation)
print(annotations)      

['../transcriptions_annotations/r19/r19.TextGrid', '../transcriptions_annotations/r18/r18.TextGrid', '../transcriptions_annotations/r13/r13.TextGrid', '../transcriptions_annotations/r9/r9.TextGrid', '../transcriptions_annotations/r10/r10.TextGrid', '../transcriptions_annotations/r1/r1.TextGrid', '../transcriptions_annotations/r4/r4.TextGrid', '../transcriptions_annotations/r15/r15.TextGrid', '../transcriptions_annotations/r7/r7.TextGrid', '../transcriptions_annotations/r5/r5.TextGrid', '../transcriptions_annotations/r17/r17.TextGrid', '../transcriptions_annotations/r12/r12.TextGrid', '../transcriptions_annotations/r2/r2.TextGrid', '../transcriptions_annotations/r3/r3.TextGrid', '../transcriptions_annotations/r6/r6.TextGrid', '../transcriptions_annotations/r14/r14.TextGrid', '../transcriptions_annotations/r16/r16.TextGrid', '../transcriptions_annotations/r11/r11.TextGrid', '../transcriptions_annotations/r8/r8.TextGrid']


In [3]:
def clean_utt(utt, literal=False):
    if not literal:
        #replace variants, partial and misspoken words with standard spelling
        utt = sub("""<[vpm]="(.+?)">.+?</[vpm]>""", lambda m:m.group(1), utt)
        #remove fillers like "{F aehm}" entirely
        utt = sub("""{.*?}""", "", utt)
        
        #TO DO: resolve complex replacements like "(der + der) + die) Katze"
        
    else:
        #remove brackets from fillers, i.e. "{F aehm}" becomes "aehm"
        utt = sub("""{(.*?)}""",lambda m:m.group(1),utt)
    #remove all remaining xml-style tags    
    utt = sub("""<.*?>""","",utt)
    #remove open tags at the end of an utterance (can be removed once problems with the TextGrids are fixed)
    utt = sub("""<.*$""","",utt)
    #remove all remaining punctuation and brackets
    utt = sub("""[\.:;,\(\)\+\$]""","",utt)
    #remove whitespace at the beginning and end of an utterance
    utt = utt.strip()
    #replace any amount of whitespace with a single space
    utt = sub("""\s+"""," ",utt)
    return utt


In [29]:
"""
Methods to consume textgrids and convert to the disfluency
corpus style for consistency across different possible raw formats.

This file is distributed as part of DUEL corpus.
"""

# corpus, start_time deleted as parameters
# how to do the basic version? e rps and f
def disfluency_tags(utt):
    """returns the list of tags for each word (simply defined by split)
    and also the list of tags for boundaries (one more than the utt length) 
    for repair points and laughter bouts. NB problem is: the laughter bout itself is a word
    may in fact instead need to do this after we establish which words are proper words"""
    utt = utt.split()
    labels = ["",] * len(utt)
    boundaries = ["",] * (len(utt)+1) # where do we use this?
    inReparandum = 0
    inRepair = 0
    inFP = False # why does this start with True, changed to False
    inLS = False
    for i in range(0,len(utt)):
        word = utt[i]
        if i == 6: print(word)
        if i == 8 : print(word, labels)
        word_clean = clean_utt(word) # this is added
        if word_clean == "-": # this was "-"
            continue
        if "<laughter>" in word or "<laughter/>" in word:
            inLS = True
        if "<p" in word:
            labels[i] = "<p/>"
        for j in range(0,len(word)):
            filled_pause_begin = False
            c = word[j]
            if c=="(":
                inReparandum+=1
            if c == "{":
                if j == len(word)-1:
                    pass #edit term (non-fp)
                elif word[j+1] == "F":
                    inFP = True
                    filled_pause_begin = True
                else:
                    pass
        if i == 6: print(inLS, inFP, filled_pause_begin, inReparandum)
        if i == 8: print(inLS, inFP, filled_pause_begin, inReparandum)   
  
        if inFP or filled_pause_begin: # using and instead of or removed all edit tags in {F Ahm
            labels[i] = "<e/>"
        elif inReparandum>0 and inFP==False:
            labels[i] = "<rm/>"
        elif inRepair>0 and inFP==False:
            labels[i] = "<rp/>" # = instead of += for only one tag. however, open and close </rm> </rm> should be +=
            
        if inLS==True:
            labels[i] = "<ls/>"
        elif "</laughter>" in word:
            inLS=False
            
        for j in range(0,len(word)):
            c = word[j]
            if c == ")": inRepair-=1 # for now counting interegnum within the repairs
            if c == "+": 
                inRepair += 1
                inReparandum -= 1
            if c =="}": #out of the filled pause
                inFP=False
                
        # fluent terms
        if labels[i] == "":
            labels[i] = "<f/>"               
    #if inLS == True:
    #    print "WARNING NO LS END", corpus, start_time
        #raw_input()
        
        # labels[i-1] + utt[i] + labels[i]
       # sandwiched_labels = labels[0] + utt + labels[1]
    print(len(labels), len(utt))
    return labels

In [30]:
print(disfluency_tags("doch man kann ja noch mal <v='ein'>'n</v> paar ( <p='Durchgänge'>D-</p> + Durchgänge ) bauen"))

<v='ein'>'n</v>
False False False 0
( ['<f/>', '<f/>', '<f/>', '<f/>', '<f/>', '<f/>', '<f/>', '<f/>', '', '', '', '', '', '']
False False False 1
14 14
['<f/>', '<f/>', '<f/>', '<f/>', '<f/>', '<f/>', '<f/>', '<f/>', '<rm/>', '<rm/>', '<rm/>', '<rp/>', '<rp/>', '<f/>']


In [49]:
# for index, f in enumerate(annotations):
#     clear_text = list()
#     textgrid = tgt.read_textgrid(f)
#     tier_names = textgrid.get_tier_names()
#     print(tier_names)
#     for names in tier_names:
#         text_tier = textgrid.get_tier_by_name(names)
#         for annotation in text_tier.annotations:
#             annotation.text = clean_utt(annotation.text)
#     tgt.io.write_to_file(textgrid, './clean_text/'+str(f.split('/')[3].split('.')[0])+".textgrid")
    # dataframe = pd.DataFrame(clear_text, columns=['file_name', 'tier_name', 'text', 'clean_text', 'start_time', 'end_time'])
    # # print(f.split('/')[3].split('.')[0])
    # dataframe.to_csv('./clean_text/'+str(f.split('/')[3].split('.')[0])+'.csv')

In [52]:
# for index, f in enumerate(annotations):
#     print("file name is ", f)
#     if index == 1:
#         break
#     clear_text = list()
#     textgrid = tgt.read_textgrid(f)
#     tier_names = textgrid.get_tier_names()
#     print("Tier names are", tier_names)
#     for names in tier_names:
#         text_tier = textgrid.get_tier_by_name(names)
#         for annotation in text_tier.annotations:
#             # print(annotation)
#             if len(annotation.text) > 1:
#                 print(annotation.text, annotation.start_time, annotation.end_time, disfluency_tags(annotation.text))
#             # print(annotation.text, convert_to_disfluency_word_tag_tuples_from_raw(annotation.text))


['doch', 'man', 'kann', 'ja', 'noch', 'mal', "<v='ein'>'n</v>", 'paar', '(', "<p='Durchgänge'>D-</p>", '+', 'Durchgänge', ')', 'bauen']
['', '', '', '', '', '', '', '', '', '', '', '', '', '']
word clean doch
stage two ['', '', '', '', '', '', '', '', '', '', '', '', '', '']
word clean man
stage two ['<f/>', '', '', '', '', '', '', '', '', '', '', '', '', '']
word clean kann
stage two ['<f/>', '<f/>', '', '', '', '', '', '', '', '', '', '', '', '']
word clean ja
stage two ['<f/>', '<f/>', '<f/>', '', '', '', '', '', '', '', '', '', '', '']
word clean noch
stage two ['<f/>', '<f/>', '<f/>', '<f/>', '', '', '', '', '', '', '', '', '', '']
word clean mal
stage two ['<f/>', '<f/>', '<f/>', '<f/>', '<f/>', '', '', '', '', '', '', '', '', '']
word clean 'n
stage two ['<f/>', '<f/>', '<f/>', '<f/>', '<f/>', '<f/>', '', '', '', '', '', '', '', '']
word clean paar
stage two ['<f/>', '<f/>', '<f/>', '<f/>', '<f/>', '<f/>', '<f/>', '', '', '', '', '', '', '']
word clean 
stage two ['<f/>', '<f/>'