In [1]:
from __future__ import division
%matplotlib inline
import pandas as pd
import numpy as np
import sys
from collections import defaultdict
import matplotlib.pyplot as plt
from copy import deepcopy

In [2]:
sys.path.append("../../../..")

from deep_disfluency.evaluation.disf_evaluation import incremental_output_disfluency_eval_from_file
from deep_disfluency.evaluation.disf_evaluation import final_output_disfluency_eval_from_file
from deep_disfluency.evaluation.disf_evaluation import ACCURACY_HEADER
from deep_disfluency.evaluation.disf_evaluation import FINAL_OUTPUT_DISFLUENCY_ACCURACY_HEADER
from deep_disfluency.evaluation.disf_evaluation import FINAL_OUTPUT_TTO_ACCURACY_HEADER
from deep_disfluency.evaluation.eval_utils import get_tag_data_from_corpus_file
from deep_disfluency.evaluation.eval_utils import sort_into_dialogue_speakers

In [11]:
# Get the locations of all needed files
# Assume we have the incremental output

experiment_dir = "../../../experiments"

partial_words = False  # No partial words in these experiments, removed
if partial_words:
    partial = '_partial'
else:
    partial = ''
#the evaluation files (as text files)
disf_dir = "../../../data/disfluency_detection/switchboard"
disfluency_files = [
                    disf_dir + "/swbd_disf_heldout{}_data_timings.csv".format(partial),
                    disf_dir + "/swbd_disf_test{}_data_timings.csv".format(partial)
                   ]
    

allsystemsfinal = ["021/epoch_40"]

# Incremental Evaluation

In [12]:
# create final output files for the final output evaluation (and do incremental evaluation first:
DO_INCREMENTAL_EVAL = True
if DO_INCREMENTAL_EVAL:
    all_incremental_results = {}
    all_incremental_error_dicts = {}
    for system in allsystemsfinal:
        print system
        #if 'complex' in system: break
        hyp_dir = experiment_dir + "/" + system
        #hyp_dir = experiment_dir
        for division, disf_file in zip(["heldout","test"], disfluency_files):
            print "*" * 30, division, "*" * 30
            IDs, timings, words, pos_tags, labels = get_tag_data_from_corpus_file(disf_file)
            gold_data = {} #map from the file name to the data
            for dialogue,a,b,c,d in zip(IDs, timings, words, pos_tags, labels):
                # if "asr" in division and not dialogue[:4] in good_asr: continue
                gold_data[dialogue] = (a,b,c,d)
            inc_filename = hyp_dir + "/swbd_disf_{0}{1}_data_output_increco".format(division, partial) + ".text"
            final_output_name = inc_filename.replace("_increco", "_final")
            results, error_analysis = incremental_output_disfluency_eval_from_file(
                                                                             inc_filename,
                                                                             gold_data,
                                                                             utt_eval=True,
                                                                             error_analysis=True,
                                                                             word=True,
                                                                             interval=False,
                                                                             outputfilename=final_output_name)
            for k,v in results.items():
                print k,v
            all_incremental_results[division + "_" + system] = deepcopy(results)
            if "heldout" in division:
                # only do the error analyses on the heldout data
                all_incremental_error_dicts[division + "_" + system] = deepcopy(error_analysis)


021/epoch_40
heldout
loading data ../../../data/disfluency_detection/switchboard/swbd_disf_heldout_data_timings.csv
loaded 102 sequences
102 speakers
incremental output disfluency evaluation
word= True interval= False utt_eval= True




writing final output to file ../../../experiments/021/epoch_40/swbd_disf_heldout_data_output_final.text
edit_overhead_rel_<rm None
delayed_acc_<rm_mean_word None
delayed_acc_<rm_3_word None
delayed_acc_<rm_2_word None
t_t_detection_<e_word 0.173286991063
delayed_acc_<rm_4_word None
t_t_detection_<rms_word 2.51349380015
delayed_acc_<rm_1_word None
edit_overhead_rel_word 5.94697924544
t_t_detection_final_t/>_word None
delayed_acc_<rm_5_word None
edit_overhead_rel_tto None
t_t_detection_t/>_word nan
t_t_detection_<rps_word 1.20744356315
delayed_acc_<rm_6_word None
processing_overhead_word None
test
loading data ../../../data/disfluency_detection/switchboard/swbd_disf_test_data_timings.csv
loaded 100 sequences
100 speakers
incremental output disfluency evaluation
word= True interval= False utt_eval= True
writing final output to file ../../../experiments/021/epoch_40/swbd_disf_test_data_output_final.text
edit_overhead_rel_<rm None
delayed_acc_<rm_mean_word None
delayed_acc_<rm_3_word None
d

# Final output evaluation

In [13]:
all_results = {}
all_error_dicts = {}
for system in allsystemsfinal:
    print system
    #if 'complex' in system: break
    hyp_dir = experiment_dir
    for division, disf_file in zip(["heldout", "test"],disfluency_files):
        #if not division == "heldout": continue
        print "*" * 30, division, "*" * 30
        IDs, timings, words, pos_tags, labels = get_tag_data_from_corpus_file(disf_file)
        gold_data = {} #map from the file name to the data
        for dialogue,a,b,c,d in zip(IDs, timings, words, pos_tags, labels):
            # if "asr" in division and not dialogue[:4] in good_asr: continue
            gold_data[dialogue] = (a,b,c,d)


        #dialogue_speakers.extend(sort_into_dialogue_speakers(IDs,mappings,utts, pos_tags, labels))
        # if "heldout" in division:
        #    good_asr = good_asr_heldout
        #else:
        #    good_asr = good_asr_test

        if "asr" in division:
            error = False
            word = False
        else:
            error = True
            word = True
        #the below does just the final output evaluation, assuming a final output file, faster
        hyp_file = hyp_dir + '/' + system + "/" + "swbd_disf_{0}{1}_data_output_final.text".format(division,
                                                                                                        partial)
        #inc_filename = hyp_dir + "/" + system + "/swbd_disf_heldout_partial_data.final.txt"
        #inc_filename = hyp_dir + "/" + system + ".text"
        #stir_tag_file = inc_filename.replace("_inc_", "_final_")
        
        results,speaker_rate_dict,error_analysis = final_output_disfluency_eval_from_file(
                                                        hyp_file,
                                                        gold_data,
                                                        utt_eval=False,
                                                        error_analysis=error,
                                                        word=word,
                                                        interval=False,
                                                        outputfilename=None
                                                    )
        #the below does incremental and final output in one, also outputting the final outputs
        #derivable from the incremental output, takes quite a while
        for k,v in results.items():
            print k,v
        all_results[division + "_" + system] = deepcopy(results)
        if "heldout" in division:
            # only do the error analyses on the heldout data
            all_error_dicts[division + "_" + system] = deepcopy(error_analysis)


021/epoch_40
****************************** heldout ******************************
loading data ../../../data/disfluency_detection/switchboard/swbd_disf_heldout_data_timings.csv
loaded 102 sequences
102 speakers
final output disfluency evaluation
word= True interval= False utt_eval= False
4519A
No end found for repair beginning at 152 in: 
0	<f/>
1	<f/>
2	<e/>
3	<f/>
4	<f/>
5	<f/>
6	<f/>
7	<f/>
8	<f/>
9	<f/>
10	<f/>
11	<f/>
12	<f/>
13	<f/>
14	<f/>
15	<f/>
16	<f/>
17	<f/>
18	<e/>
19	<f/>
20	<f/>
21	<f/>
22	<e/>
23	<f/>
24	<f/>
25	<f/>
26	<f/>
27	<f/>
28	<f/>
29	<e/>
30	<f/>
31	<f/>
32	<f/>
33	<f/>
34	<f/>
35	<f/>
36	<f/>
37	<f/>
38	<f/>
39	<f/>
40	<f/>
41	<f/>
42	<f/>
43	<f/>
44	<f/>
45	<f/>
46	<f/>
47	<e/>
48	<f/>
49	<f/>
50	<f/>
51	<e/>
52	<f/>
53	<f/>
54	<e/>
55	<f/>
56	<f/>
57	<f/>
58	<f/>
59	<f/>
60	<f/>
61	<f/>
62	<e/>
63	<f/>
64	<f/>
65	<f/>
66	<f/>
67	<f/>
68	<f/>
69	<f/>
70	<f/>
71	<f/>
72	<f/>
73	<f/>
74	<f/>
75	<f/>
76	<f/>
77	<f/>
78	<f/>
79	<f/>
80	<f/>
81	<f/>
82	<f/>
83	<

In [14]:
from collections import OrderedDict
from deep_disfluency.evaluation.disf_evaluation import ACCURACY_HEADER
from deep_disfluency.evaluation.disf_evaluation import FINAL_OUTPUT_TTO_ACCURACY_HEADER
from deep_disfluency.evaluation.disf_evaluation import INCREMENTAL_OUTPUT_TTO_ACCURACY_HEADER


final_result_to_latex_dict = OrderedDict((key, val) for key, val in [
             ("f1_<rm_word", """$F_{rm}$ (per word)"""),
             ("f1_<rps_word", """$F_{rps}$ (per word)"""),
             ("f1_<e_word", """$F_{e}$ (per word)"""),
             ("f1_t>_word",  """$F_{TTO}$ (per word)"""),
             ("f1_<rps_relaxed_interval", """$F_{rps}$ (per 10s window)"""),
             ("f1_<e_relaxed_interval", """$F_{e}$ (per 10s window)"""),
             ("f1_t>_relaxed_interval", """$F_{TTO}$ (per 10s window)"""),
             ("pearson_r_correl_rps_rate_per_utt",
              "$rps$ per utterance per speaker correlation"),
             ("NIST_SU_word", "NIST SU (word)"),
             ("DSER_word", "DSER (word)")]
)



def convert_to_latex(results, eval_level=["word", "interval"],
                     inc=False, utt_seg=False,
                    only_include=None):
    """Returns a latex style tabular from results dict.
    Also displays the pandas data frame.
    """
    if not inc:
        result_to_latex_dict = final_result_to_latex_dict
    else:
        result_to_latex_dict = incremental_result_to_latex_dict
    header = []
    system_results = {sys: [] for sys in results.keys()}
    utt_seg_measures = FINAL_OUTPUT_TTO_ACCURACY_HEADER.split(',') + \
                            INCREMENTAL_OUTPUT_TTO_ACCURACY_HEADER.split(',')
    for raw in ACCURACY_HEADER.split(","):
        # print raw
        if not utt_seg and raw in utt_seg_measures:
            print "skipping 1", raw
            continue
        if "NIST_SU" in raw or "DSER" in raw or "edit_overhead" in raw:
            raw += "_{0}"
        for t in eval_level:
            if only_include and raw not in only_include:
                print "skipping 2", raw
                continue
            # if "rps_rate_per_utt" in raw and t == "word":
            #    continue
            r = raw.format(t)
            if not r in result_to_latex_dict.keys():
                print "skipping 3", raw
                continue
            conversion = result_to_latex_dict[r]
            if "EO" in conversion:
                conversion = "EO"
                if t == "word":
                    print "skipping 4", raw
                    continue  # only one interval level eval
            header.append(conversion)
            for sys in results.keys():
                if "asr" in sys and t == "word":
                    result = "-"
                else:
                    result = results[sys][r]
                    if "f1" in raw or "t_t_d" in raw or 'correl' in raw:
                        result = '{0:.3f}'.format(result)
                    else:
                        result = '{0:.2f}'.format(result)
                system_results[sys].append(result)
    rows = []
    for sys in sorted(system_results.keys()):
        corpus = "transcript"
        if 'asr' in sys:
            corpus = "ASR results"
        system = sys.split("_")[-1]
        row = [system + " ({0})".format(corpus)]
        for r in system_results[sys]:
            row.append(r)
        row = tuple(row)
        rows.append(row)
    table = pd.DataFrame(rows, columns=['System (eval. method)'] + header)
    return table

In [15]:
print all_results.keys()

['heldout_021/epoch_40', 'test_021/epoch_40']


In [19]:
#filter out the heldout files
#print all_results
print all_results.keys()
# all_results['LSTM (with POS)'] = all_results['test_041/epoch_16'] #all_results['heldout_stack_d3_lm3pos_Partial_swbd_disf_train_1_partial_data_3_8_4_2']
all_results['RNN (with POS)'] = all_results['test_021/epoch_40']
test_only = [x for x in all_results.keys() if 'RNN' in x] #if not "heldout" in x and not "complex" in x]
# print test_only
test_results = {k : all_results[k] for k in test_only}
# print "*" * 30
# print test_results
#have a look at the test results
final = convert_to_latex(test_results, eval_level=['word'], inc=False, utt_seg=False, only_include=
                        [])
#final = final.drop(final.columns[[-2]], axis=1)
final

['heldout_021/epoch_40', 'test_021/epoch_40', 'RNN (with POS)']
skipping 3 p_<rm_{0}
skipping 3 r_<rm_{0}
skipping 3 p_<rm.<rp.<i_{0}
skipping 3 r_<rm.<rp.<i_{0}
skipping 3 f1_<rm.<rp.<i_{0}
skipping 3 p_<rps_{0}
skipping 3 r_<rps_{0}
skipping 3 p_<rps_relaxed_{0}
skipping 3 r_<rps_relaxed_{0}
skipping 3 f1_<rps_relaxed_{0}
skipping 3 p_<e_{0}
skipping 3 r_<e_{0}
skipping 3 p_<e_relaxed_{0}
skipping 3 r_<e_relaxed_{0}
skipping 3 f1_<e_relaxed_{0}
skipping 1 p_t/>_{0}
skipping 1 r_t/>_{0}
skipping 1 f1_t/>_{0}
skipping 1 p_t/>_{0}
skipping 1 \ r_t/>_relaxed_{0}
skipping 1 f1_t/>_relaxed_{0}
skipping 1 NIST_SU
skipping 1 DSER
skipping 1 SegER
skipping 3 delayed_acc_<rm_1_{0}
skipping 3 delayed_acc_<rm_2_{0}
skipping 3 delayed_acc_<rm_3_{0}
skipping 3 delayed_acc_<rm_4_{0}
skipping 3 delayed_acc_<rm_5_{0}
skipping 3 delayed_acc_<rm_6_{0}
skipping 3 delayed_acc_<rm_mean_{0}
skipping 3 t_t_detection_<rms_{0}
skipping 3 t_t_detection_<rps_{0}
skipping 3 t_t_detection_<e_{0}
skipping 3 proces

Unnamed: 0,System (eval. method),$F_{rm}$ (per word),$F_{rps}$ (per word),$F_{e}$ (per word),$rps$ per utterance per speaker correlation
0,RNN (with POS) (transcript),0.668,0.744,0.853,0.956


# Error analysis

In [28]:
#Error analyses
target_tags = ['<rms', '<rps']

for div,all_error in all_error_dicts.items():
    # print div, type(all_error)
   
    if type(all_error) == bool: continue
    if "test" in div: continue
    #if not 'TTO only' in div or "asr" in div: continue
    for tag, errors in all_error.items():
        if tag not in target_tags:
            continue
        print "*" * 30, div, tag, "*" * 30
        # print errors
        # continue
        #if not 'TTO only' in div or "asr" in div: continue
        error = {"TP" : {}, "FP" : {}, "FN": {} }
        for k,v in errors.items():
            #if k == "FP":
            #    continue
            print k, len(v)
            typedict = defaultdict(int)
            lendict = defaultdict(int)
            for repair in v:

                #print repair.gold_context
                onset = ""
                if tag == "<rps" or tag == "<rms":
                    
                    
                    for i in range(0,len(repair.gold_context)):
                        if repair.gold_context[i] == "+|+":
                            onset = repair.gold_context[i+1]
                            break

                    word = onset.split("|")[0]
                    #if k == "FP":
                    #    onset = gold_onset
                    if "<e" in onset and not tag == "<e":
                        typedict["<e"]+=1
                    else:
                        if word in ["and","or","but","so","because","that","although"]:
                            typedict["CC"]+=1
                        elif word in ["i","we","they","im","ive","he","she","id"]:
                            typedict["subj"]+=1
                        elif word in ["you","the"] or "$" in word:
                            typedict["proper_other"]+=1
                        elif word in ["yeah","no","okay","yes","right","uh-huh"]:
                            typedict["ack"]+=1
                        elif word in ["it","its"]:
                            typedict["it"]+=1
                        else:
                            typedict[word]+=1
                
                if tag == "<rps" or tag == "<rms": # and not k == 'FP':
                    #print repair.type, "*****"
                    if k == "TP" and len(repair.reparandumWords)>8:
                        print "** overlength repair!"
                        print repair
                        #X = input()
                    lendict[len(repair.reparandumWords) + len(repair.interregnumWords)]+=1
                    repair_type = None
                    if repair.type:
                        repair_type = repair.type 
                        typedict[repair_type]+=1

            error[k]['len'] = deepcopy(lendict)
            error[k]['type'] = deepcopy(typedict)

                
        for mode in ['type', 'len']:
            #q1. THE RECALL RATES FOR VARIOUS GOLD REPAIR TYPES
            print mode, "*" * 30
            tps = error['TP'][mode]
            fns = error['FN'][mode]
            fps = error['FP'][mode]

            total_tps = 0
            total_fns = 0
            total_fps = 0
            top_n = 50
            all_items = list(set(tps.keys() + fns.keys()))
            # print all_items
            for k in sorted(all_items,  reverse=False):
                #print k, "*" * 30
                if mode == 'type' and k not in ["rep", "del", "sub"]:
                    continue
                recall_total = tps[k] + fns[k]
                recall = 0 if tps[k] == 0 else tps[k]/recall_total
                precision_total = tps[k] + fps[k]
                precision = 0 if tps[k] == 0 else tps[k]/precision_total
                fscore = 0 if precision == 0 or recall == 0 else (2 * (precision * recall))/(precision + recall)
                # print k, ':', tps[k], "out of", recall_total
                #print k, ':', tps[k], "out of", precision_total
                total_tps += tps[k]
                total_fns += fns[k]
                total_fps += fps[k]
                print " & ".join([str(k), "({0}/{1})".format(tps[k],recall_total), 
                                  '{0:.1f}'.format(100 * fscore)]) + "\\\\"
                top_n-=1
                if top_n <= 0:
                    break
            print total_tps/(total_fns + total_tps)


****************************** heldout_021/epoch_40 <rps ******************************
FP 280
TP 1433
FN 513
type ******************************
del & (31/101) & 46.3\\
rep & (892/1001) & 93.0\\
sub & (510/844) & 67.1\\
0.736382322713
len ******************************
0 & (0/1) & 0.0\\
1 & (917/1096) & 88.1\\
2 & (338/467) & 74.5\\
3 & (102/205) & 59.0\\
4 & (44/88) & 54.3\\
5 & (16/46) & 49.2\\
6 & (10/26) & 54.1\\
7 & (3/8) & 54.5\\
8 & (2/5) & 57.1\\
9 & (1/1) & 100.0\\
10 & (0/1) & 0.0\\
11 & (0/1) & 0.0\\
15 & (0/1) & 0.0\\
0.736382322713
****************************** heldout_021/epoch_40 <rms ******************************
FP 382
TP 1172
FN 503
type ******************************
del & (0/75) & 0.0\\
rep & (903/1034) & 88.4\\
sub & (250/547) & 49.2\\
0.696256038647
len ******************************
1 & (866/1065) & 84.5\\
2 & (236/363) & 64.0\\
3 & (51/143) & 38.2\\
4 & (16/53) & 29.6\\
5 & (3/29) & 13.6\\
6 & (0/15) & 0.0\\
7 & (0/3) & 0.0\\
8 & (0/2) & 0.0\\
9 & (0/1) & 0.0