In [1]:
import pandas as pd
import glob
import os
import re
import sys
import ast
from collections import Counter

In [98]:
#read files and save the labels to lists
def readFiles(dir):
    dataAll_df = pd.read_csv(dir,index_col=None, header=0)
    utt = dataAll_df['utt'].to_list()
    utt = [i.strip() for i in utt]
    tokens = dataAll_df['tokenized'].to_list()
    clean_tokens = [ast.literal_eval(i) for i in tokens]
    return list(zip(utt, clean_tokens)), utt


In [99]:
Odin, Odin_utt = readFiles("metadata2tokenzied.csv")
Golden, Golden_utt = readFiles("cleanAnnotation.csv")

In [100]:
#select the Vers-3 data from Odin(Vers-6)
OdinDict = {}
for i in Odin:
    OdinDict[i[0]] = i[1]
selected_Odin = []
for i in range(len(Golden_utt)):
    if Golden_utt[i] in OdinDict:
        selected_Odin.append((Golden_utt[i],OdinDict[Golden_utt[i]]))
Odin = selected_Odin

In [101]:
#get the label tokens
OdinTokens = [i[1] for i in Odin]
GoldenTokens = [i[1] for i in Golden]

In [91]:
labels = {'0':0,
"CriticalVictim":1,
'Victim':2,
'Room':3,
'Engineer':4,
'Transporter':5,
'Medic':6,
'Rubble':7,
'MarkerBlock':8,
'Meeting':9,
'Move':10,
'Precedence':11,
'RescueInteractions':12,
'KnowledgeSharing':13,
'ReportLocation':14,
'Search':15,
'HelpRequest':16,
'Question':17,
'YesNoQuestion':18,
'Instruction':19,
'Plan':20
}
reverseLabels = {}
for key,value in labels.items():
    reverseLabels[value] = key

In [102]:
# Find differences
diffAll = []
uttD, OTD, GTD = [], [], []
for i in range(len(OdinTokens)):
    if OdinTokens[i] != GoldenTokens[i]:
        uttD.append(Golden_utt[i])
        OTD.append(OdinTokens[i])
        GTD.append(GoldenTokens[i])
        diffAll.append([Golden_utt[i], [reverseLabels[j] for j in sorted(OdinTokens[i])], [reverseLabels[j] for j in sorted(GoldenTokens[i])]])
pd.DataFrame(diffAll,columns=["utt", "odin_extractions", "annotations"]).to_csv("diffAll.csv", index=False)
len(diffAll)

1945

In [103]:
# diff for each label
# add utt to single list tokens
OT = []
GT = []
for i in range(len(uttD)):
    sentence = uttD[i]
    len1,len2 = len(OTD[i]),len(GTD[i])
    #padding 0 to shorter lists
    if len1 > len2:
        r2 = GTD[i] + (len1-len2) * [0]
        r1 = OTD[i]
    elif len2 > len1:
        r1 = OTD[i] + (len2-len1) * [0]
        r2 = GTD[i]
    else:
        r1,r2 = OTD[i], GTD[i]
    #deal with overlap
    overlap = Counter(r1) & Counter(r2)
    for j in overlap:
        OT += [sentence, j] * overlap[j]
        GT += [sentence, j] * overlap[j]
        for n in range(overlap[j]):
            r1.remove(j)
            r2.remove(j)
    for k in r1:
        OT +=[sentence, k]
    for l in r2:
        GT +=[sentence, l]

In [104]:
# convert single list to nested list
OT2, GT2 = [], []
for i in range(0,len(OT), 2):
    OT2.append([OT[i], OT[i+1]])
    GT2.append([GT[i], GT[i+1]])

In [105]:
# get diff
def getDiff(label_token):
    res = []
    for i in range(len(OT2)):
        ot, gt = OT2[i][1], GT2[i][1]
        if label_token in [ot, gt] and ot != gt:
            res.append([OT2[i][0], reverseLabels[ot], reverseLabels[gt]])
    filename = reverseLabels[label_token] + "_Diff.csv"
    print("number of diff for", reverseLabels[label_token], "label:", len(res))
    pd.DataFrame(res,columns=["utt", "odin_extractions", "annotations"]).to_csv(filename, index=False)

In [106]:
for i in range(1,21):
    getDiff(i)

number of diff for CriticalVictim label: 191
number of diff for Victim label: 298
number of diff for Room label: 483
number of diff for Engineer label: 41
number of diff for Transporter label: 54
number of diff for Medic label: 21
number of diff for Rubble label: 31
number of diff for MarkerBlock label: 55
number of diff for Meeting label: 51
number of diff for Move label: 271
number of diff for Precedence label: 193
number of diff for RescueInteractions label: 73
number of diff for KnowledgeSharing label: 255
number of diff for ReportLocation label: 110
number of diff for Search label: 56
number of diff for HelpRequest label: 123
number of diff for Question label: 111
number of diff for YesNoQuestion label: 167
number of diff for Instruction label: 345
number of diff for Plan label: 486
