In [21]:
import glob
import sys
import json
import os
import pandas as pd

In [22]:
def document_evaluation(sys_documents, gold_documents, questions):

    f1={}
    p={}
    r={}

    for q in questions:
        tp = len(sys_docs[q] & gold_docs[q])*1.0
        fp = len(sys_docs[q] - gold_docs[q])*1.0
        fn = len(gold_docs[q] - sys_docs[q])*1.0

        if tp+fp+fn>0:
            if fp+tp>0.0:
                p[q] = tp/(fp + tp)
            else:
                p[q] = 0.0
            if fn + tp > 0.0:
                r[q] = tp/(fn + tp)
            else:
                r[q] = 0.0
            if p[q]+r[q]>0.0:
                f1[q] = 2*p[q]*r[q]/(p[q]+r[q])
            else:
                f1[q]=0.0
        else:
            p[q]=1.0
            r[q]=1.0
            f1[q]=1.0

    return p, r, f1

def compute_avg(v):
    return sum(v.values())/len(v)

In [23]:
submission_dir='submissions/'
gold_dir='ref/'
input_dir='input/'

answers_file='answers.json'
questions_file='questions.json'

subtasks=['s1','s2','s3']
event_types = {'injuring', 'killing', 'fire_burning', 'job_firing'}
event_props = [['participant', 'location'], ['participant', 'time'], ['location', 'time']]

In [52]:
# Preload questions
questions={}
for subtask in subtasks:
    subtask_questions_file='%s/%s/%s' % (input_dir, subtask, questions_file)
    with open(subtask_questions_file, 'r') as f:
        questions[subtask]=json.load(f)

In [25]:
from collections import defaultdict

totals=defaultdict(int)
for subtask, qs in questions.items():
    for q, qdata in qs.items():
        kt=f"{subtask}_{qdata['event_type']}"
        totals[kt]+=1
        for prop in event_props:
            if prop[0] in qdata.keys() and prop[1] in qdata.keys():
                kp=f"{subtask}_{'&'.join(prop)}"
                totals[kp]+=1

In [26]:
gold={}
for subtask in subtasks:
    subtask_gold_file='%s/%s/%s' % (gold_dir, subtask, answers_file)
    with open(subtask_gold_file, 'r') as f:
        gold[subtask]=json.load(f)

In [27]:
def compute_subset_accuracy(ques, anss, gold, event_type=None, event_prop=None):
    correct=0
    total=0
    for qid, adata in anss.items():
        if event_type and event_type!=ques[qid]['event_type']:
            continue
        if event_prop and not set(event_prop) < set(ques[qid].keys()):
            continue
        num_answer=adata['numerical_answer']
        if num_answer==gold[qid]['numerical_answer']:
            correct+=1
        total+=1
    print('correct=', correct, 'total=', total)
    if total>0:
        return round(correct*100.0/total, 2), total
    else:
        return '', ''

In [28]:
def generate_acc_table(event_types={}, event_props={}):
    data_json={'#': {}}
    for user_submission_dir in glob.glob('%s/*' % submission_dir):
        user=user_submission_dir.split('/')[-1]
        print('USER', user)
        print('='*20)
        data_json[user]={}
        for subtask_user_submission_dir in glob.glob('%s/*' % user_submission_dir):
            subtask=subtask_user_submission_dir.split('/')[-1]
            if subtask=='s1': continue
            this_answers_file='%s/%s' % (subtask_user_submission_dir, answers_file)
            if not os.path.exists(this_answers_file):
                continue
            print('SUBTASK', subtask)
            print('='*20)
            with open(this_answers_file, 'r') as f:
                answers = json.load(f)
                for event_type in event_types:
                    print(event_type)
                    acc, total=compute_subset_accuracy(questions[subtask], 
                                            answers, 
                                            gold[subtask], 
                                            event_type=event_type)
                    k='%s_%s' % (subtask, event_type)
                    if k not in data_json['#']:
                        data_json['#'][k]=totals[k]
                    if total:
                        if total<totals[k]:
                            data_json[user][k]=f'{acc} ({round(total*100.0/totals[k], 2)}%)'
                        else:
                            data_json[user][k]=acc
                    else:
                        data_json[user][k]='-'
                    print('ACC=', acc)
                    print()

                for event_property in event_props:
                    
                    print(event_property)
                    acc, total=compute_subset_accuracy(questions[subtask], 
                            answers,
                            gold[subtask], 
                            event_prop=event_property)
                    k=f'{subtask}_{"&".join(list(event_property))}'
                    if k not in data_json['#']:
                        data_json['#'][k]=totals[k]
                    if total:
                        if total<totals[k]:
                            data_json[user][k]=f'{acc} ({round(total*100.0/totals[k], 2)}%)'
                        else:
                            data_json[user][k]=acc
                    else:
                        data_json[user][k]='-'
                    print('ACC=', acc)
                    print()
    return data_json

In [29]:
et_json=generate_acc_table(event_types=event_types)
ep_json=generate_acc_table(event_props=event_props)

USER CarlaAbreu
SUBTASK s2
fire_burning
correct= 32 total= 79
ACC= 40.51

killing
correct= 112 total= 371
ACC= 30.19

job_firing
correct= 0 total= 4
ACC= 0.0

injuring
correct= 119 total= 543
ACC= 21.92

USER NAI-SEA
SUBTASK s2
fire_burning
correct= 25 total= 79
ACC= 31.65

killing
correct= 69 total= 371
ACC= 18.6

job_firing
correct= 1 total= 4
ACC= 25.0

injuring
correct= 78 total= 543
ACC= 14.36

SUBTASK s3
fire_burning
correct= 0 total= 0
ACC= 

killing
correct= 237 total= 928
ACC= 25.54

job_firing
correct= 7 total= 26
ACC= 26.92

injuring
correct= 252 total= 1502
ACC= 16.78

USER NewsReader
SUBTASK s2
fire_burning
correct= 31 total= 79
ACC= 39.24

killing
correct= 68 total= 371
ACC= 18.33

job_firing
correct= 1 total= 4
ACC= 25.0

injuring
correct= 118 total= 543
ACC= 21.73

SUBTASK s3
fire_burning
correct= 0 total= 0
ACC= 

killing
correct= 165 total= 928
ACC= 17.78

job_firing
correct= 4 total= 26
ACC= 15.38

injuring
correct= 348 total= 1502
ACC= 23.17

USER ID-DE
SUBTASK s2
f

In [30]:
et_json['Subtask']={}
et_json['Event type']={}

ep_json['Subtask']={}
ep_json['Event properties']={}


for k in et_json['NewsReader'].keys():
    first, *rest=k.split('_')
    et_json['Subtask'][k]=first
    et_json['Event type'][k]='_'.join(rest)
    
for k in ep_json['NewsReader'].keys():
    first, *rest=k.split('_')
    ep_json['Subtask'][k]=first
    ep_json['Event properties'][k]='_'.join(rest)
    

In [31]:
columns_et=['Subtask', 'Event type', '#', 'CarlaAbreu', 'ID-DE', 'NAI-SEA', 'NewsReader', 'baseline1']
columns_ep=['Subtask', 'Event properties', '#', 'CarlaAbreu', 'ID-DE', 'NAI-SEA', 'NewsReader', 'baseline1']

df_et=pd.DataFrame.from_dict(et_json)
df_et=df_et.fillna('-')

df_ep=pd.DataFrame.from_dict(ep_json)
df_ep=df_ep.fillna('-')


In [32]:
df_et=df_et[columns_et]
df_ep=df_ep[columns_ep]

In [33]:
df_et.to_csv('acc_et.tsv', sep='\t')
df_ep.to_csv('acc_ep.tsv', sep='\t')

In [34]:
print(df_et.to_latex(index=False))

\begin{tabular}{llrlllll}
\toprule
Subtask &    Event type &     \# & CarlaAbreu &           ID-DE & NAI-SEA & NewsReader & baseline1 \\
\midrule
     s2 &  fire\_burning &    79 &      40.51 &               - &   31.65 &      39.24 &     49.37 \\
     s2 &      injuring &   543 &      21.92 &  18.67 (72.01\%) &   14.36 &      21.73 &     17.68 \\
     s2 &    job\_firing &     4 &          0 &               - &      25 &         25 &        50 \\
     s2 &       killing &   371 &      30.19 &   22.7 (76.01\%) &    18.6 &      18.33 &     12.13 \\
     s3 &  fire\_burning &     0 &          - &               - &       - &          - &         - \\
     s3 &      injuring &  1502 &          - &  12.86 (65.25\%) &   16.78 &      23.17 &         - \\
     s3 &    job\_firing &    26 &          - &               - &   26.92 &      15.38 &         - \\
     s3 &       killing &   928 &          - &  28.96 (70.69\%) &   25.54 &      17.78 &         - \\
\bottomrule
\end{tabular}



In [35]:
print(df_ep.to_latex(index=False))

\begin{tabular}{llrllrrl}
\toprule
Subtask &      Event properties &     \# & CarlaAbreu &           ID-DE &  NAI-SEA &  NewsReader & baseline1 \\
\midrule
     s2 &         location\&time &   680 &      24.56 &   19.1 (68.53\%) &    14.71 &       23.68 &        20 \\
     s2 &  participant\&location &    49 &      14.29 &  17.39 (46.94\%) &    36.73 &       10.20 &      6.12 \\
     s2 &      participant\&time &   268 &      33.21 &  23.91 (68.66\%) &    20.52 &       19.40 &     16.04 \\
     s3 &         location\&time &  1335 &          - &  17.66 (58.95\%) &    17.53 &       12.13 &         - \\
     s3 &  participant\&location &   301 &          - &  24.87 (62.79\%) &    26.58 &       39.53 &         - \\
     s3 &      participant\&time &   820 &          - &   19.7 (80.49\%) &    22.20 &       28.78 &         - \\
\bottomrule
\end{tabular}



In [90]:
# Extraction of data for both gold and system response
def extract_data(data, extract_incidents=True, gold=True):
    qs=set(data.keys())
    docs={}
    incidents={}
    for q in data:
        if gold:
            docs[q]=set(doc for inc_id in data[q]["answer_docs"] for doc in data[q]["answer_docs"][inc_id])
        elif "answer_docs" in data[q]: # system's format is simpler
            docs[q]=set(data[q]["answer_docs"])
        else: #if the question key exists but there are no documents specified, assume an empty set of docs
#            print("You did not provide any documents for the question %s. Assuming your answer is an empty set." % q)
            docs[q]=set()
        if extract_incidents: # s2 or s3
            incidents[q]=data[q]["numerical_answer"]
#        elif not gold and "answer_docs" in data[q]:
#            print("You provided a numerical answer for the question %s. Note that subtask 1 does not ask for a numerical answer, since the answer is always 1!" % q)
    return docs, incidents, qs

In [130]:
def generate_subset(qdata, subtask, etype=None, eprop=None):
    subset=set()
    for qid, qd in qdata[subtask].items():
        if etype and qd['event_type']!=etype:
            continue
        if eprop and not set(eprop) < set(qd.keys()):
            continue
        subset.add(qid)
    return subset

In [145]:
et_json={'#': {}}

for subtask in subtasks:
    for t in event_types:
        result=generate_subset(questions, subtask, etype=t)
        k='%s_%s' % (subtask, t)
        et_json['#'][k]=len(result)
        print(len(result))
        
ep_json={'#': {}}

for subtask in subtasks:
    for pr in event_props:
        result=generate_subset(questions, subtask, eprop=pr)
        k=f'{subtask}_{"&".join(list(pr))}'
        ep_json['#'][k]=len(result)
        print(len(result))

142
326
13
551
79
371
4
543
0
928
26
1502
140
298
594
49
268
680
301
820
1335


In [146]:
for user_submission_dir in glob.glob('%s/*' % submission_dir):
    user=user_submission_dir.split('/')[-1]
    print('USER', user)
    print('='*20)
    et_json[user]={}
    ep_json[user]={}
    for subtask_user_submission_dir in glob.glob('%s/*' % user_submission_dir):
        subtask=subtask_user_submission_dir.split('/')[-1]
        this_answers_file='%s/%s' % (subtask_user_submission_dir, answers_file)
        if not os.path.exists(this_answers_file):
            continue
        print('SUBTASK', subtask)
        print('='*20)
        with open(this_answers_file, 'r') as f:
            answers = json.load(f)
        gold_docs, gold_incidents, gold_qs = extract_data(gold[subtask], 
                                                  subtask!="s1")
        sys_docs, sys_incidents, sys_qs = extract_data(answers, 
                                                       subtask!="s1", 
                                                       False)
        for t in event_types:
            subset_qs=generate_etype_subset(questions, subtask, t)

            eval_qs=subset_qs & sys_qs
            print(t,len(eval_qs), len(subset_qs))

            if not len(eval_qs):
                continue

            # Document-level evaluation
            p, r, f1 = document_evaluation(sys_docs, gold_docs, eval_qs)
            avg_p=compute_avg(p)
            avg_r=compute_avg(r)
            avg_f1=compute_avg(f1)*100.0

            k='%s_%s' % (subtask, t)
            et_json[user][k]=str(round(avg_f1, 2))
            if len(eval_qs)<len(subset_qs):
                et_json[user][k] += ' (%s)' % str(round(len(eval_qs)*100.0/len(subset_qs), 2))

                
        for pr in event_props:
            subset_qs=generate_subset(questions, subtask, eprop=pr)
            
            eval_qs=subset_qs & sys_qs
            print(pr, len(eval_qs), len(subset_qs))

            if not len(eval_qs):
                continue

            # Document-level evaluation
            p, r, f1 = document_evaluation(sys_docs, gold_docs, eval_qs)
            avg_p=compute_avg(p)
            avg_r=compute_avg(r)
            avg_f1=compute_avg(f1)*100.0

            k=f'{subtask}_{"&".join(list(pr))}'
#            k='%s_%s' % (subtask, pr)
            ep_json[user][k]=str(round(avg_f1, 2))
            if len(eval_qs)<len(subset_qs):
                ep_json[user][k] += ' (%s)' % str(round(len(eval_qs)*100.0/len(subset_qs), 2))

USER CarlaAbreu
SUBTASK s2
fire_burning 79 79
killing 371 371
job_firing 4 4
injuring 543 543
['participant', 'location'] 49 49
['participant', 'time'] 268 268
['location', 'time'] 680 680
SUBTASK s1
fire_burning 142 142
killing 326 326
job_firing 13 13
injuring 551 551
['participant', 'location'] 140 140
['participant', 'time'] 298 298
['location', 'time'] 594 594
USER NAI-SEA
SUBTASK s2
fire_burning 79 79
killing 371 371
job_firing 4 4
injuring 543 543
['participant', 'location'] 49 49
['participant', 'time'] 268 268
['location', 'time'] 680 680
SUBTASK s3
fire_burning 0 0
killing 928 928
job_firing 26 26
injuring 1502 1502
['participant', 'location'] 301 301
['participant', 'time'] 820 820
['location', 'time'] 1335 1335
SUBTASK s1
fire_burning 142 142
killing 326 326
job_firing 13 13
injuring 551 551
['participant', 'location'] 140 140
['participant', 'time'] 298 298
['location', 'time'] 594 594
USER NewsReader
SUBTASK s2
fire_burning 79 79
killing 371 371
job_firing 4 4
injuring 54

In [147]:

et_json['Subtask']={}
et_json['Event type']={}

ep_json['Subtask']={}
ep_json['Event properties']={}


for k in et_json['NewsReader'].keys():
    first, *rest=k.split('_')
    et_json['Subtask'][k]=first
    et_json['Event type'][k]='_'.join(rest)
    
for k in ep_json['NewsReader'].keys():
    first, *rest=k.split('_')
    ep_json['Subtask'][k]=first
    ep_json['Event properties'][k]='_'.join(rest)

In [148]:

df_et=pd.DataFrame.from_dict(et_json)
df_et=df_et.fillna('-')

df_ep=pd.DataFrame.from_dict(ep_json)
df_ep=df_ep.fillna('-')


In [149]:

df_et=df_et[columns_et]
df_ep=df_ep[columns_ep]

In [150]:
print(df_et.to_latex(index=False))

\begin{tabular}{llrlllll}
\toprule
Subtask &    Event type &     \# & CarlaAbreu &          ID-DE & NAI-SEA &     NewsReader &      baseline1 \\
\midrule
     s1 &  fire\_burning &   142 &      20.69 &              - &   93.21 &    50.0 (1.41) &  81.48 (21.83) \\
     s1 &      injuring &   551 &      17.08 &   82.49 (49.0) &   75.71 &  49.85 (57.89) &  63.53 (19.42) \\
     s1 &    job\_firing &    13 &      31.28 &              - &    57.2 &   34.8 (53.85) &   85.0 (30.77) \\
     s1 &       killing &   326 &      38.89 &  83.71 (57.06) &   77.12 &  40.85 (62.58) &   63.69 (8.59) \\
     s2 &  fire\_burning &    79 &      50.65 &              - &   69.28 &          39.24 &          61.84 \\
     s2 &      injuring &   543 &      23.42 &  54.37 (72.01) &   48.07 &          38.43 &          29.35 \\
     s2 &    job\_firing &     4 &      18.75 &              - &   22.22 &          27.78 &          50.29 \\
     s2 &       killing &   371 &      36.72 &  56.26 (76.01) &    50.4 &      

In [151]:
print(df_ep.to_latex(index=False))

\begin{tabular}{llrlllll}
\toprule
Subtask &      Event properties &     \# & CarlaAbreu &          ID-DE & NAI-SEA &     NewsReader &      baseline1 \\
\midrule
     s1 &         location\&time &   594 &      23.06 &  86.47 (30.81) &   82.91 &  57.69 (45.45) &  72.85 (11.95) \\
     s1 &  participant\&location &   140 &      13.48 &  81.87 (53.57) &   70.22 &   24.0 (49.29) &   65.09 (15.0) \\
     s1 &      participant\&time &   298 &      33.06 &  80.19 (66.44) &   73.01 &  38.06 (64.77) &  62.91 (26.17) \\
     s2 &         location\&time &   680 &      30.95 &  61.01 (68.53) &   49.99 &          39.22 &          28.61 \\
     s2 &  participant\&location &    49 &      14.66 &   45.3 (46.94) &   50.41 &          13.53 &          10.02 \\
     s2 &      participant\&time &   268 &      32.27 &  41.59 (68.66) &   51.87 &          35.34 &          23.71 \\
     s3 &         location\&time &  1335 &          - &  70.48 (58.95) &   63.27 &          36.15 &              - \\
     s3 &  p