# Evaluation of different crowd analysis metrics


(1) Dataset filtering:

* Exclude workers based on:
    - contradiction ration (absolut thresh, batch stdv, pair stdv, total stdv)
    - worker quality score (thresholds)
    - attention check fails 
    
(2) Label aggregation:

* Majority vote
* Top vote
* CT unit-label score (tresholds)

In [1]:
from clean_annotations import clean_workers
from load_data import load_experiment_data, load_expert_data, load_gold_data
from aggregation import aggregate_binary_labels

from load_data import load_experiment_data
from calculate_iaa import get_agreement
from utils_analysis import sort_by_key
from utils_analysis import load_analysis, load_ct

from sklearn.metrics import precision_recall_fscore_support as p_r_f1
from collections import defaultdict


def iaa_dis_agreement(data_dict_list, expert_unit_agreement_dict):
    
    data_by_agreement = defaultdict(list)
    data_by_triple = sort_by_key(data_dict_list, ['relation', 'property', 'concept'])
    
    for t, gold_expect in expert_unit_agreement_dict.items():
        data = data_by_triple[t]
        data_by_agreement[gold_expect].extend(data)
        
    for exp, data in data_by_agreement.items():
        agreement = get_agreement(data, v=False)
        print(exp, agreement['Krippendorff'])
        

def get_expert_agreement_labels(expert_annotations):
    expert_annotations_by_unit = sort_by_key(expert_annotations, ['relation',
                                                              'property', 'concept'])
    unit_agreement_dict = dict()
    for unit, data in expert_annotations_by_unit.items():
        agreements = []
        for d in data:
            w = d['workerid']
            if not w.endswith('_test1'):
                for k in d.keys():
                    #print(k)
                    if k.startswith('disagreement_'):
                        agreements.append(k)
        n_agreement_annotations = len(agreements)
        n_agree = agreements.count('disagreement_agreement')
        prop_agreement = n_agree/n_agreement_annotations

        if prop_agreement == 1.0:
            unit_agreement_dict[unit] = 'agreement'
        elif 'disagreement_agreement' in agreements:
            unit_agreement_dict[unit] = 'possible_disagreement'
        else:
            unit_agreement_dict[unit] = 'disagreement'
    return unit_agreement_dict




def get_gold_dis_agreement(gold, agreement_labels):
    
    gold_agree = defaultdict(list)
    
    for d in gold:
        t = f"{d['relation']}-{d['property']}-{d['concept']}"
        label = agreement_labels[t]
        gold_agree[label].append(d)
    return gold_agree

    


# Total evaluation

In [2]:
from load_data import load_experiment_data, load_gold_data
from evaluation import evaluate_configs

import pandas as pd

In [3]:

# load gold
group = 'reason_agreement*_expert_inspection*'
run = 4
gold = load_gold_data(run, group)
for d in gold:
    if 'answer' not in d:
        print(d)
print('number of gold instances: ', len(gold))

# load crowd:
run = '*'
group = 'experiment*'
n_q = '*'
batch = '*'
crowd = load_experiment_data(run, group, n_q, batch)

number of gold instances:  154
Discarded 655.0 annotations.


In [10]:

overview_dicts = evaluate_configs(gold, crowd)
df =  pd.DataFrame(overview_dicts) 
df = df.sort_values(by=['relations-f1'], ascending=False)[['filtering',
                                                           'aggregation',
                                                            'relations-f1',
                                                            'relations-p',
                                                           'relations-r',
                                                           'alpha', 'coverage']]
df.round(2).to_csv('../analyses/evaluation_accuracy_full.csv')
df

----Label distribution----
True: 30
False 124
----------------------------
154 17917 1743
aggretation
no filtering - different aggretation methods
cleaning and aggregation


  'precision', 'predicted', average, warn_for)


clean all contradictory annotations


Unnamed: 0,filtering,aggregation,relations-f1,relations-p,relations-r,alpha,coverage
28,batch-contradictions-0.5,majority_vote,0.829942,0.855785,0.818182,0.197202,1.0
52,total-contradictions-1.5,majority_vote,0.821906,0.840843,0.811688,0.199057,1.0
48,total-contradictions-1,majority_vote,0.821906,0.840843,0.811688,0.208858,1.0
36,batch-contradictions-1.5,majority_vote,0.813515,0.847849,0.798701,0.195085,1.0
44,total-contradictions-0.5,majority_vote,0.813407,0.826243,0.805195,0.226988,1.0
32,batch-contradictions-1,majority_vote,0.812352,0.841336,0.798701,0.189528,1.0
16,pair-contradictions-1,majority_vote,0.804241,0.826067,0.792208,0.196745,1.0
56,total-contradictions-2,majority_vote,0.793501,0.820543,0.779221,0.196364,1.0
2,none,uqs-0.5,0.789453,0.824509,0.772727,0.183774,1.0
5,none,uqs-0.65,0.788171,0.781579,0.798701,0.183774,1.0


In [11]:
print(df.round(2).to_latex(index=False))


\begin{tabular}{llrrrrr}
\toprule
                         filtering &    aggregation &  relations-f1 &  relations-p &  relations-r &  alpha &  coverage \\
\midrule
          batch-contradictions-0.5 &  majority\_vote &          0.83 &         0.86 &         0.82 &   0.20 &       1.0 \\
          total-contradictions-1.5 &  majority\_vote &          0.82 &         0.84 &         0.81 &   0.20 &       1.0 \\
            total-contradictions-1 &  majority\_vote &          0.82 &         0.84 &         0.81 &   0.21 &       1.0 \\
          batch-contradictions-1.5 &  majority\_vote &          0.81 &         0.85 &         0.80 &   0.20 &       1.0 \\
          total-contradictions-0.5 &  majority\_vote &          0.81 &         0.83 &         0.81 &   0.23 &       1.0 \\
            batch-contradictions-1 &  majority\_vote &          0.81 &         0.84 &         0.80 &   0.19 &       1.0 \\
             pair-contradictions-1 &  majority\_vote &          0.80 &         0.83 &         0.7

 # Evaluation with respect to expected worker behavior

In [12]:
from utils_analysis import sort_by_key

# get agreement data

run = "4"
#group1 = 'reason_agreement_expert_inspection1'
group = 'reason_agreement*_expert_inspection*'
batch = '*'
n_q = '*'

#run4-group_reason_agreement_expert_inspection1
#expert_annotations = load_expert_data(run, group, n_q, batch)
#expert_annotations2 = load_expert_data(run, group2, n_q, batch)
#expert_annotations = expert_annotations1 + expert_annotations2
#expert_unit_agreement_dict = get_expert_agreement_labels(expert_annotations)
#agreement_labels = get_expert_agreement_labels(expert_annotations)

gold_by_agreement = sort_by_key(gold, ['expected_agreement'])
print('\n--- agreement categories---')
for l, data in gold_by_agreement.items():
    print(l, len(data))


--- agreement categories---
possible_disagreement 86
disagreement 9
agreement 59


In [13]:
# evaluate agree category:
gold_agree = gold_by_agreement['agreement']
gold_poss_disagree = gold_by_agreement['possible_disagreement']
gold_disagree = gold_by_agreement['disagreement']
print(len(gold_agree))
print(len(gold_poss_disagree))
print(len(gold_disagree))

59
86
9


In [17]:
# agree

overview_dicts_total = evaluate_configs(gold_agree, crowd)
for d in overview_dicts_total:
    d['exp. behavior'] = 'agreement'
    
gold_disagree_all = []
gold_disagree_all.extend(gold_poss_disagree)
gold_disagree_all.extend(gold_disagree)
#gold_poss_disagree + gold_disagree

overview_dicts = evaluate_configs(gold_disagree_all, crowd)
for d in overview_dicts:
    d['exp. behavior'] = 'possible disagreement'
    overview_dicts_total.append(d)
df =  pd.DataFrame(overview_dicts_total) 
df = df.sort_values(by=['relations-f1'], ascending=False)[['exp. behavior', 'filtering',
                                                           'aggregation',
                                                           'relations-f1',
                                                            'relations-p',
                                                           'relations-r',
                                                             'alpha',
                                                           'coverage']]
df.round(2).to_csv('../analyses/evaluation_accuracy_agree.csv')
df

----Label distribution----
True: 7
False 52
----------------------------
59 17917 714
aggretation
no filtering - different aggretation methods
cleaning and aggregation


  'precision', 'predicted', average, warn_for)


clean all contradictory annotations
----Label distribution----
True: 23
False 72
----------------------------
95 17917 1029
aggretation
no filtering - different aggretation methods
cleaning and aggregation
clean all contradictory annotations


Unnamed: 0,exp. behavior,filtering,aggregation,relations-f1,relations-p,relations-r,alpha,coverage
5,agreement,none,uqs-0.65,0.907571,0.927709,0.898305,0.253746,1.0
6,agreement,none,uqs-0.7,0.903664,0.912015,0.898305,0.253746,1.0
7,agreement,none,uqs-0.75,0.898305,0.898305,0.898305,0.253746,1.0
8,agreement,none,uqs-0.8,0.898305,0.898305,0.898305,0.253746,1.0
44,agreement,total-contradictions-0.5,majority_vote,0.896849,0.940678,0.881356,0.342014,1.0
12,agreement,pair-contradictions-0.5,majority_vote,0.896849,0.940678,0.881356,0.323356,1.0
48,agreement,total-contradictions-1,majority_vote,0.896849,0.940678,0.881356,0.302838,1.0
52,agreement,total-contradictions-1.5,majority_vote,0.883410,0.936723,0.864407,0.291210,1.0
4,agreement,none,uqs-0.6,0.883410,0.936723,0.864407,0.253746,1.0
28,agreement,batch-contradictions-0.5,majority_vote,0.870077,0.933263,0.847458,0.302973,1.0


In [18]:
print(df.round(2).to_latex(index=False))

\begin{tabular}{lllrrrrr}
\toprule
         exp. behavior &                          filtering &    aggregation &  relations-f1 &  relations-p &  relations-r &  alpha &  coverage \\
\midrule
             agreement &                               none &       uqs-0.65 &          0.91 &         0.93 &         0.90 &   0.25 &       1.0 \\
             agreement &                               none &        uqs-0.7 &          0.90 &         0.91 &         0.90 &   0.25 &       1.0 \\
             agreement &                               none &       uqs-0.75 &          0.90 &         0.90 &         0.90 &   0.25 &       1.0 \\
             agreement &                               none &        uqs-0.8 &          0.90 &         0.90 &         0.90 &   0.25 &       1.0 \\
             agreement &           total-contradictions-0.5 &  majority\_vote &          0.90 &         0.94 &         0.88 &   0.34 &       1.0 \\
             agreement &            pair-contradictions-0.5 &  majority\_

In [10]:
#print(df.to_latex(index=False))

# possible disagreement 



overview_dicts = evaluate_configs(gold_poss_disagree, crowd)
df =  pd.DataFrame(overview_dicts) 
df = df.sort_values(by=['relations-f1'], ascending=False)[['filtering',
                                                           'aggregation',
                                                      'relations-f1',
                                                      'levels-f1', 
                                                        'negative_relations-f1', 'alpha',
                                                           'coverage']]
df.round(2).to_csv('../analyses/evaluation_accuracy_poss_disagree.csv')
df

----Label distribution----
True: 15
False 71
----------------------------
86 17917 926
aggretation
no filtering - different aggretation methods
cleaning and aggregation


  'precision', 'predicted', average, warn_for)


clean all contradictory annotations


Unnamed: 0,filtering,aggregation,relations-f1,levels-f1,negative_relations-f1,alpha,coverage
28,batch-contradictions-0.5,majority_vote,0.827984,0.712569,0.796961,0.097433,1.0
32,batch-contradictions-1,majority_vote,0.806482,0.690999,0.769889,0.099448,1.0
52,total-contradictions-1.5,majority_vote,0.803523,0.689535,0.766422,0.121144,1.0
16,pair-contradictions-1,majority_vote,0.803523,0.689535,0.766422,0.119505,1.0
56,total-contradictions-2,majority_vote,0.794241,0.668349,0.754623,0.122368,1.0
48,total-contradictions-1,majority_vote,0.794241,0.668349,0.754623,0.127916,1.0
44,total-contradictions-0.5,majority_vote,0.790819,0.665935,0.750513,0.135779,1.0
17,pair-contradictions-1,top_vote,0.786728,0.645183,0.750513,0.119505,1.0
45,total-contradictions-0.5,top_vote,0.784980,0.668349,0.757886,0.135779,1.0
57,total-contradictions-2,top_vote,0.784980,0.668349,0.757886,0.122368,1.0


In [11]:
# possible disagreement 

gold_disagree_all = []
gold_disagree_all.extend(gold_poss_disagree)
gold_disagree_all.extend(gold_disagree)
#gold_poss_disagree + gold_disagree

overview_dicts = evaluate_configs(gold_disagree_all, crowd)
df =  pd.DataFrame(overview_dicts) 
df = df.sort_values(by=['relations-f1'], ascending=False)[['filtering',
                                                           'aggregation',
                                                      'relations-f1',
                                                      'levels-f1', 
                                                        'negative_relations-f1', 'alpha',
                                                           'coverage']]
df.round(2).to_csv('../analyses/evaluation_accuracy_poss_disagree.csv')
df

----Label distribution----
True: 23
False 72
----------------------------
95 17917 1029
aggretation
no filtering - different aggretation methods
cleaning and aggregation


  'precision', 'predicted', average, warn_for)


clean all contradictory annotations


Unnamed: 0,filtering,aggregation,relations-f1,levels-f1,negative_relations-f1,alpha,coverage
28,batch-contradictions-0.5,majority_vote,0.806207,0.760542,0.829820,0.126860,1.0
32,batch-contradictions-1,majority_vote,0.797043,0.739001,0.803639,0.126006,1.0
36,batch-contradictions-1.5,majority_vote,0.787898,0.717004,0.777457,0.141405,1.0
52,total-contradictions-1.5,majority_vote,0.783363,0.738272,0.786892,0.138565,1.0
48,total-contradictions-1,majority_vote,0.774376,0.717004,0.775002,0.146647,1.0
56,total-contradictions-2,majority_vote,0.774376,0.717004,0.775002,0.142126,1.0
16,pair-contradictions-1,majority_vote,0.771615,0.738272,0.771998,0.136139,1.0
44,total-contradictions-0.5,majority_vote,0.759624,0.715613,0.771998,0.154618,1.0
40,batch-contradictions-2,majority_vote,0.758652,0.695502,0.753466,0.136848,1.0
2,none,uqs-0.5,0.756451,0.739001,0.765466,0.136073,1.0


In [12]:
#disagree
overview_dicts = evaluate_configs(gold_disagree, crowd)
df =  pd.DataFrame(overview_dicts) 
df = df.sort_values(by=['relations-f1'], ascending=False)[['filtering',
                                                           'aggregation',
                                                      'relations-f1',
                                                      'levels-f1', 
                                                        'negative_relations-f1', 'alpha',
                                                           'coverage']]
df.round(2).to_csv('../analyses/evaluation_accuracy_disagree.csv')
df

----Label distribution----
True: 8
False 1
----------------------------
9 17917 103
aggretation
no filtering - different aggretation methods
cleaning and aggregation


  'precision', 'predicted', average, warn_for)


clean all contradictory annotations


Unnamed: 0,filtering,aggregation,relations-f1,levels-f1,negative_relations-f1,alpha,coverage
36,batch-contradictions-1.5,majority_vote,0.903704,0.891026,0.891026,0.237245,1.0
40,batch-contradictions-2,majority_vote,0.817460,0.891026,0.891026,0.217878,1.0
32,batch-contradictions-1,majority_vote,0.817460,0.891026,0.891026,0.254860,1.0
31,batch-ct_wqs-0.5,top_vote,0.777778,0.816667,0.816667,0.224904,1.0
27,pair-ct_wqs-2,top_vote,0.777778,0.816667,0.816667,0.224904,1.0
29,batch-contradictions-0.5,top_vote,0.777778,0.816667,0.816667,0.286507,1.0
1,none,top_vote,0.777778,0.816667,0.816667,0.224904,1.0
33,batch-contradictions-1,top_vote,0.777778,0.816667,0.816667,0.254860,1.0
35,batch-ct_wqs-1,top_vote,0.777778,0.816667,0.816667,0.224904,1.0
37,batch-contradictions-1.5,top_vote,0.777778,0.816667,0.816667,0.237245,1.0
