# Evaluation of different crowd analysis metrics


(1) Dataset filtering:

* Exclude workers based on:
    - contradiction ration (absolut thresh, batch stdv, pair stdv, total stdv)
    - worker quality score (thresholds)
    - attention check fails 
    
(2) Label aggregation:

* Majority vote
* Top vote
* CT unit-label score (tresholds)

In [1]:
from clean_annotations import clean_workers
from load_data import load_experiment_data, load_expert_data, load_gold_data
from aggregation import aggregate_binary_labels

from load_data import load_experiment_data
from calculate_iaa import get_agreement
from utils_analysis import sort_by_key
from utils_analysis import load_analysis, load_ct

from sklearn.metrics import precision_recall_fscore_support as p_r_f1
from collections import defaultdict


def iaa_dis_agreement(data_dict_list, expert_unit_agreement_dict):
    
    data_by_agreement = defaultdict(list)
    data_by_triple = sort_by_key(data_dict_list, ['relation', 'property', 'concept'])
    
    for t, gold_expect in expert_unit_agreement_dict.items():
        data = data_by_triple[t]
        data_by_agreement[gold_expect].extend(data)
        
    for exp, data in data_by_agreement.items():
        agreement = get_agreement(data, v=False)
        print(exp, agreement['Krippendorff'])
        

def get_expert_agreement_labels(expert_annotations):
    expert_annotations_by_unit = sort_by_key(expert_annotations, ['relation',
                                                              'property', 'concept'])
    unit_agreement_dict = dict()
    for unit, data in expert_annotations_by_unit.items():
        agreements = []
        for d in data:
            w = d['workerid']
            if not w.endswith('_test1'):
                for k in d.keys():
                    #print(k)
                    if k.startswith('disagreement_'):
                        agreements.append(k)
        n_agreement_annotations = len(agreements)
        n_agree = agreements.count('disagreement_agreement')
        prop_agreement = n_agree/n_agreement_annotations

        if prop_agreement == 1.0:
            unit_agreement_dict[unit] = 'agreement'
        elif 'disagreement_agreement' in agreements:
            unit_agreement_dict[unit] = 'possible_disagreement'
        else:
            unit_agreement_dict[unit] = 'disagreement'
    return unit_agreement_dict




def get_gold_dis_agreement(gold, agreement_labels):
    
    gold_agree = defaultdict(list)
    
    for d in gold:
        t = f"{d['relation']}-{d['property']}-{d['concept']}"
        label = agreement_labels[t]
        gold_agree[label].append(d)
    return gold_agree

    


# Total evaluation

In [9]:
from load_data import load_experiment_data, load_gold_data
from evaluation import evaluate_configs

import pandas as pd

In [10]:

# load gold
group = 'reason_agreement*_expert_inspection*'
run = 4
gold = load_gold_data(run, group)
for d in gold:
    if 'answer' not in d:
        print(d)
print('number of gold instances: ', len(gold))

# load crowd:
run = '*'
group = 'experiment*'
n_q = '*'
batch = '*'
crowd = load_experiment_data(run, group, n_q, batch)

number of gold instances:  154
Discarded 655.0 annotations.


In [6]:

overview_dicts = evaluate_configs(gold, crowd)
df =  pd.DataFrame(overview_dicts) 
df = df.sort_values(by=['relations-f1'], ascending=False)[['config',
                                                      'relations-f1',
                                                      'levels-f1', 
                                                        'negative_relations-f1', 'alpha']]
df.round(2).to_csv('../analyses/evaluation_accuracy_full.csv')
df

number of gold instances:  154
Discarded 655.0 annotations.
----Label distribution----
True: 30
False 124
----------------------------
154 17917 1743
aggretation
no filtering - different aggretation methods
cleaning and aggregation


  'precision', 'predicted', average, warn_for)


clean all contradictory annotations


Unnamed: 0,config,relations-f1,levels-f1,negative_relations-f1,alpha
38,"(batch, 0.5, contradictions, majority_vote)",0.829942,0.812390,0.838247,0.197202
62,"(total, 1.5, contradictions, majority_vote)",0.821906,0.796042,0.820254,0.199057
58,"(total, 1, contradictions, majority_vote)",0.821906,0.796042,0.820254,0.208858
46,"(batch, 1.5, contradictions, majority_vote)",0.813515,0.766163,0.800347,0.195085
54,"(total, 0.5, contradictions, majority_vote)",0.813407,0.794204,0.818687,0.226988
42,"(batch, 1, contradictions, majority_vote)",0.812352,0.782249,0.807437,0.189528
26,"(pair, 1, contradictions, majority_vote)",0.804241,0.766163,0.797615,0.196745
60,"(total, 1, crowdtruth, majority_vote)",0.800245,0.751142,0.776627,0.176613
28,"(pair, 1, crowdtruth, majority_vote)",0.794848,0.736035,0.769632,0.186099
66,"(total, 2, contradictions, majority_vote)",0.793501,0.766163,0.783620,0.196364


In [6]:
#print(df.round(2).to_latex(index=False))
#gold[0]

 # Evaluation with respect to expected worker behavior

In [11]:
from utils_analysis import sort_by_key

# get agreement data

run = "4"
#group1 = 'reason_agreement_expert_inspection1'
group = 'reason_agreement*_expert_inspection*'
batch = '*'
n_q = '*'

#run4-group_reason_agreement_expert_inspection1
#expert_annotations = load_expert_data(run, group, n_q, batch)
#expert_annotations2 = load_expert_data(run, group2, n_q, batch)
#expert_annotations = expert_annotations1 + expert_annotations2
#expert_unit_agreement_dict = get_expert_agreement_labels(expert_annotations)
#agreement_labels = get_expert_agreement_labels(expert_annotations)

gold_by_agreement = sort_by_key(gold, ['expected_agreement'])
print('\n--- agreement categories---')
for l in gold_by_agreement.keys():
    print(l)

possible_disagreement
disagreement
disagreement

--- agreement categories---
possible_disagreement
disagreement
agreement


In [12]:
# evaluate agree category:
gold_agree = gold_by_agreement['agreement']
gold_poss_disagree = gold_by_agreement['possible_disagreement']
gold_disagree = gold_by_agreement['disagreement']
print(len(gold_agree))
print(len(gold_poss_disagree))
print(len(gold_disagree))

59
86
9


In [13]:
# agree

overview_dicts = evaluate_configs(gold_agree, crowd)
df =  pd.DataFrame(overview_dicts) 
df = df.sort_values(by=['relations-f1'], ascending=False)[['config',
                                                      'relations-f1',
                                                      'levels-f1', 
                                                        'negative_relations-f1', 'alpha']]
df.round(2).to_csv('../analyses/evaluation_accuracy_agree.csv')
df

----Label distribution----
True: 7
False 52
----------------------------
59 17917 714
aggretation
no filtering - different aggretation methods
cleaning and aggregation


  'precision', 'predicted', average, warn_for)


clean all contradictory annotations


Unnamed: 0,config,relations-f1,levels-f1,negative_relations-f1,alpha
5,ct_vote_0.65,0.907571,0.945028,0.892624,0.253746
6,ct_vote_0.7,0.903664,0.915001,0.888338,0.253746
8,ct_vote_0.8,0.898305,0.907611,0.882353,0.253746
7,ct_vote_0.75,0.898305,0.907611,0.882353,0.253746
22,"(pair, 0.5, contradictions, majority_vote)",0.896849,0.894802,0.879690,0.323356
54,"(total, 0.5, contradictions, majority_vote)",0.896849,0.894802,0.879690,0.342014
58,"(total, 1, contradictions, majority_vote)",0.896849,0.894802,0.879690,0.302838
4,ct_vote_0.6,0.883410,0.919539,0.863815,0.253746
62,"(total, 1.5, contradictions, majority_vote)",0.883410,0.870514,0.863815,0.291210
38,"(batch, 0.5, contradictions, majority_vote)",0.870077,0.822376,0.848004,0.302973


In [45]:
#print(df.to_latex(index=False))

In [14]:
# possible disagreement 

overview_dicts = evaluate_configs(gold_poss_disagree, crowd)
df =  pd.DataFrame(overview_dicts) 
df = df.sort_values(by=['relations-f1'], ascending=False)[['config',
                                                      'relations-f1',
                                                      'levels-f1', 
                                                        'negative_relations-f1', 'alpha']]
df.round(2).to_csv('../analyses/evaluation_accuracy_poss_disagree.csv')
df

----Label distribution----
True: 15
False 71
----------------------------
86 17917 926
aggretation
no filtering - different aggretation methods
cleaning and aggregation


  'precision', 'predicted', average, warn_for)


clean all contradictory annotations


Unnamed: 0,config,relations-f1,levels-f1,negative_relations-f1,alpha
38,"(batch, 0.5, contradictions, majority_vote)",0.827984,0.712569,0.796961,0.097433
42,"(batch, 1, contradictions, majority_vote)",0.806482,0.690999,0.769889,0.099448
26,"(pair, 1, contradictions, majority_vote)",0.803523,0.689535,0.766422,0.119505
62,"(total, 1.5, contradictions, majority_vote)",0.803523,0.689535,0.766422,0.121144
66,"(total, 2, contradictions, majority_vote)",0.794241,0.668349,0.754623,0.122368
58,"(total, 1, contradictions, majority_vote)",0.794241,0.668349,0.754623,0.127916
54,"(total, 0.5, contradictions, majority_vote)",0.790819,0.665935,0.750513,0.135779
45,"(batch, 1, crowdtruth, top_vote)",0.787703,0.690999,0.748173,0.125144
61,"(total, 1, crowdtruth, top_vote)",0.787703,0.690999,0.760412,0.114164
16,top_vote_ct_0.7,0.786728,0.645183,0.750513,0.119937


In [47]:
#disagree
overview_dicts = evaluate_configs(gold_disagree, crowd)
df =  pd.DataFrame(overview_dicts) 
df = df.sort_values(by=['relations-f1'], ascending=False)[['config',
                                                      'relations-f1',
                                                      'levels-f1', 
                                                        'negative_relations-f1', 'alpha']]
df.round(2).to_csv('../analyses/evaluation_accuracy_disagree.csv')
df

----Label distribution----
True: 3
False 2
----------------------------
5 17917 53
aggretation
no filtering - different aggretation methods
cleaning and aggregation


  'precision', 'predicted', average, warn_for)


clean all contradictory annotations


Unnamed: 0,config,relations-f1,levels-f1,negative_relations-f1,alpha
38,"(batch, 0.5, contradictions, majority_vote)",0.780952,0.780952,0.780952,-0.079394
54,"(total, 0.5, contradictions, majority_vote)",0.600000,0.600000,0.600000,-0.059838
6,ct_vote_0.7,0.566667,0.566667,0.566667,-0.085522
71,"(None, None, exclude_contradictory_annotations...",0.450000,0.450000,0.450000,0.001299
39,"(batch, 0.5, contradictions, top_vote)",0.450000,0.450000,0.450000,-0.079394
29,"(pair, 1, crowdtruth, top_vote)",0.450000,0.450000,0.450000,-0.087568
59,"(total, 1, contradictions, top_vote)",0.450000,0.450000,0.450000,-0.062309
31,"(pair, 1.5, contradictions, top_vote)",0.450000,0.450000,0.450000,-0.088757
33,"(pair, 1.5, crowdtruth, top_vote)",0.450000,0.450000,0.450000,-0.084635
57,"(total, 0.5, crowdtruth, top_vote)",0.450000,0.450000,0.450000,-0.136601
