# Evaluation of different crowd analysis metrics


(1) Dataset filtering:

* Exclude workers based on:
    - contradiction ration (absolut thresh, batch stdv, pair stdv, total stdv)
    - worker quality score (thresholds)
    - attention check fails 
    
(2) Label aggregation:

* Majority vote
* Top vote
* CT unit-label score (tresholds)

In [17]:
from clean_annotations import clean_workers
from load_data import load_experiment_data, load_expert_data, load_gold_data
from aggregation import aggregate_binary_labels

from load_data import load_experiment_data
from calculate_iaa import get_agreement
from utils_analysis import sort_by_key
from utils_analysis import load_analysis, load_ct

from sklearn.metrics import precision_recall_fscore_support as p_r_f1
from collections import defaultdict


def iaa_dis_agreement(data_dict_list, expert_unit_agreement_dict):
    
    data_by_agreement = defaultdict(list)
    data_by_triple = sort_by_key(data_dict_list, ['relation', 'property', 'concept'])
    
    for t, gold_expect in expert_unit_agreement_dict.items():
        data = data_by_triple[t]
        data_by_agreement[gold_expect].extend(data)
        
    for exp, data in data_by_agreement.items():
        agreement = get_agreement(data, v=False)
        print(exp, agreement['Krippendorff'])
        

def get_expert_agreement_labels(expert_annotations):
    expert_annotations_by_unit = sort_by_key(expert_annotations, ['relation',
                                                              'property', 'concept'])
    unit_agreement_dict = dict()
    for unit, data in expert_annotations_by_unit.items():
        agreements = []
        for d in data:
            w = d['workerid']
            if not w.endswith('_test1'):
                for k in d.keys():
                    #print(k)
                    if k.startswith('disagreement_'):
                        agreements.append(k)
        n_agreement_annotations = len(agreements)
        n_agree = agreements.count('disagreement_agreement')
        prop_agreement = n_agree/n_agreement_annotations

        if prop_agreement == 1.0:
            unit_agreement_dict[unit] = 'agreement'
        elif 'disagreement_agreement' in agreements:
            unit_agreement_dict[unit] = 'possible_disagreement'
        else:
            unit_agreement_dict[unit] = 'disagreement'
    return unit_agreement_dict




def get_gold_dis_agreement(gold, agreement_labels):
    
    gold_agree = defaultdict(list)
    
    for d in gold:
        t = f"{d['relation']}-{d['property']}-{d['concept']}"
        label = agreement_labels[t]
        gold_agree[label].append(d)
    return gold_agree
    


# Total evaluation

In [40]:
from load_data import load_experiment_data, load_gold_data
from evaluation import evaluate_configs

import pandas as pd


# load gold
run = 4
group = '*'
n_q = '*'
batch = '*'
gold = load_gold_data(run, group, n_q, batch)
for d in gold:
    if 'answer' not in d:
        print(d)
print('number of gold instances: ', len(gold))

# load crowd:
run = '*'
group = 'experiment*'
n_q = '*'
batch = '*'
crowd = load_experiment_data(run, group, n_q, batch)

overview_dicts = evaluate_configs(gold, crowd)
df =  pd.DataFrame(overview_dicts) 
df = df.sort_values(by=['relations-f1'], ascending=False)[['config',
                                                      'relations-f1',
                                                      'levels-f1', 
                                                        'negative_relations-f1', 'alpha']]
df.round(2).to_csv('../analyses/evaluation_accuracy_full.csv')
df

number of gold instances:  90
Discarded 655.0 annotations.
----Label distribution----
True: 20
False 70
----------------------------
90 17917 1073
aggretation
no filtering - different aggretation methods
cleaning and aggregation


  'precision', 'predicted', average, warn_for)


clean all contradictory annotations


Unnamed: 0,config,relations-f1,levels-f1,negative_relations-f1,alpha
38,"(batch, 0.5, contradictions, majority_vote)",0.874872,0.925088,0.884019,0.190106
54,"(total, 0.5, contradictions, majority_vote)",0.851107,0.899408,0.856002,0.207026
58,"(total, 1, contradictions, majority_vote)",0.841359,0.899408,0.856002,0.185277
46,"(batch, 1.5, contradictions, majority_vote)",0.835391,0.876306,0.833309,0.173638
62,"(total, 1.5, contradictions, majority_vote)",0.831650,0.875147,0.843644,0.174478
22,"(pair, 0.5, contradictions, majority_vote)",0.827689,0.899408,0.840169,0.191624
5,ct_vote_0.65,0.824302,0.866011,0.815663,0.149496
42,"(batch, 1, contradictions, majority_vote)",0.823428,0.876306,0.832473,0.174412
55,"(total, 0.5, contradictions, top_vote)",0.818796,0.725323,0.780059,0.207026
6,ct_vote_0.7,0.814173,0.770940,0.775669,0.149496


In [43]:
#print(df.round(2).to_latex(index=False))

 # Evaluation with respect to expected worker behavior

In [18]:


# get agreement data

run = "4"
#group1 = 'reason_agreement_expert_inspection1'
group = 'reason_agreement*_expert_inspection*'
batch = '*'
n_q = '*'

#run4-group_reason_agreement_expert_inspection1
expert_annotations = load_expert_data(run, group, n_q, batch)
#expert_annotations2 = load_expert_data(run, group2, n_q, batch)
#expert_annotations = expert_annotations1 + expert_annotations2
expert_unit_agreement_dict = get_expert_agreement_labels(expert_annotations)
agreement_labels = get_expert_agreement_labels(expert_annotations)
gold_by_agreement = get_gold_dis_agreement(gold, agreement_labels)
print('\n--- agreement categories---')
for l in gold_by_agreement.keys():
    print(l)

run4-group_reason_agreement_expert_inspection2/qu40-s_qu40-batch1.csv
no summary data
run4-group_reason_agreement_False_expert_inspection1/qu30-s_qu30-batch1.csv
no summary data
run4-group_reason_agreement_expert_inspection3/qu40-s_qu40-batch1.csv
no summary data
run4-group_reason_agreement_False_expert_inspection2/qu40-s_qu40-batch1.csv
no summary data
run4-group_reason_agreement_expert_inspection1/qu30-s_qu30-batch1.csv
no summary data
run4-group_reason_agreement_False_expert_inspection3/qu40-s_qu40-batch1.csv
no summary data

--- agreement categories---
agreement
possible_disagreement
disagreement


In [23]:
# evaluate agree category:
gold_agree = gold_by_agreement['agreement']
gold_poss_disagree = gold_by_agreement['possible_disagreement']
gold_disagree = gold_by_agreement['disagreement']
print(len(gold_agree))
print(len(gold_poss_disagree))
print(len(gold_disagree))

40
45
5


In [44]:
# agree

overview_dicts = evaluate_configs(gold_agree, crowd)
df =  pd.DataFrame(overview_dicts) 
df = df.sort_values(by=['relations-f1'], ascending=False)[['config',
                                                      'relations-f1',
                                                      'levels-f1', 
                                                        'negative_relations-f1', 'alpha']]
df.round(2).to_csv('../analyses/evaluation_accuracy_agree.csv')
df

----Label distribution----
True: 7
False 33
----------------------------
40 17917 513
aggretation
no filtering - different aggretation methods
cleaning and aggregation


  'precision', 'predicted', average, warn_for)


clean all contradictory annotations


Unnamed: 0,config,relations-f1,levels-f1,negative_relations-f1,alpha
22,"(pair, 0.5, contradictions, majority_vote)",0.929832,0.959664,0.919423,0.346069
54,"(total, 0.5, contradictions, majority_vote)",0.929832,0.921296,0.919423,0.370216
58,"(total, 1, contradictions, majority_vote)",0.929832,0.921296,0.919423,0.334021
6,ct_vote_0.7,0.926923,0.916667,0.916364,0.267403
8,ct_vote_0.8,0.922675,0.908333,0.911741,0.267403
7,ct_vote_0.75,0.922675,0.908333,0.911741,0.267403
62,"(total, 1.5, contradictions, majority_vote)",0.907885,0.884066,0.894017,0.320205
5,ct_vote_0.65,0.904687,0.916667,0.890741,0.267403
28,"(pair, 1, crowdtruth, majority_vote)",0.886324,0.884066,0.868937,0.274504
60,"(total, 1, crowdtruth, majority_vote)",0.886324,0.884066,0.868937,0.258721


In [45]:
#print(df.to_latex(index=False))

In [46]:
# possible disagreement 

overview_dicts = evaluate_configs(gold_poss_disagree, crowd)
df =  pd.DataFrame(overview_dicts) 
df = df.sort_values(by=['relations-f1'], ascending=False)[['config',
                                                      'relations-f1',
                                                      'levels-f1', 
                                                        'negative_relations-f1', 'alpha']]
df.round(2).to_csv('../analyses/evaluation_accuracy_poss_disagree.csv')
df

----Label distribution----
True: 10
False 35
----------------------------
45 17917 507
aggretation
no filtering - different aggretation methods
cleaning and aggregation


  'precision', 'predicted', average, warn_for)


clean all contradictory annotations


Unnamed: 0,config,relations-f1,levels-f1,negative_relations-f1,alpha
38,"(batch, 0.5, contradictions, majority_vote)",0.893648,0.898190,0.925996,0.051811
26,"(pair, 1, contradictions, majority_vote)",0.827689,0.830317,0.850400,0.051209
46,"(batch, 1.5, contradictions, majority_vote)",0.814815,0.832859,0.806706,0.057680
58,"(total, 1, contradictions, majority_vote)",0.808566,0.798246,0.827323,0.049030
62,"(total, 1.5, contradictions, majority_vote)",0.808566,0.798246,0.827323,0.043946
42,"(batch, 1, contradictions, majority_vote)",0.808566,0.832859,0.827323,0.048032
54,"(total, 0.5, contradictions, majority_vote)",0.808566,0.798246,0.827323,0.062219
22,"(pair, 0.5, contradictions, majority_vote)",0.803313,0.830317,0.823212,0.064561
55,"(total, 0.5, contradictions, top_vote)",0.793002,0.700000,0.758383,0.062219
5,ct_vote_0.65,0.791145,0.803030,0.760531,0.045904


In [47]:
#disagree
overview_dicts = evaluate_configs(gold_disagree, crowd)
df =  pd.DataFrame(overview_dicts) 
df = df.sort_values(by=['relations-f1'], ascending=False)[['config',
                                                      'relations-f1',
                                                      'levels-f1', 
                                                        'negative_relations-f1', 'alpha']]
df.round(2).to_csv('../analyses/evaluation_accuracy_disagree.csv')
df

----Label distribution----
True: 3
False 2
----------------------------
5 17917 53
aggretation
no filtering - different aggretation methods
cleaning and aggregation


  'precision', 'predicted', average, warn_for)


clean all contradictory annotations


Unnamed: 0,config,relations-f1,levels-f1,negative_relations-f1,alpha
38,"(batch, 0.5, contradictions, majority_vote)",0.780952,0.780952,0.780952,-0.079394
54,"(total, 0.5, contradictions, majority_vote)",0.600000,0.600000,0.600000,-0.059838
6,ct_vote_0.7,0.566667,0.566667,0.566667,-0.085522
71,"(None, None, exclude_contradictory_annotations...",0.450000,0.450000,0.450000,0.001299
39,"(batch, 0.5, contradictions, top_vote)",0.450000,0.450000,0.450000,-0.079394
29,"(pair, 1, crowdtruth, top_vote)",0.450000,0.450000,0.450000,-0.087568
59,"(total, 1, contradictions, top_vote)",0.450000,0.450000,0.450000,-0.062309
31,"(pair, 1.5, contradictions, top_vote)",0.450000,0.450000,0.450000,-0.088757
33,"(pair, 1.5, crowdtruth, top_vote)",0.450000,0.450000,0.450000,-0.084635
57,"(total, 0.5, crowdtruth, top_vote)",0.450000,0.450000,0.450000,-0.136601
