In [12]:
from clean_annotations import clean_workers
from load_data import load_experiment_data, load_expert_data, load_gold_data
from aggregation import aggregate_binary_labels

from load_data import load_experiment_data
from calculate_iaa import get_agreement
from utils_analysis import sort_by_key
from utils_analysis import load_analysis, load_ct

from sklearn.metrics import precision_recall_fscore_support as p_r_f1
from collections import defaultdict


def iaa_dis_agreement(data_dict_list, expert_unit_agreement_dict):
    
    data_by_agreement = defaultdict(list)
    data_by_triple = sort_by_key(data_dict_list, ['relation', 'property', 'concept'])
    
    for t, gold_expect in expert_unit_agreement_dict.items():
        data = data_by_triple[t]
        data_by_agreement[gold_expect].extend(data)
        
    for exp, data in data_by_agreement.items():
        agreement = get_agreement(data, v=False)
        print(exp, agreement['Krippendorff'], len(data))
        

def get_expert_agreement_labels(expert_annotations):
    expert_annotations_by_unit = sort_by_key(expert_annotations, ['relation',
                                                              'property', 'concept'])
    unit_agreement_dict = dict()
    for unit, data in expert_annotations_by_unit.items():
        agreements = []
        for d in data:
            w = d['workerid']
            if not w.endswith('_test1'):
                for k in d.keys():
                    #print(k)
                    if k.startswith('disagreement_'):
                        agreements.append(k)
        n_agreement_annotations = len(agreements)
        n_agree = agreements.count('disagreement_agreement')
        prop_agreement = n_agree/n_agreement_annotations

        if prop_agreement == 1.0:
            unit_agreement_dict[unit] = 'agreement'
        elif 'disagreement_agreement' in agreements:
            unit_agreement_dict[unit] = 'possible_disagreement'
        else:
            unit_agreement_dict[unit] = 'disagreement'
    return unit_agreement_dict


def get_agreement_by_unit(data_dict_list):

    agreement_unit_dict = dict()
    data_by_unit = sort_by_key(data_dict_list, ['relation', 'property', 'concept'])
    for unit, dl_unit in data_by_unit.items():
        agreement = get_agreement(dl_unit, v=False, disable_kappa=True)
        agreement_unit_dict[unit] = agreement['Proportional']
    return agreement_unit_dict


def get_agreement_by_pair(data_dict_list, ag_metric):

    agreement_unit_dict = dict()
    data_by_pair = sort_by_key(data_dict_list, ['property', 'concept'])
    for pair, dl_unit in data_by_pair.items():
        agreement = get_agreement(dl_unit, v=False, disable_kappa=True)
        for d in dl_unit:
            triple = f"{d['relation']}-{d['property']}-{d['concept']}"
            agreement_unit_dict[triple] = agreement[ag_metric]
    
    return agreement_unit_dict

def get_contradictions_by_pair(data_dict_list, pair_analysis):
    
    contradictions_unit_dict = dict()
    data_by_pair = sort_by_key(data_dict_list, ['property', 'concept'])
    analysis_by_pair = sort_by_key(pair_analysis, ['pair'])
    for pair, data in data_by_pair.items():
        analysis = analysis_by_pair[pair][0]
        n_workers = analysis['n_workers']
        n_workers_contradicting = analysis['n_workers_contradicting']
        ratio = n_workers_contradicting/n_workers
        for d in data:
            triple = f"{d['relation']}-{d['property']}-{d['concept']}"
            contradictions_unit_dict[triple] = ratio
    return contradictions_unit_dict
            
        


def get_uqs_by_unit(data_dict_list, ct_units):
    ct_by_unit = sort_by_key(ct_units, ['unit'])
    uqs_unit_dict = dict()
    for d in data_dict_list:
        quid = d['quid']
        if quid in ct_by_unit:
            uqs = ct_by_unit[quid][0]['uqs']
            triple = f"{d['relation']}-{d['property']}-{d['concept']}"
            uqs_unit_dict[triple] = uqs
    return uqs_unit_dict


def evaluate(expert_unit_agreement_dict, crowd_data, thresh, v=True):
    gold = []
    predictions = []
    correct_predictions = []
    for unit, label in expert_unit_agreement_dict.items():
        if label == 'disagreement':
            label = 'possible_disagreement'
        if unit in crowd_data:
            score = crowd_data[unit]
            if score < thresh:
                pred = 'possible_disagreement'
            else:
                pred = 'agreement'  
            if pred == label:
                #print(label, pred) 
                correct_predictions.append(pred)
            gold.append(label)
            predictions.append(pred)
        else:
            pass
            #print(unit, 'no annotations')
    p, r, f1, support = p_r_f1(gold, predictions, average = 'weighted')
    if v == True:
        print('-------------------------------')
        print('\t gold \t prediction \t correct' )
        print("Agreement" ,'\t', gold.count('agreement'),
              '\t', predictions.count('agreement'), '\t', correct_predictions.count('agreement'))
        print("Disagreement",'\t', gold.count('possible_disagreement'),
              '\t', predictions.count('possible_disagreement'),
              '\t', correct_predictions.count('possible_disagreement'))
        print('--------------------------------')
        print(p, r, f1)
    return f1



In [6]:
# load expert data 

run = "4"
#group1 = 'reason_agreement_expert_inspection1'
group = 'reason_agreement*_expert_inspection*'
batch = '*'
n_q = '*'

#run4-group_reason_agreement_expert_inspection1
expert_annotations = load_expert_data(run, group, n_q, batch)
#expert_annotations2 = load_expert_data(run, group2, n_q, batch)
#expert_annotations = expert_annotations1 + expert_annotations2
expert_unit_agreement_dict = get_expert_agreement_labels(expert_annotations)
#for k, v in expert_unit_agreement_dict.items():
 #   print(k, v)

run4-group_reason_agreement_expert_inspection2/qu40-s_qu40-batch1.csv
no summary data
run4-group_reason_agreement_False_expert_inspection1/qu30-s_qu30-batch1.csv
no summary data
run4-group_reason_agreement_expert_inspection4/qu44-s_qu44-batch1.csv
no summary data
run4-group_reason_agreement_expert_inspection3/qu40-s_qu40-batch1.csv
no summary data
run4-group_reason_agreement_False_expert_inspection2/qu40-s_qu40-batch1.csv
no summary data
run4-group_reason_agreement_expert_inspection1/qu30-s_qu30-batch1.csv
no summary data
run4-group_reason_agreement_False_expert_inspection3/qu40-s_qu40-batch1.csv
no summary data


In [7]:
run = "*"
group = 'experiment*'
batch = '*'
n_q = '*'

analysis_type = 'units'
ct_units = load_ct(run, group, batch, analysis_type, as_dict=True)

analysis_type = 'pairs'
pair_analysis =  load_analysis(analysis_type, run, group, batch, as_dict=True)


data_dict_list = load_experiment_data(run, group, n_q, batch, remove_not_val = True)

Discarded 655.0 annotations.


In [13]:
# Agreement overview

iaa_dis_agreement(data_dict_list, expert_unit_agreement_dict)

agreement 0.292535628185175 573
possible_disagreement 0.10690028757842951 947
disagreement 0.23863259210089616 223


In [14]:
data_uqs = get_uqs_by_unit(data_dict_list, ct_units)

threshs = [0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7]
f1s = []
for thresh in threshs:
    f1 = evaluate(expert_unit_agreement_dict, data_uqs, thresh, v=True)
    f1s.append(f1)
    print(f1, thresh)

-------------------------------
	 gold 	 prediction 	 correct
Agreement 	 46 	 154 	 46
Disagreement 	 108 	 0 	 0
--------------------------------
0.08922246584584247 0.2987012987012987 0.1374025974025974
0.1374025974025974 0.4
-------------------------------
	 gold 	 prediction 	 correct
Agreement 	 46 	 134 	 44
Disagreement 	 108 	 20 	 18
--------------------------------
0.729249854622989 0.4025974025974026 0.3432720057720058
0.3432720057720058 0.45
-------------------------------
	 gold 	 prediction 	 correct
Agreement 	 46 	 89 	 31
Disagreement 	 108 	 65 	 50
--------------------------------
0.6435025648508794 0.525974025974026 0.5425563090302975
0.5425563090302975 0.5
-------------------------------
	 gold 	 prediction 	 correct
Agreement 	 46 	 62 	 25
Disagreement 	 108 	 92 	 71
--------------------------------
0.6616637219722773 0.6233766233766234 0.636209716209716
0.636209716209716 0.55
-------------------------------
	 gold 	 prediction 	 correct
Agreement 	 46 	 53 	 2

  'precision', 'predicted', average, warn_for)


In [14]:
data_ag = get_agreement_by_unit(data_dict_list)

threshs = [0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7]
f1s = []
for thresh in threshs:
    f1 = evaluate(expert_unit_agreement_dict, data_ag, thresh, v=True)
    f1s.append(f1)
    print(f1, thresh)

-------------------------------
	 gold 	 prediction 	 correct
Agreement 	 40 	 110 	 40
Disagreement 	 70 	 0 	 0
--------------------------------
0.1322314049586777 0.36363636363636365 0.19393939393939394
0.19393939393939394 0.4
-------------------------------
	 gold 	 prediction 	 correct
Agreement 	 40 	 85 	 36
Disagreement 	 70 	 25 	 21
--------------------------------
0.6885561497326202 0.5181818181818182 0.4907942583732058
0.4907942583732058 0.45
-------------------------------
	 gold 	 prediction 	 correct
Agreement 	 40 	 56 	 28
Disagreement 	 70 	 54 	 42
--------------------------------
0.6767676767676768 0.6363636363636364 0.6432062561094819
0.6432062561094819 0.5
-------------------------------
	 gold 	 prediction 	 correct
Agreement 	 40 	 31 	 19
Disagreement 	 70 	 79 	 58
--------------------------------
0.6900775826868111 0.7 0.6900463181775216
0.6900463181775216 0.55
-------------------------------
	 gold 	 prediction 	 correct
Agreement 	 40 	 25 	 16
Disagreement

In [16]:
ag_metric = 'Proportional'
data_ag_pair = get_agreement_by_pair(data_dict_list, ag_metric)

#threshs = [0, 0.05, 0.1, 0.15, 0.20]
threshs = [0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7]
f1s = []
for thresh in threshs:
    f1 = evaluate(expert_unit_agreement_dict, data_ag_pair, thresh, v=True)
    f1s.append(f1)
    print(f1, thresh)
    print()


-------------------------------
	 gold 	 prediction 	 correct
Agreement 	 46 	 154 	 46
Disagreement 	 108 	 0 	 0
--------------------------------
0.08922246584584247 0.2987012987012987 0.1374025974025974
0.1374025974025974 0.4

-------------------------------
	 gold 	 prediction 	 correct
Agreement 	 46 	 154 	 46
Disagreement 	 108 	 0 	 0
--------------------------------
0.08922246584584247 0.2987012987012987 0.1374025974025974
0.1374025974025974 0.45

-------------------------------
	 gold 	 prediction 	 correct
Agreement 	 46 	 82 	 23
Disagreement 	 108 	 72 	 49
--------------------------------
0.5610547988596769 0.4675324675324675 0.4891639610389611
0.4891639610389611 0.5

-------------------------------
	 gold 	 prediction 	 correct
Agreement 	 46 	 58 	 21
Disagreement 	 108 	 96 	 71
--------------------------------
0.6268193013882669 0.5974025974025974 0.6087882705529765
0.6087882705529765 0.55

-------------------------------
	 gold 	 prediction 	 correct
Agreement 	 46 	

In [15]:
# contradiction counts

data_cont = get_contradictions_by_pair(data_dict_list, pair_analysis)

threshs = [0.1, 0.15, 0.20, 0.25, 0.30, 0.35, 0.4, 0.45, 0.5]
f1s = []
for thresh in threshs:
    f1 = evaluate(expert_unit_agreement_dict, data_cont, thresh, v=True)
    f1s.append(f1)
    print(f1, thresh)
    print()

-------------------------------
	 gold 	 prediction 	 correct
Agreement 	 46 	 100 	 31
Disagreement 	 108 	 54 	 39
--------------------------------
0.5990909090909091 0.45454545454545453 0.4645080946450809
0.4645080946450809 0.1

-------------------------------
	 gold 	 prediction 	 correct
Agreement 	 46 	 80 	 29
Disagreement 	 108 	 74 	 57
--------------------------------
0.6484687609687609 0.5584415584415584 0.5767724339152911
0.5767724339152911 0.15

-------------------------------
	 gold 	 prediction 	 correct
Agreement 	 46 	 80 	 29
Disagreement 	 108 	 74 	 57
--------------------------------
0.6484687609687609 0.5584415584415584 0.5767724339152911
0.5767724339152911 0.2

-------------------------------
	 gold 	 prediction 	 correct
Agreement 	 46 	 60 	 21
Disagreement 	 108 	 94 	 69
--------------------------------
0.6193285437966288 0.5844155844155844 0.5974583981309183
0.5974583981309183 0.25

-------------------------------
	 gold 	 prediction 	 correct
Agreement 	 46