In [6]:
from clean_annotations import clean_workers
from load_data import load_experiment_data, load_gold_data
from aggregation import aggregate_binary_labels

from load_data import load_experiment_data
from calculate_iaa import get_agreement
from utils_analysis import sort_by_key
from utils_analysis import load_analysis, load_ct
from utils_analysis import load_contradiction_pairs
from utils_analysis import collect_contradictions

from sklearn.metrics import precision_recall_fscore_support as p_r_f1
from collections import defaultdict


def iaa_dis_agreement(data_dict_list, expert_unit_agreement_dict):
    
    data_by_agreement = defaultdict(list)
    data_by_triple = sort_by_key(data_dict_list, ['relation', 'property', 'concept'])
    
    for t, gold_expect in expert_unit_agreement_dict.items():
        data = data_by_triple[t]
        data_by_agreement[gold_expect].extend(data)
    
        
    for exp, data in data_by_agreement.items():
        agreement = get_agreement(data, v=False)
        data_by_triple = sort_by_key(data, ['relation', 'property', 'concept'])
        print(exp, agreement['Krippendorff'], len(data_by_triple))
        

def get_expected_behavior(gold):
    unit_behavior_dict = dict()
    for d in gold:
        unit =  f"{d['relation']}-{d['property']}-{d['concept']}"
        exp = d['expected_agreement']
        if exp != 'agreement':
            unit_behavior_dict[unit] = 'disagreement'
        else:
            unit_behavior_dict[unit] = 'agreement'
    return unit_behavior_dict





def get_agreement_by_unit(data_dict_list):

    agreement_unit_dict = dict()
    data_by_unit = sort_by_key(data_dict_list, ['relation', 'property', 'concept'])
    for unit, dl_unit in data_by_unit.items():
        agreement = get_agreement(dl_unit, v=False, disable_kappa=True)
        agreement_unit_dict[unit] = agreement['Proportional']
    return agreement_unit_dict


def get_agreement_by_pair(data_dict_list, ag_metric):

    agreement_unit_dict = dict()
    data_by_pair = sort_by_key(data_dict_list, ['property', 'concept'])
    for pair, dl_unit in data_by_pair.items():
        agreement = get_agreement(dl_unit, v=False, disable_kappa=True)
        for d in dl_unit:
            triple = f"{d['relation']}-{d['property']}-{d['concept']}"
            agreement_unit_dict[triple] = agreement[ag_metric]
    
    return agreement_unit_dict

def get_contradictions_by_pair(data_dict_list):
    contradictions = load_contradiction_pairs()
    contradictions_unit_dict = dict()
    data_by_pair = sort_by_key(data_dict_list, ['property', 'concept'])
    analysis_by_pair = sort_by_key(pair_analysis, ['pair'])
    for pair, data_pair in data_by_pair.items():
        data_by_worker = sort_by_key(data_pair, ['workerid'])
        n_possible_contradictions = 0
        n_contradictions = 0
        for w, data in data_by_worker.items():
            pair_worker_cont = collect_contradictions(data, contradictions, threshold = 0)
            relations = [d['relation'] for d in data]
            for r1, r2 in contradictions:
                if r1 in relations and r2 in relations:
                    n_possible_contradictions += 1
            n_contradictions += len(pair_worker_cont)
        relations = set([d['relation'] for d in data_pair])
        for r in relations:
            unit = f'{r}-{pair}'
            if n_possible_contradictions == 0:
                contradictions_unit_dict[unit] = 0
            else:
                contradictions_unit_dict[unit] = n_contradictions/n_possible_contradictions

    return contradictions_unit_dict
            
        


def get_uqs_by_unit(data_dict_list, ct_units):
    ct_by_unit = sort_by_key(ct_units, ['unit'])
    uqs_unit_dict = dict()
    for d in data_dict_list:
        quid = d['quid']
        if quid in ct_by_unit:
            uqs = ct_by_unit[quid][0]['uqs']
            triple = f"{d['relation']}-{d['property']}-{d['concept']}"
            uqs_unit_dict[triple] = uqs
    return uqs_unit_dict


def evaluate(expert_unit_agreement_dict, crowd_data, thresh, v=True):
    gold = []
    predictions = []
    correct_predictions = []
    units_disagree = []
    for unit, label in expert_unit_agreement_dict.items():
        if label == 'disagreement':
            label = 'possible_disagreement'
        if unit in crowd_data:
            score = crowd_data[unit]
            if score < thresh:
                pred = 'possible_disagreement'
                units_disagree.append(unit)
            else:
                pred = 'agreement'  
            if pred == label:
                #print(label, pred) 
                correct_predictions.append(pred)
            gold.append(label)
            predictions.append(pred)
        else:
            pass
            #print(unit, 'no annotations')
    p, r, f1, support = p_r_f1(gold, predictions, average = 'weighted')
    if v == True:
        print('-------------------------------')
        print('\t gold \t prediction \t correct' )
        print("Agreement" ,'\t', gold.count('agreement'),
              '\t', predictions.count('agreement'), '\t', correct_predictions.count('agreement'))
        print("Disagreement",'\t', gold.count('possible_disagreement'),
              '\t', predictions.count('possible_disagreement'),
              '\t', correct_predictions.count('possible_disagreement'))
        print('--------------------------------')
    print(f1, p, r)
    return round(f1, 2), round(p, 2), round(r, 2), units_disagree



In [2]:
# load expert data 

# load gold
group = 'reason_agreement*_expert_inspection*'
run = 4
gold = load_gold_data(run, group)
print(gold[0])
for d in gold:
    if 'answer' not in d:
        print(d)
print('number of gold instances: ', len(gold))|

expert_unit_agreement_dict = get_expected_behavior(gold)


#for k, v in expert_unit_agreement_dict.items():
 #   print(k, v)

{'answer': False, 'completionurl': 'expert_annotation', 'concept': 'shovel', 'expected_agreement': 'possible_disagreement', 'property': 'roll', 'quid': 'impossible-shovel-roll', 'relation': 'impossible', 'workerid': 'gold'}
number of gold instances:  154


In [3]:
run = "*"
group = 'experiment*'
batch = '*'
n_q = '*'

analysis_type = 'units'
ct_units = load_ct(run, group, batch, analysis_type, as_dict=True)

analysis_type = 'pairs'
pair_analysis =  load_analysis(analysis_type, run, group, batch, as_dict=True)


crowd = load_experiment_data(run, group, n_q, batch, remove_not_val = True)

run = "*"
group = 'experiment*'
batch = '*'
n_q = '*'

metric = 'contradictions'
unit = 'batch'
n_stdv = 0.5
crowd_clean = clean_workers(crowd, run, group, batch, metric, unit, n_stdv)

Discarded 655.0 annotations.


In [4]:
# Agreement overview

iaa_dis_agreement(crowd, expert_unit_agreement_dict)
print()
iaa_dis_agreement(crowd_clean, expert_unit_agreement_dict)

  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)


disagreement 0.13607292962181272 95
agreement 0.25374626711976134 59

disagreement 0.12686020346418592 95
agreement 0.30297304106827916 59


In [9]:
# uqs

data_uqs = get_uqs_by_unit(crowd, ct_units)

threshs = [0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7]
f1s = []
for thresh in threshs:
    f1, p, r, units_disagree_uqs = evaluate(expert_unit_agreement_dict, data_uqs, thresh, v=False)
    f1s.append((f1, thresh))
    #print(f1, thresh)
print(max(f1s))

0.21224315590512774 0.1467785461291955 0.38311688311688313
0.36061415493720056 0.583349486334561 0.43506493506493504
0.5231331168831169 0.5673652190506123 0.5194805194805194
0.6056291787939819 0.607809967031566 0.6038961038961039
0.6190531407368142 0.6166077524194469 0.6233766233766234
0.6377813363107481 0.6373035730833896 0.6493506493506493
0.6193706293706295 0.6204697525655868 0.6363636363636364
(0.64, 0.65)


In [28]:
# agreement per unit full


data_ag = get_agreement_by_unit(crowd)

threshs = [0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7]
f1s = []
for thresh in threshs:
    f1, p, r, units_disagree_prop = evaluate(expert_unit_agreement_dict, data_ag, thresh, v=False)
    f1s.append((f1, thresh))
    #print(f1, thresh)
print(max(f1s))   

f1, p, r, units_disagree_prop_07 = evaluate(expert_unit_agreement_dict, data_ag, 0.7, v=True)

0.21224315590512774 0.1467785461291955 0.38311688311688313
0.40224027690976394 0.5879530210238871 0.45454545454545453
0.568666766537219 0.613439590712318 0.564935064935065
0.6206519109394855 0.6187590187590187 0.6233766233766234
0.6153559403559404 0.6125982860676739 0.6233766233766234
0.5859623866699338 0.587091587091587 0.6103896103896104
0.5773325759982544 0.5782308557732286 0.6038961038961039
(0.62, 0.6)
-------------------------------
	 gold 	 prediction 	 correct
Agreement 	 59 	 36 	 17
Disagreement 	 95 	 118 	 76
--------------------------------
0.5773325759982544 0.5782308557732286 0.6038961038961039


In [33]:
# what is the intersection?

print(len(units_disagree_prop))
print(len(units_disagree_uqs))

overlap = set(units_disagree_prop_07).intersection(set(units_disagree_uqs))
print(len(overlap))

prop_only = set(units_disagree_prop_07).difference(set(units_disagree_uqs))
print(len(prop_only))

print('---low quality annotations ---')
labels = []
for u in prop_only:
    label = expert_unit_agreement_dict[u]
    #print(u, label)
    labels.append(label)
acc = labels.count('agreement') / len(labels)
print(acc, len(labels), labels.count('agreement'))
print()
print('---- correct disagreement?---')
labels = []
for u in overlap:
    label = expert_unit_agreement_dict[u]
    #print(u, label)
    labels.append(label)
acc = labels.count('disagreement')/len(labels)
print(acc, len(labels), labels.count('disagreement'))

118
113
111
7
---low quality annotations ---
0.8571428571428571 7 6

---- correct disagreement?---
0.6756756756756757 111 75


In [25]:
# agreement per unit clean

data_ag = get_agreement_by_unit(crowd_clean)

threshs = [0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7]
f1s = []
for thresh in threshs:
    f1, p, r, units_disagree_prop = evaluate(expert_unit_agreement_dict, data_ag, thresh, v=False)
    f1s.append((f1, thresh))
    #print(f1, thresh)
print(max(f1s))   

0.29827320827320825 0.5263459944311009 0.4025974025974026
0.4369235987733098 0.5522043745727956 0.461038961038961
0.5655771595050533 0.6276365478493138 0.564935064935065
0.6323913397084129 0.6362170815295816 0.6298701298701299
0.643412920723845 0.6440222897669706 0.6428571428571429
0.639534506791144 0.6375661375661376 0.6428571428571429
0.6321892393320965 0.6299441264881205 0.6363636363636364
(0.64, 0.65)


In [26]:
# contradiction counts full

conts = get_contradictions_by_pair(crowd)

threshs = [0.05, 0.1, 0.15, 0.20, 0.25, 0.30, 0.35, 0.4, 0.45, 0.5]
f1s = []
for thresh in threshs:
    f1, p, r, units_disagree_prop = evaluate(expert_unit_agreement_dict, data_ag, thresh, v=False)
    f1s.append((f1, thresh))
    #print(f1, thresh)
print(max(f1s))

0.21224315590512774 0.1467785461291955 0.38311688311688313
0.21224315590512774 0.1467785461291955 0.38311688311688313
0.21224315590512774 0.1467785461291955 0.38311688311688313
0.21224315590512774 0.1467785461291955 0.38311688311688313
0.263011988011988 0.6426392399546762 0.4025974025974026
0.2598539357041294 0.5588071838071837 0.3961038961038961
0.2598539357041294 0.5588071838071837 0.3961038961038961
0.29827320827320825 0.5263459944311009 0.4025974025974026
0.4369235987733098 0.5522043745727956 0.461038961038961
0.5655771595050533 0.6276365478493138 0.564935064935065
(0.57, 0.5)


In [27]:
# contradiction counts clean

conts = get_contradictions_by_pair(crowd_clean)

threshs = [0.05, 0.1, 0.15, 0.20, 0.25, 0.30, 0.35, 0.4, 0.45, 0.5]
f1s = []
for thresh in threshs:
    f1, p, r, units_disagree_prop = evaluate(expert_unit_agreement_dict, data_ag, thresh, v=False)
    f1s.append((f1, thresh))
    #print(f1, thresh)
print(max(f1s))

0.21224315590512774 0.1467785461291955 0.38311688311688313
0.21224315590512774 0.1467785461291955 0.38311688311688313
0.21224315590512774 0.1467785461291955 0.38311688311688313
0.21224315590512774 0.1467785461291955 0.38311688311688313
0.263011988011988 0.6426392399546762 0.4025974025974026
0.2598539357041294 0.5588071838071837 0.3961038961038961
0.2598539357041294 0.5588071838071837 0.3961038961038961
0.29827320827320825 0.5263459944311009 0.4025974025974026
0.4369235987733098 0.5522043745727956 0.461038961038961
0.5655771595050533 0.6276365478493138 0.564935064935065
(0.57, 0.5)


In [112]:
# agreement per pair full

ag_metric = 'Krippendorff'
data_ag_pair = get_agreement_by_pair(crowd, ag_metric)

#threshs = [0, 0.05, 0.1, 0.15, 0.20]
threshs = [0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75]
f1s = []
for thresh in threshs:
    f1 = evaluate(expert_unit_agreement_dict, data_ag_pair, thresh, v=False)
    f1s.append((f1, thresh))
    #print(f1, thresh)
print(max(f1s))

0.6581678125795773 0.6688311688311688 0.6435732309700803
0.6581678125795773 0.6688311688311688 0.6435732309700803
0.6061230436230436 0.6298701298701299 0.5806185782089396
0.6061230436230436 0.6298701298701299 0.5806185782089396
0.48399814471243036 0.577922077922078 0.48824128363621216
0.48399814471243036 0.577922077922078 0.48824128363621216
0.48399814471243036 0.577922077922078 0.48824128363621216
0.48399814471243036 0.577922077922078 0.48824128363621216
((0.64, 0.66, 0.67), 0.45)


In [113]:
# agreement per pair clean

ag_metric = 'Krippendorff'
data_ag_pair = get_agreement_by_pair(crowd_clean, ag_metric)

#threshs = [0, 0.05, 0.1, 0.15, 0.20]
threshs = [0.30, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75]
f1s = []
for thresh in threshs:
    f1 = evaluate(expert_unit_agreement_dict, data_ag_pair, thresh, v=False)
    f1s.append((f1, thresh))
    #print(f1, thresh)
print(max(f1s))

0.6581678125795773 0.6688311688311688 0.6435732309700803
0.6581678125795773 0.6688311688311688 0.6435732309700803
0.6581678125795773 0.6688311688311688 0.6435732309700803
0.6581678125795773 0.6688311688311688 0.6435732309700803
0.6061230436230436 0.6298701298701299 0.5806185782089396
0.6061230436230436 0.6298701298701299 0.5806185782089396
0.6061230436230436 0.6298701298701299 0.5806185782089396
0.6061230436230436 0.6298701298701299 0.5806185782089396
0.48399814471243036 0.577922077922078 0.48824128363621216
0.48399814471243036 0.577922077922078 0.48824128363621216
((0.64, 0.66, 0.67), 0.45)
