In [10]:
from clean_annotations import clean_workers
from load_data import load_experiment_data, load_expert_data, load_gold_data
from aggregation import aggregate_binary_labels

from load_data import load_experiment_data
from calculate_iaa import get_agreement
from utils_analysis import sort_by_key
from utils_analysis import load_analysis, load_ct

from sklearn.metrics import precision_recall_fscore_support as p_r_f1
from collections import defaultdict


def iaa_dis_agreement(data_dict_list, expert_unit_agreement_dict):
    
    data_by_agreement = defaultdict(list)
    data_by_triple = sort_by_key(data_dict_list, ['relation', 'property', 'concept'])
    
    for t, gold_expect in expert_unit_agreement_dict.items():
        data = data_by_triple[t]
        print(t, gold_expect)
        data_by_agreement[gold_expect].extend(data)
        
    for exp, data in data_by_agreement.items():
        #print(exp)
        agreement = get_agreement(data, v=False)
        print(exp, agreement['Krippendorff'])
        

def get_expert_agreement_labels(expert_annotations):
    expert_annotations_by_unit = sort_by_key(expert_annotations, ['relation',
                                                              'property', 'concept'])
    unit_agreement_dict = dict()
    for unit, data in expert_annotations_by_unit.items():
        agreements = []
        for d in data:
            w = d['workerid']
            if not w.endswith('_test1'):
                for k in d.keys():
                    #print(k)
                    if k.startswith('disagreement_'):
                        agreements.append(k)
        n_agreement_annotations = len(agreements)
        n_agree = agreements.count('disagreement_agreement')
        prop_agreement = n_agree/n_agreement_annotations

        if prop_agreement == 1.0:
            unit_agreement_dict[unit] = 'agreement'
        elif 'disagreement_agreement' in agreements:
            unit_agreement_dict[unit] = 'possible_disagreement'
        else:
            unit_agreement_dict[unit] = 'disagreement'
    return unit_agreement_dict


def get_agreement_by_unit(data_dict_list):

    agreement_unit_dict = dict()
    data_by_unit = sort_by_key(data_dict_list, ['relation', 'property', 'concept'])
    for unit, dl_unit in data_by_unit.items():
        agreement = get_agreement(dl_unit, v=False, disable_kappa=True)
        agreement_unit_dict[unit] = agreement['Proportional']
    return agreement_unit_dict


def get_agreement_by_pair(data_dict_list, ag_metric):

    agreement_unit_dict = dict()
    data_by_pair = sort_by_key(data_dict_list, ['property', 'concept'])
    for pair, dl_unit in data_by_pair.items():
        agreement = get_agreement(dl_unit, v=False, disable_kappa=True)
        for d in dl_unit:
            triple = f"{d['relation']}-{d['property']}-{d['concept']}"
            agreement_unit_dict[triple] = agreement[ag_metric]
    
    return agreement_unit_dict

def get_contradictions_by_pair(data_dict_list, pair_analysis):
    
    contradictions_unit_dict = dict()
    data_by_pair = sort_by_key(data_dict_list, ['property', 'concept'])
    analysis_by_pair = sort_by_key(pair_analysis, ['pair'])
    for pair, data in data_by_pair.items():
        analysis = analysis_by_pair[pair][0]
        n_workers = analysis['n_workers']
        n_workers_contradicting = analysis['n_workers_contradicting']
        ratio = n_workers_contradicting/n_workers
        for d in data:
            triple = f"{d['relation']}-{d['property']}-{d['concept']}"
            contradictions_unit_dict[triple] = ratio
    return contradictions_unit_dict
            
        


def get_uqs_by_unit(data_dict_list, ct_units):
    ct_by_unit = sort_by_key(ct_units, ['unit'])
    uqs_unit_dict = dict()
    for d in data_dict_list:
        quid = d['quid']
        if quid in ct_by_unit:
            uqs = ct_by_unit[quid][0]['uqs']
            triple = f"{d['relation']}-{d['property']}-{d['concept']}"
            uqs_unit_dict[triple] = uqs
    return uqs_unit_dict


def evaluate(expert_unit_agreement_dict, crowd_data, thresh, v=True):
    gold = []
    predictions = []
    correct_predictions = []
    for unit, label in expert_unit_agreement_dict.items():
        if unit in crowd_data:
            score = crowd_data[unit]
            if score < thresh:
                pred = 'possible_disagreement'
            else:
                pred = 'agreement'  
            if pred == label:
                #print(label, pred) 
                correct_predictions.append(pred)
            gold.append(label)
            predictions.append(pred)
        else:
            pass
            #print(unit, 'no annotations')
    p, r, f1, support = p_r_f1(gold, predictions, average = 'weighted')
    if v == True:
        print('-------------------------------')
        print('\t gold \t prediction \t correct' )
        print("Agreement" ,'\t', gold.count('agreement'),
              '\t', predictions.count('agreement'), '\t', correct_predictions.count('agreement'))
        print("Disagreement",'\t', gold.count('possible_disagreement'),
              '\t', predictions.count('possible_disagreement'),
              '\t', correct_predictions.count('possible_disagreement'))
        print('--------------------------------')
        print(p, r, f1)
    return f1


In [5]:
# load expert data 

run = "4"
#group1 = 'reason_agreement_expert_inspection1'
group = 'reason_agreement_expert_inspection*'
batch = '*'
n_q = '*'

#run4-group_reason_agreement_expert_inspection1
expert_annotations = load_expert_data(run, group, n_q, batch)
#expert_annotations2 = load_expert_data(run, group2, n_q, batch)
#expert_annotations = expert_annotations1 + expert_annotations2
#expert_unit_agreement_dict = get_expert_agreement_labels(expert_annotations)

run4-group_reason_agreement_expert_inspection2/qu40-s_qu40-batch1.csv
no summary data
run4-group_reason_agreement_expert_inspection3/qu40-s_qu40-batch1.csv
no summary data
run4-group_reason_agreement_expert_inspection1/qu30-s_qu30-batch1.csv
no summary data


In [6]:
run = "*"
group = 'experiment*'
batch = '*'
n_q = '*'

analysis_type = 'units'
ct_units = load_ct(run, group, batch, analysis_type, as_dict=True)

analysis_type = 'pairs'
pair_analysis =  load_analysis(analysis_type, run, group, batch, as_dict=True)


data_dict_list = load_experiment_data(run, group, n_q, batch, remove_not_val = True)

Discarded 655.0 annotations.


In [11]:
# Agreement overview

iaa_dis_agreement(data_dict_list, expert_unit_agreement_dict)

agreement
agreement 0.2427437367364772
possible_disagreement
possible_disagreement 0.026575395486579945
disagreement
disagreement 0.0


In [120]:

data_uqs = get_uqs_by_unit(data_dict_list, ct_units)

threshs = [0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7]
f1s = []
for thresh in threshs:
    f1 = evaluate(expert_unit_agreement_dict, data_uqs, thresh, v=True)
    f1s.append(f1)
    print(f1, thresh)

-------------------------------
	 gold 	 prediction 	 correct
Agreement 	 32 	 57 	 32
Disagreement 	 25 	 0 	 0
--------------------------------
0.3151738996614343 0.5614035087719298 0.40370589394835404
0.40370589394835404 0.4
-------------------------------
	 gold 	 prediction 	 correct
Agreement 	 32 	 46 	 30
Disagreement 	 25 	 11 	 9
--------------------------------
0.7249843977532766 0.6842105263157895 0.6511470985155196
0.6511470985155196 0.45
-------------------------------
	 gold 	 prediction 	 correct
Agreement 	 32 	 30 	 22
Disagreement 	 25 	 27 	 17
--------------------------------
0.6878492527615334 0.6842105263157895 0.6851900222019068
0.6851900222019068 0.5
-------------------------------
	 gold 	 prediction 	 correct
Agreement 	 32 	 22 	 19
Disagreement 	 25 	 35 	 22
--------------------------------
0.760537707906129 0.7192982456140351 0.7166991552956465
0.7166991552956465 0.55
-------------------------------
	 gold 	 prediction 	 correct
Agreement 	 32 	 18 	 16
D

In [119]:

data_ag = get_agreement_by_unit(data_dict_list)

threshs = [0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7]
f1s = []
for thresh in threshs:
    f1 = evaluate(expert_unit_agreement_dict, data_ag, thresh, v=True)
    f1s.append(f1)
    print(f1, thresh)

-------------------------------
	 gold 	 prediction 	 correct
Agreement 	 32 	 57 	 32
Disagreement 	 25 	 0 	 0
--------------------------------
0.3151738996614343 0.5614035087719298 0.40370589394835404
0.40370589394835404 0.4
-------------------------------
	 gold 	 prediction 	 correct
Agreement 	 32 	 46 	 30
Disagreement 	 25 	 11 	 9
--------------------------------
0.7249843977532766 0.6842105263157895 0.6511470985155196
0.6511470985155196 0.45
-------------------------------
	 gold 	 prediction 	 correct
Agreement 	 32 	 32 	 24
Disagreement 	 25 	 25 	 17
--------------------------------
0.7192982456140351 0.7192982456140351 0.7192982456140351
0.7192982456140351 0.5
-------------------------------
	 gold 	 prediction 	 correct
Agreement 	 32 	 20 	 17
Disagreement 	 25 	 37 	 22
--------------------------------
0.7379800853485065 0.6842105263157895 0.6783335509990857
0.6783335509990857 0.55
-------------------------------
	 gold 	 prediction 	 correct
Agreement 	 32 	 17 	 14


In [118]:
data_cont = get_contradictions_by_pair(data_dict_list, pair_analysis)

threshs = [0.1, 0.15, 0.20, 0.25, 0.30, 0.35, 0.4, 0.45, 0.5]
f1s = []
for thresh in threshs:
    f1 = evaluate(expert_unit_agreement_dict, data_cont, thresh, v=True)
    f1s.append(f1)
    print(f1, thresh)
    print()

-------------------------------
	 gold 	 prediction 	 correct
Agreement 	 32 	 34 	 22
Disagreement 	 25 	 23 	 13
--------------------------------
0.6111634585184189 0.6140350877192983 0.6118421052631579
0.6118421052631579 0.05

-------------------------------
	 gold 	 prediction 	 correct
Agreement 	 32 	 34 	 22
Disagreement 	 25 	 23 	 13
--------------------------------
0.6111634585184189 0.6140350877192983 0.6118421052631579
0.6118421052631579 0.1

-------------------------------
	 gold 	 prediction 	 correct
Agreement 	 32 	 34 	 22
Disagreement 	 25 	 23 	 13
--------------------------------
0.6111634585184189 0.6140350877192983 0.6118421052631579
0.6118421052631579 0.15

-------------------------------
	 gold 	 prediction 	 correct
Agreement 	 32 	 34 	 22
Disagreement 	 25 	 23 	 13
--------------------------------
0.6111634585184189 0.6140350877192983 0.6118421052631579
0.6118421052631579 0.2

-------------------------------
	 gold 	 prediction 	 correct
Agreement 	 32 	 24 

In [124]:
ag_metric = 'Proportional'
data_ag_pair = get_agreement_by_pair(data_dict_list, ag_metric)

#threshs = [0, 0.05, 0.1, 0.15, 0.20]
threshs = [0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7]
f1s = []
for thresh in threshs:
    f1 = evaluate(expert_unit_agreement_dict, data_ag_pair, thresh, v=True)
    f1s.append(f1)
    print(f1, thresh)
    print()


-------------------------------
	 gold 	 prediction 	 correct
Agreement 	 32 	 57 	 32
Disagreement 	 25 	 0 	 0
--------------------------------
0.3151738996614343 0.5614035087719298 0.40370589394835404
0.40370589394835404 0.4

-------------------------------
	 gold 	 prediction 	 correct
Agreement 	 32 	 57 	 32
Disagreement 	 25 	 0 	 0
--------------------------------
0.3151738996614343 0.5614035087719298 0.40370589394835404
0.40370589394835404 0.45

-------------------------------
	 gold 	 prediction 	 correct
Agreement 	 32 	 21 	 16
Disagreement 	 25 	 36 	 20
--------------------------------
0.6714007240323029 0.631578947368421 0.6265648656128413
0.6265648656128413 0.5

-------------------------------
	 gold 	 prediction 	 correct
Agreement 	 32 	 18 	 14
Disagreement 	 25 	 39 	 21
--------------------------------
0.6728145149197782 0.6140350877192983 0.6022149122807018
0.6022149122807018 0.55

-------------------------------
	 gold 	 prediction 	 correct
Agreement 	 32 	 18 	