In [142]:
from clean_annotations import clean_workers
from load_data import load_experiment_data, load_gold_data
from aggregation import aggregate_binary_labels

from load_data import load_experiment_data
from calculate_iaa import get_agreement
from utils_analysis import sort_by_key
from utils_analysis import load_analysis, load_ct
from utils_analysis import load_contradiction_pairs
from utils_analysis import collect_contradictions

from sklearn.metrics import precision_recall_fscore_support as p_r_f1
from collections import defaultdict


def iaa_dis_agreement(data_dict_list, expert_unit_agreement_dict):
    
    data_by_agreement = defaultdict(list)
    data_by_triple = sort_by_key(data_dict_list, ['relation', 'property', 'concept'])
    
    for t, gold_expect in expert_unit_agreement_dict.items():
        data = data_by_triple[t]
        data_by_agreement[gold_expect].extend(data)
    
        
    for exp, data in data_by_agreement.items():
        agreement = get_agreement(data, v=False)
        data_by_triple = sort_by_key(data, ['relation', 'property', 'concept'])
        print(exp, agreement['Krippendorff'], len(data_by_triple))
        

def get_expected_behavior(gold):
    unit_behavior_dict = dict()
    for d in gold:
        unit =  f"{d['relation']}-{d['property']}-{d['concept']}"
        exp = d['expected_agreement']
        cnt = d['disagreement_cnt']
        if exp == 'disagreement' and cnt >= 3:
            exp = 'certain_disagreement'
        #if exp != 'agreement':
        unit_behavior_dict[unit] = exp
        #else:
            #unit_behavior_dict[unit] = 'agreement'
    return unit_behavior_dict





def get_agreement_by_unit(data_dict_list):

    agreement_unit_dict = dict()
    data_by_unit = sort_by_key(data_dict_list, ['relation', 'property', 'concept'])
    for unit, dl_unit in data_by_unit.items():
        agreement = get_agreement(dl_unit, v=False, disable_kappa=True)
        agreement_unit_dict[unit] = agreement['Proportional']
    return agreement_unit_dict


def get_agreement_by_pair(data_dict_list, ag_metric):

    agreement_unit_dict = dict()
    data_by_pair = sort_by_key(data_dict_list, ['property', 'concept'])
    for pair, dl_unit in data_by_pair.items():
        agreement = get_agreement(dl_unit, v=False, disable_kappa=True)
        for d in dl_unit:
            triple = f"{d['relation']}-{d['property']}-{d['concept']}"
            agreement_unit_dict[triple] = agreement[ag_metric]
    
    return agreement_unit_dict

def get_contradictions_by_pair(data_dict_list):
    contradictions = load_contradiction_pairs()
    contradictions_unit_dict = dict()
    data_by_pair = sort_by_key(data_dict_list, ['property', 'concept'])
    analysis_by_pair = sort_by_key(pair_analysis, ['pair'])
    for pair, data_pair in data_by_pair.items():
        data_by_worker = sort_by_key(data_pair, ['workerid'])
        n_possible_contradictions = 0
        n_contradictions = 0
        for w, data in data_by_worker.items():
            pair_worker_cont = collect_contradictions(data, contradictions, threshold = 0)
            relations = [d['relation'] for d in data]
            for r1, r2 in contradictions:
                if r1 in relations and r2 in relations:
                    n_possible_contradictions += 1
            n_contradictions += len(pair_worker_cont)
        relations = set([d['relation'] for d in data_pair])
        for r in relations:
            unit = f'{r}-{pair}'
            if n_possible_contradictions == 0:
                contradictions_unit_dict[unit] = 0
            else:
                contradictions_unit_dict[unit] = n_contradictions/n_possible_contradictions

    return contradictions_unit_dict
            
        


def get_uqs_by_unit(data_dict_list, ct_units):
    ct_by_unit = sort_by_key(ct_units, ['unit'])
    uqs_unit_dict = dict()
    for d in data_dict_list:
        quid = d['quid']
        if quid in ct_by_unit:
            uqs = ct_by_unit[quid][0]['uqs']
            triple = f"{d['relation']}-{d['property']}-{d['concept']}"
            uqs_unit_dict[triple] = uqs
    return uqs_unit_dict


def disagreement_acc(target_units_true, target_units_false, unit_score_dict, thresh, below = True):
    
    predictions = []
    labels = [True for u in target_units_true]
    [labels.append(False) for u in target_units_false]
    target_units = target_units_true + target_units_false
    
    for u in target_units:
        score = unit_score_dict[u]
        if below == True:
            if score < thresh:
                predictions.append(True)
            else:
                predictions.append(False)
        elif below == False:
            if score > thresh:
                predictions.append(True)
            else:
                 predictions.append(False)
    p, r, f1, support = p_r_f1(labels, predictions, average = 'micro')
    print('f1', round(f1, 2))
    #print('p', p)
    #print('r', r)
    correct_pos = []
    correct_neg = []
    for u, l, pred in zip(target_units, labels, predictions):
        if l == pred == True:
            correct_pos.append(u)
        elif l==pred==False:
            correct_neg.append(u)
    acc_true = len(correct_pos)/len(target_units_true)
    acc_false = len(correct_neg)/len(target_units_false)
    acc_total = (len(correct_pos) + len(correct_neg)) / len(target_units)
    
    print('acc true', round(acc_true, 2))
    print('acc neg', round(acc_false, 2))
    print('acc total', round(acc_total, 2))

In [98]:
# load expert data 

# load gold
group = 'reason_agreement*_expert_inspection*'
run = 4
gold = load_gold_data(run, group)
print(gold[0])
for d in gold:
    if 'answer' not in d:
        print(d)
print('number of gold instances: ', len(gold))

expert_unit_agreement_dict = get_expected_behavior(gold)


#for k, v in expert_unit_agreement_dict.items():
 #   print(k, v)

{'answer': 'false', 'completionurl': 'expert_annotation', 'concept': 'shovel', 'disagreement_cnt': 4, 'expected_agreement': 'disagreement', 'property': 'roll', 'quid': 'impossible-shovel-roll', 'relation': 'impossible', 'workerid': 'gold'}
number of gold instances:  154


In [94]:
run = "*"
group = 'experiment*'
batch = '*'
n_q = '*'

analysis_type = 'units'
ct_units = load_ct(run, group, batch, analysis_type, as_dict=True)

analysis_type = 'pairs'
pair_analysis =  load_analysis(analysis_type, run, group, batch, as_dict=True)


crowd = load_experiment_data(run, group, n_q, batch, remove_not_val = True)

run = "*"
group = 'experiment*'
batch = '*'
n_q = '*'

metric = 'contradictions'
unit = 'batch'
n_stdv = 0.5
crowd_clean = clean_workers(crowd, run, group, batch, metric, unit, n_stdv)

Discarded 655.0 annotations.


In [71]:
# Agreement overview

iaa_dis_agreement(crowd, expert_unit_agreement_dict)
print()
iaa_dis_agreement(crowd_clean, expert_unit_agreement_dict)

certain_disagreement 0.21325657742636528 41
agreement 0.22530689627444067 49
possible_disagreement 0.12087774980792576 48
disagreement 0.14993472839217536 16

certain_disagreement 0.2168361244019137 41
agreement 0.27655299539170497 49
possible_disagreement 0.1230574217264544 48
disagreement 0.11274762248085968 16


In [95]:
for u, ag in expert_unit_agreement_dict.items():
    if ag == 'certain_disagreement':
        print(u)

impossible-roll-shovel
unusual-red-carrot
implied_category-dangerous-freebooter
rare-red-carrot
rare-roll-shovel
variability_limited-yellow-pineapple
variability_limited-square-recliner
variability_open-square-recliner
creative-square-recliner
unusual-square-recliner
unusual-yellow-leopard
implied_category-round-pen
typical_of_concept-round-pen
implied_category-square-recliner
typical_of_concept-square-recliner
rare-square-recliner
impossible-yellow-leopard
variability_limited-round-pen
creative-fly-stock
rare-fly-stock
unusual-fly-stock
impossible-fly-stock
implied_category-wheels-cruiser
unusual-wheels-cruiser
implied_category-fly-stock
typical_of_property-fly-stock
typical_of_concept-fly-stock
afforded_unusual-fly-stock
afforded_usual-fly-stock
creative-wheels-cruiser
variability_open-wheels-cruiser
rare-wheels-cruiser
impossible-wheels-cruiser
rare-black-rhino
creative-yellow-leopard
impossible-square-recliner
unusual-roll-shovel
impossible-red-carrot
implied_category-fly-arrow
aff

In [143]:
from statistics import stdev

# uqs
unit_uqs_dict = get_uqs_by_unit(crowd, ct_units)

mean = sum(unit_uqs_dict.values())/len(unit_uqs_dict)
sd = stdev(unit_uqs_dict.values())


# get disagreement 
target_units_dis = [u for u, ex in expert_unit_agreement_dict.items() if ex == 'certain_disagreement']
target_units_ag = [u for u, ex in expert_unit_agreement_dict.items() if ex == 'agreement']

print(len(target_units_dis), len(target_units_ag))

for n_sd in [0, 0.5, 1, 1.5, 2]:
    print('n_sd', n_sd)
    thresh = mean - (sd * n_sd)
    
    disagreement_acc(target_units_dis, target_units_ag, unit_uqs_dict, thresh, below = True)
    print('---')

41 49
n_sd 0
f1 0.5
acc true 0.68
acc neg 0.35
acc total 0.5
---
n_sd 0.5
f1 0.51
acc true 0.63
acc neg 0.41
acc total 0.51
---
n_sd 1
f1 0.53
acc true 0.44
acc neg 0.61
acc total 0.53
---
n_sd 1.5
f1 0.54
acc true 0.0
acc neg 1.0
acc total 0.54
---
n_sd 2
f1 0.54
acc true 0.0
acc neg 1.0
acc total 0.54
---


In [145]:

# propertional agreemnt

unit_ag_dict = get_agreement_by_unit(crowd)

mean = sum(unit_ag_dict.values())/len(unit_ag_dict)
sd = stdev(unit_ag_dict.values())
target_units_dis = [u for u, ex in expert_unit_agreement_dict.items() if ex == 'certain_disagreement']
target_units_ag = [u for u, ex in expert_unit_agreement_dict.items() if ex == 'agreement']


for n_sd in [0, 0.5, 1, 1.5, 2]:
    
    thresh = mean - (sd * n_sd)
    
    disagreement_acc(target_units_dis, target_units_ag, unit_ag_dict, thresh, below = True)
    print('---')

f1 0.48
acc true 0.71
acc neg 0.29
acc total 0.48
---
f1 0.52
acc true 0.68
acc neg 0.39
acc total 0.52
---
f1 0.57
acc true 0.46
acc neg 0.65
acc total 0.57
---
f1 0.54
acc true 0.0
acc neg 1.0
acc total 0.54
---
f1 0.54
acc true 0.0
acc neg 1.0
acc total 0.54
---


In [147]:

# propertional agreemnt clean

unit_ag_dict = get_agreement_by_unit(crowd_clean)

mean = sum(unit_ag_dict.values())/len(unit_ag_dict)
sd = stdev(unit_ag_dict.values())


# get disagreement 

for n_sd in [0, 0.5, 1, 1.5, 2]:
    print(n_sd)
    thresh = mean - (sd * n_sd)
    target_units_dis = [u for u, ex in expert_unit_agreement_dict.items() if ex == 'certain_disagreement']
    target_units_ag = [u for u, ex in expert_unit_agreement_dict.items() if ex == 'agreement']
    disagreement_acc(target_units_dis, target_units_ag, unit_ag_dict, thresh, below = True)
    print('---')

0
f1 0.56
acc true 0.68
acc neg 0.45
acc total 0.56
---
0.5
f1 0.59
acc true 0.68
acc neg 0.51
acc total 0.59
---
1
f1 0.58
acc true 0.44
acc neg 0.69
acc total 0.58
---
1.5
f1 0.52
acc true 0.05
acc neg 0.92
acc total 0.52
---
2
f1 0.53
acc true 0.0
acc neg 0.98
acc total 0.53
---


In [149]:
# contradiction counts full

unit_cont_dict = get_contradictions_by_pair(crowd)


mean = sum(unit_cont_dict.values())/len(unit_cont_dict)
sd = stdev(unit_cont_dict.values())

target_units_dis = [u for u, ex in expert_unit_agreement_dict.items() if ex == 'certain_disagreement']
target_units_ag = [u for u, ex in expert_unit_agreement_dict.items() if ex == 'agreement']

# get disagreement 

for n_sd in [0, 0.5, 1, 1.5, 2]:
    print(n_sd)
    thresh = mean + (sd * n_sd)
    
    disagreement_acc(target_units_dis, target_units_ag, unit_cont_dict, thresh, below = False)
    print('---')

0
f1 0.31
acc true 0.32
acc neg 0.31
acc total 0.31
---
0.5
f1 0.32
acc true 0.32
acc neg 0.33
acc total 0.32
---
1
f1 0.41
acc true 0.32
acc neg 0.49
acc total 0.41
---
1.5
f1 0.49
acc true 0.29
acc neg 0.65
acc total 0.49
---
2
f1 0.49
acc true 0.29
acc neg 0.65
acc total 0.49
---


In [150]:
# contradiction counts full

unit_cont_dict = get_contradictions_by_pair(crowd_clean)


mean = sum(unit_cont_dict.values())/len(unit_cont_dict)
sd = stdev(unit_cont_dict.values())

target_units_dis = [u for u, ex in expert_unit_agreement_dict.items() if ex == 'certain_disagreement']
target_units_ag = [u for u, ex in expert_unit_agreement_dict.items() if ex == 'agreement']

# get disagreement 

for n_sd in [0, 0.5, 1, 1.5, 2]:
    print(n_sd)
    thresh = mean + (sd * n_sd)
    
    disagreement_acc(target_units_dis, target_units_ag, unit_cont_dict, thresh, below = False)
    print('---')

0
f1 0.41
acc true 0.32
acc neg 0.49
acc total 0.41
---
0.5
f1 0.41
acc true 0.32
acc neg 0.49
acc total 0.41
---
1
f1 0.49
acc true 0.29
acc neg 0.65
acc total 0.49
---
1.5
f1 0.49
acc true 0.29
acc neg 0.65
acc total 0.49
---
2
f1 0.41
acc true 0.1
acc neg 0.67
acc total 0.41
---
