In [41]:
from load_data import load_experiment_data
from calculate_iaa import get_agreement
from utils_analysis import sort_by_key
from utils_analysis import get_annotation_ids
from utils_analysis import load_analysis
from utils_analysis import load_ct

from collections import Counter, defaultdict
import pandas as pd
from scipy.stats import spearmanr


def get_agreement_by_property(data_dict_list):

    agreement_prop_dict = dict()
    data_by_pair = sort_by_key(data_dict_list, ['property', 'concept'])
    for pair, dl_prop in data_by_pair.items():
        agreement_prop_dict[pair] = get_agreement(dl_prop, v=False, disable_kappa=True)
    return agreement_prop_dict


def get_pairs_by_day(data_dict_list):
    data_by_date = sort_by_key(data_dict_list, ['timestamp'])
    day_by_pair = dict()
    for d, data in data_by_date.items():
        day = d.split(' ')[0]
        pairs = sort_by_key(data, ['property', 'concept']).keys()
        for pair in pairs:
            day_by_pair[pair] = day
    return day_by_pair


def get_agreement_contradiction_data(data_dict_list, pair_analysis, ct_dicts):
    
    pair_dicts = pair_analysis.to_dict('records') 
    pair_dicts_by_pair = sort_by_key(pair_dicts, ['pair'])
    ag_pair_dict = get_agreement_by_property(data_dict_list)
    
    ct_by_quid = sort_by_key(ct_dicts, ['unit'])
    # get dates
    day_by_pair = get_pairs_by_day(data_dict_list)
    data_by_pair = sort_by_key(data_dict_list, ['property', 'concept'])
    #print(day_by_pair['round-fruit'])
    agreement_dict = Counter()
    for pair, ag in ag_pair_dict.items():
        agreement_dict[pair] =  ag['Krippendorff']

    ag_cont_dicts = []
    for pair, ag in agreement_dict.most_common():
        date = day_by_pair[pair]
        if 'test' not in pair and 'check' not in pair: 
            d = pair_dicts_by_pair[pair]
            annotations = data_by_pair[pair]
            units = set([d['quid'] for d in annotations])
            ct_dicts_units = [ct_by_quid[unit][0] for unit in units]
            uqs_list = [d['uqs'] for d in ct_dicts_units]
            if len(d) == 1:
                new_dict = dict()
                d = d[0]
                #print(d.keys())
                cont = d['contradiction_poss_contradiction_ratio']
                new_dict['pair'] = pair
                new_dict['agreement'] = ag
                new_dict['contradiction_rate'] = cont
                new_dict['date'] = date
                new_dict['units'] = units
                new_dict['uqs_list'] = uqs_list
                new_dict['mean_uqs'] = sum(uqs_list)/len(uqs_list)
                ag_cont_dicts.append(new_dict)

            else:
                print('unexpected length:', len(d), 'for pair', pair)
        else:
            print('test pair:', pair)
        #d = data_by_pair[pair]
    return ag_cont_dicts



In [42]:
run = "*"
group = 'experiment*'
batch = '*'
n_q = '*'

data_dict_list = load_experiment_data(run, group, n_q, batch, remove_not_val = True)
analysis_type = 'pairs'
pair_analysis =  load_analysis(analysis_type, run, group, batch)
analysis_type = 'units'
ct_dicts = load_ct(run, group, batch, analysis_type, as_dict=True)
ag_cont_dicts = get_agreement_contradiction_data(data_dict_list, pair_analysis, ct_dicts)
df_ag_cont = pd.DataFrame(ag_cont_dicts)

Discarded 655.0 annotations.
test pair: _test4-_test
test pair: _check4-_check4
test pair: _test2-_test2
test pair: _check2-_check2
test pair: _test1-_test1
test pair: _test3-_test
test pair: _check3-_check3
test pair: _check1-_check1
test pair: _test4-_test4
test pair: _test3-_test3


In [38]:
df_ag_sorted = df_ag_cont.sort_values('agreement', axis = 0, ascending=False, inplace=False)
top_ag = df_ag_sorted[:30]
print(f'Total number of pairs: {len(df_ag_sorted)}')
top_ag

Total number of pairs: 1935


Unnamed: 0,agreement,contradiction_rate,date,mean_uqs,pair,units,uqs_list
0,1.0,0.0,03-Jun-2020,1.0,made_of_wood-pen,"{39b0e894-347f-44b0-ba5a-d92cdab1b9a9, c657e40...","[1.0, 1.0, 1.0, 1.0]"
1,1.0,0.0,19-May-2020,1.0,yellow-citrus,"{c8625ecb-a5c2-4900-98f4-2010a00e591c, 0969ad6...","[1.0, 1.0, 1.0, 1.0]"
2,0.960384,0.0,28-May-2020,0.980264,sweet-cookie,"{ffa90803-ff04-422f-be8f-f3a75960e737, 7c12ede...","[0.8026404829932009, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
3,0.929487,0.0,04-Jun-2020,0.965258,sweet-berry,"{b92d84c4-4a39-4cf4-82d0-1a078d67c864, 1bb156f...","[0.6525782317668917, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
4,0.921667,0.0,02-Jun-2020,0.960974,wheels-lorry,"{78cd9d2a-1d8f-4071-8377-242f7770b6e2, 37e0b8f...","[1.0, 1.0, 1.0, 0.6097350824381351, 1.0, 1.0, ..."
5,0.878654,0.0,02-Jun-2020,0.944678,dangerous-poison,"{aecfa8ff-df52-447b-b5d3-bcac12944231, 7032a17...","[1.0, 1.0, 0.658423794073228, 1.0, 1.0, 0.7883..."
6,0.857143,0.0,25-May-2020,0.93006,wheels-motorcar,"{382d3802-51c1-45ad-9b9e-3d700442b652, 3e555c6...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.6538823041786..."
7,0.855885,0.0,04-Jun-2020,0.925698,sweet-peach,"{52d2e83c-9024-40c4-896d-da53844e4aad, e36a791...","[1.0, 0.806080865464913, 1.0, 0.63618057266754..."
8,0.8504,0.0,05-Jun-2020,0.928282,sweet-fruit,"{efff924d-e32b-470e-99e0-68af9be7feb6, 2a0b543...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8028211978680..."
9,0.85016,0.033333,26-May-2020,0.927828,hot-cooker,"{82ad8696-390d-4014-b8a5-6728a67483d8, 751716b...","[1.0, 0.8030443269527517, 1.0, 0.6475719824325..."


In [31]:
print(type(ag_cont_dicts))
print(ag_cont_dicts[0])

<class 'list'>
{'pair': 'made_of_wood-pen', 'agreement': 1.0, 'contradiction_rate': 0.0, 'date': '03-Jun-2020', 'units': {'39b0e894-347f-44b0-ba5a-d92cdab1b9a9', 'c657e40d-60d0-4fc3-aafc-df9320a8ec04', '5a59872e-1776-41af-afde-7c77ca860ce1', '4fe89bd2-0a76-4a72-ab9f-1879628f6188'}, 'uqs_list': [1.0, 1.0, 1.0, 1.0]}


### Correlations

* CT contradictions
* CT IAA
* IAA contradictions


**Results**
* Clear high correlation between uqs and iaa (both checking the same)
* small neg correlation between uqs and contradiction count 
* smaller neg correlation between contradiction count and iaa

--> task-specific metric tests something else than traditional metrics do. It is really important to establish a quality metric independet of agreement, in particular in scenaries in which you expect the accumulated labels to approximate the truth, rather than each worker to know the truth. 

--> ambiguty is pervailant, but in most cases, there is an interpretation which we think is more likely than others. this is indeed shown if you use many annotators. 

In [46]:
iaa = df_ag_cont['agreement']
cont = df_ag_cont['contradiction_rate']
uqs = df_ag_cont['mean_uqs']
corr_uqs_iaa = spearmanr(uqs, iaa)
corr_uqs_cont = spearmanr(uqs, cont)
corr_cont_iaa = spearmanr(cont, iaa)
print(corr_uqs_iaa)
print(corr_uqs_cont)
print(corr_cont_iaa)

SpearmanrResult(correlation=0.8436404257587673, pvalue=0.0)
SpearmanrResult(correlation=-0.33253262649683824, pvalue=3.531225988888631e-51)
SpearmanrResult(correlation=-0.2152463388206515, pvalue=1.0213601812316866e-21)
