In [1]:
from load_data import load_expert_data
from utils_analysis import sort_by_key
import pandas as pd
import csv
from calculate_iaa import get_agreement, get_kappa_pairs, create_matrix
import glob

In [17]:
def get_overview_table(expert_data):
    row_dicts = []
    data_by_triple = sort_by_key(expert_data, ['relation', 'concept', 'property'])
    #all_workers = data_by_worker.keys()
    #all_workers = set([d['workerid'] for d in expert_data])
    workers_exclude = set(['pia_test1'])
    for t, data in data_by_triple.items():
        triple_dict = dict()
        triple_dict['triple'] = t
        triple_dict['description'] = data[0]['description']
        for d in data:
            w = d['workerid']
            if 'answer' in d and w not in workers_exclude:
                a = d['answer']
                expected_disagreements = []
                for k, v in d.items():
                    if k.startswith('disagreement_') and v == 'true':
                        expected_disagreements.append(k)
                triple_dict[f'answer-{w}'] = a
                triple_dict[f'expected_behavior-{w}'] = '-'.join(sorted(expected_disagreements))
                if 'reason' in d:
                    triple_dict[f'reason-{w}'] = d['reason']
                if 'comment' in d:
                    triple_dict[f'comment-{w}'] = d['comment']
        row_dicts.append(triple_dict)
    return row_dicts


def get_stats(data_dict_list_raw, data_dict_list_discussion):

    stats_dict = dict()
    concepts = set([d['concept'] for d in data_dict_list_raw])
    annotations_by_triple = sort_by_key(data_dict_list_raw, ['relation', 'property', 'concept'])
    annotations_by_property = sort_by_key(data_dict_list_raw, ['property'])
    annotations_by_pair = sort_by_key(data_dict_list_raw, ['property', 'concept'])
    iaa_labels_raw = get_agreement(data_dict_list_raw, collapse_relations = False, disable_kappa = False, v=False)
    iaa_levels_raw = get_agreement(data_dict_list_raw, collapse_relations = 'levels', disable_kappa = False, v=False)
    iaa_labels_diss = get_agreement(data_dict_list_discussion, collapse_relations = False, disable_kappa = False, v=False)
    iaa_levels_diss = get_agreement(data_dict_list_discussion, collapse_relations = 'levels', disable_kappa = False, v=False)
    n_triples = len(annotations_by_triple)
    n_concepts_prop = []
    for p, annotations in annotations_by_property.items():
        if not p.startswith('_'):
            concepts = set([d['concept'] for d in annotations if not \
                            d['concept'].startswith('_')])
            n_concepts_prop.append(len(concepts))
    
    #stats_dict['n_annotations'] = int(n_annotations)
    stats_dict['n_properties'] = int(len(annotations_by_property))
    stats_dict['n_pairs'] = int(len(annotations_by_pair))
    stats_dict['n_triples'] = int(n_triples)
    print(iaa_labels_raw['Av_Cohens_kappa'])
    stats_dict['iaa_label_raw'] = round(iaa_labels_raw['Av_Cohens_kappa'], 2)
    #stats_dict['iaa_subset_raw'] = round(iaa_levels_raw['Av_Cohens_kappa'], 2)
    stats_dict['iaa_label_discussion'] = round(iaa_labels_diss['Av_Cohens_kappa'], 2)
    #stats_dict['iaa_subset_discussion'] = round(iaa_levels_diss['Av_Cohens_kappa'], 2)
    #stats_dict['contradiction_rate_mean'] = round(sum(cont_rates)/len(cont_rates), 2)
    
    return stats_dict

def prepare_discussion(expert_rows):
    
    agree_all = []
    agree_label = []
    agree_behavior = []
    disagree = []
    
    new_rows = []
    
    for r in expert_rows:
        answers = [v for k, v in r.items() if k.startswith('answer-') and v != '-']
        behav = [v for k, v in r.items() if k.startswith('expected_behavior-') and v != '-']
        n_answer_types = len(set(answers))
        n_behav_types = len(set(behav))
        
        if n_answer_types == 1 and n_behav_types == 1:
            r['conflict'] = 'none'
            agree_all.append(r)
        elif n_answer_types == 1 and n_behav_types > 1:
            r['conflict'] = 'behavior'
            agree_label.append(r)
        elif n_behav_types == 1 and n_answer_types > 1:
            r['conflict'] = 'label'
            agree_behavior.append(r)
        else:
            disagree.append(r)
            r['conflict'] = 'all'
    print(len(agree_all))
    print(len(agree_label))
    print(len(agree_behavior))
    print(len(disagree))
    
    new_rows.extend(disagree)
    new_rows.extend(agree_label)
    new_rows.extend(agree_behavior)
    new_rows.extend(agree_all)
    
    return new_rows
    
    
    
def discussion_to_file(run, n_q, batch, group):
    
    path = f'../analyses/expert_annotations/discussion/run{run}-{group}.csv'
    #reason_agreement_False_expert_inspection1-overview.csv
    
    expert_data = load_expert_data(run, group, n_q, batch)
    expert_rows = get_overview_table(expert_data)
    overview_df = pd.DataFrame(expert_rows).fillna(value = '-')
    expert_rows_all_colls = overview_df.to_dict("records")
    rows_discussion = prepare_discussion(expert_rows_all_colls)
    
    col_seq = ['conflict', 'triple', 'description']
    answers_cols = [k for k in rows_discussion[0].keys() if k.startswith('answer-')]
    behav_cols = [k for k in rows_discussion[0].keys() if k.startswith('expected_behavior-')]
    reason_cols = [k for k in rows_discussion[0].keys() if k.startswith('reason-')]
    comment_cols = [k for k in rows_discussion[0].keys() if k.startswith('comment-')]
    col_seq.extend(answers_cols)
    col_seq.extend(behav_cols)
    col_seq.extend(reason_cols)
    col_seq.extend(comment_cols)
    df_discussion = pd.DataFrame(rows_discussion)
    df_discussion[col_seq].to_csv(path, index=False)
    return df_discussion[col_seq]


def load_resolved_data(run, group):
    
    # Gold dicts should have: property, concept, relation, answer, 
    # expected disagreement: agree, poss_disagree, disagree
    expert_dicts = []
    
    path_dir = f'../analyses/expert_annotations/resolved/'
    paths = f'{path_dir}run{run}-{group}.xlsx'
    
    for path in glob.glob(paths):
        print(path)
        df = pd.read_excel(path)#, sheetname=f'PageStylerun{run}-{group}')
        resolved_dicts = df.to_dict('records')

        for d in resolved_dicts:
            answers = [v for k, v in d.items() if k.startswith('answer-')
                       and v != '-' and not k.endswith('lea')]
            workers = [k for k, v in d.items() if k.startswith('answer-')
                       and v != '-' and not k.endswith('lea')]
            disagreements =  [v.split('-') for k, v in d.items() if k.startswith('expected_behavior-')
                             and type(v) == str]

            triple = d['triple']
            relation, concept, prop = triple.split('-')
            for w, a, dis in zip(workers, answers, disagreements):
                data_dict = dict()
                dis = [d for d in dis if d != '']
                if dis == []:
                    dis = ['disagreement_agreement']
                    disagreement_cnt = 0
                elif dis == ['disagreement_agreement']:
                    exp_dis = 'agreement'
                    disagreement_cnt = 0
                elif 'disagreement_agreement' not in dis:
                    exp_dis = 'disagreement'
                    disagreement_cnt = len(dis)
                elif 'disagreement_agreement' in dis:
                    disagreements = [d for d in dis if d != 'disagreement_agreement']
                    exp_dis = 'possible_disagreement'
                    disagreement_cnt = len(disagreements)
              
                data_dict['relation'] = relation
                data_dict['concept'] = concept
                data_dict['property'] = prop
                data_dict['quid'] = triple
                data_dict['workerid'] = w
                data_dict['answer'] = a
                data_dict['expected_agreement'] = exp_dis 
                data_dict['disagreement_cnt'] = disagreement_cnt
                data_dict['completionurl'] = 'expert_annotation'
                expert_dicts.append(data_dict)  
    return expert_dicts


def resolved_to_gold(expert_data_resolved):
    
    gold_data = []
    no_gold_cnt = 0
    data_by_triple = sort_by_key(expert_data_resolved, ['quid'])
    for t, data in data_by_triple.items():
        gold_dict = data[0]
        answers = set([str(d['answer']).lower() for d in data])
        if len(answers) == 1:
            a = list(answers)[0]
        else:
            a = 'NOGOLD'
            no_gold_cnt += 1
    
     
        agreements = [d['expected_agreement'] for d in data]
        disagreement_cnt = [int(d['disagreement_cnt']) for d in data]
        #n_ag = agreements.count('agreement')
        #n_dis = agreements.count('disagreement')
        #n_poss = agreements.count('possible_disagreement')
        
        if a == 'NOGOLD':
            agreement = 'disagreement'
        #print(n_ag, n_disagree, n_poss)
        elif len(set(agreements)) == 1:
            #print(agreements)
            agreement = agreements[0]
            
        elif set(agreements) == {'agreement', 'possible_disagreement'}:
            agreement = 'possible_disagreement'
            
        elif 'agreement'in agreements and 'disagreement' in agreements:
            agreement = 'possible_disagreement'
        else:
            agreement = 'disagreement'
        gold_dict['answer'] = a
        gold_dict['disagreement_cnt'] = sum(disagreement_cnt)
        gold_dict['workerid'] = 'gold'
        gold_dict['expected_agreement'] = agreement
        gold_data.append(gold_dict)
        
    print(f'Collected {len(gold_data)} in total')
    print(f'{no_gold_cnt} units of the total do not have a gold standard.')
    return gold_data
        


run = 4
n_q = '*'
batch = '1'
group = 'reason_agreement*_expert_inspection*'
df = discussion_to_file(run, n_q, batch, group)

#overview_df
#name = f'run{run}-group_{group}-batch{batch}.csv'.replace('*', '-all-')



expert_data_resolved =  load_resolved_data(run, group)
gold_data = resolved_to_gold(expert_data_resolved)

name = f'run{run}-{group}.csv'.replace('*', '-all-')
gold_path = f'../gold_labels/gold_files/{name}'
gold_df = pd.DataFrame(gold_data)
gold_df.to_csv(gold_path, index=False)

gold_df

run4-group_reason_agreement_expert_inspection2/qu40-s_qu40-batch1.csv
no summary data
run4-group_reason_agreement_False_expert_inspection1/qu30-s_qu30-batch1.csv
no summary data
run4-group_reason_agreement_expert_inspection4/qu44-s_qu44-batch1.csv
no summary data
run4-group_reason_agreement_expert_inspection3/qu40-s_qu40-batch1.csv
no summary data
run4-group_reason_agreement_False_expert_inspection2/qu40-s_qu40-batch1.csv
no summary data
run4-group_reason_agreement_expert_inspection1/qu30-s_qu30-batch1.csv
no summary data
run4-group_reason_agreement_False_expert_inspection3/qu40-s_qu40-batch1.csv
no summary data
41
72
6
35
../analyses/expert_annotations/resolved/run4-reason_agreement_False_expert_inspection2.xlsx
../analyses/expert_annotations/resolved/run4-reason_agreement_expert_inspection1.xlsx
../analyses/expert_annotations/resolved/run4-reason_agreement_False_expert_inspection3.xlsx
../analyses/expert_annotations/resolved/run4-reason_agreement_expert_inspection4.xlsx
../analyses/e

Unnamed: 0,answer,completionurl,concept,disagreement_cnt,expected_agreement,property,quid,relation,workerid
0,false,expert_annotation,shovel,4,disagreement,roll,impossible-shovel-roll,impossible,gold
1,NOGOLD,expert_annotation,carrot,5,disagreement,red,unusual-carrot-red,unusual,gold
2,true,expert_annotation,freebooter,4,disagreement,dangerous,implied_category-freebooter-dangerous,implied_category,gold
3,true,expert_annotation,carrot,5,disagreement,red,rare-carrot-red,rare,gold
4,NOGOLD,expert_annotation,shovel,3,disagreement,roll,rare-shovel-roll,rare,gold
5,false,expert_annotation,freebooter,0,agreement,dangerous,variability_open-freebooter-dangerous,variability_open,gold
6,NOGOLD,expert_annotation,pineapple,3,disagreement,yellow,variability_limited-pineapple-yellow,variability_limited,gold
7,false,expert_annotation,buttercup,0,agreement,yellow,typical_of_property-buttercup-yellow,typical_of_property,gold
8,false,expert_annotation,buttercup,1,possible_disagreement,yellow,variability_limited-buttercup-yellow,variability_limited,gold
9,false,expert_annotation,tire,0,agreement,roll,variability_limited-tire-roll,variability_limited,gold


# Expert IAA (before discussion)

In [17]:
run = 4
n_q = '*'
batch = '1'
group = 'reason_agreement*_expert_inspection*'

expert_data = load_expert_data(run, group, n_q, batch)

expert_data_answer = [d for d in expert_data if 'answer' in d]
print(len(expert_data), len(expert_data_answer))

ag = get_agreement(expert_data_answer)

matrix = create_matrix(expert_data_answer)
pair_kappa_dict = get_kappa_pairs(matrix)

run4-group_reason_agreement_expert_inspection2/qu40-s_qu40-batch1.csv
no summary data
run4-group_reason_agreement_False_expert_inspection1/qu30-s_qu30-batch1.csv
no summary data
run4-group_reason_agreement_expert_inspection4/qu44-s_qu44-batch1.csv
no summary data
run4-group_reason_agreement_expert_inspection3/qu40-s_qu40-batch1.csv
no summary data
run4-group_reason_agreement_False_expert_inspection2/qu40-s_qu40-batch1.csv
no summary data
run4-group_reason_agreement_expert_inspection1/qu30-s_qu30-batch1.csv
no summary data
run4-group_reason_agreement_False_expert_inspection3/qu40-s_qu40-batch1.csv
no summary data
479 477
Krippendorff's alpha: 0.5255275697753574
Average Cohen's Kappa (pairwise): 0.5085545625425939
Proportional agreement (pairwise): 0.8300865800865797



## IAA after discussion

In [18]:
expert_data_resolved = load_resolved_data(run, group)
get_agreement(expert_data_resolved)

../analyses/expert_annotations/resolved/run4-reason_agreement_False_expert_inspection2.xlsx
../analyses/expert_annotations/resolved/run4-reason_agreement_expert_inspection1.xlsx
../analyses/expert_annotations/resolved/run4-reason_agreement_False_expert_inspection3.xlsx
../analyses/expert_annotations/resolved/run4-reason_agreement_expert_inspection4.xlsx
../analyses/expert_annotations/resolved/run4-reason_agreement_expert_inspection3.xlsx
../analyses/expert_annotations/resolved/run4-reason_agreement_expert_inspection2.xlsx
../analyses/expert_annotations/resolved/run4-reason_agreement_False_expert_inspection1.xlsx
Krippendorff's alpha: 0.6914766480233281
Average Cohen's Kappa (pairwise): 0.7178718774974399
Proportional agreement (pairwise): 0.9025974025974027



{'Krippendorff': 0.6914766480233281,
 'Proportional': 0.9025974025974027,
 'Av_Cohens_kappa': 0.7178718774974399}

In [19]:
# IAA on behavior fine-grained

print(expert_data[0].keys())

# change 'answer' to behavior

for d in expert_data:
    if 'disagreement_agreement' in d:
        d['answer'] = d['disagreement_agreement']
    
ag = get_agreement(expert_data_answer)

odict_keys(['filename', 'listnumber', 'assignmentid', 'hitid', 'workerid', 'origin', 'timestamp', 'partid', 'questionid', 'quid', 'description', 'exampletrue', 'examplefalse', 'run', 'sublist', 'completionurl', 'name', 'id', 'uuid', 'time_taken_batch', 'answer', 'disagreement_agreement', 'relation', 'property', 'concept'])
Krippendorff's alpha: 0.09019717089175716
Average Cohen's Kappa (pairwise): 0.0745949934277079
Proportional agreement (pairwise): 0.6515151515151512



In [20]:
# iaa behavior coarse-grained

print(expert_data_resolved[0].keys())

for d in expert_data_resolved:
    if 'expected_agreement' in d:
        d['answer'] = d['expected_agreement']
        
ag = get_agreement(expert_data_resolved)

dict_keys(['relation', 'concept', 'property', 'quid', 'workerid', 'answer', 'expected_agreement', 'completionurl'])
Krippendorff's alpha: 0.19219945807005645
Average Cohen's Kappa (pairwise): 0.23531372949101229
Proportional agreement (pairwise): 0.5660173160173163



In [21]:
run = 4
n_q = '*'
batch = '1'
group = 'reason_agreement*_expert_inspection*'

expert_data = load_expert_data(run, group, n_q, batch)
expert_data_answer = [d for d in expert_data if 'answer' in d]
expert_data_resolved = load_resolved_data(run, group)

stats = get_stats(expert_data_answer, expert_data_resolved)
print(stats)
stats_dict = dict()
stats_dict['experts'] = stats
df = pd.DataFrame(stats_dict)
df

run4-group_reason_agreement_expert_inspection2/qu40-s_qu40-batch1.csv
no summary data
run4-group_reason_agreement_False_expert_inspection1/qu30-s_qu30-batch1.csv
no summary data
run4-group_reason_agreement_expert_inspection4/qu44-s_qu44-batch1.csv
no summary data
run4-group_reason_agreement_expert_inspection3/qu40-s_qu40-batch1.csv
no summary data
run4-group_reason_agreement_False_expert_inspection2/qu40-s_qu40-batch1.csv
no summary data
run4-group_reason_agreement_expert_inspection1/qu30-s_qu30-batch1.csv
no summary data
run4-group_reason_agreement_False_expert_inspection3/qu40-s_qu40-batch1.csv
no summary data
../analyses/expert_annotations/resolved/run4-reason_agreement_False_expert_inspection2.xlsx
../analyses/expert_annotations/resolved/run4-reason_agreement_expert_inspection1.xlsx
../analyses/expert_annotations/resolved/run4-reason_agreement_False_expert_inspection3.xlsx
../analyses/expert_annotations/resolved/run4-reason_agreement_expert_inspection4.xlsx
../analyses/expert_annot

Unnamed: 0,experts
iaa_label_discussion,0.72
iaa_label_raw,0.51
n_pairs,19.0
n_properties,11.0
n_triples,154.0


In [91]:
print(df.to_latex())

\begin{tabular}{lr}
\toprule
{} &  experts \\
\midrule
iaa\_label\_discussion &     0.72 \\
iaa\_label\_raw        &     0.51 \\
n\_pairs              &    19.00 \\
n\_properties         &    11.00 \\
n\_triples            &   154.00 \\
\bottomrule
\end{tabular}

