In [3]:
from load_data import load_expert_data
from utils_analysis import sort_by_key
import pandas as pd
import csv
from calculate_iaa import get_agreement, get_kappa_pairs, create_matrix
import glob

In [9]:
def get_overview_table(expert_data):
    row_dicts = []
    data_by_triple = sort_by_key(expert_data, ['relation', 'concept', 'property'])
    #all_workers = data_by_worker.keys()
    #all_workers = set([d['workerid'] for d in expert_data])
    workers_exclude = set(['pia_test1'])
    for t, data in data_by_triple.items():
        triple_dict = dict()
        triple_dict['triple'] = t
        triple_dict['description'] = data[0]['description']
        for d in data:
            w = d['workerid']
            if 'answer' in d and w not in workers_exclude:
                a = d['answer']
                expected_disagreements = []
                for k, v in d.items():
                    if k.startswith('disagreement_') and v == 'true':
                        expected_disagreements.append(k)
                triple_dict[f'answer-{w}'] = a
                triple_dict[f'expected_behavior-{w}'] = '-'.join(sorted(expected_disagreements))
                if 'reason' in d:
                    triple_dict[f'reason-{w}'] = d['reason']
                if 'comment' in d:
                    triple_dict[f'comment-{w}'] = d['comment']
        row_dicts.append(triple_dict)
    return row_dicts

def prepare_discussion(expert_rows):
    
    agree_all = []
    agree_label = []
    agree_behavior = []
    disagree = []
    
    new_rows = []
    
    for r in expert_rows:
        answers = [v for k, v in r.items() if k.startswith('answer-') and v != '-']
        behav = [v for k, v in r.items() if k.startswith('expected_behavior-') and v != '-']
        n_answer_types = len(set(answers))
        n_behav_types = len(set(behav))
        
        if n_answer_types == 1 and n_behav_types == 1:
            r['conflict'] = 'none'
            agree_all.append(r)
        elif n_answer_types == 1 and n_behav_types > 1:
            r['conflict'] = 'behavior'
            agree_label.append(r)
        elif n_behav_types == 1 and n_answer_types > 1:
            r['conflict'] = 'label'
            agree_behavior.append(r)
        else:
            disagree.append(r)
            r['conflict'] = 'all'
    print(len(agree_all))
    print(len(agree_label))
    print(len(agree_behavior))
    print(len(disagree))
    
    new_rows.extend(disagree)
    new_rows.extend(agree_label)
    new_rows.extend(agree_behavior)
    new_rows.extend(agree_all)
    
    return new_rows
    
    
    
def discussion_to_file(run, n_q, batch, group):
    
    path = f'../analyses/expert_annotations/discussion/run{run}-{group}.csv'
    #reason_agreement_False_expert_inspection1-overview.csv
    
    expert_data = load_expert_data(run, group, n_q, batch)
    expert_rows = get_overview_table(expert_data)
    overview_df = pd.DataFrame(expert_rows).fillna(value = '-')
    expert_rows_all_colls = overview_df.to_dict("records")
    rows_discussion = prepare_discussion(expert_rows_all_colls)
    
    col_seq = ['conflict', 'triple', 'description']
    answers_cols = [k for k in rows_discussion[0].keys() if k.startswith('answer-')]
    behav_cols = [k for k in rows_discussion[0].keys() if k.startswith('expected_behavior-')]
    reason_cols = [k for k in rows_discussion[0].keys() if k.startswith('reason-')]
    comment_cols = [k for k in rows_discussion[0].keys() if k.startswith('comment-')]
    col_seq.extend(answers_cols)
    col_seq.extend(behav_cols)
    col_seq.extend(reason_cols)
    col_seq.extend(comment_cols)
    df_discussion = pd.DataFrame(rows_discussion)
    df_discussion[col_seq].to_csv(path, index=False)
    return df_discussion[col_seq]


def load_resolved_data(run, group):
    
    # Gold dicts should have: property, concept, relation, answer, 
    # expected disagreement: agree, poss_disagree, disagree
    expert_dicts = []
    
    path_dir = f'../analyses/expert_annotations/resolved/'
    paths = f'{path_dir}run{run}-{group}.xlsx'
    
    for path in glob.glob(paths):
        print(path)
        df = pd.read_excel(path)#, sheetname=f'PageStylerun{run}-{group}')
        resolved_dicts = df.to_dict('records')

        for d in resolved_dicts:
            answers = [v for k, v in d.items() if k.startswith('answer-')
                       and v != '-' and not k.endswith('lea')]
            workers = [k for k, v in d.items() if k.startswith('answer-')
                       and v != '-' and not k.endswith('lea')]
            disagreements =  [v.split('-') for k, v in d.items() if k.startswith('expected_behavior-')
                             and type(v) == str]

            triple = d['triple']
            relation, concept, prop = triple.split('-')
            for w, a, dis in zip(workers, answers, disagreements):
                data_dict = dict()
                dis = [d for d in dis if d != '']
                if dis == []:
                    dis = ['disagreement_agreement']
                if dis == ['disagreement_agreement']:
                    exp_dis = 'agreement'
                elif len(dis) > 1 and 'disagreement_agreement' in dis:
                    exp_dis = 'possible_disagreement'
                else:
                    exp_dis = 'disagreement'
                data_dict['relation'] = relation
                data_dict['concept'] = concept
                data_dict['property'] = prop
                data_dict['quid'] = triple
                data_dict['workerid'] = w
                data_dict['answer'] = a
                data_dict['expected_agreement'] = exp_dis 
                data_dict['completionurl'] = 'expert_annotation'
                expert_dicts.append(data_dict)  
    return expert_dicts


def resolved_to_gold(expert_data_resolved):
    
    gold_data = []
    
    data_by_triple = sort_by_key(expert_data_resolved, ['quid'])
    for t, data in data_by_triple.items():
        gold_dict = data[0]
        answers = [str(d['answer']).lower() for d in data]
        true_cnt = answers.count('true')
        true_prop = true_cnt/len(answers)
        if true_prop > 0.5:
            a = 'true'
        else:
            a = 'false'
        agreements = [d['expected_agreement'] for d in data]
        if len(set(agreements)) == 1:
            #print(agreements)
            agreement = agreements[0]
        elif 'disagreement' in agreements:
            agreement = 'possible_disagreement'
        gold_dict['answer'] = a
        gold_dict['workerid'] = 'gold'
        gold_dict['expected_agreement'] = agreement
        gold_data.append(gold_dict)
    return gold_data
        


run = 4
n_q = '*'
batch = '1'
group = 'reason_agreement*_expert_inspection*'
df = discussion_to_file(run, n_q, batch, group)

#overview_df
#name = f'run{run}-group_{group}-batch{batch}.csv'.replace('*', '-all-')



expert_data_resolved =  load_resolved_data(run, group)
gold_data = resolved_to_gold(expert_data_resolved)

name = f'run{run}-{group}.csv'.replace('*', '-all-')
gold_path = f'../gold_labels/gold_files/{name}'
gold_df = pd.DataFrame(gold_data)
gold_df.to_csv(gold_path, index=False)

#df

run4-group_reason_agreement_expert_inspection2/qu40-s_qu40-batch1.csv
no summary data
run4-group_reason_agreement_False_expert_inspection1/qu30-s_qu30-batch1.csv
no summary data
run4-group_reason_agreement_expert_inspection4/qu44-s_qu44-batch1.csv
no summary data
run4-group_reason_agreement_expert_inspection3/qu40-s_qu40-batch1.csv
no summary data
run4-group_reason_agreement_False_expert_inspection2/qu40-s_qu40-batch1.csv
no summary data
run4-group_reason_agreement_expert_inspection1/qu30-s_qu30-batch1.csv
no summary data
run4-group_reason_agreement_False_expert_inspection3/qu40-s_qu40-batch1.csv
no summary data
41
72
6
35
../analyses/expert_annotations/resolved/run4-reason_agreement_False_expert_inspection2.xlsx
../analyses/expert_annotations/resolved/run4-reason_agreement_expert_inspection1.xlsx
../analyses/expert_annotations/resolved/run4-reason_agreement_False_expert_inspection3.xlsx
../analyses/expert_annotations/resolved/run4-reason_agreement_expert_inspection4.xlsx
../analyses/e

154


# Expert IAA (before discussion)

In [135]:
run = 4
n_q = '*'
batch = '1'
group = 'reason_agreement*_expert_inspection*'

expert_data = load_expert_data(run, group, n_q, batch)

expert_data_answer = [d for d in expert_data if 'answer' in d]
print(len(expert_data), len(expert_data_answer))
ag = get_agreement(expert_data_answer)

matrix = create_matrix(expert_data_answer)
pair_kappa_dict = get_kappa_pairs(matrix)

run4-group_reason_agreement_expert_inspection2/qu40-s_qu40-batch1.csv
no summary data
run4-group_reason_agreement_False_expert_inspection1/qu30-s_qu30-batch1.csv
no summary data
run4-group_reason_agreement_expert_inspection4/qu44-s_qu44-batch1.csv
no summary data
run4-group_reason_agreement_expert_inspection3/qu40-s_qu40-batch1.csv
no summary data
run4-group_reason_agreement_False_expert_inspection2/qu40-s_qu40-batch1.csv
no summary data
run4-group_reason_agreement_expert_inspection1/qu30-s_qu30-batch1.csv
no summary data
run4-group_reason_agreement_False_expert_inspection3/qu40-s_qu40-batch1.csv
no summary data
479 477
Krippendorff's alpha: 0.5255275697753574
Average Cohen's Kappa (pairwise): -
Proportional agreement (pairwise): 0.8300865800865797



## IAA after discussion

In [136]:
expert_data_resolved = load_resolved_data(run, group)
get_agreement(expert_data_resolved)

../analyses/expert_annotations/resolved/run4-reason_agreement_False_expert_inspection2.xlsx
../analyses/expert_annotations/resolved/run4-reason_agreement_expert_inspection1.xlsx
../analyses/expert_annotations/resolved/run4-reason_agreement_False_expert_inspection3.xlsx
../analyses/expert_annotations/resolved/run4-reason_agreement_expert_inspection4.xlsx
../analyses/expert_annotations/resolved/run4-reason_agreement_expert_inspection3.xlsx
../analyses/expert_annotations/resolved/run4-reason_agreement_expert_inspection2.xlsx
../analyses/expert_annotations/resolved/run4-reason_agreement_False_expert_inspection1.xlsx
Krippendorff's alpha: 0.6914766480233281
Average Cohen's Kappa (pairwise): 0.7178718774974399
Proportional agreement (pairwise): 0.9025974025974027



{'Krippendorff': 0.6914766480233281,
 'Proportional': 0.9025974025974027,
 'Av_Cohens_kappa': 0.7178718774974399}

In [137]:
# IAA on behavior fine-grained

print(expert_data[0].keys())

# change 'answer' to behavior

for d in expert_data:
    if 'disagreement_agreement' in d:
        d['answer'] = d['disagreement_agreement']
    
ag = get_agreement(expert_data_answer)

odict_keys(['filename', 'listnumber', 'assignmentid', 'hitid', 'workerid', 'origin', 'timestamp', 'partid', 'questionid', 'quid', 'description', 'exampletrue', 'examplefalse', 'run', 'sublist', 'completionurl', 'name', 'id', 'uuid', 'time_taken_batch', 'answer', 'disagreement_agreement', 'relation', 'property', 'concept'])
Krippendorff's alpha: 0.09019717089175716
Average Cohen's Kappa (pairwise): -
Proportional agreement (pairwise): 0.6515151515151512



In [138]:
# iaa behavior coarse-grained

print(expert_data_resolved[0].keys())

for d in expert_data_resolved:
    if 'expected_agreement' in d:
        d['answer'] = d['expected_agreement']
        
ag = get_agreement(expert_data_resolved)

dict_keys(['relation', 'concept', 'property', 'quid', 'workerid', 'answer', 'expected_agreement', 'completionurl'])
Krippendorff's alpha: 0.19219945807005645
Average Cohen's Kappa (pairwise): 0.2353137294910123
Proportional agreement (pairwise): 0.5660173160173163

