# Worker evaluation and filtering

* get a ranking of all workers based on contradictions and test questions 
* check test question counting code again
* See if there is a selection of top workers and test whether they have higher agreement than the others
* Perhaps consider putting them on a list of selected top workers with more salary


**Note for later** Test what happens to agreement if we collapse all annotations in a single group. 


# Further steps:

* add pair and relation evaluation and set up something similar to crowd truth


## Pair evaluation: 

* pairs with many annotators contradicting themselves are probably difficult (likely because of polysemy)


## Relation evaluation:

* relations with most contradictions are probably difficult

## Example evaluation:

* Some examples may be missleading

In [33]:
from load_data import load_experiment_data
from utils_analysis import load_contradiction_pairs
from utils_analysis import collect_contradictions
from utils_analysis import sort_by_key


from collections import Counter
import pandas as pd
import os



def get_cont_type_dicts(contradictions, cont_type_cnt):
    contradiction_dict = dict()
    for cont in contradictions:
        cont = tuple(sorted(cont))
        cnt = cont_type_cnt[cont]
        cont_str = '-'.join(cont)
        contradiction_dict[cont_str] = cnt
    return contradiction_dict


def get_average_time_worker(worker_dict_list):

    data_by_batch = sort_by_key(worker_dict_list, ['filename'])
    av_time_questions = []
    for batch, dl in data_by_batch.items():
        # time info is the same for the entire batch
        time = float(dl[0]['time_taken_batch'])
        av_time_question = time / len(dl)
        av_time_questions.append(av_time_question)
    av_time = sum(av_time_questions) / len(av_time_questions)
    return av_time


def get_tests_and_checks(worker_dict_list):
    fails = []
    for d in worker_dict_list:
        quid = d['quid']
        if quid.startswith('check') or quid.startswith('test'):
            actual_answer = d['answer']
            if quid in ['check1', 'check2', 'check3']:
                correct_answer = 'true'
            elif quid.startswith('test'):
                correct_answer = d['relation'].split('_')[1]
            elif quid == 'check4':
                # if quid == check4 (I am answering questions at random)
                correct_answer = 'false'
            #check if correct
            if correct_answer != actual_answer:
                worker = d['workerid']
                fails.append(d['description'])
    return fails

def get_pair_analysis(data_dict_list, name):

    pair_data_dicts = []
    data_by_pair = sort_by_key(data_dict_list, ['property', 'concept'])
    contradictions = load_contradiction_pairs()

    for pair, dl_pair in data_by_pair.items():
        d = dict()
        n_annotations = len(dl_pair)
        data_by_worker = sort_by_key(dl_pair, ['workerid'])
        cont_cnt = Counter()
        av_time_all_workers = []
        d['pair'] = pair
        workers_with_contradictions = []
        d['n_annotations'] = n_annotations
        n_workers = len(data_by_worker)
        d['n_workers'] = n_workers
        for worker, dl_worker in data_by_worker.items():
            av_time_all_workers.append(get_average_time_worker(dl_worker))
            pair_worker_cont = collect_contradictions(dl_worker, contradictions, threshold = 0)
            if len(pair_worker_cont) > 0:
                workers_with_contradictions.append(worker)
            cont_cnt.update(pair_worker_cont)
        n_contradictions = sum(cont_cnt.values())
        d['n_contradictions'] = n_contradictions
        d['n_workers_contradicting'] = len(workers_with_contradictions)
        d['ratio_workers_contradicting'] = len(workers_with_contradictions)/n_workers
        d['contradiction_annotation_ratio'] = n_contradictions/n_annotations
        d['average_time_pair'] = sum(av_time_all_workers)/len(av_time_all_workers)
        d['workers_contradicting'] = ' '.join(workers_with_contradictions)
        workers_not_contradicting = [w for w in data_by_worker if w \
                                     not in workers_with_contradictions]
        d['workers_not_contradicting'] = ' '.join(workers_not_contradicting)
        # add contradiction_type analysis
        d.update(cont_cnt)
        pair_data_dicts.append(d)

    pair_df = pd.DataFrame(pair_data_dicts)
    # sort by contradiction to annotation ratio
    pair_df.sort_values('contradiction_annotation_ratio', axis=0, ascending=False, inplace=True)
    out_dir = '../analyses/pairs/'
    os.makedirs(out_dir, exist_ok=True)
    filepath = f'{out_dir}{name}.csv'
    pair_df.to_csv(filepath)
    return pair_df, filepath


def main():
    # analyze all data:
    run = '3'
    batch = '16'
    n_q = '*'
    group = 'experiment1'

    data_dict_list = load_experiment_data(run, group, n_q, batch, remove_not_val = True)
    name = f'run{run}-group_{group}-batch{batch}'.replace('*', '-all-')
    df, filepath = get_pair_analysis(data_dict_list, name)
    print(f'analysis can be found at: {filepath}')

if __name__ == '__main__':
    main()


analysis can be found at: ../analyses/pairs/run3-group_experiment1-batch16.csv


In [34]:
run = '3'
batch = '16'
n_q = '*'
group = 'experiment1'

data_dict_list = load_experiment_data(run, group, n_q, batch, remove_not_val = True)
name = f'run{run}-group_{group}-batch{batch}'.replace('*', '-all-')
df, filepath = get_pair_analysis(data_dict_list, name)
print(f'analysis can be found at: {filepath}')
# Head
df[:20]
# Tail
#df[-20:]

analysis can be found at: ../analyses/pairs/run3-group_experiment1-batch16.csv


Unnamed: 0,pair,n_annotations,n_workers,n_contradictions,n_workers_contradicting,ratio_workers_contradicting,contradiction_annotation_ratio,average_time_pair,workers_contradicting,workers_not_contradicting,"(implied_category, unusual)","(implied_category, rare)","(affording_activity, unusual)","(affording_activity, impossible)","(implied_category, impossible)","(impossible, typical_of_concept)","(impossible, typical_of_property)","(typical_of_property, unusual)"
3,round-can,100,10,10,3,0.3,0.1,62.30588,5c688d8e9b80050001708a14 5d51e1e3d255270001a28...,5b222aff59f9620001c109cb 5c62a91524035400014b6...,2.0,,3.0,1.0,1.0,,1.0,2.0
2,round-seedpod,100,10,4,1,0.1,0.04,62.30588,5e4b00acecbca8011a7319c1,5b222aff59f9620001c109cb 5c62a91524035400014b6...,,,,1.0,1.0,1.0,1.0,
5,red-daisy,100,10,3,2,0.2,0.03,62.30588,5e4b00acecbca8011a7319c1 5b479eb95a02af0001a79012,5b222aff59f9620001c109cb 5c62a91524035400014b6...,1.0,1.0,,,1.0,,,
7,red-slug,100,10,3,1,0.1,0.03,62.30588,5e4b00acecbca8011a7319c1,5b222aff59f9620001c109cb 5c62a91524035400014b6...,1.0,1.0,,,1.0,,,
0,red-elm,100,10,2,1,0.1,0.02,62.30588,5e4b00acecbca8011a7319c1,5b222aff59f9620001c109cb 5c62a91524035400014b6...,1.0,1.0,,,,,,
1,red-candy,100,10,1,1,0.1,0.01,62.30588,5e4b00acecbca8011a7319c1,5b222aff59f9620001c109cb 5c62a91524035400014b6...,,,1.0,,,,,
4,round-mayonnaise,100,10,1,1,0.1,0.01,62.30588,5d51e1e3d255270001a28fbb,5b222aff59f9620001c109cb 5c62a91524035400014b6...,,,,,,,1.0,
6,red-strawberry,60,10,0,0,0.0,0.0,103.843133,,5b222aff59f9620001c109cb 5c62a91524035400014b6...,,,,,,,,
8,roll-sprocket,10,10,0,0,0.0,0.0,623.0588,,5b222aff59f9620001c109cb 5c62a91524035400014b6...,,,,,,,,
9,_check1-_check1,10,10,0,0,0.0,0.0,623.0588,,5b222aff59f9620001c109cb 5c62a91524035400014b6...,,,,,,,,
