# Worker evaluation and filtering

* get a ranking of all workers based on contradictions and test questions 
* check test question counting code again
* See if there is a selection of top workers and test whether they have higher agreement than the others
* Perhaps consider putting them on a list of selected top workers with more salary


**Note for later** Test what happens to agreement if we collapse all annotations in a single group. 


# Further steps:

* add pair and relation evaluation and set up something similar to crowd truth


## Pair evaluation: 

* pairs with many annotators contradicting themselves are probably difficult (likely because of polysemy)


## Relation evaluation:

* relations with most contradictions are probably difficult

In [191]:
from load_data import load_experiment_data

from collections import defaultdict
from collections import Counter
import pandas as pd
import os 

In [192]:
def sort_by_key(data_dict_list, keys):
    
    sorted_dict = defaultdict(list)
    for d in data_dict_list:
        if len(keys) == 1:
            key = keys[0]
            sortkey = d[key]
        else:
            sortkeys = []
            for key in keys:
                sortkey = d[key]
                sortkeys.append(sortkey)
            sortkey = '-'.join(sortkeys)
        sorted_dict[sortkey].append(d)
    return sorted_dict 
    

def get_relation_cnt(pair_dicts):
    relation_cnt = Counter()
    for d in pair_dicts:
        if d['answer'] == 'true':
            val = 1
        else:
            val = 0
        relation_cnt[d['relation']] += val
       
    return relation_cnt

def get_relation_pairs(pair_dicts, threshold = 0):
    relation_cnt = get_relation_cnt(pair_dicts)
    relations_true = [rel for rel, cnt in relation_cnt.items() if cnt > threshold]
    
    relation_pairs = []
    for rel1 in relations_true:
        for rel2 in relations_true:
            pair = set([rel1, rel2])
            if len(pair) > 1 and pair not in relation_pairs:
                relation_pairs.append(pair)
    return relation_pairs
            
    
def collect_contradictions(pair_dicts, contradictions, threshold = 0):
    relation_pairs = get_relation_pairs(pair_dicts, threshold = threshold)
    contradiction_pairs = [tuple(sorted(p)) for p in relation_pairs if p in contradictions]
    return contradiction_pairs 
  

def load_contradiction_pairs():
    contradictions = []
    with open('../scheme/contradictions.csv') as infile:
        for line in infile:
            contradictions.append(set(line.strip('\n').split(',')))
    return contradictions
    
def get_cont_type_dicts(contradictions, cont_type_cnt):
    contradiction_dict = dict()
    for cont in contradictions:
        cont = tuple(sorted(cont))
        cnt = cont_type_cnt[cont]
        cont_str = '-'.join(cont)
        contradiction_dict[cont_str] = cnt
    return contradiction_dict 


def get_average_time_worker(worker_dict_list):
    
    data_by_batch = sort_by_key(data_dict_list, ['workerid']) 
    for batch, dl in data_by_batch.items():
        times_spent = []
        for d in dl:
            times_spent.append(float(d['time_taken_batch']))
            av_time_batch = sum(times_spent)/len(times_spent)
            av_time_question = av_time_batch / len(times_spent)
    return av_time_question

In [189]:
run = 3
batch = 13
n_q = 70
group = 'experiment1'
data_dict_list = load_experiment_data(run, group, n_q, batch, remove_not_val = True)

name = f'run{run}-group_{group}-batch{batch}'.replace('*', '-all-')

worker_data_dicts = []
data_by_worker = sort_by_key(data_dict_list, ['workerid']) 
contradictions = load_contradiction_pairs()


worker_cont_cnt = Counter()
worker_cont_type_cnt = dict()
worker_annotations_cnt = Counter()

for worker, dl_worker in data_by_worker.items():
    worker_annotations_cnt[worker] = len(dl_worker)
    cont_cnt = Counter()
    data_by_pair = sort_by_key(dl_worker, ['property', 'concept']) 
    for pair, dl_pair in data_by_pair.items():
        pair_contradictions = collect_contradictions(dl_pair, contradictions, threshold = 0)
        worker_cont_cnt[worker] += len(pair_contradictions)
        cont_cnt.update(pair_contradictions)
    worker_cont_type_cnt[worker] = cont_cnt

for worker, cnt in worker_cont_cnt.most_common():
    cont_type_cnt = worker_cont_type_cnt[worker]
    cont_dict = get_cont_type_dicts(contradictions, cont_type_cnt)
    n_annotations = worker_annotations_cnt[worker]
    original_d = data_by_worker[worker]
    d = dict()
    d['workerid'] = worker
    d['contradiction_cnt'] = cnt
    d['n_annotations'] = n_annotations
    d['ratio_contradictions'] = cnt / n_annotations
    d.update(cont_dict)
    worker_data_dicts.append(d)
    
worker_df = pd.DataFrame(worker_data_dicts)


Unnamed: 0,workerid,contradiction_cnt,n_annotations,ratio_contradictions,afforded_usual-rare,afforded_usual-impossible,afforded_usual-unusual,rare-typical_of_property,impossible-typical_of_property,typical_of_property-unusual,...,typical_of_concept-unusual,afforded_unusual-rare,afforded_unusual-impossible,afforded_unusual-unusual,implied_category-rare,implied_category-impossible,implied_category-unusual,affording_activity-rare,affording_activity-impossible,affording_activity-unusual
0,5c8d3f8bf756d40016683868,25,76,0.328947,0,0,0,2,1,3,...,4,0,0,0,3,2,6,0,1,1
1,5e6a3ff6a21e120a40d76a84,17,76,0.223684,0,0,0,2,2,1,...,1,0,0,0,1,1,1,3,2,1
2,5bfe6f8c6e6cd80001bea615,14,76,0.184211,0,0,0,2,0,2,...,0,0,0,0,1,1,1,2,1,1
3,5bae4929254aa9000189a644,4,76,0.052632,0,0,0,0,0,0,...,0,0,0,0,1,0,1,1,0,1
4,5bf4d30498496900016ffd14,2,76,0.026316,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
5,5b222aff59f9620001c109cb,0,76,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,5c7017ac1e20530001e7ce63,0,76,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,5dcc8459bfb31a920381996e,0,76,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,5e6a16f0d2e99d06ad80e2a8,0,76,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,5e7156645b050e07277c6682,0,76,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
