In [2]:
import csv
from collections import defaultdict
from collections import Counter

from utils import load_experiment_data
from utils import get_pair_dict
from utils import load_contradiction_pairs
from utils import get_relation_counts
from utils import consistency_check

In [3]:
def get_worker_dict(dict_list_out):
    worker_dict = defaultdict(list)
    for d in dict_list_out:
        worker = d['workerid']
        worker_dict[worker].append(d)
    return worker_dict

In [5]:
run = 3
batch = '*'
n_q = 70
group = 'experiment1'


contradiction_pairs = load_contradiction_pairs()
dict_list_out = load_experiment_data(run, group, n_q, batch, remove_not_val = True)
worker_dict = get_worker_dict(dict_list_out)
worker_pair_dict = dict()
for worker, worker_dict_list in worker_dict.items():
    pair_dict = get_pair_dict(worker_dict_list)
    worker_pair_dict[worker] = pair_dict
      
worker_pairs_to_keep = set()
worker_pairs_to_discard = set()
for worker, pair_dict in worker_pair_dict.items():
    print(f'checking worker: {worker}')
    for pair, relation_vec in pair_dict.items():
        relation_counts = get_relation_counts(relation_vec, normalize = True)
        contradictions = consistency_check(contradiction_pairs, relation_counts, thresh = 0.0)
        if len(contradictions) > 0:
            print('contradition_count', worker, pair)
            worker_pairs_to_discard.add((worker, pair))
            continue
        else:
            clean_worker_pair = (worker, pair)
            worker_pairs_to_keep.add(clean_worker_pair)
    print()

checking worker: 56ae9348322d76000c990b51

checking worker: 5b06e71668eff50001d1b795
contradition_count 5b06e71668eff50001d1b795 ('round', 'candy')
contradition_count 5b06e71668eff50001d1b795 ('red', 'cake')
contradition_count 5b06e71668eff50001d1b795 ('red', 'blackberry')
contradition_count 5b06e71668eff50001d1b795 ('red', 'pomegranate')
contradition_count 5b06e71668eff50001d1b795 ('round', 'spoon')

checking worker: 5c41047672b87000013a5f0a
contradition_count 5c41047672b87000013a5f0a ('red', 'clarinet')
contradition_count 5c41047672b87000013a5f0a ('red', 'cake')
contradition_count 5c41047672b87000013a5f0a ('red', 'blackberry')
contradition_count 5c41047672b87000013a5f0a ('round', 'spoon')

checking worker: 5c94176f0791df0014366f00
contradition_count 5c94176f0791df0014366f00 ('round', 'candy')
contradition_count 5c94176f0791df0014366f00 ('red', 'cake')
contradition_count 5c94176f0791df0014366f00 ('round', 'spoon')

checking worker: 5e2a160c77d2c70762f0e32c
contradition_count 5e2a160c7

In [6]:
for p in worker_pairs_to_discard:
    print(p)

('5c98f0f728cfb0001667d035', ('round', 'pomade'))
('5c41047672b87000013a5f0a', ('red', 'clarinet'))
('5e6f6b7b7a27a35206e5c30c', ('red', 'oven'))
('5e695bb0254c4a2103ed09da', ('round', 'melon'))
('5e385a484af60703465f207d', ('red', 'rhagoletis'))
('5dbb81ae6d82ff16f84bb066', ('red', 'rhagoletis'))
('5dc72b0bb98b1252dec22f61', ('red', 'coffee'))
('5e673f3fb47a554b79b02c5a', ('red', 'coffee'))
('5e67cfb2a3821007f17744c8', ('round', 'candy'))
('5b06e71668eff50001d1b795', ('red', 'cake'))
('5e6f98ded5d246555d98c6e0', ('round', 'scone'))
('no workerld', ('round', 'melon'))
('5b06e71668eff50001d1b795', ('round', 'candy'))
('5e6f8c15fe7a000009575a52', ('red', 'onion'))
('5dbb81ae6d82ff16f84bb066', ('red', 'grapefruit'))
('5e2a160c77d2c70762f0e32c', ('red', 'pomegranate'))
('5e6f6b7b7a27a35206e5c30c', ('red', 'onion'))
('unemployed', ('round', 'sundae'))
('5e673f3fb47a554b79b02c5a', ('red', 'rhagoletis'))
('5e695bb0254c4a2103ed09da', ('red', 'onion'))
('5dc72b0bb98b1252dec22f61', ('red', 'rhag

In [7]:
def get_selected_annotations(dict_list_out, target_worker_pairs):
    dict_list_clean = []
    for d in dict_list_out:
        worker = d['workerid']
        triple_list = d['triple'].split('-')
        worker_pair = (worker, (triple_list[1], triple_list[2]))
        #print(worker_pair)
        if worker_pair in target_worker_pairs:
            dict_list_clean.append(d)
    return dict_list_clean
            

In [8]:
dict_list_clean = get_selected_annotations(dict_list_out, worker_pairs_to_keep)
dict_list_discard = get_selected_annotations(dict_list_out, worker_pairs_to_discard)
print(len(dict_list_out), len(dict_list_clean), len(dict_list_discard))

3013 2328 685


In [9]:
# analysis: 
# who are the workers most often contradicting themselves?
# which pairs lead to most contradictions?
# which contradictions occur most often?
workers_contradicting = Counter()
pairs_contridicting = Counter()
relation_contradictions = Counter()


for worker, pair in worker_pairs_to_discard:
    workers_contradicting[worker] += 1
    pairs_contridicting[pair] += 1
    

In [12]:
print('workers contradicting themselves')
for w, cnt in workers_contradicting.most_common():
    print(w, cnt)
print('pairs with many contradictions')
for p, cnt in pairs_contridicting.most_common():
    print(p, cnt)

workers contradicting themselves
no workerld 5
5b06e71668eff50001d1b795 5
5e67cfb2a3821007f17744c8 5
5e673f3fb47a554b79b02c5a 4
5c41047672b87000013a5f0a 4
5e6f8c15fe7a000009575a52 3
unemployed 3
5dc72b0bb98b1252dec22f61 3
5e385a484af60703465f207d 3
5d7a42caf0a2960016536deb 3
5c94176f0791df0014366f00 3
5e695bb0254c4a2103ed09da 3
5c98f0f728cfb0001667d035 3
5e6f88ac12fd1f000cdf2403 2
5dbb81ae6d82ff16f84bb066 2
5e6f6b7b7a27a35206e5c30c 2
5e6f98ded5d246555d98c6e0 2
5d3337693d0a510019bffca0 2
5e5d84d591b5f0094e38ec2a 2
5e6b7cde2691b2126162ee61 1
5bdc4a3b0f86a60001be614a 1
5e6f8d129cfd4d53fc092d1b 1
5e67d9ed33e6a60542f4dfd7 1
5e6a63a7d6d433011c20639b 1
5cb72d295a63a200170af08f 1
5e6f89993f2dc053a390dad6 1
5e6b9983d4948e1491e34553 1
5e2a160c77d2c70762f0e32c 1
5a19d62dab721b0001ef91e6 1
pairs with many contradictions
('red', 'grapefruit') 6
('red', 'onion') 5
('round', 'melon') 5
('round', 'spoon') 5
('red', 'rhagoletis') 4
('round', 'scone') 4
('red', 'pomegranate') 4
('red', 'blackberry') 4
(