# Exlcude confusing pairs prior to annotation

## 1 Prepare annotations

In [9]:
import glob 
import os 
from collections import defaultdict

from utils import read_group, read_csv, to_csv

def load_pairs(exp_group):
    
    all_pairs = []
    collections = ['perceptual', 'activities', 'complex', 'parts']
    prop_coll_dict = read_group(exp_group)
    
    for collection in collections:
        f = f'../data/{collection}.csv'
        dict_list = read_csv(f)
        for d in dict_list:
            prop = d['property']
            if prop in prop_coll_dict:
                all_pairs.append(d)
    return all_pairs

def shrink_dict(d, keys):
    current_keys = list(d.keys())
    for k in current_keys:
        if k not in keys:
            d.pop(k)
    return d
            

def to_file(all_pairs, exp_group):
    header = ['property', 'lemma']
    all_pairs_clean = [shrink_dict(d, header) for d in all_pairs]
    f = f'../pair_filtering/{exp_group}.csv'
    to_csv(f, all_pairs_clean)
    
    
def divide_to_file(exp_group, annotators):
    all_pairs = load_pairs(exp_group)
 
    
    exp_dir = f'../data_pair_filtering/to_annotate/{exp_group}/'
    if not os.path.isdir(exp_dir):
        os.mkdir(exp_dir)
    
    prop_dict = defaultdict(list)
    for d in all_pairs:
        header = ['property', 'lemma']
        d_clean = shrink_dict(d, header) 
        prop = d_clean['property']
        prop_dict[prop].append(d_clean)

    for prop, dict_list in prop_dict.items():
        for annotator in annotators:
            f = f'{exp_dir}/{prop}-{annotator}.csv'
            to_csv(f, dict_list)
        

In [10]:
# Generate annotation sheets - already done 
#exp_group = 'experiment3'   
#annotators = ['pia', 'antske']
#divide_to_file(exp_group, annotators)


## 2 Annotate via Google sheets

Annotation instructions, links and status overview are [here](https://docs.google.com/document/d/1tupla_Jhr1hXt0CEle0Rji355zZpBZeGXugPMcKRjLM/edit?usp=sharing). 

Once annotationes are down, download the sheets as .csv files and store in `../pair_filtering/annotated/exp_name/property.csv`

## 3 Analyze agreement 

In [8]:
import os
from utils import read_group, read_csv, to_csv


def load_annotations_file(f):
    print(f)
    dict_list = read_csv(f)
    return dict_list


def analyze_annotations(exp_group, prop, name1, name2):
    
    dir_annotated = f'../data_pair_filtering/annotated/{exp_group}'
  
    f = f'{dir_annotated}/{prop}-{name1}.csv'
    dict_list_1 = load_annotations_file(f)
    f = f'{dir_annotated}/{prop}-{name2}.csv'
    dict_list_2 = load_annotations_file(f)
    print(dict_list_2[0].keys())
    data_annotated = []
    
    for d_1, d_2 in zip(dict_list_1, dict_list_2):
        new_d = dict()
        concept = d_1['lemma']
        if concept != d_2['lemma']:
            print('problem: data do not match')
        l_1 = d_1['decision']
        l_2 = d_2['decision']
        new_d['lemma'] = concept
        new_d[name1] = l_1
        new_d[name2] = l_2
        if l_1 != l_2:
            decision = 'exclude1'
            #contradictions.append(new_d)
        elif l_1 == l_2 == 'exclude2':
            decision = 'exclude'
        elif l_1 == l_2 == 'include':
            decision = 'include'
        new_d['decision'] = decision
        data_annotated.append(new_d)
            
    include = [d for d in data_annotated if d['decision'] == 'include'] 
    exclude1 = [d for d in data_annotated if d['decision'] == 'exclude1']
    exclude_agree = [d for d in data_annotated if d['decision'] == 'exclude2'] 
    print(f'Total number of concepts: {len(data_annotated)}')
    print(f'Total number included by both: {len(include)}')
    print(f'Total number excluded by one person: {len(exclude1)}')
    print(f'Total number of agreements on exlcude: {len(exclude_agree)}')
    if exclude_agree:
        print('Agreed on excluding:')
        for d in exclude_agree:
            print(d['lemma'])
    return data_annotated


def aggregated_to_file(prop, data_annotated, exp_group):
    
    dir_path = f'../pair_filtering/aggregated/{exp_group}'
    if not os.path.isdir(dir_path):
        os.mkdir(dir_path)
    
    file_path = f'{dir_path}/{prop}.csv'
    to_csv(file_path, data_annotated)

In [9]:
 
exp_group = 'experiment3'
name1 = 'antske'
name2 = 'pia'
prop = 'blue'
data_annotated = analyze_annotations(exp_group, prop, name1, name2)
print()
aggregated_to_file(prop, data_annotated, exp_group)

../data_pair_filtering/annotated/experiment3/blue-antske.csv
../data_pair_filtering/annotated/experiment3/blue-pia.csv
include include
include include
include include
include include
include include
include include
include include
include include
include exclude
include include
include include
include include
include include
include include
include include
include include
include include
include include
include include
include include
include include
include include
include include
include include
include include
include include
include include
include include
include include
include include
include include
include include
include include
include include
include include
include include
include include
include include
include include
include include
include include
include include
include include
include include
include exclude
include include
include include
include include
include include
include include
include include
include include
include include
include include
include include
i

FileNotFoundError: [Errno 2] No such file or directory: '../pair_filtering/aggregated/experiment3'

In [21]:
name1 = 'antske'
name2 = 'pia'
prop = 'used_in_cooking'
data_annotated = analyze_annotations(exp_group, prop, name1, name2)
aggregated_to_file(prop, data_annotated, exp_group)

../pair_filtering/annotated/experiment3/used_in_cooking-antske.csv
../pair_filtering/annotated/experiment3/used_in_cooking-pia.csv
Total number of concepts: 179
Total number included by both: 157
Total number excluded by one person: 22
Total number of agreements on exlcude: 0
