# Exlcude confusing pairs prior to annotation

## 1 Prepare annotations

In [7]:
import glob 
import os 
from collections import defaultdict

from utils import read_group, read_csv, to_csv

def load_pairs(exp_group):
    
    all_pairs = []
    collections = ['perceptual', 'activities', 'complex', 'parts']
    prop_coll_dict = read_group(exp_group)
    
    for collection in collections:
        f = f'../data/{collection}.csv'
        dict_list = read_csv(f)
        for d in dict_list:
            prop = d['property']
            if prop in prop_coll_dict:
                all_pairs.append(d)
    return all_pairs

def shrink_dict(d, keys):
    current_keys = list(d.keys())
    for k in current_keys:
        if k not in keys:
            d.pop(k)
    return d
            

def to_file(all_pairs, exp_group):
    header = ['property', 'lemma']
    all_pairs_clean = [shrink_dict(d, header) for d in all_pairs]
    f = f'../pair_filtering/{exp_group}.csv'
    to_csv(f, all_pairs_clean)
    
    
def divide_to_file(exp_group, annotators):
    all_pairs = load_pairs(exp_group)
 
    
    exp_dir = f'../data_pair_filtering/to_annotate/{exp_group}/'
    if not os.path.isdir(exp_dir):
        os.mkdir(exp_dir)
    
    prop_dict = defaultdict(list)
    for d in all_pairs:
        header = ['property', 'lemma']
        d_clean = shrink_dict(d, header) 
        prop = d_clean['property']
        prop_dict[prop].append(d_clean)

    for prop, dict_list in prop_dict.items():
        for annotator in annotators:
            f = f'{exp_dir}/{prop}-{annotator}.csv'
            to_csv(f, dict_list)
        

In [10]:
# Generate annotation sheets - already done 
#exp_group = 'experiment3'   
#annotators = ['pia', 'antske']
#divide_to_file(exp_group, annotators)


## 2 Annotate via Google sheets

Annotation instructions, links and status overview are [here](https://docs.google.com/document/d/1tupla_Jhr1hXt0CEle0Rji355zZpBZeGXugPMcKRjLM/edit?usp=sharing). 

Once annotationes are down, download the sheets as .csv files and store in `../pair_filtering/annotated/exp_name/property.csv`

## 3 Analyze agreement 

In [8]:
import os
from utils import read_group, read_csv, to_csv


def load_annotations_file(f):
    print(f)
    dict_list = read_csv(f)
    return dict_list


def analyze_annotations(exp_group, prop, name1, name2):
    
    dir_annotated = f'../data_pair_filtering/annotated/{exp_group}'
  
    f = f'{dir_annotated}/{prop}-{name1}.csv'
    dict_list_1 = load_annotations_file(f)
    f = f'{dir_annotated}/{prop}-{name2}.csv'
    dict_list_2 = load_annotations_file(f)
    data_annotated = []
    
    for d_1, d_2 in zip(dict_list_1, dict_list_2):
        new_d = dict()
        concept = d_1['lemma']
        if concept != d_2['lemma']:
            print('problem: data do not match')
        l_1 = d_1['decision']
        l_2 = d_2['decision']
        new_d['lemma'] = concept
        new_d[name1] = l_1
        new_d[name2] = l_2
        #print(concept, l_1, l_2)
        if l_1 != l_2:
            decision = 'exclude1'
            #contradictions.append(new_d)
        elif l_1 == l_2 == 'exclude':
            decision = 'exclude2'
        elif l_1 == l_2 == 'include':
            decision = 'include'
        new_d['decision'] = decision
        data_annotated.append(new_d)
            
    include = [d for d in data_annotated if d['decision'] == 'include'] 
    exclude1 = [d for d in data_annotated if d['decision'] == 'exclude1']
    exclude_agree = [d for d in data_annotated if d['decision'] == 'exclude2'] 
    print(f'Total number of concepts: {len(data_annotated)}')
    print(f'Total number included by both: {len(include)}')
    print(f'Total number excluded by one person: {len(exclude1)}')
    print(f'Total number of agreements on exlcude: {len(exclude_agree)}')
    if exclude_agree:
        print('Agreed on excluding:')
        for d in exclude_agree:
            print(d['lemma'])
    return data_annotated


def aggregated_to_file(prop, data_annotated, exp_group):
    
    dir_path = f'../data_pair_filtering/aggregated/{exp_group}'
    if not os.path.isdir(dir_path):
        os.mkdir(dir_path)
    
    file_path = f'{dir_path}/{prop}.csv'
    to_csv(file_path, data_annotated)
    
    
def replace_concept(data_annotated, include, replace):
    # set for juicy only: find variants of potato and exclude. leave in potato

    potato_include = 'potato'
    potato_variants = ['tater', 'spud']
    all_concepts = [d['lemma'] for d in data_annotated]
    if potato_include in all_concepts:
        print('found', potato_include)
    for d in data_annotated:
        if d['lemma'] in potato_variants:
            d['decision'] = 'exclude'
            print(d)
        elif d['lemma'] == potato_include:
            d['decision'] = 'include'
            print(d)
            
            
def add_concepts(data_annotated, concepts_to_include, name1, name2):
    all_concepts_to_include = set([d['lemma'] for d in concepts_to_include])
    for c in concepts_to_include:
        if c not in all_concepts_to_include:
            d = dict()
            d['lemma'] = c
            d[name1] = 'add'
            d[name2] = 'add'
            d['decision'] = 'added'
            data_annotated.append(d)
        else:
            print('Already in included concepts': c)
        

# Aggregation and replacements

### juicy (special case)

- replaced different variants of potato: potato should stay, tater and spud should be excluded

### green

- aggregated and stored

### swim

- aggregated and stored


In [12]:
# Code

exp_group = 'experiment3'
name1 = 'antske'
name2 = 'pia'
prop = 'green'

concepts_to_include = []

data_annotated = analyze_annotations(exp_group, prop, name1, name2)

#add_concepts(data_annotated, concepts_to_include, name1, name2)
aggregated_to_file(prop, data_annotated, exp_group)

../data_pair_filtering/annotated/experiment3/green-antske.csv
../data_pair_filtering/annotated/experiment3/green-pia.csv
Total number of concepts: 175
Total number included by both: 158
Total number excluded by one person: 10
Total number of agreements on exlcude: 7
Agreed on excluding:
plumeria
lewisia
palaquium
manilkara
platanthera
colocasia
teucrium


### cold
-  discuss before aggregating

In [6]:
# code cold

# cold
exp_group = 'experiment3'
name1 = 'antske'
name2 = 'pia'
prop = 'cold'

concepts_to_include = []

data_annotated = analyze_annotations(exp_group, prop, name1, name2)

#add_concepts(data_annotated, concepts_to_include, name1, name2)

#aggregated_to_file(prop, data_annotated, exp_group)

../data_pair_filtering/annotated/experiment3/cold-antske.csv
../data_pair_filtering/annotated/experiment3/cold-pia.csv
Total number of concepts: 121
Total number included by both: 105
Total number excluded by one person: 12
Total number of agreements on exlcude: 4
Agreed on excluding:
iliamna
mentzelia
crack
perithecia


### wings


- Comment Antske: Done:
I think there are too many cars or car-like things in the corpus and more `borderline’ or `odd’ vehicles would be nice.
I also think we want to include Penguin and Platypus, possibly other animals. Insects are probably more interesting than birds.

* agggregate
* add words 
* make sure these words can be used by the sampling algorith
* Suggestion for added words below - we can add more

In [11]:
# Wings
exp_group = 'experiment3'
name1 = 'antske'
name2 = 'pia'
prop = 'wings'

# check car brands - possibly exclude
# different words for airplane (possibly add after annotation)
concepts_to_include = ['kite', 'helicopter', 'airplane', 'zeppelin', 
                       'balloon', 'parachute', 'hovercraft',
                   'penguin', 'ostrich', 'chicken', 'emu', 'kiwi',
                   'platypus', 
                   'bee', 'dragonfly', 'ant', 'ladybird', 'butterfly']
print()
print(' '.join(concepts_to_include))
print()
data_annotated = analyze_annotations(exp_group, prop, name1, name2)

add_concepts(data_annotated, concepts_to_include, name1, name2)

aggregated_to_file(prop, data_annotated, exp_group)


kite helicopter airplane zeppelin balloon parachute hovercraft penguin ostrich chicken emu kiwi platypus bee dragonfly ant ladybird butterfly

../data_pair_filtering/annotated/experiment3/wings-antske.csv
../data_pair_filtering/annotated/experiment3/wings-pia.csv
Total number of concepts: 147
Total number included by both: 101
Total number excluded by one person: 27
Total number of agreements on exlcude: 19
Agreed on excluding:
procellariidae
strigidae
cotingidae
timaliidae
alaudidae
apodidae
laridae
fringillidae
muscicapa
caprimulgus
phalacrocorax
puffinus
calidris
motacilla
emberizidae
haliaeetus
charadrius
paridae
icteridae


## Get lexical data for additional concepts

(1) add concepts to feature_data/data/manually_included_after_centroid/properties_selected_run5.csv

(2) In feature_data: cd scripts_process_raw_data, run add_preselected_concepts_after_centroid_step.py [collection]

(3) copy feature_data/data/concepts_additional_info_manual_run5_pilot to SPT_annotation/data_all_candidates/. 



## Merge new sets with already created question sets

In SPT_annotation: 

(1) run replace_excluded_concepts.py [property] [collection] [run] 

(2) run update_dataset.py [run] [prop] [run_new_label]  (use new label if you want to keep runs separate

(3) run create_questions.py [run_new_label] (you will have to copy the template file from the previous run and use the new run label)

(4) Merge old question file with new question file (make backups of both before)

In [13]:
# merge the question files

import csv

In [14]:
file_original = '../questions/run5_pilot-all-restricted_True.csv'
file_new = '../questions/run5_part2-all-restricted_True.csv'

# write original data to a new file:
file_original_backup = '../questions/run5_part1-all-restricted_True.csv'
with open(file_original) as infile:
    original = infile.read()
    
with open(file_original_backup, 'w') as outfile:
    outfile.write(original)

In [20]:
# load original data as dict_list

with open(file_original) as infile:
    dict_list_original = list(csv.DictReader(infile, delimiter = '\t'))
    
# load new file

with open(file_new) as infile:
    dict_list_new = list(csv.DictReader(infile, delimiter = '\t'))
    
print(len(dict_list_original), len(dict_list_new))

# add new to original
dict_list_original.extend(dict_list_new)
print(len(dict_list_original))

6666 5138
11804


In [21]:
# extended file to originl path

header = dict_list_original[0].keys()
with open(file_original, 'w') as outfile:
    writer = csv.DictWriter(outfile, fieldnames = header, delimiter = '\t')
    writer.writeheader()
    for d in dict_list_original:
        writer.writerow(d)