# Create crowd input workflow


1.) Create all questions of a run

- this should not be changed (only created once)
- if the question formulations change, we create a new run
- identifiers should not change
- create unique ids here
- we draw from this selection for creating batches 


2.) Create batch

- draw from all questions of a run
- only select properties in the experiment group
- always make sure that a batch contains the full set of questions of a pair 
- add prolific url 
- check prolific_input for already posed pairs
- write to file without header


In [None]:
# match question ids - done
# to clean up the data, change the ids in the questions file to the input ids - done


# then check how much has been annotated



In [2]:
import os
#os.listdir('../task_input/prolific_input/run3-group_experiment1/')

In [82]:
from utils import read_csv, to_csv
from utils import sort_by_key
from utils import read_group

from random import shuffle, choice
import os

def read_input(run, experiment_name):
    all_input_dicts = []
    dir_path = f'../task_input/prolific_input/run{run}-group_{experiment_name}/'
    header_path = f'{dir_path}header.txt'
    
    with open(header_path) as infile:
        header = infile.read().split(',')
    
    filepaths = os.listdir(dir_path)
    for f in filepaths:
        full_path = f'{dir_path}{f}'
        input_dicts = read_csv(full_path, header = header)
        all_input_dicts.extend(input_dicts)
    return all_input_dicts

def collect_not_annotated(input_dicts, question_dicts):
    
    questions_not_annotated = []
    input_by_quid = sort_by_key(input_dicts, ['quid'])
    for d in question_dicts:
        quid = d['quid']
        if quid not in input_by_quid:
            questions_not_annotated.append(d)
    return questions_not_annotated
        
    
def get_annotated_questions(input_dicts, question_dicts):
    
    input_by_quid = sort_by_key(input_dicts, ['quid'])
    questions_by_quid = sort_by_key(question_dicts, ['quid'])
    questions_annotated = []
    for quid in input_by_quid:
        if quid in questions_by_quid:
            question = questions_by_quid[quid][0]
            questions_annotated.append(question)
    return questions_annotated
    

def collect_invalid(input_dicts, question_dicts):
    
    questions_by_pair = sort_by_key(question_dicts, ['property', 'concept'])
    questions_annotated = get_annotated_questions(input_dicts, question_dicts)
    questions_anntotated_by_pair = sort_by_key(questions_annotated, ['property', 'concept'])
    invalid_annotations = []
    
    for pair, questions_annotated in questions_anntotated_by_pair.items():
        questions = questions_by_pair[pair]
        if len(questions) != len(questions_annotated):
            #print('missing annotations for pair:', pair, len(questions), len(questions_annotated))
            invalid_annotations.extend(questions)
    return invalid_annotations

def get_available_questions(input_dicts, question_dicts):
    
    questions_for_annotation = []
    questions_not_annotated = collect_not_annotated(input_dicts, question_dicts)
    print('not annotated yet:', len(questions_not_annotated))
    invalid_annotations = collect_invalid(input_dicts, question_dicts)
    print('not valid', len(invalid_annotations))
    
    not_annotated_pair = sort_by_key(questions_not_annotated, ['property', 'concept'])
    invalid_pair = sort_by_key(invalid_annotations, ['property', 'concept'])
    
    for pair, questions in not_annotated_pair.items():
        if pair in invalid_pair:
            questions_for_annotation.extend(invalid_pair[pair])
        else:
            questions_for_annotation.extend(questions)
            
    wrong_n_questions = []
    available_by_pair = sort_by_key(available_questions, ['property', 'concept'])
    for pair, questions in available_by_pair.items():
        if len(questions) > 10 or len(questions) < 3:
            wrong_n_questions.append((n, pair))
    assert len(wrong_n_questions) == 0, 'Number of questions per pair not correct.'
    return questions_for_annotation
    

def get_check_and_test():
    checks = read_csv('../questions/checks.csv')
    tests = read_csv('../questions/tests.csv')
    
    rand_check = choice(checks)
    rand_test = choice(tests)
    tests_checks = [rand_check, rand_test]
    for d in tests_checks:
        if '' in d:
            d.pop('')
    return tests_checks

def create_new_batch(questions_to_annotate, n_qu = 70):
    batch = []
    properties = set()
    # shuffle questions:
    shuffle(questions_to_annotate)
    questions_by_pair = sort_by_key(questions_to_annotate, ['property', 'concept'])
    available_properties = set([p.split('-')[0] for p in questions_by_pair.keys()])

    for pair, questions in questions_by_pair.items():
        prop = pair.split('-')[0]
        if len(batch) < n_qu:
            if prop not in properties:
                print('found a new one:', prop, len(batch))
                batch.extend(questions)
                properties.add(prop)
            else:
                props_not_used = available_properties.difference(properties)
                print('properties not used:', len(props_not_used), len(batch))
                if len(props_not_used) > 0:
                    continue
                else:
                    batch.extend(questions)
                    properties.add(prop)
                    print('no more properties, adding quetions:', len(questions))
        else:
            print('found enough questions', len(batch))
            break

    return batch  


def batch_to_file(batch, url, run):
    
    header = ['quid', 'question', 'example_pos', 'example_neg']
    header_new = ['quid', 'description', 'exampleTrue', 'exampleFalse', 'triple', 'url']
    new_dics = []
    for d in batch:
        triple = f"{d['relation']}-{d['property']}-{d['concept']}"
        new_d = dict()
        for h in header:
            hew_d[h] = d[h]
        new_d['triple'] = triple
        new_d['url'] = url
        new_dicts.append(new_d)
    pass
    
    
    



if __name__ == '__main__':
    run = 3
    experiment_name = 'experiment1'
    input_dicts = read_input(run, experiment_name)
    question_path = f'../questions/run{run}-all-restricted_True.csv'
    question_dicts = read_csv(question_path)
    selected_properties = read_group(experiment_name)
    test_check_questions = get_check_and_test()

    available_questions = get_available_questions(input_dicts, question_dicts)

    questions_in_selection = [d for d in available_questions \
                              if d['property'] in selected_properties]


    print('questions available for annotation:', len(questions_to_annotate))
    print('questions for current experiment:', len(questions_in_selection))
    new_batch = create_new_batch(questions_in_selection, n_qu = 70)
    print(f'created new batch with {len(new_batch)} questions')
    new_batch.extend(test_check_questions)
    print(f'added test and check, {len(new_batch)} questions in total.')
    print(new_batch[0].keys())
    to_csv('test_batch.csv', new_batch)

not annotated yet: 29064
not valid 640
questions available for annotation: 29704
questions for current experiment: 2384
found a new one: round 0
found a new one: roll 10
properties not used: 1 20
properties not used: 1 20
found a new one: red 20
properties not used: 0 30
no more properties, adding quetions: 10
properties not used: 0 40
no more properties, adding quetions: 10
properties not used: 0 50
no more properties, adding quetions: 10
properties not used: 0 60
no more properties, adding quetions: 10
found enough questions 70
created new batch with 70 questions
added test and check, 72 questions in total.
odict_keys(['property', 'label', 'certainty', 'concept', 'collection', 'sources', 'quid', 'relation', 'question', 'prop_pos', 'concept_pos', 'prop_neg', 'concept_neg', 'example_pos', 'example_neg'])
