# Create crowd input workflow


1.) Create all questions of a run

- this should not be changed (only created once)
- if the question formulations change, we create a new run
- identifiers should not change
- create unique ids here
- we draw from this selection for creating batches 


2.) Create batch

- draw from all questions of a run
- only select properties in the experiment group
- always make sure that a batch contains the full set of questions of a pair 
- add prolific url 
- check prolific_input for already posed pairs
- write to file without header


In [189]:
import csv
from collections import defaultdict
import random
import uuid
import os

In [154]:


#os.listdir('../examples/')

In [313]:
# utils 

import csv
from collections import defaultdict
import random
import os

def read_csv(filepath):

    # check for separator
    with open(filepath) as infile:
        lines = infile.read().split('\n')
    if '\t' in lines[0]:
        sep = '\t'
    else:
        sep = ','
    with open(filepath) as infile:
        dict_list = list(csv.DictReader(infile, delimiter = sep))
    return dict_list

def sort_by_key(data_dict_list, keys):

    sorted_dict = defaultdict(list)
    for d in data_dict_list:
        if len(keys) == 1:
            key = keys[0]
            sortkey = d[key].strip()
        else:
            sortkeys = []
            for key in keys:
                sortkey = d[key].strip()
                sortkeys.append(sortkey)
            sortkey = '-'.join(sortkeys)
        sorted_dict[sortkey].append(d)
    return sorted_dict

def read_examples(relations):
    relation_examples_dict = dict()
    for rel in relations:
        filepath = f'../examples/{rel}-pairs.csv'
        dict_list = read_csv(filepath)
        sorted_by_collection = sort_by_key(dict_list, ['collection'])
        relation_examples_dict[rel] = sorted_by_collection
    return relation_examples_dict


def read_pairs(collection):
    filepath = f'../data/{collection}.csv'
    dict_list = read_csv(filepath)
    dict_list_by_prop = sort_by_key(dict_list, ['property'])
    return dict_list_by_prop

    def read_property_info():
    filepath = f'../data/property_info.csv'
    dict_list = read_csv(filepath)
    prop_info_dict = sort_by_key(dict_list, ['property'])
    return prop_info_dict



def read_template(run):
    filepath = f'../templates/template-run{run}.csv'
    dict_list = read_csv(filepath)
    collections = ['perceptual', 'perceptual_scale', 'complex',\
           'complex_scale','parts', 'activities']

    collection_relation_question_dict = dict()
    level_relation_dict = defaultdict(set)
    for d in dict_list:
        target_collections = [c for c in collections if d[c].strip("'") == '1']
        relation = d['relation']
        level = d['level']
        if level != '':
            level = int(level)
            level_relation_dict[level].add(relation)
        for c in target_collections:
            collection_relation_question_dict[(c, relation)] = d['question']
    assert len(collection_relation_question_dict) == 60, 'not enough relations'
    assert len(level_relation_dict[1]) == 6, 'wrong mapping level 1'
    assert len(level_relation_dict[2]) == 2, 'wrong mapping level 2'
    assert len(level_relation_dict[3]) == 4, 'wrong mapping level 3'
    return collection_relation_question_dict, level_relation_dict

def capitalize(word):
    if len(word) > 1:
        cap = word[0].upper()+word[1:]
    else:
        cap = word
    return cap

def verb_agreement(prop):

    prop_s = f'{prop}s'
    prop_ing = f'{prop}ing'

    if prop == 'lay_eggs':
        prop_s = 'lays eggs'
        prop_ing = 'laying eggs'
    elif prop == 'fly':
        prop_s = 'flies'
    elif prop == 'swim':
        prop_ing = 'swimming'
    return prop_s, prop_ing

def create_question(prop, concept, question_temp, category):

    prop_s, prop_ing = verb_agreement(prop)
    prop_cap = capitalize(prop)
    concept_cap = capitalize(concept)
    if prop.startswith('made_of'):
        link_sg = 'is'
        link_inf = 'be'
        link = 'are'
        link_ing = 'being'
        clink_ing = 'Being'
        part = 'materials'
        lneg = 'is not'
    else:
        link_sg = 'has'
        link_inf = 'have'
        link = 'have'
        link_ing = 'having'
        clink_ing = 'Having'
        part = 'parts or components'
        lneg = 'does not have'

    replacements = [
                    ('[X]', prop),
                    ('[CX]', prop_cap),
                    ('[Xs]', prop_s),
                    ('[Xing]', prop_ing),
                    ('[Y]', concept),
                    ('[CY]', concept_cap),
                    ('[L]', link),
                    ('[Linf]', link_inf),
                    ('[Lsg]', link_sg),
                    ('[Ling]', link_ing),
                    ('[PART]', part),
                    ('[Lneg]', lneg),
                    ('[CLing]', clink_ing),
                    ('[category]', category),
                    ('[link]', link)
                    ]

    for char, replace_form in replacements:
        question_temp = question_temp.replace(char, replace_form)
        question_temp = question_temp.replace('???', '"')
    question = question_temp
    return question

def get_levels(label, certainty, restrict):
    if restrict == True:
        if certainty == 'not_certain':
            levels = [1, 2, 3]
        else:
            if label == 'pos':
                levels = [1, 2]
            else:
                levels = [3]
    else:
        levels = [1, 2, 3]
    return levels

def create_example(examples, question_temp, prop_info_dict):
    rand_index = random.randint(0, len(examples)-1)
    rand_example = examples[rand_index]
    if 'collection' in rand_example:
        rand_example.pop('collection')
    labels = ['pos', 'neg']
    for l in labels:
        prop = rand_example[f'prop_{l}']
        concept = rand_example[f'concept_{l}']
        category = prop_info_dict[prop][0]['category']
        example_qu = create_question_string(prop, concept, question_temp, category)
        rand_example[f'example_{l}'] = example_qu
    return rand_example

def create_example_creative(examples, question_temp, prop_info_dict):
    
    creative_examples = []
    creative_ex_d = defaultdict(list)
    labels = ['pos', 'neg']
    for l in labels:
        for ex in examples:
            creative_ex_d['proper']
        

def read_group(exp_name):
    collections = ['perceptual', 'activities', 'parts', 'complex']
    prop_coll_dict = dict()
    for c in collections:
        filepath = f'../experiment_groups/{c}-group{exp_name}.txt'
        if os.path.isfile(filepath):
            with open(filepath) as infile:
                properties = infile.read().strip().split('\n')
            for prop in properties:
                prop_coll_dict[prop] = c
    return prop_coll_dict


    

In [316]:
class Pairs:

    def __init__(self, run, restrict = True):
        self.run = run
        self.collections = ['perceptual', 'activities', 'complex', 'parts']
        self.data_dicts = self.get_data_dicts()
        self.question_templates, self.level_dict = read_template(run)
        self.relations = [rel for rel_list in self.level_dict.values() for rel in rel_list] 
        self.prop_info = prop_info_dict = read_property_info()
        self.relation_examples_dict = read_examples(relations)
        self.restrict = restrict 
        self.questions = self.get_questions()
        
    def get_data_dicts(self):
        data_dicts = []
        for coll in self.collections:
            prop_data_dicts = read_pairs(coll)
            for prop, dicts in prop_data_dicts.items():
                for d in dicts:
                    d['concept'] = d['lemma']
                    d.pop('lemma')
                    d['collection'] = coll
                    d['sources'] = d['sources_str']
                    d.pop('sources_str')
                data_dicts.extend(dicts)
        return data_dicts
    
    def get_questions(self):
        questions = []
        for d in self.data_dicts:
            certainty = d['certainty']
            label = d['label']
            levels = get_levels(label, certainty, self.restrict)
            for l in levels:
                relations = self.level_dict[l]
                for rel in relations:
                    coll = d['collection']
                    coll_rel = (coll, rel)
                    if coll_rel in self.question_templates:
                        q_d = dict()
                        q_d.update(d)
                        prop = d['property']
                        qu_temp = self.question_templates[coll_rel]
                        cat = self.prop_info[prop][0]['category']
                        q_d['quid']  = uuid.uuid4()
                        q_d['question'] = create_question(prop, d['concept'], qu_temp, cat)
                        examples = relation_examples_dict[rel][coll]
                        example_dict = create_example(examples, qu_temp, self.prop_info)
                        q_d.update(example_dict)
                        questions.append(q_d)
        return questions
    
    def to_file(self):
        #questions/run_3-all-restrict_True.csv 
        filepath = f'../questions/run{self.run}-all-restricted_{self.restrict}.csv'
        fieldnames = self.questions[0].keys()
        with open(filepath, 'w') as outfile:
            writer = csv.DictWriter(outfile, fieldnames = fieldnames)
            writer.writeheader()
            for d in self.questions:
                writer.writerow(d)
            


run = 3
restrict = True
pairs =  Pairs(run,  restrict = True)
pairs.to_file()
print(len(pairs.questions))

29834


In [297]:
prop = 'swim'
concept = 'penguin'
scale = 'F'
collection = 'activities'
relation = 'typical_of_concept'

if scale == 'T':
    collection = f'{collection}_scale'

run = 3

prop_info_dict = read_property_info()

collection_relation_question_dict, level_relation_dict = read_template(run)
collection_relation_question_dict, level_relation_dict = read_template(run)
relations = []
[relations.extend(rels) for rels in level_relation_dict.values()]
relation_examples_dict = read_examples(relations)
examples = relation_examples_dict[relation][collection]


question_temp = collection_relation_question_dict[(collection, relation)]
category = prop_info_dict[prop][0]['category']
print(category)
question = create_question_string(prop, concept, question_temp, category)
print(question_temp)
example = create_example(examples, question_temp, prop_info_dict)
print(example['example_pos'])
print(example['example_neg'])
print(example.keys())

movements
``[CX]'' is one for the first things which come to mind when I hear ``[Y]''.
property paint
1
activities
property move
1
activities
``Paint'' is one for the first things which come to mind when I hear ``brush''.
``Move'' is one for the first things which come to mind when I hear ``apple''.
odict_keys(['prop_pos', 'concept_pos', 'prop_neg', 'concept_neg', 'example_pos', 'example_neg'])


In [235]:

run = 3
collection = 'activities'
prop = 'swim'



def get_target_dicts(d, level_relation_dict, restricted, collection):
    for d in data_dicts:
        concept = d['lemma']
        certainty = d['certainty']
        label = d['label']
        levels = get_levels(label, certainty, restricted)
        target_dicts = []
        for l in levels:
            relations = level_relations_dict[l]
            for rel in relations:
                q_d = dict()
                q_d['relation'] = relation
                q_d['property'] = d['property']
                q_d['concept'] = d['concept']
                q_d['certainty'] = certainty
                q_d['label'] = label
                q_d['collection'] = collection
                self.questions.extend(q_d)
     
   
    return target_dicts

dict_list_by_prop = read_pairs(collection)
dicts = dict_list_by_prop[prop]
collection_relation_question_dict, level_relation_dict = read_template(run)
prop_info_dict = read_property_info()
relation_examples_dict = read_examples(relations)
restricted = True

questions = []
for d in dicts[:10]:
    target_dicts = get_target_dicts(d, level_relation_dict, restricted, collection)
    questions.extend(target_dicts)
for d in questions:
    collection_relation = (d['collection'], d['relation'])
    if collection_relation in collection_relation_question_dict:
        d['quid']  = uuid.uuid4()
        question_temp = collection_relation_question_dict[collection_relation]
        question = create_question_string(d['property'], concept, question_temp, category)
        qu_d['question_template'] = question_temp
        qu_d['question'] = question
        examples = relation_examples_dict[relation][collection]
        example_dict = create_example(examples, question_temp, prop_info_dict)
        qu_d.update(example_dict)
        qu_d['restricted'] = restricted
        qu_d['label'] = label
        qu_d['source'] = d['sources_str']
        questions.append(qu_d)
            
for q in questions[:3]:
    print(q['question'])

NameError: name 'level_relations_dict' is not defined

In [236]:
exp_name = '_experiment1'
group_dict = group = read_group(exp_name)

for prop, coll in group_dict.items():
    print(prop, coll)

round perceptual
red perceptual
roll activities


In [None]:

            



def get_questions(self):
            for coll, props in self.group.items():
                for prop in props:
                    prop_dicts = read_pairs(collection)
                    for d in prop_dicts:
                        pass 
# create questions 

# read template

# read property info

# read examples 

# read pairs 


# for each pair:

# load relations
# for each relation:
# create question string 
# pick example 
# create example string 
# create question id 

