In [1]:
import json
from collections import defaultdict
import csv
import pickle

### 1. Setup job ID and load the results JSONL file

In [2]:
job_ids=["1285337", "1286025"]
data_loc='crowd_data'

In [3]:
results=[]
for jid in job_ids:
    filename='%s/job_%s.json' % (data_loc, jid)
    with open(filename, 'r') as f:
        for line in f:
            results.append(json.loads(line))

In [4]:
len(results)

331

### 2. Get basic stats

In [5]:
states_count=defaultdict(int)
for result in results:
    states_count[result['state']]+=1
states_count

defaultdict(int,
            {'finalized': 259,
             'golden': 23,
             'hidden_gold': 10,
             'judgable': 39})

### 3. Parse and format test questions

In [6]:
def get_judgements_distribution(judgements, prop):
    dists=defaultdict(int)
    for j in judgements:
        for value in j['data'][prop]:
            dists[value]+=1
    for k,v in dists.items():
        dists[k]=round(v/len(judgements), 2)
    return dict(dists)

In [7]:
def print_test_question(data, judgements, qid):
    print("#### QID: %d, judgments: %d ####" % (qid, len(judgements)))
    print()
    print('Given properties:')
    clean_data={}
    for k, v in data.items():
        if v and v!='NOFILL' and k!='_golden':
            if k.endswith('_gold'):
                gold_question=k[:-5]
                gold_answer=v
            elif k.endswith('_gold_reason'):
                gold_reason=v
            else:
                print(k,'\t', v)
    print()

    answer_dist=get_judgements_distribution(judgements, gold_question)
    
    print('Attribute to fill: %s' % gold_question[3:])
    print('Gold answer: %s, reason: %s' % (gold_answer, gold_reason))
    print('Answer distribution')
    print(answer_dist)
    
    print()
    print()

In [8]:
for result in results:
    if result['state']=='golden':
        this_data=result['data']
        these_judgements=result['results']['judgments']
        this_id=result['id']
        print_test_question(this_data, these_judgements, this_id)

#### QID: 1822205280, judgments: 44 ####

Given properties:
century 	 19
religion 	 Christianity
deathplace 	 Washington D.C.
lifedur 	 61-70
birthplace 	 Philadelphia (PA)
occupation 	 politician
politicalparty 	 Republican Party

Attribute to fill: gender
Gold answer: Male, reason: Politicians in the 19th century were male.
Answer distribution
{'Male': 0.93, 'I can not decide': 0.05, 'Female': 0.02}


#### QID: 1822205281, judgments: 42 ####

Given properties:
century 	 20
gender 	 Female
deathplace 	 Santa Monica (CA)
lifedur 	 71-80
birthplace 	 Los Angeles (CA)
educatedat 	 University of California Berkeley

Attribute to fill: politicalparty
Gold answer: Democratic Party, reason: The state of California is dominantly democratic in the 20th/21st century.
Answer distribution
{'Democratic Party': 0.88, 'Republican Party': 0.1, 'I can not decide': 0.02}


#### QID: 1822205283, judgments: 45 ####

Given properties:
century 	 18
religion 	 Judaism
gender 	 Female
deathplace 	 New York C

### 4. Parse and format 'real' data

In [9]:
def print_real_question(ps, data, judgements, qid, only_top_result=False):
    a_row=[]
    for k in ps:
        v=data[k]
        if not v:
            dist=get_judgements_distribution(judgements, 'sel%s' % k)
            if only_top_result:
                max_tuple = max(dist.items(), key=lambda a: a[1])
                a_row.append('%s (%.2f)' % (max_tuple[0], max_tuple[1]))
            else:
                a_row.append(dist)
        elif v!='NOFILL':
            a_row.append('*%s*' % v)
    print(a_row)
    return a_row

In [10]:
props=['century', 'religion', 'gender', 'deathplace', 'lifedur', 'birthplace',
      'worklocation', 'occupation', 'educatedat', 'politicalparty'] #[::-1]
for only_top_result in [True, False]:
    if only_top_result:
        fn='%s/results_top_value.csv' % data_loc
    else:
        fn='%s/results_all_values.csv' % data_loc
    
    with open(fn, 'w') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=',',
                                quotechar='"', quoting=csv.QUOTE_MINIMAL)
        spamwriter.writerow(['#judgements'] + props)
        for result in results:
            if result['state'] in ['finalized', 'judgable']:
                this_data=result['data']
                these_judgements=result['results']['judgments']
                this_id=result['id']
                a_row=print_real_question(props, this_data, these_judgements, this_id, only_top_result=only_top_result)
                spamwriter.writerow([len(these_judgements)] + a_row)

['20 (0.60)', 'Christianity (0.60)', 'Male (0.67)', 'I can not decide (0.53)', '71-80 (0.53)', 'I can not decide (0.47)', 'I can not decide (0.53)', '*lawyer*', '*Harvard University*', '*Democratic Party*']
['20 (0.73)', 'Christianity (0.47)', 'Male (0.67)', 'I can not decide (0.47)', '71-80 (0.40)', 'I can not decide (0.40)', '*Washington D.C.*', '*lawyer*', '*Harvard University*', '*Democratic Party*']
['20 (0.73)', 'Christianity (0.80)', 'Male (0.87)', 'Washington D.C. (0.40)', '71-80 (0.40)', '*Boston (MA)*', '*Washington D.C.*', '*lawyer*', '*Harvard University*', '*Democratic Party*']
['20 (0.73)', 'Christianity (0.80)', 'Male (0.93)', 'Washington D.C. (0.53)', '*71-80*', '*Boston (MA)*', '*Washington D.C.*', '*lawyer*', '*Harvard University*', '*Democratic Party*']
['20 (0.47)', 'Christianity (0.87)', 'Male (0.80)', 'I can not decide (0.47)', '71-80 (0.40)', 'I can not decide (0.40)', 'Washington D.C. (0.60)', '*politician*', '*Harvard University*', '*Republican Party*']
['20 (0

['*19*', '*Christianity*', '*Male*', '*Philadelphia (PA)*', '71-80 (0.27)', 'Philadelphia (PA) (0.73)', 'Washington D.C. (0.27)', 'I can not decide (0.47)', 'I can not decide (0.47)', 'Republican Party (0.47)']
['*19*', '*Christianity*', '*Male*', '*Philadelphia (PA)*', '*81-90*', 'Philadelphia (PA) (0.80)', 'Washington D.C. (0.33)', 'politician (0.40)', 'I can not decide (0.47)', 'Democratic Party (0.47)']
['*19*', '*Christianity*', '*Male*', '*Philadelphia (PA)*', '*81-90*', '*Philadelphia (PA)*', 'Harrisburg (PA) (0.47)', 'I can not decide (0.33)', 'I can not decide (0.53)', 'Democratic Party (0.60)']
['*20*', '*Christianity*', '*Male*', '*New York City (NY)*', '71-80 (0.53)', 'New York City (NY) (0.60)', 'New York City (NY) (0.73)', 'I can not decide (0.40)', 'I can not decide (0.40)', 'Democratic Party (0.73)']
['*20*', '*Christianity*', '*Male*', '*New York City (NY)*', '*81-90*', 'New York City (NY) (0.53)', 'New York City (NY) (0.67)', 'I can not decide (0.40)', 'I can not deci

### 5. Prepare it for the profiler

In [11]:
newlabel_to_uri=defaultdict(set)
uri_to_label=defaultdict(str)
with open('mappings.tsv', 'r') as f:
    for line in f:
        line=line.strip()
        uri, label, newlabel = line.split('\t')
        label=label.strip()
        newlabel=newlabel.strip()
        newlabel_to_uri[newlabel].add(uri)
        uri_to_label[uri]=label

In [12]:
inv_mappings={}
for newlbl, uris in newlabel_to_uri.items():
    if len(uris)==1:
        inv_mappings[newlbl]=uris.pop()
    else:
        found=False
        for uri in uris:
            if uri_to_label[uri]==newlbl:
                inv_mappings[newlbl]=uri
                found=True
                break
            else:
                print(newlbl, uri_to_label[uri], uri)

New York City (NY) Manhattan http://www.wikidata.org/entity/Q11299
New York City (NY) Brooklyn http://www.wikidata.org/entity/Q18419
Los Angeles (CA) Hollywood http://www.wikidata.org/entity/Q34006
Christianity Catholicism http://www.wikidata.org/entity/Q1841
Christianity Anglicanism http://www.wikidata.org/entity/Q6423963
Christianity Episcopal Church http://www.wikidata.org/entity/Q682443
Christianity Presbyterianism http://www.wikidata.org/entity/Q178169
Christianity The Church of Jesus Christ of Latter-day Saints http://www.wikidata.org/entity/Q42504


In [13]:
inv_mappings['Christianity']='http://www.wikidata.org/entity/Q1841'

In [14]:
inv_mappings.keys()

dict_keys(['Merged label', 'Male', 'Female', '20', '19', '18', '17', '21', 'Democratic Party', 'Republican Party', '71-80', '81-90', '61-70', '51-60', '91-100', '41-50', '31-40', '21-30', '101-110', '11-20', 'politician', 'actor', 'lawyer', 'baseball player', 'American football player', 'singer', 'writer', 'basketball player', 'judge', 'New York City (NY)', 'Chicago (IL)', 'Los Angeles (CA)', 'Philadelphia (PA)', 'Boston (MA)', 'Washington D.C.', 'San Francisco (CA)', 'Detroit (MI)', 'Santa Monica (CA)', 'Harvard University', 'Columbia University', 'Yale University', 'University of Michigan', 'Stanford University', 'Princeton University', 'University of Wisconsin–Madison', 'University of California, Berkeley', 'Cornell University', 'Harrisburg (PA)', 'Sacramento (CA)', 'Austin (TX)', 'Springfield (IL)', 'Tallahassee (FL)', 'Baton Rouge (LA)', 'Montpelier (VT)', 'Phoenix (AZ)', 'Christianity', 'atheism', 'Judaism', 'Islam'])

In [15]:
profiler_ordered_props=['lifedur', 'century', 'gender', 'occupation', 
                         'worklocation', 'educatedat', 'religion', 
                         'politicalparty', 'deathplace', 'birthplace']
rows=[]
for result in results:
    if result['state'] in ['finalized', 'judgable']:
        this_data=result['data']
        these_judgements=result['results']['judgments']
        this_id=result['id']
        a_row=[]
        for p in profiler_ordered_props:
            if not this_data[p]:
                a_row.append('')
            else:
                original_value=this_data[p]
                mapped_value=inv_mappings[original_value]
                a_row.append(mapped_value)
        rows.append(a_row)

In [16]:
len(rows)

298

In [17]:
with open('profiler_input.tsv', 'w') as pi:
    for row in rows:
        pi.write('%s\n' % '\t'.join(row))

### 6. Store for evaluation

In [18]:
all_given=[]
all_judgements=[]
for result in results:
    if result['state'] in ['finalized', 'judgable']:
        this_data=result['data']
        these_judgements=result['results']['judgments']
        this_id=result['id']
        givens_row={}
        judgments_row={}
        for p in props:
            if this_data[p]:
                original_value=this_data[p]
                givens_row[p]=original_value
            else:
                dist=get_judgements_distribution(these_judgements, 'sel%s' % p)
                judgments_row[p]=dist
        all_given.append(givens_row)
        all_judgements.append(judgments_row)

In [19]:
len(all_given)

298

In [20]:
len(all_judgements)

298

In [21]:
for x in range(len(all_given)):
    row_given=all_given[x]
    row_judged=all_judgements[x]
    print(row_given)
    print(row_judged)

{'occupation': 'lawyer', 'educatedat': 'Harvard University', 'politicalparty': 'Democratic Party'}
{'century': {'20': 0.6, '18': 0.07, 'I can not decide': 0.27, '19': 0.07}, 'religion': {'Christianity': 0.6, 'I can not decide': 0.27, 'Judaism': 0.07, 'atheism': 0.07}, 'gender': {'Male': 0.67, 'I can not decide': 0.13, 'Female': 0.2}, 'deathplace': {'I can not decide': 0.53, 'Boston (MA)': 0.07, 'Washington D.C.': 0.13, 'New York City (NY)': 0.13, 'None of the above': 0.07, 'Los Angeles (CA)': 0.07}, 'lifedur': {'71-80': 0.53, '61-70': 0.2, 'I can not decide': 0.13, 'None of the above': 0.07, '51-60': 0.07}, 'birthplace': {'New York City (NY)': 0.07, 'Boston (MA)': 0.33, 'I can not decide': 0.47, 'Philadelphia (PA)': 0.07, 'Los Angeles (CA)': 0.07}, 'worklocation': {'New York City (NY)': 0.2, 'Washington D.C.': 0.27, 'I can not decide': 0.53}}
{'worklocation': 'Washington D.C.', 'occupation': 'lawyer', 'educatedat': 'Harvard University', 'politicalparty': 'Democratic Party'}
{'century':

{'lifedur': '61-70', 'birthplace': 'Chicago (IL)', 'worklocation': 'Springfield (IL)', 'occupation': 'lawyer', 'educatedat': 'Harvard University', 'politicalparty': 'Republican Party'}
{'century': {'18': 0.07, '19': 0.2, '20': 0.6, 'I can not decide': 0.13}, 'religion': {'I can not decide': 0.27, 'Christianity': 0.73}, 'gender': {'I can not decide': 0.27, 'Male': 0.73}, 'deathplace': {'Chicago (IL)': 0.6, 'I can not decide': 0.13, 'None of the above': 0.13, 'Santa Monica (CA)': 0.07, 'New York City (NY)': 0.07}}
{'worklocation': 'Washington D.C.', 'occupation': 'lawyer', 'educatedat': 'Harvard University', 'politicalparty': 'Republican Party'}
{'century': {'20': 0.73, 'I can not decide': 0.13, '19': 0.13}, 'religion': {'Christianity': 0.93, 'I can not decide': 0.07}, 'gender': {'Male': 1.0}, 'deathplace': {'I can not decide': 0.33, 'Washington D.C.': 0.6, 'San Francisco (CA)': 0.07}, 'lifedur': {'I can not decide': 0.27, '71-80': 0.4, '81-90': 0.07, '61-70': 0.27}, 'birthplace': {'I ca

{'birthplace': {'Los Angeles (CA)': 0.73, 'I can not decide': 0.13, 'Chicago (IL)': 0.07, 'New York City (NY)': 0.07}, 'worklocation': {'Sacramento (CA)': 0.6, 'Baton Rouge (LA)': 0.07, 'None of the above': 0.13, 'I can not decide': 0.13, 'New York City (NY)': 0.07}, 'occupation': {'actor': 0.33, 'judge': 0.13, 'singer': 0.07, 'politician': 0.27, 'I can not decide': 0.13, 'writer': 0.07}, 'educatedat': {'University of California Berkeley': 0.4, 'Harvard University': 0.07, 'I can not decide': 0.2, 'Stanford University': 0.07, 'Columbia University': 0.07, 'None of the above': 0.13, 'Yale University': 0.07}, 'politicalparty': {'Democratic Party': 0.53, 'I can not decide': 0.13, 'Republican Party': 0.33}}
{'century': '19', 'religion': 'Christianity', 'gender': 'Male', 'deathplace': 'Los Angeles (CA)', 'lifedur': '91-100', 'birthplace': 'New York City (NY)'}
{'worklocation': {'Sacramento (CA)': 0.33, 'New York City (NY)': 0.4, 'None of the above': 0.13, 'Harrisburg (PA)': 0.07, 'I can not d

In [22]:
with open('%s/given.pkl' % data_loc, 'wb') as f:
    pickle.dump(all_given, f)
with open('%s/predicted.pkl' % data_loc, 'wb') as f:
    pickle.dump(all_judgements, f)    