In [1]:
import pickle
import json
from collections import defaultdict

In [2]:
crowd_data_loc="crowd_data"
profiler_data_loc="profiler_data"
givens_file="given.pkl"
predicted_file="predicted.pkl"

In [3]:
with open('mappings.tsv', 'r') as f:
    mapping_file=f.readlines()
mappings={}
for line in mapping_file:
    uri, old_label, new_label = line.split('\t')
    mappings[uri]=new_label.strip()

In [4]:
domains={'century':['20', '19', '18', '17', '21'], 
         'lifedur': ['71-80', '81-90', '61-70', '51-60', '91-100', '41-50', 
                      '31-40', '21-30', '101-110', '11-20'],
         'gender': ['Male', 'Female'],
         'politicalparty': ['Democratic Party', 'Republican Party'],
         'occupation': ['politician', 'actor', 'lawyer', 'baseball player', 
                        'American football player', 'singer', 'writer', 
                        'basketball player', 'judge'],
         'birthplace': ['New York City (NY)', 'Chicago (IL)', 'Los Angeles (CA)', 
                        'Philadelphia (PA)', 'Boston (MA)', 'Washington D.C.', 
                        'San Francisco (CA)', 'Detroit (MI)'],
         'deathplace': ['New York City (NY)', 'Chicago (IL)', 'Los Angeles (CA)', 
                        'Philadelphia (PA)', 'Boston (MA)', 'Washington D.C.', 
                        'San Francisco (CA)', 'Santa Monica (CA)'],
         'educatedat': ['Harvard University', 'Columbia University', 'Yale University', 
                        'University of Michigan', 'Stanford University', 
                        'Princeton University', 'University of Wisconsin–Madison', 
                        'University of California, Berkeley', 'Cornell University'],
         'worklocation': ['Washington D.C.', 'New York City (NY)', 
                          'Harrisburg (PA)', 'Sacramento (CA)', 'Austin (TX)', 
                          'Springfield (IL)', 'Tallahassee (FL)', 'Baton Rouge (LA)', 
                          'Montpelier (VT)', 'Phoenix (AZ)'],
         'religion': ['Christianity', 'atheism', 'Judaism', 'Islam']
        }

In [5]:
properties_mapping={'member of political party': 'politicalparty',
                    'educated at': 'educatedat',
                    'occupation': 'occupation',
                    'work location': 'worklocation',
                    'place of birth': 'birthplace',
                    'place of death': 'deathplace',
                    'lifespan': 'lifedur',
                    'sex or gender': 'gender',
                    'religion': 'religion',
                    'century': 'century'
                   }

### 1. Prepare profiler data

#### 1.1. Map URIs to labels

In [6]:
with open('%s/%s' % (profiler_data_loc, givens_file), 'rb') as f:
    old_prof_givens=pickle.load(f)
with open('%s/%s' % (profiler_data_loc, predicted_file), 'rb') as f:
    old_prof_preds=pickle.load(f,encoding='latin1')

In [7]:
def map_givens(old_prof_givens):    
    new_prof_givens=[]
    for row in old_prof_givens:
        new_row={}
        for k,v in row.items():
            new_k=properties_mapping[k]
            new_v = mappings[v]
            new_row[new_k]=new_v
        new_prof_givens.append(new_row)
    return new_prof_givens

In [8]:
new_prof_givens=map_givens(old_prof_givens)

In [9]:
new_prof_preds=defaultdict(list)
for prop, data in old_prof_preds.items():
    prop=properties_mapping[prop]
    for dist in data:
        new_dist={}
        for k,v in dist:
            if k in mappings.keys() and mappings[k] in domains[prop]:
                new_k=mappings[k]
            else:
                new_k='None of the above'
            if new_k not in new_dist.keys():
                new_dist[new_k]=v
            else:
                new_dist[new_k]+=v
        new_prof_preds[prop].append(new_dist)


In [10]:
test_data=old_prof_preds['religion']
for test_dist in test_data:
    print(test_dist)
    new_dist={}
    for k,v in test_dist:
        print(k,v)
        if k in mappings.keys():
            if mappings[k] in  domains['religion']:
                print(mappings[k])
                new_k=mappings[k]
                if new_k in new_dist.keys():
                    new_dist[new_k]+=v
                else:
                    new_dist[new_k]=v
    print(new_dist)
    break


[('http://www.wikidata.org/entity/Q9592', 0.30576660668162087), ('http://www.wikidata.org/entity/Q682443', 0.24842106527264676), ('http://www.wikidata.org/entity/Q9268', 0.18849640508632778), ('http://www.wikidata.org/entity/Q7066', 0.05239944198522553), ('http://www.wikidata.org/entity/Q1841', 0.04013294670585205), ('http://www.wikidata.org/entity/Q59778', 0.030666341026719893), ('http://www.wikidata.org/entity/Q1062789', 0.014665244299124485), ('http://www.wikidata.org/entity/Q7970362', 0.01423274472253384), ('http://www.wikidata.org/entity/Q178169', 0.013676829931846293), ('http://www.wikidata.org/entity/Q426316', 0.012780642871559455)]
http://www.wikidata.org/entity/Q9592 0.30576660668162087
Christianity
http://www.wikidata.org/entity/Q682443 0.24842106527264676
Christianity
http://www.wikidata.org/entity/Q9268 0.18849640508632778
Judaism
http://www.wikidata.org/entity/Q7066 0.05239944198522553
atheism
http://www.wikidata.org/entity/Q1841 0.04013294670585205
Christianity
http://www

In [11]:
transposed_prof_predictions=[]
for x in range(len(new_prof_givens)):
    given_props=new_prof_givens[x].keys()
    needed_props=domains.keys()-given_props
    pred_row={}
    for prop in needed_props:
        pred_row[prop]=new_prof_preds[prop][x]
        pred_row[prop]['None of the above']=max(0,1-sum(pred_row[prop].values()))
    transposed_prof_predictions.append(pred_row)

In [12]:
transposed_prof_predictions

[{'birthplace': {'Boston (MA)': 0.06900091642937108,
   'Chicago (IL)': 0.026149034999901136,
   'New York City (NY)': 0.2225399841963115,
   'None of the above': 0.40137572058746707,
   'Philadelphia (PA)': 0.02744406509113528,
   'Washington D.C.': 0.042454994097572224},
  'century': {'17': 0.00016531192369236185,
   '18': 0.07589992275375045,
   '19': 0.3167326276002765,
   '20': 0.6071912290994848,
   '21': 1.6148425626221606e-08,
   'None of the above': 1.1688428003253648e-12},
  'deathplace': {'Boston (MA)': 0.09906384075691642,
   'Chicago (IL)': 0.02341661613337656,
   'New York City (NY)': 0.1325649526531716,
   'None of the above': 0.3876464458247968,
   'Philadelphia (PA)': 0.013235125918098245,
   'Washington D.C.': 0.13664478048813092},
  'gender': {'Female': 0.052905706547618135,
   'Male': 0.9470942755556055,
   'None of the above': 9.992007221626409e-16},
  'lifedur': {'101-110': 0.0014463231003208588,
   '11-20': 1.1135489764120896e-06,
   '21-30': 0.011736199205875666

### 2. Prepare crowd data

In [13]:
with open('%s/%s' % (crowd_data_loc, givens_file), 'rb') as f:
    old_crowd_givens=pickle.load(f)
    new_crowd_givens=old_crowd_givens
with open('%s/%s' % (crowd_data_loc, predicted_file), 'rb') as f:
    old_crowd_preds=pickle.load(f)

In [14]:
undecided='I can not decide'
def transform_undecided(prop, dist):
    if undecided in dist.keys():
        new_dist={}
        undecided_part=dist[undecided]
        split_among=len(domains[prop]) + 1
        add_to_each=undecided_part/split_among
        for value in ['None of the above'] + domains[prop]:
            if value in dist.keys():
                new_dist[value]=dist[value]+add_to_each
            else:
                new_dist[value]=add_to_each
        return new_dist
    else:
        return dist

In [15]:
new_crowd_preds=[]
for row in old_crowd_preds:
    new_row={}
    for prop, dist in row.items():
        new_dist=transform_undecided(prop, dist)
        new_row[prop]=new_dist
    new_crowd_preds.append(new_row)

In [16]:
new_crowd_preds

[{'birthplace': {'Boston (MA)': 0.38222222222222224,
   'Chicago (IL)': 0.05222222222222222,
   'Detroit (MI)': 0.05222222222222222,
   'Los Angeles (CA)': 0.12222222222222223,
   'New York City (NY)': 0.12222222222222223,
   'None of the above': 0.05222222222222222,
   'Philadelphia (PA)': 0.12222222222222223,
   'San Francisco (CA)': 0.05222222222222222,
   'Washington D.C.': 0.05222222222222222},
  'century': {'17': 0.045000000000000005,
   '18': 0.11500000000000002,
   '19': 0.11500000000000002,
   '20': 0.645,
   '21': 0.045000000000000005,
   'None of the above': 0.045000000000000005},
  'deathplace': {'Boston (MA)': 0.1288888888888889,
   'Chicago (IL)': 0.05888888888888889,
   'Los Angeles (CA)': 0.1288888888888889,
   'New York City (NY)': 0.18888888888888888,
   'None of the above': 0.1288888888888889,
   'Philadelphia (PA)': 0.05888888888888889,
   'San Francisco (CA)': 0.05888888888888889,
   'Santa Monica (CA)': 0.05888888888888889,
   'Washington D.C.': 0.1888888888888888

### 3. Prepare for evaluation

In [17]:
# Check if the order is the same in both data pieces
for x in range(len(new_crowd_givens)):
    print(new_crowd_givens[x]==new_prof_givens[x])
print(len(new_crowd_givens))

True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True


In [18]:
def prob_to_list(data, prop_domains):
    size=len(prop_domains)+1
    probs=[0]*size
    
    
    for value, probability in data.items():
        if value=='University of California Berkeley':
            value='University of California, Berkeley'
        if value=='None of the above':
            index=-1
        else:
            index = prop_domains.index(value)
        probs[index]=probability
    return probs

In [19]:
props = list(properties_mapping.values())[::-1]
props

['century',
 'religion',
 'gender',
 'lifedur',
 'deathplace',
 'birthplace',
 'worklocation',
 'occupation',
 'educatedat',
 'politicalparty']

In [47]:
import evaluation
import math
from sklearn.metrics import mutual_info_score
from scipy.spatial.distance import cosine

def compute_divergencies(system, gold, givens, props, metric='js_div'):
    divergencies = defaultdict(list)
    
    divergencies_on_known={}
    
    uncertainties={}
    
    for p in props:
        divergencies_on_known[p]=defaultdict(list)
        uncertainties[p]=defaultdict(list)
    
    for x in range(len(givens)):
        profiler_predictions=system[x]
        crowd_predictions=gold[x]
        
        for prop in crowd_predictions.keys():
            crowd_prop_predictions=crowd_predictions[prop]
            prof_prop_predictions=profiler_predictions[prop]
            crowd_probs=prob_to_list(crowd_prop_predictions, domains[prop])
            prof_probs=prob_to_list(prof_prop_predictions, domains[prop])

            if metric=='js_div':
                div=evaluation.js(crowd_probs, prof_probs)
            elif metric=='js_dist':
                tmp_div=evaluation.js(crowd_probs, prof_probs)
                div=math.sqrt(tmp_div)
            elif metric=='kl_div':
                div=evaluation.kl(crowd_probs, prof_probs)
            elif metric=='kl_max':
                div=evaluation.kl_max(crowd_probs, prof_probs)
            elif metric=='kl_avg':
                div=evaluation.kl_avg(crowd_probs, prof_probs)
            elif metric=='cos':
                div=cosine(crowd_probs, prof_probs)
            divergencies[prop].append(div)
            
            known_props=len(givens[x])
            divergencies_on_known[prop][known_props].append(div)
            
            unc=0.0
            if 'None of the above' in crowd_predictions[prop]:
                unc=crowd_predictions[prop]['None of the above']
            uncertainties[prop][known_props].append(unc)

    return divergencies, divergencies_on_known, uncertainties

In [48]:
def compute_mean_per_prop(data):
    prop_divs={}
    for prop, divs in data.items():
        avg_div=round(sum(divs)/len(divs),2)
#        print(prop, '\t', avg_div)
        prop_divs[prop]=avg_div
    return prop_divs

In [49]:
metrics=['js_div', 'js_dist', 'kl_div', 'kl_max', 'kl_avg', 'cos']
div_per_metric={}
div_array_per_metric={}
for metric in metrics:
    divergencies, div_known, uncertainties = compute_divergencies(transposed_prof_predictions, 
                                                                new_crowd_preds, 
                                                                new_crowd_givens, 
                                                                props, metric=metric)
    mean_values=compute_mean_per_prop(divergencies)
    div_per_metric[metric]=mean_values
    div_array_per_metric[metric]=list(mean_values.values())

In [50]:
from scipy.stats import spearmanr, pearsonr

pearson_correlations = [[' '] + metrics]
spearman_correlations = [[' '] + metrics]

for metric1, dist1 in div_array_per_metric.items():
    pearson_row=[metric1]
    spearman_row=[metric1]
    for metric2, dist2 in div_array_per_metric.items():
        sp_corr=spearmanr(dist1, dist2)
        pe_corr=pearsonr(dist1, dist2)
        spearman_row.append(round(sp_corr.correlation, 4))
        pearson_row.append(round(pe_corr[0], 4))
        
    pearson_correlations.append(pearson_row)
    spearman_correlations.append(spearman_row)

In [51]:
def print_matrix(A):
    print('\n'.join(['\t'.join(['{:7}'.format(item) for item in row]) 
      for row in A]))

In [52]:
print_matrix(pearson_correlations)

       	js_div 	js_dist	kl_div 	kl_max 	kl_avg 	cos    
js_div 	    1.0	 0.9862	 0.8558	 0.9094	 0.9444	 0.9977
js_dist	 0.9862	    1.0	 0.8438	 0.8941	 0.9262	 0.9888
kl_div 	 0.8558	 0.8438	    1.0	 0.9915	 0.9741	 0.8674
kl_max 	 0.9094	 0.8941	 0.9915	    1.0	 0.9948	 0.9157
kl_avg 	 0.9444	 0.9262	 0.9741	 0.9948	    1.0	 0.9483
cos    	 0.9977	 0.9888	 0.8674	 0.9157	 0.9483	    1.0


In [53]:
print_matrix(spearman_correlations)

       	js_div 	js_dist	kl_div 	kl_max 	kl_avg 	cos    
js_div 	    1.0	  0.997	 0.8936	 0.9119	 0.9483	  0.997
js_dist	  0.997	    1.0	  0.903	 0.9152	 0.9515	    1.0
kl_div 	 0.8936	  0.903	    1.0	 0.9758	 0.9758	  0.903
kl_max 	 0.9119	 0.9152	 0.9758	    1.0	 0.9636	 0.9152
kl_avg 	 0.9483	 0.9515	 0.9758	 0.9636	    1.0	 0.9515
cos    	  0.997	    1.0	  0.903	 0.9152	 0.9515	    1.0


In [55]:
for prop, data in div_known.items():
    for known_props, dists in data.items():
        uncs=uncertainties[prop][known_props]
        print(prop, '\t', known_props, '\t', round(sum(dists)/len(dists),2), '\t', 
              round(sum(uncs)/len(uncs), 4))

century 	 3 	 0.04 	 0.0299
century 	 4 	 0.08 	 0.016
century 	 5 	 0.11 	 0.0251
century 	 6 	 0.23 	 0.0047
century 	 7 	 0.5 	 0.003
century 	 8 	 0.4 	 0.0046
century 	 9 	 1.0 	 0.0
religion 	 3 	 0.04 	 0.0296
religion 	 4 	 0.06 	 0.0219
religion 	 5 	 0.08 	 0.0197
religion 	 6 	 0.07 	 0.0131
religion 	 7 	 0.02 	 0.0136
religion 	 8 	 0.02 	 0.01
gender 	 3 	 0.01 	 0.0304
gender 	 4 	 0.01 	 0.0243
gender 	 5 	 0.01 	 0.0316
gender 	 6 	 0.0 	 0.0118
gender 	 7 	 0.0 	 0.0042
lifedur 	 3 	 0.16 	 0.0268
lifedur 	 4 	 0.17 	 0.0175
lifedur 	 5 	 0.14 	 0.0227
deathplace 	 3 	 0.49 	 0.1083
deathplace 	 4 	 0.24 	 0.1269
deathplace 	 5 	 0.15 	 0.0888
deathplace 	 6 	 0.18 	 0.0694
birthplace 	 3 	 0.51 	 0.1004
birthplace 	 4 	 0.43 	 0.0519
birthplace 	 5 	 0.37 	 0.0263
worklocation 	 3 	 0.35 	 0.0823
worklocation 	 4 	 0.46 	 0.1694
worklocation 	 5 	 0.54 	 0.1156
worklocation 	 6 	 0.48 	 0.1228
occupation 	 3 	 0.44 	 0.143
occupation 	 4 	 0.53 	 0.0979
occupation 	 

### 4. MFV baseline

#### 4.1 Prepare

In [56]:
mfv_data_loc = 'mfv_data'
with open('%s/%s' % (mfv_data_loc, givens_file), 'rb') as f:
    mfv_givens=pickle.load(f)
with open('%s/%s' % (mfv_data_loc, predicted_file), 'rb') as f:
    mfv_preds = pickle.load(f)

In [57]:
new_mfv_givens=map_givens(mfv_givens)

In [58]:
new_mfv_preds=[]
for a_row in mfv_preds:
    new_row={}
    for prop, vals in a_row.items():
        prop=properties_mapping[prop]
        for k,v in vals.items():
            new_k=mappings[k]
            new_row[prop]={new_k: 1.0}
    new_mfv_preds.append(new_row)

#### 4.2 Evaluate

In [62]:
for metric in ['js_div','js_dist']:
    mfv_divs, div_known_mfv, uncertainties = \
        compute_divergencies(new_mfv_preds, new_crowd_preds, mfv_givens, props, metric=metric)
    mean_values=compute_mean_per_prop(mfv_divs)
    print(metric, mean_values)

js_div {'century': 0.14, 'religion': 0.05, 'gender': 0.04, 'deathplace': 0.52, 'lifedur': 0.29, 'birthplace': 0.39, 'worklocation': 0.49, 'occupation': 0.38, 'educatedat': 0.4, 'politicalparty': 0.16}
js_dist {'century': 0.36, 'religion': 0.19, 'gender': 0.16, 'deathplace': 0.71, 'lifedur': 0.53, 'birthplace': 0.59, 'worklocation': 0.68, 'occupation': 0.6, 'educatedat': 0.62, 'politicalparty': 0.38}


### 5. NB baseline

#### 5.1 Prepare

In [64]:
nb_data_loc = 'nb_data'
with open('%s/%s' % (nb_data_loc, predicted_file), 'rb') as f:
    nb_preds = pickle.load(f)

In [65]:
nb_givens=new_mfv_givens

In [66]:
new_nb_preds=[]
for x in range(len(new_crowd_givens)):
    new_row={}
    these_givens=new_crowd_givens[x]
    for prop in nb_preds.keys():
        new_prop=properties_mapping[prop]
        if new_prop not in these_givens.keys():
            pred=nb_preds[prop][x]
            if pred in mappings.keys():
                new_k=mappings[pred]
            else:
                new_k='None of the above'
            new_row[new_prop]={new_k: 1.0}
    new_nb_preds.append(new_row)

#### 5.2 Evaluate

In [68]:
for metric in ['js_div','js_dist']:
    nb_divs, div_known_nb, uncertainties = \
        compute_divergencies(new_nb_preds, new_crowd_preds, nb_givens, props, metric=metric)
    mean_values=compute_mean_per_prop(nb_divs)
    print(metric, mean_values)

js_div {'century': 0.36, 'religion': 0.05, 'gender': 0.04, 'deathplace': 0.47, 'lifedur': 0.36, 'birthplace': 0.46, 'worklocation': 0.49, 'occupation': 0.48, 'educatedat': 0.4, 'politicalparty': 0.17}
js_dist {'century': 0.57, 'religion': 0.19, 'gender': 0.16, 'deathplace': 0.67, 'lifedur': 0.58, 'birthplace': 0.65, 'worklocation': 0.68, 'occupation': 0.68, 'educatedat': 0.62, 'politicalparty': 0.39}


### 6. Compute correlation to entropy

In [71]:
entropy=[0.922, 1.256, 0.701, 2.403, 2.676, 2.477, 2.802, 2.903, 2.907, 0.998]
norm_entropy=[0.397, 0.628, 0.701, 0.801, 0.806, 0.826, 0.844, 0.916, 0.917, 0.998]
js_dist_ae=[0.32, 0.27, 0.13, 0.44, 0.31, 0.5, 0.5, 0.59, 0.5, 0.23]
js_dist_mfv=[0.36, 0.19, 0.16, 0.71, 0.53, 0.59, 0.68, 0.6, 0.62, 0.38]
js_dist_nb=[0.57, 0.19, 0.16, 0.67, 0.58, 0.65, 0.68, 0.68, 0.62, 0.39]

for metric in [spearmanr, pearsonr]:
    print('entropy<->mfv', metric(entropy, js_dist_mfv))
    print('norm_entropy<->mfv', metric(norm_entropy, js_dist_mfv))

    print('entropy<->nb', metric(entropy, js_dist_nb))
    print('norm_entropy<->nb', metric(norm_entropy, js_dist_nb))

    print('entropy<->ae', metric(entropy, js_dist_ae))
    print('norm_entropy<->ae', metric(norm_entropy, js_dist_ae))

entropy<->mfv SpearmanrResult(correlation=0.7333333333333332, pvalue=0.01580059625057158)
norm_entropy<->mfv SpearmanrResult(correlation=0.4787878787878787, pvalue=0.1615229280174558)
entropy<->nb SpearmanrResult(correlation=0.7538028741632801, pvalue=0.011794786289983676)
norm_entropy<->nb SpearmanrResult(correlation=0.41337576970244394, pvalue=0.23506197663269415)
entropy<->ae SpearmanrResult(correlation=0.8160123484559988, pvalue=0.003989144395287169)
norm_entropy<->ae SpearmanrResult(correlation=0.3865321650581048, pvalue=0.26986858257602425)
entropy<->mfv (0.8810520967513815, 0.0007568813931480064)
norm_entropy<->mfv (0.4976644015184615, 0.14327876547472712)
entropy<->nb (0.7967720458198583, 0.005792592867841871)
norm_entropy<->nb (0.2809775498453526, 0.43163095547696295)
entropy<->ae (0.8632830977819409, 0.0012917441642871187)
norm_entropy<->ae (0.36787375952426543, 0.2956223768519687)
