In [1]:
partition='full'
extractor='auto'
baselines={'exact', 'noclash'}

In [2]:
import glob
import pickle
from collections import defaultdict, Counter
import json

In [3]:
#input files
gold_prop_dir='../data/input/%s/annotation' % partition
auto_prop_dir='../systems/extracted_data/%s' % partition

# gold file
gold_file='../data/gold/%s/participants.json' % partition

# system files
baseline_dir='../data/system/%s/%s/' % (extractor, partition)
profiler_dir='../data/system/%s_profiling/%s' % (extractor, partition)

In [4]:
if extractor=='gold':
    data_dir=gold_prop_dir
else:
    data_dir=auto_prop_dir

### 1. Get number of combinations that are unique or have presence of N

In [5]:
keys=['Residence', 'Ethnicity', 'EducationLevel', 'MedicalCondition', 'BirthPlace', 'Gender', 
      'Age', 'Religion', 'PastConviction', 'CauseOfDeath', 'DeathPlace', 'DeathDate', 'Name']

In [6]:
def create_key(props, keys):
    the_key=[]
    for k in keys:
        if k in props.keys():
            v=props[k].strip()
        else:
            v=''
        the_key.append(v)
    return tuple(the_key)

In [7]:
def get_distribution(data):
    counter=defaultdict(int)
    for k, v in data.items():
        counter[len(v)]+=1
    return counter

In [8]:
for input_file in glob.glob('%s/*.*' % data_dir):
    with open(input_file, 'rb') as f:
        data=pickle.load(f)
        grouped_by_props=defaultdict(list)
        for doc_id, doc_data in data.items():
            for part_id, part_props in doc_data.items():
                k=create_key(part_props, keys)
                grouped_by_props[k].append(part_id)
    print(input_file)
    print('Number of unique combinations:', len(grouped_by_props.keys()))
    print('Distribution', json.dumps(get_distribution(grouped_by_props), sort_keys=True))

../systems/extracted_data/full/participants_input.p
Number of unique combinations: 8064
Distribution {"1": 5718, "2": 1253, "3": 512, "4": 192, "5": 111, "6": 71, "7": 40, "8": 26, "9": 28, "10": 102, "11": 3, "12": 1, "13": 1, "16": 2, "17": 2, "21": 1, "41": 1}
../systems/extracted_data/full/participants_samefirstname.p
Number of unique combinations: 6848
Distribution {"1": 4894, "2": 932, "3": 391, "4": 177, "5": 110, "6": 72, "7": 50, "8": 34, "9": 20, "10": 79, "11": 14, "12": 7, "13": 12, "14": 6, "15": 8, "16": 2, "17": 8, "18": 2, "19": 1, "20": 1, "21": 5, "22": 2, "23": 3, "24": 1, "25": 1, "27": 1, "31": 1, "32": 2, "33": 1, "38": 2, "39": 1, "54": 1, "63": 1, "67": 1, "75": 1, "76": 1, "84": 1, "90": 1, "355": 1}
../systems/extracted_data/full/participants_samename.p
Number of unique combinations: 1600
Distribution {"1": 1149, "2": 199, "3": 64, "4": 37, "5": 16, "6": 13, "7": 14, "8": 12, "9": 7, "10": 10, "11": 9, "12": 6, "13": 8, "14": 4, "15": 4, "16": 4, "17": 1, "18"

### 2. Distinguishability
If we group all data per name, how many comparisons are there to be made?
How many comparisons are useless because of the property sets being exactly the same (`indistinguishable`)?
How many comparisons have some distinguishing properties (`distinguishable`)? Out of these, how often is the information in these properties directly comparable and sufficient to discriminate (`sufficient`)?

In [9]:
def analyze_grouped_data(grouped_by_name):
    total_comparisons=0
    indistinguishable=0
    distinguishable=0
    sufficient_to_distinguish=0
    
    set_indistinguishable=set()
    set_distinguishable=set()
    set_sufficient=set()

    for name, parts in grouped_by_name.items():
        for p1, p1_data  in parts.items():
            for p2, p2_data in parts.items():
                if p1>p2:
                    pair=(p1,p2)
                    total_comparisons+=1
                    if p1_data==p2_data:
                        indistinguishable+=1
                        set_indistinguishable.add(pair)
                    else:
                        distinguishable+=1
                        set_distinguishable.add(pair)
                        for index, e1 in enumerate(p1_data):
                            e2=p2_data[index]
                            if e1!=e2 and e1!='' and e2!='':
                                sufficient_to_distinguish+=1
                                set_sufficient.add(pair)
                                break

    profiler_role=distinguishable-sufficient_to_distinguish
    print('Total pairwise comparisons for a same name', total_comparisons)
    print('Out of these, %d are not distinguishable' % indistinguishable)
    print('%d are distinguishable. From these, for %d there is already enough information in the properties to distinguish.' % 
          (distinguishable, sufficient_to_distinguish))
    print('Hence, the profiler can only help in %d (%d %%) cases.' % (profiler_role, round(profiler_role*100/total_comparisons)))
    
    sets={'set_indistinguishable': set_indistinguishable, 
            'set_distinguishable': set_distinguishable, 
            'set_sufficient': set_sufficient}
    return sets

In [10]:

if extractor=='auto':
    chosen_keys=keys
else:
    chosen_keys=['Ethnicity', 'Gender', 'Age', 'Religion', 'CauseOfDeath', 'Occupation']

In [11]:
def get_filename(f):
    return f.split('/')[-1].split('.')[0]

In [12]:
input_sets={}
for input_file in glob.glob('%s/*.*' % data_dir):
    with open(input_file, 'rb') as f:
        data=pickle.load(f)
        grouped_by_name=defaultdict(dict)
        for doc_id, doc_data in data.items():
            for part_id, part_props in doc_data.items():
                if 'Name' not in part_props.keys() or not part_props['Name'].strip(): continue
                name=part_props['Name'].strip()
                k=create_key(part_props, chosen_keys)
                grouped_by_name[name][part_id]=k
    print(input_file)
    filename=get_filename(input_file)
    input_sets[filename]=analyze_grouped_data(grouped_by_name)

../systems/extracted_data/full/participants_input.p
Total pairwise comparisons for a same name 23927
Out of these, 15133 are not distinguishable
8794 are distinguishable. From these, for 563 there is already enough information in the properties to distinguish.
Hence, the profiler can only help in 8231 (34 %) cases.
../systems/extracted_data/full/participants_samefirstname.p
Total pairwise comparisons for a same name 190774
Out of these, 106049 are not distinguishable
84725 are distinguishable. From these, for 9245 there is already enough information in the properties to distinguish.
Hence, the profiler can only help in 75480 (40 %) cases.
../systems/extracted_data/full/participants_samename.p
Total pairwise comparisons for a same name 90511785
Out of these, 40533195 are not distinguishable
49978590 are distinguishable. From these, for 6247221 there is already enough information in the properties to distinguish.
Hence, the profiler can only help in 43731369 (48 %) cases.
../systems/extr

### 3. Next steps
How many of the open cases are same vs not the same?

How does this relate to the exact baseline?

How does this relate to the noclash baseline?

How does this relate to the profiler behavior?

Is there any correlation between the properties distance and the usefulness of the profiler? - might be tricky to quantify




#### 3.1 Load gold data

In [13]:
def load_system_datasets(the_dir):
    the_data={}
    print(the_dir)
    for f in glob.glob('%s/*.*' % the_dir):
        with open(f, 'rb') as ff:
            print(f)
            fname=f.split('/')[-1].split('.')[0]
            the_data[fname]=json.load(ff)
    return the_data

In [14]:
# load gold file
with open (gold_file, 'rb') as gf:
    gold_data=json.load(gf)

profiler_data=load_system_datasets(profiler_dir)
baseline_data={}
for b in baselines:
    ext_baseline_dir='%s%s/p10' % (baseline_dir, b)
    baseline_data[b]=load_system_datasets(ext_baseline_dir)

../data/system/auto_profiling/full
../data/system/auto_profiling/full/participants_samelastname.json
../data/system/auto_profiling/full/participants_samefirstname.json
../data/system/auto_profiling/full/participants_samename.json
../data/system/auto_profiling/full/extracted_data.json
../data/system/auto_profiling/full/participants_input.json
../data/system/auto/full/noclash/p10
../data/system/auto/full/noclash/p10/participants_samelastname.json
../data/system/auto/full/noclash/p10/participants_samefirstname.json
../data/system/auto/full/noclash/p10/participants_samename.json
../data/system/auto/full/noclash/p10/extracted_data.json
../data/system/auto/full/noclash/p10/participants_input.json
../data/system/auto/full/exact/p10
../data/system/auto/full/exact/p10/participants_samelastname.json
../data/system/auto/full/exact/p10/participants_samefirstname.json
../data/system/auto/full/exact/p10/participants_samename.json
../data/system/auto/full/exact/p10/extracted_data.json
../data/system/

#### 3.2 Analyze the how many of the (in)distinguishable pairs are the same

In [15]:
def analyze_performance_on_pairs(pairs):
    same=0
    different=0
    for id1,id2 in pairs:
        cluster_id1=gold_data[id1]
        cluster_id2=gold_data[id2]
        if cluster_id1==cluster_id2:
            same+=1
        else:
            different+=1
    return same, different

In [16]:
for filename, sets in input_sets.items():
    for set_name, the_set in sets.items():
        same, different=analyze_performance_on_pairs(the_set)
        print('*'*70)
        print(filename, set_name)
        print('Same: %d \nDifferent: %d' % (same, different))

**********************************************************************
participants_input set_indistinguishable
Same: 13949 
Different: 1184
**********************************************************************
participants_input set_distinguishable
Same: 6987 
Different: 1807
**********************************************************************
participants_input set_sufficient
Same: 398 
Different: 165
**********************************************************************
participants_samefirstname set_indistinguishable
Same: 13949 
Different: 92100
**********************************************************************
participants_samefirstname set_distinguishable
Same: 6987 
Different: 77738
**********************************************************************
participants_samefirstname set_sufficient
Same: 398 
Different: 8847
**********************************************************************
participants_samename set_indistinguishable
Same: 13949 
Different: 40519246
*****

#### 3.3 Analyze the system performance on the (in)distinguishable ones

In [17]:
def get_decision(id1, id2, data):
    cluster_id1=data[id1]
    cluster_id2=data[id2]
    return cluster_id1==cluster_id2

In [18]:
def scores_vs_identity(gold_data, sys_data, pairs):
    
    acc_counts=defaultdict(int)
    total_counts=defaultdict(int)
    for id1,id2 in pairs:
        sys_same=get_decision(id1, id2, sys_data)
        gold_same=get_decision(id1, id2, gold_data)

        if sys_same==gold_same: # both have voted for the same decision
            acc_counts[(sys_same, gold_same)]+=1
        total_counts[gold_same]+=1
        
    return acc_counts, total_counts

In [19]:
def compute_acc(acc, total, gold_same=False):
    if total[gold_same]>0:
        return acc[(gold_same, gold_same)]*100.0/total[gold_same]
    else:
        return -1

In [20]:
#sys_data=baseline_data['exact']
sys_data=profiler_data

In [21]:
for filename, sets in input_sets.items():
    print('#'*70)
    if filename not in sys_data.keys(): continue
    sys_predictions=sys_data[filename]
    for set_name, the_set in sets.items():
        print(filename, set_name)
        acc_counts, total_counts=scores_vs_identity(gold_data, sys_predictions, the_set)
        gold_same_acc=compute_acc(acc_counts, total_counts, gold_same=True)
        gold_diff_acc=compute_acc(acc_counts, total_counts, gold_same=False)
        print('Gold same acc: %.2f%%\nGold diff acc: %.2f%%' % (gold_same_acc, gold_diff_acc))
        print('*'*50)

######################################################################
participants_input set_indistinguishable
Gold same acc: 100.00%
Gold diff acc: 0.00%
**************************************************
participants_input set_distinguishable
Gold same acc: 78.96%
Gold diff acc: 59.60%
**************************************************
participants_input set_sufficient
Gold same acc: 51.26%
Gold diff acc: 74.55%
**************************************************
######################################################################
participants_samefirstname set_indistinguishable
Gold same acc: 100.00%
Gold diff acc: 0.00%
**************************************************
participants_samefirstname set_distinguishable
Gold same acc: 64.41%
Gold diff acc: 62.46%
**************************************************
participants_samefirstname set_sufficient
Gold same acc: 44.47%
Gold diff acc: 70.44%
**************************************************
####################################