In [148]:
partition='full'
extractor='gold'
baselines={'exact', 'noclash'}

In [149]:
import glob
import pickle
from collections import defaultdict, Counter
import json

In [150]:
gold_prop_dir='../data/input/%s/annotation' % partition
auto_prop_dir='../systems/extracted_data/%s' % partition
gold_file='../data/gold/%s/participants.json'
baseline_dir='../data/system/%s/%s/' % (extractor, partition)
profiler_dir='../data/system/%s_profiling/%s' % (extractor, partition)

In [151]:
if extractor=='gold':
    data_dir=gold_prop_dir
else:
    data_dir=auto_prop_dir

# Questions we answer here
* Number of pairwise comparisons
* Number of non-distinguishable pairwise comparisons
    * How many of these are actually the same entity?
* From the ones that are distinguishable
    * How many are resolved by the baselines
    * How many are resolved by the profiler
    * Is there any correlation between the distance and the usefulness of the profiler


### 1. Get number of combinations that are unique or have presence of N

In [152]:
keys=['Residence', 'Ethnicity', 'EducationLevel', 'MedicalCondition', 'BirthPlace', 'Gender', 
      'Age', 'Religion', 'PastConviction', 'CauseOfDeath', 'DeathPlace', 'DeathDate', 'Name']

In [153]:
def create_key(props, keys):
    the_key=[]
    for k in keys:
        if k in props.keys():
            v=props[k].strip()
        else:
            v=''
        the_key.append(v)
    return tuple(the_key)

In [154]:
def get_distribution(data):
    counter=defaultdict(int)
    for k, v in data.items():
        counter[len(v)]+=1
    return counter

In [155]:
for input_file in glob.glob('%s/*.*' % data_dir):
    with open(input_file, 'rb') as f:
        data=pickle.load(f)
        grouped_by_props=defaultdict(list)
        for doc_id, doc_data in data.items():
            for part_id, part_props in doc_data.items():
                k=create_key(part_props, keys)
                grouped_by_props[k].append(part_id)
    print(input_file)
    print('Number of unique combinations:', len(grouped_by_props.keys()))
    print('Distribution', json.dumps(get_distribution(grouped_by_props), sort_keys=True))

../data/input/full/annotation/participants_input.p
Number of unique combinations: 5358
Distribution {"1": 1911, "2": 1643, "3": 870, "4": 358, "5": 188, "6": 114, "7": 61, "8": 37, "9": 16, "10": 135, "11": 9, "12": 5, "17": 9, "28": 1, "35": 1}
../data/input/full/annotation/participants_samefirstname.p
Number of unique combinations: 5204
Distribution {"1": 1829, "2": 1604, "3": 844, "4": 356, "5": 188, "6": 119, "7": 67, "8": 40, "9": 18, "10": 108, "11": 10, "12": 5, "13": 2, "15": 2, "17": 9, "24": 1, "77": 1, "297": 1}
../data/input/full/annotation/participants_samename.p
Number of unique combinations: 2230
Distribution {"1": 649, "2": 632, "3": 337, "4": 175, "5": 107, "6": 71, "7": 39, "8": 28, "9": 20, "10": 40, "11": 18, "12": 8, "13": 9, "14": 8, "15": 5, "16": 8, "17": 5, "18": 3, "19": 3, "20": 4, "21": 3, "22": 6, "24": 3, "26": 1, "27": 2, "29": 1, "30": 4, "31": 3, "34": 1, "37": 1, "38": 2, "40": 1, "43": 1, "45": 1, "47": 1, "51": 2, "52": 1, "55": 1, "60": 1, "67": 1, 

### 2. Distinguishability

In [156]:
def analyze_grouped_data(grouped_by_name):
    total_comparisons=0
    undistinguishable=0
    distinguishable=0
    sufficient_to_distinguish=0

    for name, parts in grouped_by_name.items():
        for p1, p1_data  in parts.items():
            for p2, p2_data in parts.items():
                if p1>p2:
                    total_comparisons+=1
                    if p1_data==p2_data:
                        undistinguishable+=1
                    else:
                        distinguishable+=1
                        for index, e1 in enumerate(p1_data):
                            e2=p2_data[index]
                            if e1!=e2 and e1!='' and e2!='':
                                sufficient_to_distinguish+=1
                                break

    profiler_role=distinguishable-sufficient_to_distinguish
    print('Total pairwise comparisons for a same name', total_comparisons)
    print('Out of these, %d are not distinguishable' % undistinguishable)
    print('%d are distinguishable. From these, for %d there is already enough information in the properties to distinguish.' % 
          (distinguishable, sufficient_to_distinguish))
    print('Hence, the profiler can only help in %d (%d %%) cases.' % (profiler_role, round(profiler_role*100/total_comparisons)))

In [157]:
#chosen_keys=keys
chosen_keys=['Ethnicity', 'Gender', 'Age', 'Religion', 'CauseOfDeath', 'Occupation']

In [159]:
for input_file in glob.glob('%s/*.*' % data_dir):
    with open(input_file, 'rb') as f:
        data=pickle.load(f)
        grouped_by_name=defaultdict(dict)
        for doc_id, doc_data in data.items():
            for part_id, part_props in doc_data.items():
                if 'Name' not in part_props.keys() or not part_props['Name'].strip(): continue
                name=part_props['Name'].strip()
                k=create_key(part_props, chosen_keys)
                grouped_by_name[name][part_id]=k
    print(input_file)
    analyze_grouped_data(grouped_by_name)
#    print('Number of unique combinations:', len(grouped_by_props.keys()))
#    print('Distribution', json.dumps(get_distribution(grouped_by_props), sort_keys=True))

../data/input/full/annotation/participants_input.p
Total pairwise comparisons for a same name 23927
Out of these, 21994 are not distinguishable
1933 are distinguishable. From these, for 134 there is already enough information in the properties to distinguish.
Hence, the profiler can only help in 1799 (8 %) cases.
../data/input/full/annotation/participants_samefirstname.p
Total pairwise comparisons for a same name 190774
Out of these, 68790 are not distinguishable
121984 are distinguishable. From these, for 72984 there is already enough information in the properties to distinguish.
Hence, the profiler can only help in 49000 (26 %) cases.
../data/input/full/annotation/participants_samename.p


KeyboardInterrupt: 

### 3. Next steps
How many of the open cases are same vs not the same?
How does this relate to the exact baseline?
How does this relate to the noclash baseline?
How does this relate to the profiler behavior?
Is there any correlation between the properties distance and the usefulness of the profiler? - might be tricky to quantify


#### Analyze the performance of the baselines on the distinguishable ones

#### Analyze the performance of the profiler on the distinguishable ones