In [1]:
import pickle
import glob
import json
import sys
from collections import defaultdict
import itertools
import numpy as np
from scipy.stats import entropy

from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.cluster.hierarchy import fcluster
from sklearn.cluster import DBSCAN

In [2]:
import cluster_utils as cu

In [3]:
which_extractor='gold' # gold or auto
which_partition='partial' # partial or full

In [4]:
profiled_dir='profiler_output'
givens_path='%s/%s_%s_given.pkl' % (profiled_dir, which_extractor, which_partition)
predicted_path='%s/%s_%s_predicted.pkl' % (profiled_dir, which_extractor, which_partition)

if which_extractor=='gold':
    input_dir='../data/input/%s/annotation' % which_partition
    output_dir='../data/system/gold_profiling/%s' % which_partition
else:
    input_dir='extracted_data/%s' % which_partition
    output_dir='../data/system/auto_profiling/%s' % which_partition

In [5]:
properties=['native language' , 'ethnic group', 'cause of death', 'sex or gender', 'religion', 'member of political party', 'occupation', 'age group']

In [6]:
value_mapping_file='../resources/gv_mappings.json'
with open(value_mapping_file, 'r') as f:
    wikidata_to_labels=json.load(f)

prop_vals={}
for prop, vals in wikidata_to_labels.items():
    prop_vals[prop]=list(set(vals.values())) + ['']

In [7]:
prop_vals['age group']=['child 0-11', 'teen 12-17', 'adult 18-64', 'senior 65+', '']
prop_vals['ethnic group'].append('Hispanic/Latin')

### 1. Parse profiling result

#### 1a. Load the profiling predictions

In [8]:
def load_profiler_predictions(givens_file, predicted_file, properties):
    with open(givens_file, 'rb',) as f:
        givens=pickle.load(f, encoding='latin1')
    with(open(predicted_file, 'rb')) as f:
        predicted=pickle.load(f, encoding='latin1')
    
    data={}
    for index, givens_row in enumerate(givens):
        ready_key=[]
        ready_value={}
        for p in properties:
            if p in givens_row.keys():
                ready_key.append(givens_row[p])
            else:
                ready_key.append('')
                ready_value[p]=predicted[p][index]
        data[tuple(ready_key)]=ready_value
    return data

In [9]:
profiled_data=load_profiler_predictions(givens_path, predicted_path, properties)

In [10]:
profiled_data.keys()

dict_keys([('', '', '', 'male', '', '', '', 'senior 65+'), ('', '', '', 'male', '', '', '', 'adult 18-64'), ('', '', 'Intentional', 'male', '', '', '', 'teen 12-17'), ('', '', '', 'male', '', '', '', 'teen 12-17'), ('', '', 'Accidental', 'male', '', '', '', 'teen 12-17'), ('', '', 'Accidental', 'male', '', '', '', 'child 0-11'), ('', '', 'Suicide', 'male', 'Christianity', '', '', 'adult 18-64'), ('', '', 'Intentional', 'female', 'Christianity', '', '', 'child 0-11'), ('', '', 'Intentional', 'female', 'Christianity', '', '', 'teen 12-17'), ('', '', 'Intentional', 'female', 'Christianity', '', '', 'adult 18-64'), ('', '', '', 'male', 'Christianity', '', '', 'child 0-11'), ('', '', '', 'male', '', '', '', 'child 0-11'), ('', '', 'Accidental', 'male', 'Christianity', '', '', 'teen 12-17'), ('', '', '', 'female', '', '', '', 'child 0-11'), ('', '', 'Accidental', 'female', '', '', '', 'teen 12-17'), ('', '', '', 'female', '', '', '', 'adult 18-64'), ('', '', 'Suicide', 'female', '', '', '', 

#### 1b. Merge with existing data to prepare for clustering

In [11]:
def get_property_tuple(properties, part_data):
    part_for_profiler=['']*len(properties)
    if 'Ethnicity' in part_data.keys():
        v=part_data['Ethnicity'].strip()
        if v.lower()=='african american':
            v='African American/Black'
        if v.lower()=='white/caucascian' or v.lower()=='white':
            v='White/Caucasian'
        if v.lower()=='hispanic/latin':
            v='Hispanic/Latin'
        part_for_profiler[1]=v
    if 'CauseOfDeath' in part_data.keys():
        part_for_profiler[2]=part_data['CauseOfDeath'].strip()
    if 'Gender' in part_data.keys():
        part_for_profiler[3]=part_data['Gender'].strip().lower()
    if 'Religion' in part_data.keys():
        v=part_data['Religion'].strip()
        if v.lower()=='christian':
            v='Christianity'
        part_for_profiler[4]=v
    if 'Occupation' in part_data.keys():
        part_for_profiler[6]=part_data['Occupation'].strip()
    if 'Age' in part_data.keys():
        part_for_profiler[7]=part_data['Age'].strip().lower()
    if which_extractor=='gold' and which_partition=='partial':
        tuple_input=tuple(part_for_profiler)
    else:
        norm_input=normalize_values(part_for_profiler)
        tuple_input=tuple(norm_input)
    
    return tuple_input

In [12]:
def group_age(a):
    if a<12:
        return 'child 0-11'
    elif a<18:
        return 'teen 12-17'
    elif a<65:
        return 'adult 18-64'
    else:
        return 'senior 65+'

In [13]:
def map_occupations(o):
    mappings={'basketball': 'sports player', 'rugby': 'sports player', 'football player': 'sports player', 'sports': 'sports player'}
    if o in mappings.keys():
        return mappings[o]
    else:
        return ''

In [14]:
def normalize_values(row, debug=False):
    
    #debug=True

    new_row=row

    cause_of_death=row[2]
    if cause_of_death:
        new_row[2]=cause_of_death.capitalize()
        if new_row[2]=='Negligent':
            new_row[2]='Accidental'
        elif new_row[2] not in {'Intentional', 'Accidental', 'Suicide'}:
            new_row[2]=''


    age=row[7]
    if age:
        age_group=group_age(int(age))
        new_row[7]=age_group

    occupation=row[6]
    if occupation:
        new_row[6]=map_occupations(occupation)

    return new_row

In [15]:
def prepare_profiler_data(input_file, properties, profiled_data, debug=False):
    with open(input_file, 'rb') as f:
        participants=pickle.load(f)

    parts_per_name=defaultdict(dict)
    
    values_per_name=defaultdict(dict)
    
    given_values_num=defaultdict(int)
    for doc_id, doc_data in participants.items():
        for part_id, part_data in doc_data.items():
            name=''
            if 'Name' not in part_data.keys() or not part_data['Name'].strip(): continue
            name=part_data['Name'].strip()
            
            tuple_input=get_property_tuple(properties, part_data)
            for index, ti in enumerate(tuple_input):
                if ti.strip()!='':
                    given_values_num[index]+=1

            values=profiled_data[tuple_input]
            
            for index, t in enumerate(tuple_input):
                if t!='':
                    values[properties[index]]=[tuple([t, 1.0])]
            
            parts_per_name[name][part_id]=values
    return parts_per_name

### 2. Compute JS distance

In [16]:
def js(p, q):
    p = np.asarray(p).astype(np.float)
    q = np.asarray(q).astype(np.float)
   # normalize
    p /= p.sum()
    q /= q.sum()
    m = (p + q) / 2
    return (entropy(p, m) + entropy(q, m)) / 2

In [17]:
def map_values(v1, v2, domain):
    l=len(domain)
    mapped1=[0]*l
    mapped2=[0]*l
    
    for k,v in v1:
        if k not in domain: print('key1', k, domain)
        index=domain.index(k)
        mapped1[index]=v
    for k,v in v2:
        if k not in domain: print('key2', k, domain)
        index=domain.index(k)
        mapped2[index]=v
    return mapped1, mapped2    

In [18]:
def compute_js_divergences(c1, c2, properties):
    divs=[]
    for p in properties:
        mapped1, mapped2 = map_values(c1[p], c2[p], prop_vals[p])
        div=js(mapped1, mapped2)
        divs.append(div)
    return divs

In [19]:
# DEPRECATED
def cluster_matrix_with_features(matrix, algorithm='ward', max_d=0.3, criterion='distance'):
    merges = linkage(matrix, 'ward')
    clusters = fcluster(merges, max_d, criterion=criterion)
    return clusters

In [20]:
def cluster_matrix(distances, eps=0.1, min_samples=1):
    labels=DBSCAN(min_samples=min_samples, eps=eps, metric='precomputed').fit_predict(distances)
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise = list(labels).count(-1)
        
    return list(labels), n_clusters, n_noise

In [21]:
def compute_clusters(candidates, properties, start_id, eps, agg='avg'):

    # intitialize an empty matrix
    num_cands=len(candidates.keys())
    dist_matrix = np.zeros(shape=(num_cands, num_cands)) # Distances matrix
    
    # fill the matrix with similarity values
    the_keys=list(candidates.keys())    
    for index1, p1 in enumerate(the_keys):
        for index2, p2 in enumerate(the_keys):
            if index1<index2:
                c1=candidates[p1]
                c2=candidates[p2]
                #print(index1, index2)
                divs=compute_js_divergences(c1, c2, properties)
                
                if agg=='max':
                    agg_div=max(divs)
                    #min_div=min(divs)
                else: # agg=avg
                    agg_div=sum(divs)/len(divs)
        
                dist_matrix[index1, index2]=agg_div
                dist_matrix[index2, index1]=agg_div
                
    # run clustering
    clusters, n_clusters, n_noise = cluster_matrix(dist_matrix, eps=eps)
    clusters_json={}
    for index, part_id in enumerate(the_keys):
        cluster_id=start_id+int(clusters[index])
        clusters_json[part_id]=cluster_id
    
    new_start_id=start_id+n_clusters
    
    return clusters_json, new_start_id

In [22]:
def perform_clustering(data, properties, eps):
    
    clusters={}
    start_id=1
    for name, name_candidates in data.items():
        new_clusters, new_start_id=compute_clusters(name_candidates, properties, start_id, eps)
        #if name=='Hayden Mayes':
        #    print(new_clusters)
        clusters.update(new_clusters)
        start_id=new_start_id
    print(start_id)
    return clusters

### 3. Prepare to run the reasoning baselines

In [85]:
def harden_values(data, t):
    new_data={}
    for name, name_data in data.items():
        new_data[name]={}
        for part_id, part_data in name_data.items():
            new_data[name][part_id]={}
            for prop, vals in part_data.items():
                for val, prob in vals:
                    prob=float(prob)
                    if prob>=t:
                        new_data[name][part_id][prop]=val
    return new_data

In [86]:
def aggregate_per_value_tuple(data, props):
    group_by_name_plus=defaultdict(set)
    for name, name_data in data.items():
        for part, participant in name_data.items():
            the_key=[]
            the_key.append(name)
            for p in properties:
                if p in participant.keys():
                    the_key.append(participant[p])
                else:
                    the_key.append('')
            tuple_key=tuple(the_key)
            group_by_name_plus[tuple_key].add(part)
    print(len(group_by_name_plus.keys()))
    return group_by_name_plus

In [87]:
def run_baseline(merger, agg_data):
    if merger=='exact':
        system_json=cu.transform_to_json(agg_data)
    else: # 'noclash'
        new_data=cu.perform_merging(agg_data)
        system_json=cu.transform_to_json(new_data)
    return system_json

### 4. Putting it all together

In [88]:
soft_aggregation=False # if false, then we get the max value as long as it is above the threshold tau

In [101]:
merger='noclash'
tau=0.99

In [102]:
clustering_eps=0.1 # 0.05 or 0.1

In [103]:
def store_clusters(clusters, output_file):
    with open(output_file, 'w') as w:
        json.dump(clusters, w)
    return

In [104]:
def inspect(data):
    num_parts=0
    for name, name_data in data.items():
        num_parts+=len(name_data.keys())
    print(num_parts)

In [105]:
for f in glob.glob('%s/*.p' % input_dir):
    filename=(f.split('/')[-1]).split('.')[0]
    #if filename=='participants_samename': continue
    print(filename)
    output_file='%s/%s.json' % (output_dir, filename)
    data=prepare_profiler_data(f, properties, profiled_data, debug=True)
    inspect(data)
    if soft_aggregation:
        clusters=perform_clustering(data, properties, clustering_eps)
    else:
        rounded_data=harden_values(data, tau)
        #less_props=['ethnic group', 'cause of death', 'sex or gender', 'religion', 'age group']
        agg_data=aggregate_per_value_tuple(rounded_data, properties)
        clusters=run_baseline(merger, agg_data)
    
    store_clusters(clusters, output_file)
    print(max(clusters.values()), len(clusters.keys()))


participants_input
762
504
459 762
participants_samefirstname
762
476
419 762
participants_samename
762
47
26 762
participants_samelastname
762
491
426 762


In [79]:
for name, nd in data.items():
    if '94412534e0b2172a6f7338f43290d772' in nd.keys():
        print(nd['94412534e0b2172a6f7338f43290d772'], name)

{'native language': [('English', 0.9999257963166346), ('French', 7.366188608406273e-05), ('German', 2.8576194336101364e-07)], 'ethnic group': [('White/Caucasian', 0.882382790768442), ('African American/Black', 0.1176172092315581), ('', 1.6218019968179536e-19)], 'religion': [('Christianity', 0.9991272093063862), ('atheism', 0.0007933334295516191), ('Judaism', 7.938017073685977e-05)], 'member of political party': [('Democratic Party', 0.6124247470056865), ('Republican Party', 0.3875752529943136), ('', 7.575389760492456e-22)], 'occupation': [('actor', 0.9097758303083633), ('journalist', 0.06358057310957367), ('singer', 0.01377762208040168)], 'cause of death': [('Suicide', 1.0)], 'sex or gender': [('male', 1.0)], 'age group': [('teen 12-17', 1.0)]} Marsavious Smith
