In [1]:
import pickle
import glob
import json
from collections import defaultdict
import itertools
import numpy as np
from scipy.stats import entropy

from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.cluster.hierarchy import fcluster
from sklearn.cluster import DBSCAN

In [2]:
profiled_dir='profiler_output'
givens_path='%s/given.pkl' % profiled_dir
predicted_path='%s/predicted.pkl' % profiled_dir

input_dir='../data/input/partial/annotation'

output_dir='../data/system/auto_profiling/partial'

In [3]:
properties=['native language' , 'ethnic group', 'cause of death', 'sex or gender', 'religion', 'member of political party', 'occupation', 'age group']

In [4]:
value_mapping_file='../resources/gv_mappings.json'
with open(value_mapping_file, 'r') as f:
    wikidata_to_labels=json.load(f)

prop_vals={}
for prop, vals in wikidata_to_labels.items():
    prop_vals[prop]=list(set(vals.values())) + ['']

In [5]:
prop_vals['age group']=['child 0-11', 'teen 12-17', 'adult 18-64', 'senior 65+', '']
prop_vals['ethnic group'].append('Hispanic/Latin')

### 1. Parse profiling result

#### 1a. Load the profiling predictions

In [6]:
def load_profiler_predictions(givens_file, predicted_file, properties):
    with open(givens_file, 'rb',) as f:
        givens=pickle.load(f, encoding='latin1')
    with(open(predicted_file, 'rb')) as f:
        predicted=pickle.load(f, encoding='latin1')
    
    data={}
    for index, givens_row in enumerate(givens):
        ready_key=[]
        ready_value={}
        for p in properties:
            if p in givens_row.keys():
                ready_key.append(givens_row[p])
            else:
                ready_key.append('')
                ready_value[p]=predicted[p][index]
        data[tuple(ready_key)]=ready_value
    return data

In [7]:
profiled_data=load_profiler_predictions(givens_path, predicted_path, properties)

#### 1b. Merge with existing data to prepare for clustering

In [8]:
def get_property_tuple(properties, part_data):
    part_for_profiler=['']*len(properties)
    if 'Ethnicity' in part_data.keys():
        v=part_data['Ethnicity'].strip()
        if v=='African American':
            v='African American/Black'
        if v=='White/Caucascian':
            v='White/Caucasian'
        part_for_profiler[1]=v
    if 'CauseOfDeath' in part_data.keys():
        part_for_profiler[2]=part_data['CauseOfDeath'].strip()
    if 'Gender' in part_data.keys():
        part_for_profiler[3]=part_data['Gender'].strip().lower()
    if 'Religion' in part_data.keys():
        v=part_data['Religion'].strip()
        if v=='Christian':
            v='Christianity'
        part_for_profiler[4]=v
    if 'Occupation' in part_data.keys():
        part_for_profiler[6]=part_data['Occupation'].strip()
    if 'Age' in part_data.keys():
        part_for_profiler[7]=part_data['Age'].strip().lower()
    tuple_input=tuple(part_for_profiler)

    return tuple_input

In [9]:
def prepare_profiler_data(input_file, properties, profiled_data):
    with open(input_file, 'rb') as f:
        participants=pickle.load(f)

    parts_per_name=defaultdict(dict)
    
    for doc_id, doc_data in participants.items():
        for part_id, part_data in doc_data.items():
            name=''
            if 'Name' in part_data.keys():
                name=part_data['Name']

            tuple_input=get_property_tuple(properties, part_data)
            values=profiled_data[tuple_input]
            
            for index, t in enumerate(tuple_input):
                if t!='':
                    values[properties[index]]=[tuple([t, '1.0'])]
            
            parts_per_name[name][part_id]=values
                
    return parts_per_name

### 2. Compute similarity with JS entropy

In [10]:
def js(p, q):
    p = np.asarray(p).astype(np.float)
    q = np.asarray(q).astype(np.float)
   # normalize
    p /= p.sum()
    q /= q.sum()
    m = (p + q) / 2
    return (entropy(p, m) + entropy(q, m)) / 2

In [11]:
def map_values(v1, v2, domain):
    l=len(domain)
    mapped1=[0]*l
    mapped2=[0]*l
    
    for k,v in v1:
        index=domain.index(k)
        mapped1[index]=v
    for k,v in v2:
        index=domain.index(k)
        mapped2[index]=v
    return mapped1, mapped2    

In [12]:
def compute_js_divergences(c1, c2, properties):
    divs=[]
    for p in properties:
        mapped1, mapped2 = map_values(c1[p], c2[p], prop_vals[p])
        div=js(mapped1, mapped2)
        divs.append(div)
    return divs

In [13]:
# DEPRECATED
def cluster_matrix_with_features(matrix, algorithm='ward', max_d=0.3, criterion='distance'):
    merges = linkage(matrix, 'ward')
    clusters = fcluster(merges, max_d, criterion=criterion)
    return clusters

In [31]:
def cluster_matrix(distances, eps=0.5, min_samples=1):
    labels=DBSCAN(min_samples=min_samples, metric='precomputed').fit_predict(distances)
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise = list(labels).count(-1)
        
    return list(labels), n_clusters, n_noise

In [40]:
def compute_clusters(candidates, properties, start_id):

    # intitialize an empty matrix
    num_cands=len(candidates.keys())
    dist_matrix = np.zeros(shape=(num_cands, num_cands)) # Distances matrix
    
    # fill the matrix with similarity values
    the_keys=list(candidates.keys())    
    for index1, p1 in enumerate(the_keys):
        for index2, p2 in enumerate(the_keys):
            if index1<index2:
                c1=candidates[p1]
                c2=candidates[p2]
                divs=compute_js_divergences(c1, c2, properties)
                avg_div=sum(divs)/len(divs)
        
                dist_matrix[index1, index2]=avg_div
                dist_matrix[index2, index1]=avg_div
                
    # run clustering
    clusters, n_clusters, n_noise = cluster_matrix(dist_matrix)
    clusters_json={}
    for index, part_id in enumerate(the_keys):
        cluster_id=start_id+int(clusters[index])
        clusters_json[part_id]=cluster_id
    
    new_start_id=start_id+n_clusters
    
    if num_cands>1:
        print(dist_matrix)
        print(num_cands, '\t', n_clusters)
        print(candidates)
        input('continue')
    
    return clusters_json, new_start_id

In [41]:
def perform_clustering(data, properties):
    
    clusters={}
    start_id=1
    for name, name_candidates in data.items():
        new_clusters, new_start_id=compute_clusters(name_candidates, properties, start_id)
        clusters.update(new_clusters)
        start_id=new_start_id
    print(start_id)
    return clusters

### 3. Putting it all together

In [42]:
def store_clusters(clusters, output_file):
    with open(output_file, 'w') as w:
        json.dump(clusters, w)
    return

In [None]:
for f in glob.glob('%s/*.p' % input_dir):
    print(f)
    output_file='%s/%s.json' % (output_dir, (f.split('/')[-1]).split('.')[0])
    data=prepare_profiler_data(f, properties, profiled_data)
    clusters=perform_clustering(data, properties)
    store_clusters(clusters, output_file)

../data/input/partial/annotation/participants_input.p
[[0. 0.]
 [0. 0.]]
2 	 1
{'197305597b1d8d10976862afb0febb8e': {'native language': [('English', 0.9999190856200348), ('Hebrew', 4.5455976036105126e-05), ('French', 3.259447206300881e-05)], 'ethnic group': [('African American/Black', 0.9899308811646633), ('White/Caucasian', 0.01006911883533657), ('', 4.57341775784103e-19)], 'religion': [('Christianity', 0.8841126381019017), ('Judaism', 0.06947726661063709), ('atheism', 0.04202959994518969)], 'member of political party': [('Democratic Party', 0.6939421213559067), ('Republican Party', 0.3060578786440933), ('', 8.692132813342743e-21)], 'occupation': [('actor', 0.9241068213171112), ('politician', 0.03072418620339429), ('journalist', 0.030028258712548838)], 'cause of death': [('Intentional', '1.0')], 'sex or gender': [('male', '1.0')], 'age group': [('teen 12-17', '1.0')]}, 'd8dcbfa71b9b5cf7e3bf283772df7411': {'native language': [('English', 0.9999190856200348), ('Hebrew', 4.54559760361051

continue 


[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
3 	 1
{'22469ebbde972ab665bec2328e3e8281': {'native language': [('English', 0.9673000984105138), ('French', 0.030595477030940323), ('Hebrew', 0.0010869374568688594)], 'ethnic group': [('African American/Black', 0.7619750780950755), ('White/Caucasian', 0.2380249218554192), ('', 2.1793806147360332e-11)], 'cause of death': [('Suicide', 0.621506711543154), ('Accidental', 0.35665820466390846), ('Intentional', 0.021835083773341108)], 'religion': [('Christianity', 0.4938281664690149), ('atheism', 0.40836998022745835), ('Judaism', 0.07335212620033314)], 'member of political party': [('Democratic Party', 0.5470254216223158), ('Republican Party', 0.4529745783677087), ('', 4.743363094052237e-12)], 'occupation': [('actor', 0.6098231818217773), ('singer', 0.12257291202204187), ('journalist', 0.10486667881131201)], 'sex or gender': [('male', '1.0')], 'age group': [('teen 12-17', '1.0')]}, '695f99ae21f8641cc9a3b7ff648a1473': {'native language': [('English', 0.9673