In [1]:
import pickle
import glob
import json
import sys
from collections import defaultdict
import itertools
import numpy as np
from scipy.stats import entropy

from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.cluster.hierarchy import fcluster
from sklearn.cluster import DBSCAN

In [2]:
clustering_eps=0.2 # 0.05 or 0.1
which_extractor='auto'

In [3]:
profiled_dir='profiler_output'
givens_path='%s/%s_given.pkl' % (profiled_dir, which_extractor)
predicted_path='%s/%s_predicted.pkl' % (profiled_dir, which_extractor)

if which_extractor=='gold':
    input_dir='../data/input/partial/annotation'
    output_dir='../data/system/gold_profiling/partial'
else:
    input_dir='extracted_data/partial'
    output_dir='../data/system/auto_profiling/partial'

In [4]:
properties=['native language' , 'ethnic group', 'cause of death', 'sex or gender', 'religion', 'member of political party', 'occupation', 'age group']

In [5]:
value_mapping_file='../resources/gv_mappings.json'
with open(value_mapping_file, 'r') as f:
    wikidata_to_labels=json.load(f)

prop_vals={}
for prop, vals in wikidata_to_labels.items():
    prop_vals[prop]=list(set(vals.values())) + ['']

In [6]:
prop_vals['age group']=['child 0-11', 'teen 12-17', 'adult 18-64', 'senior 65+', '']
prop_vals['ethnic group'].append('Hispanic/Latin')

### 1. Parse profiling result

#### 1a. Load the profiling predictions

In [7]:
def load_profiler_predictions(givens_file, predicted_file, properties):
    with open(givens_file, 'rb',) as f:
        givens=pickle.load(f, encoding='latin1')
    with(open(predicted_file, 'rb')) as f:
        predicted=pickle.load(f, encoding='latin1')
    
    data={}
    for index, givens_row in enumerate(givens):
        ready_key=[]
        ready_value={}
        for p in properties:
            if p in givens_row.keys():
                ready_key.append(givens_row[p])
            else:
                ready_key.append('')
                ready_value[p]=predicted[p][index]
        data[tuple(ready_key)]=ready_value
    return data

In [8]:
profiled_data=load_profiler_predictions(givens_path, predicted_path, properties)

In [9]:
profiled_data.keys()

dict_keys([('', '', '', '', '', '', '', ''), ('', '', '', '', '', '', '', 'teen 12-17'), ('', '', 'Intentional', 'male', '', '', '', ''), ('', '', 'Accidental', '', '', '', '', 'teen 12-17'), ('', '', 'Accidental', 'male', '', '', '', 'teen 12-17'), ('', '', 'Accidental', '', '', '', '', 'child 0-11'), ('', '', 'Intentional', '', '', '', '', ''), ('', '', '', '', '', '', '', 'adult 18-64'), ('', '', '', 'female', 'Christianity', '', '', 'child 0-11'), ('', '', 'Intentional', 'male', '', '', '', 'teen 12-17'), ('', '', 'Accidental', 'male', '', '', '', ''), ('', '', '', 'male', '', '', '', ''), ('', '', 'Accidental', '', '', '', '', 'adult 18-64'), ('', '', 'Intentional', '', '', '', '', 'teen 12-17'), ('', '', 'Intentional', 'female', '', '', '', ''), ('', '', '', '', '', '', '', 'child 0-11'), ('', '', '', 'female', '', '', '', ''), ('', '', 'Suicide', '', '', '', '', ''), ('', '', 'Suicide', 'male', '', '', '', 'adult 18-64'), ('', '', 'Suicide', 'male', '', '', '', ''), ('', '', 'In

#### 1b. Merge with existing data to prepare for clustering

In [10]:
def get_property_tuple(properties, part_data):
    part_for_profiler=['']*len(properties)
    if 'Ethnicity' in part_data.keys():
        v=part_data['Ethnicity'].strip()
        if v.lower()=='african american':
            v='African American/Black'
        if v.lower()=='white/caucascian':
            v='White/Caucasian'
        part_for_profiler[1]=v
    if 'CauseOfDeath' in part_data.keys():
        part_for_profiler[2]=part_data['CauseOfDeath'].strip()
    if 'Gender' in part_data.keys():
        part_for_profiler[3]=part_data['Gender'].strip().lower()
    if 'Religion' in part_data.keys():
        v=part_data['Religion'].strip()
        if v.lower()=='christian':
            v='Christianity'
        part_for_profiler[4]=v
    if 'Occupation' in part_data.keys():
        part_for_profiler[6]=part_data['Occupation'].strip()
    if 'Age' in part_data.keys():
        part_for_profiler[7]=part_data['Age'].strip().lower()
    
    if which_extractor=='auto':
        norm_input=normalize_values(part_for_profiler)
        tuple_input=tuple(norm_input)
    else:
        tuple_input=tuple(part_for_profiler)

    
    return tuple_input

In [11]:
def group_age(a):
    if a<12:
        return 'child 0-11'
    elif a<18:
        return 'teen 12-17'
    elif a<65:
        return 'adult 18-64'
    else:
        return 'senior 65+'

In [12]:
def map_occupations(o):
    mappings={'basketball': 'sports player', 'rugby': 'sports player', 'football player': 'sports player', 'sports': 'sports player'}
    if o in mappings.keys():
        return mappings[o]
    else:
        return ''

In [13]:
def normalize_values(row, debug=False):
    
    #debug=True

    new_row=row

    cause_of_death=row[2]
    if cause_of_death:
        new_row[2]=cause_of_death.capitalize()
        if new_row[2]=='Negligent':
            new_row[2]='Accidental'
        elif new_row[2] not in {'Intentional', 'Accidental', 'Suicide'}:
            new_row[2]=''


    age=row[7]
    if age:
        age_group=group_age(int(age))
        new_row[7]=age_group

    occupation=row[6]
    if occupation:
        new_row[6]=map_occupations(occupation)

    return new_row

In [14]:
def prepare_profiler_data(input_file, properties, profiled_data, debug=False):
    with open(input_file, 'rb') as f:
        participants=pickle.load(f)

    parts_per_name=defaultdict(dict)
    
    values_per_name=defaultdict(dict)
    
    for doc_id, doc_data in participants.items():
        for part_id, part_data in doc_data.items():
            name=''
            if 'Name' in part_data.keys():
                name=part_data['Name'].strip()

            tuple_input=get_property_tuple(properties, part_data)
            values=profiled_data[tuple_input]
            
            for index, t in enumerate(tuple_input):
                if t!='':
                    values[properties[index]]=[tuple([t, '1.0'])]
            
            parts_per_name[name][part_id]=values
            
            #parts_per_name[name][part_id]=tuple_input
            
    #print(parts_per_name['Hayden Mayes'])
    #if debug:
#        print(values_per_name)
        #a_name='Hayden Mayes'
        #print(values_per_name[a_name])
        #print(parts_per_name[a_name])
        #sys.exit()
    
    return parts_per_name

### 2. Compute similarity with JS entropy

In [15]:
def js(p, q):
    p = np.asarray(p).astype(np.float)
    q = np.asarray(q).astype(np.float)
   # normalize
    p /= p.sum()
    q /= q.sum()
    m = (p + q) / 2
    return (entropy(p, m) + entropy(q, m)) / 2

In [16]:
def map_values(v1, v2, domain):
    l=len(domain)
    mapped1=[0]*l
    mapped2=[0]*l
    
    for k,v in v1:
        index=domain.index(k)
        mapped1[index]=v
    for k,v in v2:
        index=domain.index(k)
        mapped2[index]=v
    return mapped1, mapped2    

In [17]:
def compute_js_divergences(c1, c2, properties):
    divs=[]
    for p in properties:
        mapped1, mapped2 = map_values(c1[p], c2[p], prop_vals[p])
        div=js(mapped1, mapped2)
        divs.append(div)
    return divs

In [18]:
# DEPRECATED
def cluster_matrix_with_features(matrix, algorithm='ward', max_d=0.3, criterion='distance'):
    merges = linkage(matrix, 'ward')
    clusters = fcluster(merges, max_d, criterion=criterion)
    return clusters

In [19]:
def cluster_matrix(distances, eps=0.1, min_samples=1):
    labels=DBSCAN(min_samples=min_samples, eps=eps, metric='precomputed').fit_predict(distances)
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise = list(labels).count(-1)
        
    return list(labels), n_clusters, n_noise

In [20]:
def compute_clusters(candidates, properties, start_id, eps):

    # intitialize an empty matrix
    num_cands=len(candidates.keys())
    dist_matrix = np.zeros(shape=(num_cands, num_cands)) # Distances matrix
    
    # fill the matrix with similarity values
    the_keys=list(candidates.keys())    
    for index1, p1 in enumerate(the_keys):
        for index2, p2 in enumerate(the_keys):
            if index1<index2:
                c1=candidates[p1]
                c2=candidates[p2]
                divs=compute_js_divergences(c1, c2, properties)
                avg_div=sum(divs)/len(divs)
        
                dist_matrix[index1, index2]=avg_div
                dist_matrix[index2, index1]=avg_div
                
    # run clustering
    clusters, n_clusters, n_noise = cluster_matrix(dist_matrix, eps=eps)
    clusters_json={}
    for index, part_id in enumerate(the_keys):
        cluster_id=start_id+int(clusters[index])
        clusters_json[part_id]=cluster_id
    
    new_start_id=start_id+n_clusters
    
    return clusters_json, new_start_id

In [21]:
def perform_clustering(data, properties, eps):
    
    clusters={}
    start_id=1
    for name, name_candidates in data.items():
        new_clusters, new_start_id=compute_clusters(name_candidates, properties, start_id, eps)
        #if name=='Hayden Mayes':
        #    print(new_clusters)
        clusters.update(new_clusters)
        start_id=new_start_id
    print(start_id)
    return clusters

### 3. Putting it all together

In [22]:
def store_clusters(clusters, output_file):
    with open(output_file, 'w') as w:
        json.dump(clusters, w)
    return

In [23]:
for f in glob.glob('%s/*.p' % input_dir):
    print(f)
    output_file='%s/%s.json' % (output_dir, (f.split('/')[-1]).split('.')[0])
    data=prepare_profiler_data(f, properties, profiled_data, debug=True)
    clusters=perform_clustering(data, properties, clustering_eps)
    store_clusters(clusters, output_file)
    #break

extracted_data/partial/participants_input.p
465
extracted_data/partial/participants_samefirstname.p
333
extracted_data/partial/participants_samename.p
2
extracted_data/partial/participants_samelastname.p
385
extracted_data/partial/extracted_data.p
465
