In [41]:
import pickle
import glob
from collections import defaultdict
import itertools

In [17]:
profiled_dir='profiler_output'
givens_path='%s/given.pkl' % profiled_dir
predicted_path='%s/predicted.pkl' % profiled_dir

input_dir='../data/input/partial/annotation'

In [34]:
properties=['native language' , 'ethnic group', 'cause of death', 'sex or gender', 'religion', 'member of political party', 'occupation', 'age group']

### 1. Parse profiling result

#### 1a. Load the profiling predictions

In [35]:
def load_profiler_predictions(givens_file, predicted_file, properties):
    with open(givens_file, 'rb',) as f:
        givens=pickle.load(f, encoding='latin1')
    with(open(predicted_file, 'rb')) as f:
        predicted=pickle.load(f, encoding='latin1')
        
    data={}
    for index, givens_row in enumerate(givens):
        ready_key=[]
        ready_value={}
        for p in properties:
            if p in givens_row.keys():
                ready_key.append(givens_row[p])
            else:
                ready_key.append('')
                ready_value[p]=predicted[p][index]
        data[tuple(ready_key)]=ready_value
    return data

In [36]:
profiled_data=load_profiler_predictions(givens_path, predicted_path, properties)

#### 1b. Merge with existing data to prepare for clustering

In [21]:
def get_property_tuple(properties, part_data):
    part_for_profiler=['']*len(properties)
    if 'Ethnicity' in part_data.keys():
        v=part_data['Ethnicity'].strip()
        if v=='African American':
            v='African American/Black'
        if v=='White/Caucascian':
            v='White/Caucasian'
        part_for_profiler[1]=v
    if 'CauseOfDeath' in part_data.keys():
        part_for_profiler[2]=part_data['CauseOfDeath'].strip()
    if 'Gender' in part_data.keys():
        part_for_profiler[3]=part_data['Gender'].strip().lower()
    if 'Religion' in part_data.keys():
        part_for_profiler[4]=part_data['Religion'].strip()
    if 'Occupation' in part_data.keys():
        part_for_profiler[6]=part_data['Occupation'].strip()
    if 'Age' in part_data.keys():
        part_for_profiler[7]=part_data['Age'].strip().lower()
    tuple_input=tuple(part_for_profiler)

    return tuple_input

In [30]:
def prepare_profiler_data(input_file, properties, profiled_data):
    with open(input_file, 'rb') as f:
        participants=pickle.load(f)

    parts_per_name=defaultdict(dict)
    
    for doc_id, doc_data in participants.items():
        for part_id, part_data in doc_data.items():
            name=''
            if 'Name' in part_data.keys():
                name=part_data['Name']

            tuple_input=get_property_tuple(properties, part_data)
            values=profiled_data[tuple_input]
            
            for index, t in enumerate(tuple_input):
                if t!='':
                    values[properties[index]]=[tuple([t, '1.0'])]
            
            parts_per_name[name][part_id]=values
                
    return parts_per_name

### 2. Compute similarity with JS entropy

In [47]:
def js_divergence(c1, c2):
    return 0.5

In [48]:
def compute_js_divergences(c1, c2, properties):
    divs=[]
    for p in properties:
        div=js_divergence(c1[p], c2[p])
        divs.append(div)
    return divs

In [58]:
def compute_clusters(candidates, properties, threshold=0.45):
    clusters_json={}
    
    pairs=list(itertools.combinations(candidates.keys(), 2))
    
    for p1,p2 in pairs:
        c1=candidates[p1]
        c2=candidates[p2]
        divs=compute_js_divergences(c1, c2, properties)
        
        avg=sum(divs)/len(divs)
        if avg<threshold:
            print(p1, p2, avg, True)
        else:
            print(p1, p2, avg, False)
    
    return clusters_json

In [59]:
def perform_clustering(data, properties):
    for name, name_candidates in data.items():
        clusters=compute_clusters(name_candidates, properties)
        print(name, clusters)
        input('continue?')

In [None]:
test=perform_clustering(data, properties)

 Nathaniel Jones {}


continue? 


 E.C. Robinson {}


continue? 


 Kendall Reed {}


continue? 


 Will Harris {}


continue? 


 Steven Coleman {}


continue? 


197305597b1d8d10976862afb0febb8e d8dcbfa71b9b5cf7e3bf283772df7411 0.5 False
 Christopher Roupe {}


continue? 


22469ebbde972ab665bec2328e3e8281 695f99ae21f8641cc9a3b7ff648a1473 0.5 False
22469ebbde972ab665bec2328e3e8281 b0a0251c68d927d4f5ae0ae5d2473018 0.5 False
695f99ae21f8641cc9a3b7ff648a1473 b0a0251c68d927d4f5ae0ae5d2473018 0.5 False
 Brady Osborne {}


continue? 


c33d0ab46bf6dbc51b78d6c552d196d8 67290b5ca2d1c930c89a270061cc1061 0.5 False
c33d0ab46bf6dbc51b78d6c552d196d8 ef8ea80ae83aa93f6b0241a4ec6775d6 0.5 False
67290b5ca2d1c930c89a270061cc1061 ef8ea80ae83aa93f6b0241a4ec6775d6 0.5 False
 Matt Anderson {}


continue? 


c34db0836a8c4a062fe76b45da2939fb 161753d585e7a28ab67bde6e67ccb699 0.5 False
 Braison Howard {}


### 3. Putting it all together

In [None]:
for f in glob.glob('%s/*.p' % input_dir):
    data=prepare_profiler_data(f, properties, profiled_data)
    clusters=perform_clustering(data)
    store_clusters(clusters)
    print(f)
    break