In [1]:
import pickle
import glob
import pandas as pd
import json

### 1. Prepare gold partial data

In [2]:
partition='full'

In [3]:
gold_input_file='../data/input/%s/annotation/participants_input.p' % partition
if partition=='partial':
    sys_input_file='../baselines/extracted_data/%s/participants_input.p' % partition
else:
    sys_input_file='../baselines/extracted_data/%s/extracted_data.p' % partition

files=[sys_input_file, gold_input_file]
#files=[gold_input_file]
output_dir='profiler_input'

In [4]:
properties=['native language' , 'ethnic group', 'cause of death', 'sex or gender', 'religion', 'member of political party', 'occupation', 'age group']

In [5]:
def prepare_profiler_data(input_file, properties):
    with open(input_file, 'rb') as f:
        participants=pickle.load(f)

    profiler_data=[]
    for doc_id, doc_data in participants.items():
        for part_id, part_data in doc_data.items():
            part_for_profiler=['']*len(properties)
            if 'Ethnicity' in part_data.keys():
                v=part_data['Ethnicity'].strip()
                if v.lower()=='african american':
                    v='African American/Black'
                if v.lower()=='white/caucascian' or v.lower()=='white':
                    v='White/Caucasian'
                if v.lower()=='hispanic/latin':
                    v='Hispanic/Latin'
                part_for_profiler[1]=v
            if 'CauseOfDeath' in part_data.keys():
                part_for_profiler[2]=part_data['CauseOfDeath'].strip()
            if 'Gender' in part_data.keys():
                part_for_profiler[3]=part_data['Gender'].strip().lower()
            if 'Religion' in part_data.keys():
                v=part_data['Religion'].strip()
                v=v.capitalize()
                if v.lower()=='christian':
                    v='Christianity'
                
                part_for_profiler[4]=v
            if 'Occupation' in part_data.keys():
                part_for_profiler[6]=part_data['Occupation'].strip()
            if 'Age' in part_data.keys():
                part_for_profiler[7]=part_data['Age'].strip().lower()
            profiler_data.append(part_for_profiler)
    return profiler_data

In [6]:
with open('../resources/gv_mappings.json', 'r') as r:
    mappings=json.load(r)
    
#mappings

In [7]:
def group_age(a):
    if a<12:
        return 'child 0-11'
    elif a<18:
        return 'teen 12-17'
    elif a<65:
        return 'adult 18-64'
    else:
        return 'senior 65+'

In [8]:
def map_occupations(o):
    mappings={'basketball': 'sports player', 'rugby': 'sports player', 'football player': 'sports player', 'sports': 'sports player'}
    if o in mappings.keys():
        return mappings[o]
    else:
        return ''

In [9]:
def normalize_values(raw_data, debug=False):
    
    #debug=True
    
    clean_data=[]
    for row in raw_data:
        new_row=row
        
        cause_of_death=row[2]
        if cause_of_death:
            new_row[2]=cause_of_death.capitalize()
            if new_row[2]=='Negligent':
                new_row[2]='Accidental'
            elif new_row[2] not in {'Intentional', 'Accidental', 'Suicide'}:
                new_row[2]=''
                
        
        age=row[7]
        if age:
            age_group=group_age(int(age))
            new_row[7]=age_group
        
        occupation=row[6]
        if occupation:
            new_row[6]=map_occupations(occupation)
            
        if debug:
            print(new_row)
            input('continue?')
        
        clean_data.append(new_row)
    return clean_data

In [10]:
for f in files:
    data=prepare_profiler_data(f, properties)
    if 'extracted_data' in f: # system extracted file
        data=normalize_values(data, False)
        filename='%s_%s_%s' % ('auto', partition, f.split('/')[-1].replace('.p', '.tsv'))
    else:
        data=normalize_values(data, False)
        filename='%s_%s_%s' % ('gold', partition, f.split('/')[-1].replace('.p', '.tsv'))
    output_file='%s/%s' % (output_dir, filename)
    with open(output_file, 'w') as w:
        df = pd.DataFrame(data)
        df.to_csv(output_file, sep='\t', header=False, index=False)

### 2. Prepare automatically extracted partial data