In [41]:
from collections import defaultdict, Counter
import json
import pickle

### 1. Extract attribute lists from Emiel's data

In [2]:
attributes=['age', 'ethnicity', 'religion']
maybe=['height', 'weight', 'skin-color', 'skin-other', 'hair-style', 'hair-length', 
       'hair-color', 'eyes','fitness','occupation-or-social-group', 'disability']
attributes+=maybe

In [3]:
data_dirs=['../../LabelingPeople/VisualGenome/resources/Categories',
           '../../LabelingPeople/Flickr30k/resources/Categories']

In [4]:
values=defaultdict(set)
for a in attributes:
    for d in data_dirs:
        file_path='%s/%s.txt' % (d, a)
        try:
            with open(file_path, 'r') as f:
                data=f.read().strip().split('\n')
                values[a] |=set(data)
        except FileNotFoundError:
            continue

In [5]:
values_to_list={}
for a, adata in values.items():
    newdata=[]
    for d in adata:
        if d.strip()!='':
            newdata.append(d)       
    values_to_list[a]=newdata

In [6]:
with open('../resources/emiel.json', 'w') as w:
    json.dump(values_to_list, w)

### 2. Extract vocabulary from our student annotation

In [15]:
annotation_files=['../resources/ann_areum_men.json', '../resources/ann_ngan_men.json']
conll_dir='../data_preparation/auxiliary_data'
conll_main='%s/docs.conll' % conll_dir
conll_rest='%s/new.conll' % conll_dir

In [32]:
incpart2str_file='../resources/participant_annotation.json'
with open(incpart2str_file, 'r') as r:
    part_strdata=json.load(r)

In [8]:
def build_id_to_text_index(conll_files):
    index={}
    for file in conll_files:
        with open(file, 'r') as lines:
            for line in lines:
                if not line.startswith('#begin') and not line.startswith('#end'):
                    line_data=line.split('\t')
                    token_id=line_data[0]
                    form=line_data[1]
                    index[token_id]=form
    return index

In [9]:
index = build_id_to_text_index([conll_main, conll_rest])

In [10]:
def obtain_form_from_tokens(index, tokens):
    form=[]
    for t in tokens:
        if t in index:
            form.append(index[t])
        else:
            print(t)
    return ' '.join(form)

In [39]:
attrs=['CauseOfDeath', 'Residence', 'EducationLevel', 'Ethnicity', 'Religion', 'BirthPlace', 'PastConviction']

annotated_values={}
for a in attrs:
    annotated_values[a]=defaultdict(list)

for f in annotation_files:
    with open(f, 'r') as jfile:
        annotations=json.load(jfile)
        for inc, inc_data in annotations.items():
            
            for a_token, token_annotations in inc_data.items():

                if 'mwu' in token_annotations:
                    tokens=token_annotations['mwu']
                else:
                    tokens=[a_token]
                
                phrase = obtain_form_from_tokens(index, tokens)
                if not phrase: continue
                for pair in token_annotations['pairs']:
                    prop = pair[1]
                    part=pair[0]
                    if prop=='MedicalCondition': continue
                    key='%s#%s' % (inc, part)
                    try:
                        interpretation=part_strdata[key][prop]
                        annotated_values[prop][phrase].append(interpretation)
                    except KeyError:
                        print(prop, 'not in', part_strdata[key])
                    
                #print(tokens, properties)

EducationLevel not in {'CauseOfDeath': 'Accidental', 'Residence': 'Montana', 'Gender': ' Male', 'Age': 'Child 0-11', 'DeathPlace': 'Montana', 'Name': ' Lonato Moran-Allen', 'DeathDate': '2014'}
EducationLevel not in {'CauseOfDeath': 'Accidental', 'Residence': 'Montana', 'Gender': ' Male', 'Age': 'Child 0-11', 'DeathPlace': 'Montana', 'Name': ' Lonato Moran-Allen', 'DeathDate': '2014'}
EducationLevel not in {'CauseOfDeath': 'Accidental', 'Residence': 'Montana', 'Gender': ' Male', 'Age': 'Child 0-11', 'DeathPlace': 'Montana', 'Name': ' Lonato Moran-Allen', 'DeathDate': '2014'}
EducationLevel not in {'CauseOfDeath': 'Accidental', 'Residence': 'Montana', 'Gender': ' Male', 'Age': 'Child 0-11', 'DeathPlace': 'Montana', 'Name': ' Lonato Moran-Allen', 'DeathDate': '2014'}
EducationLevel not in {'CauseOfDeath': 'Accidental', 'Residence': 'Montana', 'Gender': ' Male', 'Age': 'Child 0-11', 'DeathPlace': 'Montana', 'Name': ' Lonato Moran-Allen', 'DeathDate': '2014'}
EducationLevel not in {'CauseO

KeyError: '273773#1'

In [46]:
data_to_store={}
for prop, values in annotated_values.items():
    data_to_store[prop]={}
    for keyword, ints in values.items():
        data_to_store[prop][keyword]=Counter(ints).most_common(1)[0][0]

In [47]:
with open('../resources/students.json', 'w') as w:
    json.dump(data_to_store, w)