In [1]:
# imports
import csv
import regex as re
import json
from collections import defaultdict, Counter
import utils
import en_coref_md
import pickle

In [2]:
import spacy

In [3]:
gvdb_articles_file = '../../gvdb-aggregated-db/Articles-with-extracted-info.tsv'

In [4]:
se_annotation_file='../data/input/partial/annotation/participants_input.p'
se_text_file='../data/input/partial/text/docs.p'

### 1. Load coreference model & resources

In [5]:
nlp = en_coref_md.load()

In [6]:
def get_sentence_offsets(doc):
    sent_offsets=[]
    for s in doc.sents:
        sent_offsets.append(tuple([s.start_char, s.end_char]))
    return sent_offsets

In [7]:
txt ='Hello. I am you. Who are you?'
doc = nlp(txt)
get_sentence_offsets(doc)

[(0, 6), (7, 16), (17, 29)]

In [8]:
emiel_resources_file='../resources/emiel.json'
with open (emiel_resources_file, 'r') as f:
    emiel_resources=json.load(f)

In [9]:
students_resources_file='../resources/students.json'
with open (students_resources_file, 'r') as f:
    students_resources=json.load(f)

In [10]:
students_resources.keys()

dict_keys(['CauseOfDeath', 'Residence', 'EducationLevel', 'Ethnicity', 'Religion', 'BirthPlace', 'PastConviction'])

### 2. Generate keywords

In [11]:
attributes=['age', 'race', 'gender']

In [12]:
"""
def get_most_frequent_words(rdr, attribute):
    frequency_count=defaultdict(int)
    for row in rdr:
        data=json.loads(row[3])
        for s in sections:
            for participant in data[s]:
                if participant[attribute]['value']:
                    value=participant[attribute]['value'].strip()
                    frequency_count[value]+=1
    return Counter(frequency_count).most_common(50)
    
frequency_count={}
with open(gvdb_articles_file, 'r') as csvfile:
    rdr = csv.reader(csvfile, delimiter='\t', quotechar='"')
    header=next(rdr)
    frequency_count['race']=get_most_frequent_words(rdr, 'race')
"""

'\ndef get_most_frequent_words(rdr, attribute):\n    frequency_count=defaultdict(int)\n    for row in rdr:\n        data=json.loads(row[3])\n        for s in sections:\n            for participant in data[s]:\n                if participant[attribute][\'value\']:\n                    value=participant[attribute][\'value\'].strip()\n                    frequency_count[value]+=1\n    return Counter(frequency_count).most_common(50)\n    \nfrequency_count={}\nwith open(gvdb_articles_file, \'r\') as csvfile:\n    rdr = csv.reader(csvfile, delimiter=\'\t\', quotechar=\'"\')\n    header=next(rdr)\n    frequency_count[\'race\']=get_most_frequent_words(rdr, \'race\')\n'

In [13]:
genders={'he': 'male', 
         'boy': 'male', 
         'man': 'male', 
         'dude': 'male',
         'guy': 'male',
         'male': 'male',
         'brother': 'male',
         'father': 'male',
         'him': 'male',
         'himself': 'male',
         'she': 'female',
         'girl': 'female',
         'woman': 'female',
         'female': 'female',
         'sister': 'female',
         'mother': 'female',
         'her': 'female',
         'herself': 'female'}

In [14]:
pattern_attrs=['age','ethnicity', 'religion', 'occupation', 'educationlevel', 
               'causeofdeath', 'pastconviction', 'birthplace', 'residence']

patterns={}
pattern_data=defaultdict(dict)

for p in pattern_attrs:
    patterns[p]=None

#tmp_races = set(emiel_resources['ethnicity'])
#tmp_races |= set({'black', 'latino', 'white', 'hispanic', 'asian', 'latina', 
#       'african american', 'filipino', 'african-american', 'latinos',
#      'palestinian', 'chinese-american', 'blacks', 'german-iranian'})
#tmp_races = 
pattern_data['ethnicity']=students_resources['Ethnicity']


#genders={'male': {'he', 'boy', 'man', 'dude', 'guy', 'male'}, 'female': {'girl', 'woman', 'female'}}

pattern_data['occupation']={o:o for o in emiel_resources['occupation-or-social-group']}

#religions={o:o for o in (emiel_resources['religion']+students_resources['Religion'])}
pattern_data['religion']=students_resources['Religion']

pattern_data['educationlevel']=students_resources['EducationLevel']
del pattern_data['educationlevel']['a']

pattern_data['pastconviction']=students_resources['PastConviction']

pattern_data['causeofdeath']=students_resources['CauseOfDeath']

pattern_data['birthplace']=students_resources['BirthPlace']

pattern_data['residence']=students_resources['Residence']

patterns['age']=[r'\d\d?-year-old', r', \d\d?']

### 3. Attribute value extractors

#### 3.1 Proximity based extractors

In [15]:
def attr_values_extractor(text, people_spans, coref_spans, sentence_offsets, patterns, a_dict=None):
    """Generic extractor that operates based on patterns."""
    
    extracted_pairs=defaultdict(list)
    for pattern in patterns:
        r=re.compile(pattern, re.IGNORECASE)
        values=r.finditer(text)
        for val_found in values:
            span=val_found.span()
            value=val_found.group()
            value=value.replace('-year-old', '').replace(',', '').strip()
            if a_dict and value in a_dict:
                value=a_dict[value]
            person, distance=utils.find_closest_person(span, 
                                                       people_spans, 
                                                       coref_spans, 
                                                       sentence_offsets,
                                                       min_dist=1000)
            if person:
                extracted_pairs[person].append(tuple([distance, value]))
    clean_pairs=utils.get_closest_value_per_person(extracted_pairs)
    return clean_pairs

In [16]:
def pattern_extractor(text, people_spans, coref_spans, sentence_offsets, patterns=None, pattern_data=None):
    if patterns: # if there are patterns given, fire the function immediately
        return attr_values_extractor(text, people_spans, coref_spans, sentence_offsets, patterns, pattern_data)
    
    #else create them first
    patterns=set()
    cs=set(pattern_data.keys())
    for o in cs:
        patterns.add(r'\b%s\b' % o)
    return attr_values_extractor(text, people_spans, coref_spans, sentence_offsets, patterns, pattern_data)

#### 3.2 Coreference based extractor

In [17]:
def attr_extractor_coref(clusters, names, values_json, debug=False): #, text, people_spans):
    
    person_data=defaultdict(list)

    if not clusters:
        return person_data
    
    for c in clusters:
        mentions=utils.stringify_cluster_mentions(c.mentions)
        for person_name in names:
            if utils.lookup_person_in_list(person_name, mentions):
                for m in c.mentions:
                    for txt in [m.text, m.lemma_]:
                        if txt.lower() in values_json.keys():
                            person_data[person_name].append(values_json[txt.lower()])
                            
    clean_data={}
    for person_name, gs in person_data.items():
        c=Counter(gs).most_common(1)[0][0]
        clean_data[person_name]=c
    return clean_data

#### 3.3 Run extraction of all properties for a document

In [18]:
def extract_properties(names, full_text, nlp):
    
    doc = nlp(full_text)
    # run coreference
    people_spans, coref_spans, clusters=utils.get_coref_spans(names, full_text, doc)
    
    sentence_offsets=utils.get_sentence_offsets(doc)
    
    all_extracted={}
    for attribute in pattern_attrs:
        all_extracted[attribute]=pattern_extractor(full_text, 
                                               people_spans, 
                                               coref_spans, 
                                               sentence_offsets, 
                                               pattern_data=pattern_data[attribute],
                                               patterns=patterns[attribute])
    
    all_extracted['gender']=attr_extractor_coref(clusters, names, genders)
    
    #combine extractors
    combined=utils.singularize_data(all_extracted)
            
    return combined

In [19]:
# Test stuff

txt='Hello Peter Boer, 26 was shot in church from Houston. The white police guy failed.'
names=['Peter Boer']
extract_properties(names, txt, nlp)

{'Peter Boer': {'age': '26',
  'causeofdeath': 'intentional',
  'ethnicity': 'white/caucascian',
  'occupation': 'police',
  'religion': 'christian',
  'residence': 'texas'}}

### 4. Run GVDB extractors

In [20]:
def process_gvdb_data(the_file, limit=200):

    all_gold_rows=[]
    all_sys_rows=[]

    with open(the_file, 'r') as csvfile:
        rdr = csv.reader(csvfile, delimiter='\t', quotechar='"')
        header=next(rdr)
        
        for c, row in enumerate(rdr):
            if c==limit: break

            full_text=row[2]
            data=json.loads(row[3])
            part_info=utils.get_participant_info(data)

            names=set(part_info.keys())
            if not len(names): continue

            system_data = extract_properties(names, full_text, nlp)
            all_sys_rows.append(system_data)
            all_gold_rows.append(part_info)
            
            c+=1
    return all_sys_rows, all_gold_rows

In [21]:
#all_sys_rows, all_gold_rows = process_gvdb_data(gvdb_articles_file, limit=200)

### 5. Run SE extractors

In [22]:
def process_se_data(annotation_file, text_file, limit=200):
    all_gold_rows=[]
    all_sys_rows=[]

    with open(annotation_file, 'rb') as af:
        annotations=pickle.load(af)
    
    with open(text_file, 'rb') as tf:
        all_texts_json=pickle.load(tf)
        
    cnt=0
    for doc_id, part_data in annotations.items():
    
        #if doc_id!='b38b3726bdb8fc28186f88217dfa7c7b': continue
        
        names=utils.get_names(part_data)
        
        if doc_id not in all_texts_json:
            continue
        text_json=all_texts_json[doc_id]
        conll_data=text_json['content']
        text=utils.conll_to_text(conll_data)
        
        properties=extract_properties(names, text, nlp)
        
        all_sys_rows.append(properties)
        
        part_info=utils.transform_part_info(part_data)
        all_gold_rows.append(part_info)
        
        cnt+=1
        if cnt==limit: break
        
    return all_sys_rows, all_gold_rows

In [23]:
se_system, se_gold = process_se_data(se_annotation_file, se_text_file, limit=500)

african american ad74870db8920e6986b76133117e7a82
african american 65d7c38113863b1bc6cee2e0fa38dc6c
hispanic/latin a51f975867abd1f18661d858fabf9c3f
white/caucascian 4026e33b1c2033fb5433740895842102
african american 6bb0bddf01c6230df34d1ce9cd7869a6
hispanic/latin 8695354c9046509b209a5aa92f48c199
african american 6bb255cee652c51bb6d34444b1a76705
african american e95dc411de0c45356810076ce4e50890
african american 42ab147825ee27d8a354517061216a91
african american 2a7e35c5897effac0205abe1f82af96d


In [24]:
count_per_attr=defaultdict(int)
for row in se_gold:
    for part, data in row.items():
        for attr in data.keys():
            count_per_attr[attr]+=1
            if attr=='ethnicity':
                print(part, data[attr], row)
print(count_per_attr)

Devin Anderson african american {'Devin Anderson': {'causeofdeath': 'accidental', 'ethnicity': 'african american', 'residence': 'louisiana', 'deathplace': 'louisiana', 'deathdate': '2016', 'age': 'teen 12-17', 'gender': 'male'}, 'Ahmad Antoine': {'residence': 'louisiana', 'ethnicity': 'african american', 'pastconviction': 'no', 'age': 'teen 12-17', 'gender': 'male'}}
Ahmad Antoine african american {'Devin Anderson': {'causeofdeath': 'accidental', 'ethnicity': 'african american', 'residence': 'louisiana', 'deathplace': 'louisiana', 'deathdate': '2016', 'age': 'teen 12-17', 'gender': 'male'}, 'Ahmad Antoine': {'residence': 'louisiana', 'ethnicity': 'african american', 'pastconviction': 'no', 'age': 'teen 12-17', 'gender': 'male'}}
Alejandro Martinez hispanic/latin {'Alejandro Martinez': {'ethnicity': 'hispanic/latin', 'deathplace': 'texas', 'deathdate': '2016', 'age': 'adult 18-64', 'gender': 'male'}}
Bryant Sanchez white/caucascian {'Bryant Sanchez': {'ethnicity': 'white/caucascian', 'r

### 5. Benchmark extractors

#### 5.1 Benchmark GVDB data

In [25]:
#utils.benchmark_extractors(all_sys_rows, all_gold_rows, attributes)

#### 5.2 Benchmark on SemEval data

In [26]:
se_attributes=['age', 'gender', 'pastconviction', 'educationlevel', 
               'causeofdeath', 'ethnicity', 'religion', 'birthplace', 'residence']

In [27]:
def map_age_to_group(sys_data):
    for doc in sys_data:
        for part, data in doc.items():
            if 'age' in data:
                age=int(data['age'])
                age_group=None
                if age<12:
                    age_group='child 0-11'
                elif age<18:
                    age_group='teen 12-17'
                elif age<65:
                    age_group='adult 18-64'
                else:
                    age_group='senior 65+'
                data['age']=age_group
    return sys_data

if 'age' in se_attributes:
    se_system=map_age_to_group(se_system)

In [28]:
utils.benchmark_extractors(se_system, se_gold, se_attributes, debug='ethnicity')

128 sys african american
129 sys african american
135 gold african american
135 gold african american
176 gold hispanic/latin
239 gold white/caucascian
270 gold african american
294 gold hispanic/latin
305 gold african american
335 gold african american
364 gold african american
403 gold african american
420 sys african american
defaultdict(<class 'int'>, {'age': 192, 'educationlevel': 23, 'gender': 103, 'residence': 96, 'causeofdeath': 87, 'religion': 2, 'pastconviction': 8}) defaultdict(<class 'int'>, {'pastconviction': 76, 'educationlevel': 14, 'causeofdeath': 136, 'residence': 154, 'age': 14, 'birthplace': 1, 'gender': 14, 'ethnicity': 3, 'religion': 2}) defaultdict(<class 'int'>, {'age': 547, 'gender': 653, 'residence': 307, 'causeofdeath': 401, 'educationlevel': 94, 'birthplace': 6, 'pastconviction': 24, 'religion': 15, 'ethnicity': 10})


({'age': 0.9320388349514563,
  'birthplace': 0.0,
  'causeofdeath': 0.3901345291479821,
  'educationlevel': 0.6216216216216216,
  'ethnicity': 0.0,
  'gender': 0.8803418803418803,
  'pastconviction': 0.09523809523809523,
  'religion': 0.5,
  'residence': 0.384},
 {'age': 0.2598105548037889,
  'birthplace': 0.0,
  'causeofdeath': 0.17827868852459017,
  'educationlevel': 0.19658119658119658,
  'ethnicity': 0.0,
  'gender': 0.13624338624338625,
  'pastconviction': 0.25,
  'religion': 0.11764705882352941,
  'residence': 0.23821339950372208},
 {'age': 0.4063492063492063,
  'birthplace': 0.0,
  'causeofdeath': 0.24472573839662448,
  'educationlevel': 0.29870129870129875,
  'ethnicity': 0.0,
  'gender': 0.23596792668957617,
  'pastconviction': 0.13793103448275862,
  'religion': 0.19047619047619047,
  'residence': 0.2940275650842266})

## * Next steps:
* prepare output for extrinsic evaluation

### 6. Prepare SE output to be run by the baselines
Desired format: pickle
{
    doc_id:
    {
        part_id:
        {
            prop: value, 
            prop2: value2,
            ...
        }
    }
}