In [1]:
# imports
import csv
import regex as re
import json
from collections import defaultdict, Counter
import utils
import en_coref_md
import pickle

In [2]:
import spacy

In [3]:
gvdb_articles_file = '../../gvdb-aggregated-db/Articles-with-extracted-info.tsv'

In [4]:
se_annotation_file='../data/input/partial/annotation/participants_input.p'
se_text_file='../data/input/partial/text/docs.p'

### 1. Load coreference model & resources

In [5]:
nlp = en_coref_md.load()

In [6]:
def get_sentence_offsets(doc):
    sent_offsets=[]
    for s in doc.sents:
        sent_offsets.append(tuple([s.start_char, s.end_char]))
    return sent_offsets

In [7]:
txt ='Hello. I am you. Who are you?'
doc = nlp(txt)
get_sentence_offsets(doc)

[(0, 6), (7, 16), (17, 29)]

In [8]:
emiel_resources_file='../resources/emiel.json'
with open (emiel_resources_file, 'r') as f:
    emiel_resources=json.load(f)

In [9]:
students_resources_file='../resources/students.json'
with open (students_resources_file, 'r') as f:
    students_resources=json.load(f)

In [10]:
students_resources.keys()

dict_keys(['CauseOfDeath', 'Residence', 'EducationLevel', 'Ethnicity', 'Religion', 'BirthPlace', 'PastConviction'])

### 2. Generate keywords

In [11]:
attributes=['age', 'race', 'gender']

In [12]:
"""
def get_most_frequent_words(rdr, attribute):
    frequency_count=defaultdict(int)
    for row in rdr:
        data=json.loads(row[3])
        for s in sections:
            for participant in data[s]:
                if participant[attribute]['value']:
                    value=participant[attribute]['value'].strip()
                    frequency_count[value]+=1
    return Counter(frequency_count).most_common(50)
    
frequency_count={}
with open(gvdb_articles_file, 'r') as csvfile:
    rdr = csv.reader(csvfile, delimiter='\t', quotechar='"')
    header=next(rdr)
    frequency_count['race']=get_most_frequent_words(rdr, 'race')
"""

'\ndef get_most_frequent_words(rdr, attribute):\n    frequency_count=defaultdict(int)\n    for row in rdr:\n        data=json.loads(row[3])\n        for s in sections:\n            for participant in data[s]:\n                if participant[attribute][\'value\']:\n                    value=participant[attribute][\'value\'].strip()\n                    frequency_count[value]+=1\n    return Counter(frequency_count).most_common(50)\n    \nfrequency_count={}\nwith open(gvdb_articles_file, \'r\') as csvfile:\n    rdr = csv.reader(csvfile, delimiter=\'\t\', quotechar=\'"\')\n    header=next(rdr)\n    frequency_count[\'race\']=get_most_frequent_words(rdr, \'race\')\n'

In [13]:
tmp_races = set(emiel_resources['ethnicity'])
tmp_races |= set({'black', 'latino', 'white', 'hispanic', 'asian', 'latina', 
       'african american', 'filipino', 'african-american', 'latinos',
      'palestinian', 'chinese-american', 'blacks', 'german-iranian'})
tmp_races |= set(students_resources['Ethnicity'])

races={o:o for o in tmp_races}


#genders={'male': {'he', 'boy', 'man', 'dude', 'guy', 'male'}, 'female': {'girl', 'woman', 'female'}}

genders={'he': 'male', 
         'boy': 'male', 
         'man': 'male', 
         'dude': 'male',
         'guy': 'male',
         'male': 'male',
         'brother': 'male',
         'father': 'male',
         'him': 'male',
         'himself': 'male',
         'she': 'female',
         'girl': 'female',
         'woman': 'female',
         'female': 'female',
         'sister': 'female',
         'mother': 'female',
         'her': 'female',
         'herself': 'female'}

occupations={o:o for o in emiel_resources['occupation-or-social-group']}

#religions={o:o for o in (emiel_resources['religion']+students_resources['Religion'])}
religions={o:o for o in students_resources['Religion']}

educations=students_resources['EducationLevel']
del educations['a']

convictions=students_resources['PastConviction']

causes=students_resources['CauseOfDeath']

age_patterns=[r'\d\d?-year-old', r', \d\d?']

### 3. Attribute value extractors

#### 3.1 Proximity based extractors

In [38]:
def attr_values_extractor(text, people_spans, coref_spans, sentence_offsets, patterns, a_dict=None):
    """Generic extractor that operates based on patterns."""
    
    extracted_pairs=defaultdict(list)
    for pattern in patterns:
        r=re.compile(pattern, re.IGNORECASE)
        values=r.finditer(text)
        for val_found in values:
            span=val_found.span()
            value=val_found.group()
            value=value.replace('-year-old', '').replace(',', '').strip()
            if a_dict and value in a_dict:
                value=a_dict[value]
            person, distance=utils.find_closest_person(span, 
                                                       people_spans, 
                                                       coref_spans, 
                                                       sentence_offsets,
                                                       min_dist=1000)
            if person:
                extracted_pairs[person].append(tuple([distance, value]))
    clean_pairs=utils.get_closest_value_per_person(extracted_pairs)
    return clean_pairs

In [39]:
def pattern_extractor(text, people_spans, coref_spans, sentence_offsets, patterns=None, pattern_data=None):
    if patterns: # if there are patterns given, fire the function immediately
        return attr_values_extractor(text, people_spans, coref_spans, sentence_offsets, patterns, pattern_data)
    
    #else create them first
    patterns=set()
    cs=set(pattern_data.keys())
    for o in cs:
        patterns.add(r'\b%s\b' % o)
    return attr_values_extractor(text, people_spans, coref_spans, sentence_offsets, patterns, pattern_data)

#### 3.2 Coreference based extractor

In [40]:
def attr_extractor_coref(clusters, names, values_json, debug=False): #, text, people_spans):
    
    person_data=defaultdict(list)

    if not clusters:
        return person_data
    
    for c in clusters:
        mentions=utils.stringify_cluster_mentions(c.mentions)
        for person_name in names:
            if utils.lookup_person_in_list(person_name, mentions):
                for m in c.mentions:
                    for txt in [m.text, m.lemma_]:
                        if txt.lower() in values_json.keys():
                            person_data[person_name].append(values_json[txt.lower()])
                            
    clean_data={}
    for person_name, gs in person_data.items():
        c=Counter(gs).most_common(1)[0][0]
        clean_data[person_name]=c
    return clean_data

#### 3.3 Run extraction of all properties for a document

In [41]:
def extract_properties(names, full_text, nlp):
    
    doc = nlp(full_text)
    # run coreference
    people_spans, coref_spans, clusters=utils.get_coref_spans(names, full_text, doc)

    sentence_offsets=utils.get_sentence_offsets(doc)
    
    # run individual extractors
    gender_extracted=attr_extractor_coref(clusters, names, genders)
#    gender_extracted=pattern_extractor(full_text,people_spans, coref_spans, pattern_data=genders)
    
    age_extracted=pattern_extractor(full_text, people_spans, coref_spans, sentence_offsets, patterns=age_patterns)
    
    race_extracted=pattern_extractor(full_text,people_spans, coref_spans, sentence_offsets, pattern_data=races)
#    race_extracted=attr_extractor_coref(clusters, names, races)
    
    occupation_extracted=pattern_extractor(full_text, people_spans, coref_spans, sentence_offsets, pattern_data=occupations)
    #attr_extractor_coref(clusters, names, occupations)
    #print(occupation_extracted)
    
    religion_extracted=pattern_extractor(full_text, people_spans, coref_spans, sentence_offsets, pattern_data=religions)
#    religion_extracted=attr_extractor_coref(clusters, names, religions)
    #if religion_extracted:
    #    print(religion_extracted)
    
    education_extracted=pattern_extractor(full_text, people_spans, coref_spans, sentence_offsets, pattern_data=educations)
    #education_extracted=attr_extractor_coref(clusters, names, educations)
    #if education_extracted:
    #    print(education_extracted)
        
    conviction_extracted=pattern_extractor(full_text, people_spans, coref_spans, sentence_offsets, pattern_data=convictions)
    #conviction_extracted=attr_extractor_coref(clusters, names, convictions)
    #if conviction_extracted:
    #    print(conviction_extracted)
    
    causes_extracted=pattern_extractor(full_text, people_spans, coref_spans, sentence_offsets, pattern_data=causes)
    #if causes_extracted:
    #    print(causes_extracted)  
    
    all_extracted={'age': age_extracted, 'ethnicity': race_extracted, 'gender': gender_extracted, 
                  'religion': religion_extracted, 'occupation': occupation_extracted, 
                   'educationlevel': education_extracted, 'causeofdeath': causes_extracted,
                  'pastconviction': conviction_extracted}
    
    #combine extractors
    combined=utils.singularize_data(all_extracted)
            
    return combined

In [42]:
# Test stuff

txt='Hello Peter Boer, 26 was shot. The white police guy failed.'
names=['Peter Boer']
extract_properties(names, txt, nlp)

{'Peter Boer': {'age': '26',
  'causeofdeath': 'intentional',
  'ethnicity': 'white',
  'occupation': 'police'}}

### 4. Run GVDB extractors

In [43]:
def process_gvdb_data(the_file, limit=200):

    all_gold_rows=[]
    all_sys_rows=[]

    with open(the_file, 'r') as csvfile:
        rdr = csv.reader(csvfile, delimiter='\t', quotechar='"')
        header=next(rdr)
        
        for c, row in enumerate(rdr):
            if c==limit: break

            full_text=row[2]
            data=json.loads(row[3])
            part_info=utils.get_participant_info(data)

            names=set(part_info.keys())
            if not len(names): continue

            system_data = extract_properties(names, full_text, nlp)
            all_sys_rows.append(system_data)
            all_gold_rows.append(part_info)
            
            c+=1
    return all_sys_rows, all_gold_rows

In [44]:
#all_sys_rows, all_gold_rows = process_gvdb_data(gvdb_articles_file, limit=200)

### 5. Run SE extractors

In [45]:
def conll_to_text(rows):
    tokens=[]
    for row in rows:
        elements=row.split('\t')
#        print(elements, elements[1])
        if elements[1]=='NEWLINE':
            elements[1]='\n'
        tokens.append(elements[1])
    text=' '.join(tokens)
    return text

In [46]:
def get_names(part_data):
    names=[]
    for part_id, part_info in part_data.items():
        if 'Name' in part_info.keys() and part_info['Name'].strip():
            names.append(part_info['Name'].strip())
    return names

In [47]:
def transform_part_info(data):
    new_data={}
    for part_id, part in data.items():
        if 'Name' in part and part['Name'].strip():
            name=part['Name'].strip()
            del part['Name']
            new_part={}
            for k,v in part.items():
                k=k.strip().lower()
                new_part[k]=v.strip().lower()
            new_data[name]=new_part
    return new_data

In [48]:
def process_se_data(annotation_file, text_file, limit=200):
    all_gold_rows=[]
    all_sys_rows=[]

    with open(annotation_file, 'rb') as af:
        annotations=pickle.load(af)
    
    with open(text_file, 'rb') as tf:
        all_texts_json=pickle.load(tf)
        
    cnt=0
    for doc_id, part_data in annotations.items():

        names=get_names(part_data)
        
        if doc_id not in all_texts_json:
            continue
        text_json=all_texts_json[doc_id]
        conll_data=text_json['content']
        text=conll_to_text(conll_data)
        
        properties=extract_properties(names, text, nlp)
        
        all_sys_rows.append(properties)
        
        part_info=transform_part_info(part_data)
        all_gold_rows.append(part_info)
        
        cnt+=1
        if cnt==limit: break
        
    return all_sys_rows, all_gold_rows

In [49]:
se_system, se_gold = process_se_data(se_annotation_file, se_text_file, limit=1000)

In [50]:
len(se_system)

456

In [51]:
count_per_attr=defaultdict(int)
for row in se_gold:
    for part, data in row.items():
        for attr in data.keys():
            count_per_attr[attr]+=1
            if attr=='ethnicity':
                print(part, data[attr])
print(count_per_attr)

Devin Anderson african american
Ahmad Antoine african american
Alejandro Martinez hispanic/latin
Bryant Sanchez white/caucascian
Stanley Greene african american
Luis Canseco hispanic/latin
Lydell McLaurin african american
James Graham african american
Gary Holmes african american
Tyre King african american
defaultdict(<class 'int'>, {'residence': 403, 'age': 739, 'gender': 756, 'educationlevel': 117, 'causeofdeath': 488, 'deathplace': 565, 'deathdate': 565, 'birthplace': 6, 'pastconviction': 32, 'religion': 17, 'medicalcondition': 8, 'ethnicity': 10})


### 5. Benchmark extractors

In [28]:
def benchmark_extractors(system, gold, attributes, debug=False):
    assert len(system)==len(gold)
    tp=defaultdict(int)
    fp=defaultdict(int)
    fn=defaultdict(int)
    
    
    for index, gold_row in enumerate(gold):
        system_row=system[index]
        
        for part, gold_vals in gold_row.items():
            try:
                system_vals=system_row[part]
            except KeyError:
                system_vals={}
            
            for a in attributes:
                gold_val=''
                system_val=''
                if a in gold_vals:
                    gold_val=gold_vals[a].strip()
                if a in system_vals:
                    system_val=system_vals[a].strip()
                if gold_val and system_val:
                    if gold_val==system_val:
                        tp[a]+=1
                    else:
                        fp[a]+=1
                        fn[a]+=1
                elif gold_val:
                    fn[a]+=1
                elif system_val:
                    fp[a]+=1
    
    recall={}
    prec={}
    f1={}
    
    print(tp,fp, fn)
    for a in attributes:
        prec[a]=tp[a]/(tp[a]+fp[a])
        recall[a]=tp[a]/(tp[a]+fn[a])
        f1[a]=2*prec[a]*recall[a]/(prec[a]+recall[a])
    return prec, recall, f1

#### 5.1 Benchmark GVDB data

In [29]:
benchmark_extractors(all_sys_rows, all_gold_rows, attributes)

NameError: name 'all_sys_rows' is not defined

#### 5.2 Benchmark on SemEval data

In [58]:
se_attributes=['age', 'gender', 'pastconviction', 'educationlevel', 
               'causeofdeath', 'ethnicity', 'religion']

In [56]:
def map_age_to_group(sys_data):
    for doc in sys_data:
        for part, data in doc.items():
            if 'age' in data:
                age=int(data['age'])
                age_group=None
                if age<12:
                    age_group='child 0-11'
                elif age<18:
                    age_group='teen 12-17'
                elif age<65:
                    age_group='adult 18-64'
                else:
                    age_group='senior 65+'
                data['age']=age_group
    return sys_data
if 'age' in se_attributes:
    se_system=map_age_to_group(se_system)

ValueError: invalid literal for int() with base 10: 'teen 12-17'

In [59]:
benchmark_extractors(se_system, se_gold, se_attributes, debug=True)

defaultdict(<class 'int'>, {'age': 192, 'educationlevel': 23, 'gender': 103, 'causeofdeath': 87, 'pastconviction': 8}) defaultdict(<class 'int'>, {'pastconviction': 76, 'educationlevel': 14, 'causeofdeath': 136, 'age': 14, 'religion': 4, 'ethnicity': 16, 'gender': 14}) defaultdict(<class 'int'>, {'age': 547, 'gender': 653, 'causeofdeath': 401, 'educationlevel': 94, 'pastconviction': 24, 'religion': 17, 'ethnicity': 10})


ZeroDivisionError: float division by zero

## * Next steps:
* benchmark
* other properties: religion, school, deathplace, residence, birthplace, occupation,
* evaluate extrinsically