In [1]:
# imports
import csv
import regex as re
import json
from collections import defaultdict, Counter
import utils
import en_coref_md

In [2]:
gvdb_articles_file = '../../gvdb-aggregated-db/Articles-with-extracted-info.tsv'

### 1. Load coreference model & resources

In [3]:
nlp = en_coref_md.load()

In [4]:
emiel_resources_file='../resources/emiel.json'
with open (emiel_resources_file, 'r') as f:
    emiel_resources=json.load(f)

In [65]:
students_resources_file='../resources/students.json'
with open (students_resources_file, 'r') as f:
    students_resources=json.load(f)

In [26]:
students_resources.keys()

dict_keys(['CauseOfDeath', 'Residence', 'EducationLevel', 'Ethnicity', 'Religion', 'BirthPlace', 'PastConviction'])

### 2. Generate keywords

In [7]:
attributes=['age', 'race', 'gender']

In [8]:
"""
def get_most_frequent_words(rdr, attribute):
    frequency_count=defaultdict(int)
    for row in rdr:
        data=json.loads(row[3])
        for s in sections:
            for participant in data[s]:
                if participant[attribute]['value']:
                    value=participant[attribute]['value'].strip()
                    frequency_count[value]+=1
    return Counter(frequency_count).most_common(50)
    
frequency_count={}
with open(gvdb_articles_file, 'r') as csvfile:
    rdr = csv.reader(csvfile, delimiter='\t', quotechar='"')
    header=next(rdr)
    frequency_count['race']=get_most_frequent_words(rdr, 'race')
"""

'\ndef get_most_frequent_words(rdr, attribute):\n    frequency_count=defaultdict(int)\n    for row in rdr:\n        data=json.loads(row[3])\n        for s in sections:\n            for participant in data[s]:\n                if participant[attribute][\'value\']:\n                    value=participant[attribute][\'value\'].strip()\n                    frequency_count[value]+=1\n    return Counter(frequency_count).most_common(50)\n    \nfrequency_count={}\nwith open(gvdb_articles_file, \'r\') as csvfile:\n    rdr = csv.reader(csvfile, delimiter=\'\t\', quotechar=\'"\')\n    header=next(rdr)\n    frequency_count[\'race\']=get_most_frequent_words(rdr, \'race\')\n'

In [66]:
tmp_races = set(emiel_resources['ethnicity'])
tmp_races |= set({'black', 'latino', 'white', 'hispanic', 'asian', 'latina', 
       'african american', 'filipino', 'african-american', 'latinos',
      'palestinian', 'chinese-american', 'blacks', 'german-iranian'})
tmp_races |= set(students_resources['Ethnicity'])

races={o:o for o in tmp_races}


#genders={'male': {'he', 'boy', 'man', 'dude', 'guy', 'male'}, 'female': {'girl', 'woman', 'female'}}

genders={'he': 'male', 
         'boy': 'male', 
         'man': 'male', 
         'dude': 'male',
         'guy': 'male',
         'male': 'male',
         'brother': 'male',
         'father': 'male',
         'him': 'male',
         'himself': 'male',
         'she': 'female',
         'girl': 'female',
         'woman': 'female',
         'female': 'female',
         'sister': 'female',
         'mother': 'female',
         'her': 'female',
         'herself': 'female'}

occupations={o:o for o in emiel_resources['occupation-or-social-group']}

#religions={o:o for o in (emiel_resources['religion']+students_resources['Religion'])}
religions={o:o for o in students_resources['Religion']}

educations=students_resources['EducationLevel']
del educations['a']

convictions=students_resources['PastConviction']

causes=students_resources['CauseOfDeath']

age_patterns=[r'\d\d?-year-old', r', \d\d?']

### 3. Attribute value extractors

In [41]:
def attr_values_extractor(text, people_spans, patterns, a_dict=None):
    """Generic extractor that operates based on patterns."""
    
    extracted_pairs=defaultdict(list)
    for pattern in patterns:
        r=re.compile(pattern, re.IGNORECASE)
        values=r.finditer(text)
        for val_found in values:
            span=val_found.span()
            value=val_found.group()
            value=value.replace('-year-old', '').replace(',', '').strip()
            if a_dict and value in a_dict:
                value=a_dict[value]
            person, distance=utils.find_closest_person(span, people_spans)
            if person:
                extracted_pairs[person].append(tuple([distance, value]))
    clean_pairs=utils.get_closest_value_per_person(extracted_pairs)
    return clean_pairs

In [30]:
def age_extractor(text, people_spans):
    patterns=[r'\d\d?-year-old', r', \d\d?']
    return attr_values_extractor(text, people_spans, patterns)

In [67]:
def pattern_extractor(text, people_spans, patterns=None, pattern_data=None):
    if patterns: # if there are patterns given, fire the function immediately
        return attr_values_extractor(text, people_spans, patterns, pattern_data)
    #else create them first
    patterns=set()
    cs=set(pattern_data.keys())
    for o in cs:
        patterns.add(r'\b%s\b' % o)
    return attr_values_extractor(text, people_spans, patterns, pattern_data)

In [43]:
def attr_extractor_coref(clusters, names, values_json, debug=False): #, text, people_spans):
    
    person_data=defaultdict(list)

    if not clusters:
        return person_data
    
    for c in clusters:
        mentions=utils.stringify_cluster_mentions(c.mentions)
        for person_name in names:
            if utils.lookup_person_in_list(person_name, mentions):
                for m in c.mentions:
                    for txt in [m.text, m.lemma_]:
                        if txt.lower() in values_json.keys():
                            person_data[person_name].append(values_json[txt.lower()])
                            
    clean_data={}
    for person_name, gs in person_data.items():
        c=Counter(gs).most_common(1)[0][0]
        clean_data[person_name]=c
    return clean_data

In [68]:
def extract_properties(names, full_text, nlp):
    
    # run coreference
    people_spans, clusters=utils.get_coref_spans(names, full_text, nlp)

    # run individual extractors
    gender_extracted=attr_extractor_coref(clusters, names, genders)

    age_extracted=pattern_extractor(full_text,people_spans, patterns=age_patterns)
    race_extracted=pattern_extractor(full_text,people_spans, pattern_data=races)

    occupation_extracted=pattern_extractor(full_text,people_spans, pattern_data=occupations)
    #attr_extractor_coref(clusters, names, occupations)
    #print(occupation_extracted)
    
    religion_extracted=pattern_extractor(full_text, people_spans, pattern_data=religions)
#    religion_extracted=attr_extractor_coref(clusters, names, religions)
    if religion_extracted:
        print(religion_extracted)
    
    education_extracted=pattern_extractor(full_text, people_spans, pattern_data=educations)
    #education_extracted=attr_extractor_coref(clusters, names, educations)
    if education_extracted:
        print(education_extracted)
        
    conviction_extracted=pattern_extractor(full_text, people_spans, pattern_data=convictions)
    #conviction_extracted=attr_extractor_coref(clusters, names, convictions)
    if conviction_extracted:
        print(conviction_extracted)
    
    causes_extracted=pattern_extractor(full_text, people_spans, pattern_data=causes)
    if causes_extracted:
        print(causes_extracted)  
    
    all_extracted={'age': age_extracted, 'race': race_extracted, 'gender': gender_extracted, 
                  'religion': religion_extracted, 'occupation': occupation_extracted, 
                   'education': education_extracted}
    
    #combine extractors
    combined=utils.singularize_data(all_extracted)
            
    return combined

### 4. Run extractors

In [69]:
def process_gvdb_data(the_file, limit=200):

    all_gold_rows=[]
    all_sys_rows=[]

    with open(the_file, 'r') as csvfile:
        rdr = csv.reader(csvfile, delimiter='\t', quotechar='"')
        header=next(rdr)
        
        for c, row in enumerate(rdr):
            if c==limit: break

            full_text=row[2]
            data=json.loads(row[3])
            part_info=utils.get_participant_info(data)

            names=set(part_info.keys())
            if not len(names): continue

            system_data = extract_properties(names, full_text, nlp)
            all_sys_rows.append(system_data)
            all_gold_rows.append(part_info)
            
            c+=1
    return all_sys_rows, all_gold_rows

In [70]:
all_sys_rows, all_gold_rows = process_gvdb_data(gvdb_articles_file, limit=200)

{'Sean Bolton': 'Intentional'}
{'Renea Lloyd': 'Accidental'}
{'Michael Habay': 'Intentional'}
{'Matthew Moore': 'Yes'}
{'Chrystol Moore': 'Intentional'}
{'bear': 'Intentional'}
{'Kendrick Armond Brown': 'Intentional'}


### 5. Benchmark extractors

In [71]:
def benchmark_extractors(system, gold, attributes):
    assert len(system)==len(gold)
    tp=defaultdict(int)
    fp=defaultdict(int)
    fn=defaultdict(int)
    
    
    for index, gold_row in enumerate(gold):
        system_row=system[index]
        
        for part, system_vals in system_row.items():
            gold_vals=gold_row[part]
            
            for a in attributes:
                gold_val=''
                system_val=''
                if a in gold_vals:
                    gold_val=gold_vals[a].strip()
                if a in system_vals:
                    system_val=system_vals[a].strip()
                if gold_val and system_val:
                    if gold_val==system_val:
                        tp[a]+=1
                    else:
                        fp[a]+=1
                        fn[a]+=1
                elif gold_val:
                    fn[a]+=1
                elif system_val:
                    fp[a]+=1
    
    recall={}
    prec={}
    f1={}
    
    print(tp,fp, fn)
    for a in attributes:
        prec[a]=tp[a]/(tp[a]+fp[a])
        recall[a]=tp[a]/(tp[a]+fn[a])
        f1[a]=2*prec[a]*recall[a]/(prec[a]+recall[a])
    return prec, recall, f1

In [72]:
benchmark_extractors(all_sys_rows, all_gold_rows, attributes)

defaultdict(<class 'int'>, {'age': 96, 'gender': 71, 'race': 1}) defaultdict(<class 'int'>, {'gender': 19, 'age': 1, 'race': 1}) defaultdict(<class 'int'>, {'age': 8, 'gender': 14, 'race': 4})


({'age': 0.9896907216494846, 'gender': 0.7888888888888889, 'race': 0.5},
 {'age': 0.9230769230769231, 'gender': 0.8352941176470589, 'race': 0.2},
 {'age': 0.955223880597015,
  'gender': 0.8114285714285714,
  'race': 0.28571428571428575})

## * Next steps:
* benchmark
* other properties: religion, school, deathplace, residence, birthplace, occupation,
* evaluate extrinsically

In [None]:
attr_lexica['religion']