In [1]:
# imports
import csv
import regex as re
import json
from collections import defaultdict, Counter

In [2]:
gvdb_articles_file = '../../gvdb-aggregated-db/Articles-with-extracted-info.tsv'

In [3]:
sections=['shooter-section', 'victim-section']

### 0. Load coreference

In [4]:
import en_coref_md
nlp = en_coref_md.load()

### 1. General purpose functions

#### 1a. Generate frequency counts

In [5]:
def get_most_frequent_words(rdr, attribute):
    frequency_count=defaultdict(int)
    for row in rdr:
        data=json.loads(row[3])
        for s in sections:
            for participant in data[s]:
                if participant[attribute]['value']:
                    value=participant[attribute]['value'].strip()
                    frequency_count[value]+=1
    return Counter(frequency_count).most_common(50)

In [6]:
frequency_count={}
with open(gvdb_articles_file, 'r') as csvfile:
    rdr = csv.reader(csvfile, delimiter='\t', quotechar='"')
    header=next(rdr)
    frequency_count['race']=get_most_frequent_words(rdr, 'race')

In [70]:
races={'black', 'latino', 'white', 'hispanic', 'asian', 'latina', 
       'african american', 'filipino', 'african-american', 'latinos',
      'palestinian', 'chinese-american', 'blacks', 'german-iranian'}

#genders={'male': {'he', 'boy', 'man', 'dude', 'guy', 'male'}, 'female': {'girl', 'woman', 'female'}}

genders={'he': 'male', 
         'boy': 'male', 
         'man': 'male', 
         'dude': 'male',
         'guy': 'male',
         'male': 'male',
         'brother': 'male',
         'father': 'male',
         'him': 'male',
         'himself': 'male',
         'she': 'female',
         'girl': 'female',
         'woman': 'female',
         'female': 'female',
         'sister': 'female',
         'mother': 'female',
         'her': 'female',
         'herself': 'female'}

In [34]:
def get_participant_info(data):
    part_info={}
    for s in sections:
        for participant in data[s]:
            if participant['name']['value']:
                name=participant['name']['value'].strip()
                if name in part_info:
                    continue
                info={}
                if participant['gender']:
                    info['gender']=participant['gender']
                if participant['age']['value']:
                    info['age']=participant['age']['value']
                if participant['race']['value']:
                    info['race']=participant['race']['value']
                part_info[name]=info
    return part_info

In [35]:
def get_people_spans(names, text):
    spans=defaultdict(list)
    for name in names:
        oc=[[m.start(),m.end()] for m in re.finditer(re.escape(name), text)]
        spans[name]=oc
    return spans

In [36]:
def stringify(mentions):
    stringy=[]
    for m in mentions:
        stringy.append(m.text)
    return stringy

In [42]:
def get_coref_spans(names, text):
    doc = nlp(text)
    
    spans=get_people_spans(names, text)
    
    if not doc._.has_coref:
        return spans, None

    clusters=doc._.coref_clusters
    for c in clusters:
        mentions=stringify(c.mentions)
        for person_name in names:
            if person_name in mentions:
                for m in c.mentions:
                    oc=[m.start_char, m.end_char]
                    if oc not in spans[person_name]:
                        spans[person_name].append(oc)
    return spans, clusters

In [38]:
def find_closest_person(attr_span, people_spans):
    min_dist=10
    closest_person=None
    for name, person_spans in people_spans.items():
        for ps in person_spans:
            # make sure to exclude cases where the attribute and the name overlap
            if (ps[0]<=attr_span[0] and ps[1]>=attr_span[1]) or (attr_span[0]<=ps[0] and attr_span[1]>=ps[1]):
                continue
            if ps[0]<attr_span[0]: # person mentioned before the attribute
                dist=attr_span[0]-ps[1]
            else: # person mentioned after the attribute
                dist=ps[0]-attr_span[1]
            if dist<min_dist:
                min_dist=dist
                closest_person=name
    return closest_person, min_dist

In [39]:
def get_closest_value_per_person(pairs):
    clean_pairs={}
    for person, person_pairs in pairs.items():
        clean_pairs[person]=sorted(person_pairs)[0][1]
    return clean_pairs

In [40]:
def attr_values_extractor(text, people_spans, patterns):
    
    #people_spans=get_people_spans(names, text)
    extracted_pairs=defaultdict(list)
    
    for pattern in patterns:
        r=re.compile(pattern, re.IGNORECASE)
        values=r.finditer(text)
        for val_found in values:
            span=val_found.span()
            value=val_found.group()
            #print(span, value)
            value=value.replace('-year-old', '').replace(',', '').strip()
            person, distance=find_closest_person(span, people_spans)
            if person:
                extracted_pairs[person].append(tuple([distance, value]))
    clean_pairs=get_closest_value_per_person(extracted_pairs)
    return clean_pairs

In [41]:
def age_extractor(text, people_spans):
    patterns=[r'\d\d?-year-old', r', \d\d?']
    return attr_values_extractor(text, people_spans, patterns)

In [16]:
def race_extractor(text, people_spans):
    patterns=set()
    for race in races:
        patterns.add(r'\b%s\b' % race)
    return attr_values_extractor(text, people_spans, patterns)

In [65]:
def lookup_person_in_list(name, ments):
    for m in ments:
        if name in m:
            return True
    return False

In [80]:
def gender_extractor(clusters, names): #, text, people_spans):
    person_genders=defaultdict(list)
    for c in clusters:
        mentions=stringify(c.mentions)
        for person_name in names:
            if lookup_person_in_list(person_name, mentions):
                for m in c.mentions:
                    for txt in [m.text, m.lemma_]:
                        if txt.lower() in genders.keys():
                            person_genders[person_name].append(genders[txt.lower()])
                            
    clean_genders={}
    for person_name, gs in person_genders.items():
        c=Counter(gs).most_common(1)[0][0]
        clean_genders[person_name]=c
    return clean_genders

In [81]:
def singularize_data(extractors_json):
    result=defaultdict(dict)
    for attribute, extracted in extractors_json.items():
        for name, value in extracted.items():
            result[name][attribute]=value
    return dict(result)

In [82]:
with open(gvdb_articles_file, 'r') as csvfile:
    rdr = csv.reader(csvfile, delimiter='\t', quotechar='"')
    header=next(rdr)
    for row in rdr:
        data=json.loads(row[3])
        part_info=get_participant_info(data)
        names=set(part_info.keys())
        if not len(names): continue
        full_text=row[2]
        people_spans, clusters=get_coref_spans(names, full_text)
        #print(people_spans)
        age_extracted=age_extractor(full_text,people_spans)
        race_extracted=race_extractor(full_text,people_spans)
        gender_extracted=gender_extractor(clusters, names)
        all_extracted={'age': age_extracted, 'race': race_extracted, 'gender': gender_extracted}
        combined=singularize_data(all_extracted)
#        if len(race_extracted.keys()):
        print(part_info, '\n', combined)
        input('continue?')

{'Joe Henderson': {'gender': 'Male', 'age': '26'}} 
 {'Joe Henderson': {'age': '26', 'gender': 'male'}}
continue?
{'Joe Henderson': {'gender': 'Male', 'age': '26'}} 
 {'Joe Henderson': {'age': '26', 'gender': 'male'}}
continue?
{'Joe Henderson': {'gender': 'Male', 'age': '26'}} 
 {'Joe Henderson': {'age': '26', 'gender': 'male'}}
continue?
{'Eight law enforcement officers': {}, 'Jean Falgout': {'gender': 'Male', 'age': '45-year-old'}} 
 {}
continue?
{'Joe Henderson': {'gender': 'Male', 'age': '26-year-old'}} 
 {'Joe Henderson': {'age': '26', 'gender': 'male'}}
continue?
{'Joe Henderson': {'gender': 'Male', 'age': '26-year-old'}} 
 {'Joe Henderson': {'age': '26', 'gender': 'male'}}
continue?
{'Joe Henderson': {'gender': 'Male', 'age': '26-year-old'}} 
 {'Joe Henderson': {'age': '26', 'gender': 'male'}}
continue?
{'Joe Henderson': {'gender': 'Male', 'age': '26'}} 
 {'Joe Henderson': {'age': '26', 'gender': 'male'}}
continue?
{'Joe Henderson': {'gender': 'Male', 'age': '26'}} 
 {'Joe Hend

TypeError: 'NoneType' object is not iterable

#### Improvements:
* integrate coreference
* make sure that the boundaries don't overlap, or even add PoS tagging
* more extractors (gender, religion, residence, etc.)