### 1.Prepare to run

#### 1.1 Import modules 

In [1]:
# import existing ones
import csv
import regex as re
from collections import defaultdict, Counter
import en_coref_md
import glob
import pickle

In [2]:
# import modules we created
import utils

#### 1.2 Set data directories

In [3]:
# Important: set which partition to work with!
which_partition='full'

In [4]:
# SemEval input files (annotations and text)

# annotation data
se_annotation_dir='../data/input/%s/annotation' % which_partition
se_annotation_file='%s/participants_input.p' % se_annotation_dir

# text documents
se_partial_text_file='../data/input/partial/text/docs.p'
se_all_documents_path='../data/input/full/text'

# SemEval output paths
se_output_dir='../data/tmp/extracted_data/%s' % which_partition
se_output_file='%s/extracted_data.p' % se_output_dir

In [5]:
# GVDB data file
gvdb_articles_file = '../../gvdb-aggregated-db/Articles-with-extracted-info.tsv'

#### 1.3 Load lexicons

In [6]:
emiel_resources_file='../resources/emiel.json'
emiel_resources=utils.load_json(emiel_resources_file)

students_resources_file='../resources/students.json'
students_resources=utils.load_json(students_resources_file)

genders_resources_file='../resources/genders.json'
genders=utils.load_json(genders_resources_file)

#### 1.4 Load coreference model

In [7]:
nlp = en_coref_md.load()

### 2. Generate keywords

#### 2.1 Define mapping of properties (lower-to-capitalletter)

In [8]:
map_properties ={
                    'age': 'Age', 
                    'ethnicity': 'Ethnicity',
                    #'occupation': 'Occupation',
                    'causeofdeath': 'CauseOfDeath',
                    'religion': 'Religion',
                    'educationlevel': 'EducationLevel',
                    'pastconviction': 'PastConviction',
                    'residence': 'Residence',
                    'birthplace': 'BirthPlace',
                    'gender': 'Gender'
                }

In [9]:
se_attributes=list(map_properties.keys())

#### 2.2 Define how to process which attributes

In [10]:
# Process based on patterns and keywords
pattern_attrs=['age','ethnicity', 'religion', 'educationlevel', 
               'causeofdeath', 'pastconviction', 'birthplace', 'residence']

# Process based on coreference
coref_attrs=['gender']

#### 2.3 Prepare the lexicons per attribute

##### 2.3.1 Prepare pattern keywords

In [11]:
pattern_data=defaultdict(dict)
for k,v in students_resources.items():
    new_v=utils.remove_stopwords(v)
    pattern_data[k.lower()]=new_v

#pattern_data['occupation']={o:o for o in emiel_resources['occupation-or-social-group']}

#pattern_data['ethnicity'] |= set(emiel_resources['ethnicity'])
#pattern_data['ethnicity'] |= set({'black', 'latino', 'white', 'hispanic', 'asian', 'latina', 
#       'african american', 'filipino', 'african-american', 'latinos',
#      'palestinian', 'chinese-american', 'blacks', 'german-iranian'})


#religions={o:o for o in (emiel_resources['religion']+students_resources['Religion'])}

##### 2.3.2 Prepare patterns

In [12]:
# Define exact patterns per attribute
patterns={'age': [r'\d\d?-year-old', r', \d\d?']}

In [13]:
for attr, keyword_json in pattern_data.items(): 
    attr_patterns=set()
    keywords=set(keyword_json.keys())
    for keyword in keywords:
        attr_patterns.add(r'\b%s\b' % keyword)
    patterns[attr]=attr_patterns

### 3. Attribute value extractors

#### 3.1 Proximity based extractor

In [14]:
def attr_extractor_proximity(text, people_spans, coref_spans, sentence_offsets, patterns, a_dict=None):
    """Generic extractor that operates based on patterns."""
    
    extracted_pairs=defaultdict(list)
    for pattern in patterns:
        r=re.compile(pattern, re.IGNORECASE)
        values=r.finditer(text)
        for val_found in values:
            span=val_found.span()
            value=val_found.group()
            value=value.replace('-year-old', '').replace(',', '').strip()
            if a_dict and value in a_dict:
                value=a_dict[value]
            person, distance=utils.find_closest_person(span, 
                                                       people_spans, 
                                                       coref_spans, 
                                                       sentence_offsets,
                                                       min_dist=1000)
            if person:
                extracted_pairs[person].append(tuple([distance, value]))
    clean_pairs=utils.get_closest_value_per_person(extracted_pairs)
    return clean_pairs

#### 3.2 Coreference based extractor

In [15]:
def attr_extractor_coref(clusters, names, values_json, debug=False): #, text, people_spans):
    
    person_data=defaultdict(list)

    if not clusters:
        return person_data
    
    for c in clusters:
        mentions=utils.stringify_cluster_mentions(c.mentions)
        for person_name in names:
            if utils.lookup_person_in_list(person_name, mentions):
                for m in c.mentions:
                    for txt in [m.text, m.lemma_]:
                        if txt.lower() in values_json.keys():
                            person_data[person_name].append(values_json[txt.lower()])
                            
    clean_data={}
    for person_name, gs in person_data.items():
        c=Counter(gs).most_common(1)[0][0]
        clean_data[person_name]=c
    return clean_data

#### 3.3 Run extraction of all properties for a document

In [16]:
def extract_properties(names, full_text, nlp):
    
    doc = nlp(full_text)
    # run coreference
    people_spans, coref_spans, clusters=utils.get_coref_spans(names, full_text, doc)
    
    sentence_offsets=utils.get_sentence_offsets(doc)
    
    all_extracted={}
    for attribute in pattern_attrs:
        all_extracted[attribute]=attr_extractor_proximity(full_text, 
                                                           people_spans, 
                                                           coref_spans, 
                                                           sentence_offsets, 
                                                           a_dict=pattern_data[attribute],
                                                           patterns=patterns[attribute])
    
    all_extracted['gender']=attr_extractor_coref(clusters, names, genders)
    
    #combine extractors
    combined=utils.singularize_data(all_extracted)
            
    return combined

In [17]:
# Test stuff

txt='Hello Peter Boer, 26 was shot in church from Houston. The white police guy failed.'
names=['Peter Boer']
extract_properties(names, txt, nlp)

{'Peter Boer': {'age': '26',
  'ethnicity': 'white/caucascian',
  'religion': 'christian',
  'causeofdeath': 'intentional',
  'residence': 'texas'}}

### 4. Run GVDB extractors

In [18]:
def process_gvdb_data(the_file, limit=200):

    all_gold_rows=[]
    all_sys_rows=[]

    with open(the_file, 'r') as csvfile:
        rdr = csv.reader(csvfile, delimiter='\t', quotechar='"')
        header=next(rdr)
        
        for c, row in enumerate(rdr):
            if c==limit: break

            full_text=row[2]
            data=json.loads(row[3])
            part_info=utils.get_participant_info(data)

            names=set(part_info.keys())
            if not len(names): continue

            system_data = extract_properties(names, full_text, nlp)
            all_sys_rows.append(system_data)
            all_gold_rows.append(part_info)
            
            c+=1
    return all_sys_rows, all_gold_rows

In [19]:
#all_sys_rows, all_gold_rows = process_gvdb_data(gvdb_articles_file, limit=200)

### 5. Run SE extractors

In [20]:

annotations=utils.load_pickle(se_annotation_file)
print(annotations['2b10cc753152f0edaacf76314ab6ceec']['6306e4f5f77791a77e9bd9ea3efc9f17'])


{'Name': ' Ryan Morales', 'Age': ' 2', 'Age Group': ' Child 0-11', 'Gender': ' Male', 'Status': ' Killed', 'Type': ' Victim', 'DeathPlace': 'Texas', 'DeathDate': '2017'}


In [21]:
def process_se_data(annotation_file, text_file):

    annotations=utils.load_pickle(annotation_file)
    all_texts_json=utils.load_pickle(text_file)
    #print(annotations['2b10cc753152f0edaacf76314ab6ceec']['6306e4f5f77791a77e9bd9ea3efc9f17'])
    
    all_gold_rows=[]
    all_sys_rows=[]
        
    storable = {}
     
    for doc_id, part_data in annotations.items():
        storable[doc_id]={}
        
        names=utils.get_names(part_data)
        if doc_id not in all_texts_json:
            text=utils.load_text_from_json('%s/%s.json' % (se_all_documents_path, doc_id))
        else:
            text_json=all_texts_json[doc_id]
            conll_data=text_json['content']
            text=utils.conll_to_text(conll_data)
        
        properties=extract_properties(names, text, nlp)
        all_sys_rows.append(properties)
             
        for part_id, a_part_info in part_data.items():
            
            if 'Name' not in a_part_info.keys() or not a_part_info['Name']: 
                print(doc_id, part_id, a_part_info.keys())
                input('continue?')
                continue

            name=a_part_info['Name'].strip()

            these_properties={}
            if name in properties.keys():
                for k,v in properties[name].items():
                    these_properties[map_properties[k]]=v

            these_properties['Name']=name
            storable[doc_id][part_id]=these_properties

        some_info=utils.transform_part_info(part_data)
        all_gold_rows.append(some_info)
        
        
    return all_sys_rows, all_gold_rows, storable

In [22]:
se_system, se_gold, data_to_store = process_se_data(se_annotation_file, 
                                                    se_partial_text_file)

In [23]:
utils.count_per_attribute(se_gold)

defaultdict(int,
            {'age': 11845,
             'age group': 13218,
             'gender': 12958,
             'status': 13308,
             'type': 13382,
             'deathplace': 6026,
             'deathdate': 6026,
             'relationship': 949})

### 5. Benchmark extractors

#### 5.1 Benchmark GVDB data

In [24]:
# gvdb_attributes=['age', 'race', 'gender']
# utils.benchmark_extractors(all_sys_rows, all_gold_rows, gvdb_attributes)

#### 5.2 Benchmark on SemEval data

In [25]:
if 'age' in se_attributes:
    se_system=utils.map_age_to_group(se_system)

In [26]:
utils.benchmark_extractors(se_system, se_gold, se_attributes, debug='ethnicity')

40 sys white
285 sys african american
431 sys white/caucascian
481 sys hispanic/latin
482 sys white/caucascian
544 sys african american
548 sys african american
591 sys african american
727 sys white
727 sys white
950 sys african american
1007 sys white/caucascian
1172 sys african american
1204 sys african american
1205 sys african american
1329 sys african american
1443 sys african american
1500 sys african american
1577 sys white/caucascian
1784 sys white/caucascian
1888 sys african american
1967 sys african american
2209 sys african american
2372 sys african american
2558 sys african american
2559 sys african american
2560 sys african american
2669 sys african american
2671 sys african american
2759 sys african american
2900 sys african american
2929 sys african american
2941 sys african american
2972 sys white
2993 sys white
3155 sys african american
3157 sys african american
3210 sys african american
3263 sys white/caucascian
3289 sys white/caucascian
3410 sys african american
342

ZeroDivisionError: division by zero

### 6. Prepare SE output to be run by the baselines

Desired format: pickle

```
{
    doc_id:
    {
        part_id:
        {
            prop: value, 
            prop2: value2,
            ...
        }
    }
}
```

#### 6.1 Process the original data

In [27]:
utils.dump_pickle(data_to_store, se_output_file)

#### 6.2 Process the altered versions

In [28]:
not_found_data=defaultdict(set)
for f in glob.glob('%s/*.p' % se_annotation_dir):
    #if f.strip()!=se_annotation_file:
    print(f, se_annotation_file)
    altered_data=utils.load_pickle(f)

    not_found=0
    new_data=defaultdict(dict)
    for doc_id, doc_data in altered_data.items():
        doc_property_data=data_to_store[doc_id]

        for part_id, part_data in doc_data.items():
            if part_id not in doc_property_data.keys():

                #print('NOT FOUND IN THE EXTRACTED DATA. document:', doc_id, '; participant:', part_id)
                not_found+=1
                not_found_data[doc_id].add(part_id)
                continue

            new_part_data={}
            if 'Name' in part_data:
                new_part_data['Name'] = part_data['Name']
            for k, v in doc_property_data[part_id].items():
                if k!='Name':
                    new_part_data[k]=v
            new_data[doc_id][part_id]=new_part_data

    print('not found number:', not_found)
    new_file_path='%s/%s' % (se_output_dir, f.split('/')[-1])
    print(new_file_path)

    utils.dump_pickle(new_data, new_file_path)

../data/input/full/annotation/participants_input.p ../data/input/full/annotation/participants_input.p
not found number: 0
extracted_data/full/participants_input.p
../data/input/full/annotation/participants_samefirstname.p ../data/input/full/annotation/participants_input.p
not found number: 0
extracted_data/full/participants_samefirstname.p
../data/input/full/annotation/participants_samename.p ../data/input/full/annotation/participants_input.p
not found number: 0
extracted_data/full/participants_samename.p
../data/input/full/annotation/participants_samelastname.p ../data/input/full/annotation/participants_input.p
not found number: 0
extracted_data/full/participants_samelastname.p


#### Debug missing participants

In [None]:
doc_id='9897ff64ff1c41541dd9c4bdb3e2026b'

In [None]:
data_to_store[doc_id]

In [None]:
doc2inc_file='../data/tmp/auxiliary_data/doc2inc.p'
with open(doc2inc_file, 'rb') as f:
    doc2inc=pickle.load(f)

In [None]:
doc2inc[doc_id]

In [None]:
not_found_data