In [1]:
import pandas as pd
from collections import defaultdict
import utils
import hashlib
import pickle
import json
import sys
import glob

In [2]:
se_data_folder='auxiliary_data/semeval18_data'
se_test_data_folder='%s/test_data' % se_data_folder
se_trial_data_folder='%s/trial_data' % se_data_folder
se_frames_file='%s/gva_frames' % se_data_folder

input_data_folder='../data/input/full'
gold_data_folder='../data/gold/full/'

In [3]:
a_df = pd.read_pickle(se_frames_file)

In [4]:
len(a_df)

4580

In [5]:
exclude_empty_names=True

In [6]:
participant_data_per_document=defaultdict(dict)
identical_participants=defaultdict(set)
for index, row in a_df.iterrows():
    inc_uri=row['incident_uri']
    docs=list(row['hashed_ids'].values())
    cnt=1
    state=row['state']
    year=row['date'][-4:]
    for participant_data in row['participants']:
        inc_part_key='%s#%s' % (inc_uri, cnt)
        if exclude_empty_names and \
            ('Name' not in participant_data.keys() or not participant_data['Name'].strip()):
            continue
        if 'Status' in participant_data:
            status=participant_data['Status'].strip()
            if status=='Killed':
                participant_data['DeathPlace']=state
                participant_data['DeathDate']=year
        for doc_id in docs:
            string_to_encode='%s#%d#%s' % (doc_id, cnt, json.dumps(participant_data))
            new_id=hashlib.md5(string_to_encode.encode('utf-8')).hexdigest()
            participant_data_per_document[doc_id][new_id]=participant_data
            identical_participants[inc_part_key].add(new_id)
        cnt+=1


In [7]:
input_participants_file='%s/annotation/participants_input.p' % input_data_folder
gold_participants_file='%s/participants_gold.p' % gold_data_folder
out_gold_file = '%s/participants.json' % gold_data_folder

In [8]:
with open(input_participants_file, 'wb') as w:
    pickle.dump(participant_data_per_document, w)

In [9]:
with open(gold_participants_file, 'wb') as w:
    pickle.dump(identical_participants, w)

In [10]:
json_clusters=utils.transform_gold_to_json(identical_participants)

In [11]:
with open(out_gold_file, 'w') as w:
    json.dump(json_clusters, w)

### 2. Increase ambiguity of data

#### 2a) Increase ambiguity by giving everyone the same first name

In [12]:
new_first_name = "John"
gva_samefirstname_annotations = '%s/annotation/participants_samefirstname.p' % input_data_folder

utils.create_ambiguous_data(input_participants_file, 
                      gva_samefirstname_annotations, 
                      new_firstname=new_first_name)

#### 2b) Increase ambiguity by giving everyone the same last name

In [13]:
new_last_name = "Smith"
gva_samelastname_annotations = '%s/annotation/participants_samelastname.p' % input_data_folder

utils.create_ambiguous_data(input_participants_file, 
                      gva_samelastname_annotations, 
                      new_lastname=new_last_name)

### 2c) Increase ambiguity by giving everyone the same name

In [14]:
new_name = "John Smith"
gva_samename_annotations = '%s/annotation/participants_samename.p' % input_data_folder

utils.create_ambiguous_data(input_participants_file, 
                      gva_samename_annotations, 
                      new_name=new_name)

### 3. Gather all documents in JSON format from the violent corpus

In [None]:
corpus_location='../../../../SemEval/LongTailQATask/EventRegistries/GunViolenceArchive/the_violent_corpus'

In [None]:
documents_to_store=set(participant_data_per_document.keys())

In [None]:
found_documents=set()
for fn in glob.glob('%s/*/*.json' % corpus_location):
    file_basename=fn.split('/')[-1]
    doc_name=file_basename.split('.')[0]
    print(doc_name, doc_name in documents_to_store)
    if doc_name in documents_to_store and doc_name not in found_documents:
        with open(fn, 'rb') as pickfile:
            a_news_item=pickle.load(pickfile)
        new_json={'title': a_news_item.title, 
                  'content': a_news_item.content, 
                  'dct': a_news_item.dct.strftime("%Y-%m-%d")}
        with open('%s/text/%s' % (input_data_folder, file_basename), 'w') as target_json:
            json.dump(new_json, target_json)
        
        found_documents.add(doc_name)
        print('copied %s' % fn)

In [None]:
len(found_documents)