In [1]:
import pickle
import utils
import json

In [2]:
aux_path='../data/tmp/auxiliary_data'
all_docs_file = '%s/docs.conll' % aux_path # 
more_docs_file='%s/new.conll'  % aux_path #
relevant_docs_file='%s/final_docs.p'  % aux_path # 

input_path='../data/input/partial'
out_pickle = '%s/text/docs.p' % input_path

### 1) Get and store texts to disk

In [3]:
# Load all necessary components
all_docs_data=utils.load_conll_data(all_docs_file)

more_docs_data=utils.load_conll_data(more_docs_file)

with open(relevant_docs_file, 'rb') as f:
    relevant_incdocs=pickle.load(f)
    


In [4]:
# Merge initial and added documents
for more_doc, more_doc_data in more_docs_data.items():
    if more_doc not in all_docs_data.keys():
        all_docs_data[more_doc]=more_doc_data

In [5]:
# Combine document IDs from all incidents
relevant_docs=set()
for inc, docs in relevant_incdocs.items():
    relevant_docs |= set(docs)
    
print(len(relevant_docs))

456


In [6]:
# Keep only the data for the relevant documents
docs_data={}
for k,v in all_docs_data.items():
    if k in relevant_docs:
        docs_data[k]=all_docs_data[k]
        
len(docs_data)

456

In [7]:
with open(out_pickle, 'wb') as p:
    pickle.dump(docs_data, p)

### 2) Increase ambiguity by giving everyone the same first or last name

In [8]:
gva_input_annotations_file = '%s/annotation/participants_input.p' % input_path

#### 2a) Increase ambiguity by giving everyone the same first name

In [9]:
new_first_name = "John"
gva_samefirstname_annotations = '%s/annotation/participants_samefirstname.p' % input_path

utils.create_ambiguous_data(gva_input_annotations_file, 
                      gva_samefirstname_annotations, 
                      new_firstname=new_first_name)

#### 2b) Increase ambiguity by giving everyone the same last name

In [10]:
new_last_name = "Smith"
gva_samelastname_annotations = '%s/annotation/participants_samelastname.p' % input_path

utils.create_ambiguous_data(gva_input_annotations_file, 
                      gva_samelastname_annotations, 
                      new_lastname=new_last_name)

#### 2c) Increase ambiguity by giving everyone the same name

In [11]:
new_name = "John Smith"
gva_samename_annotations = '%s/annotation/participants_samename.p' % input_path

utils.create_ambiguous_data(gva_input_annotations_file, 
                      gva_samename_annotations, 
                      new_name=new_name)

### 3. Transform gold data to JSON with cluster IDs

#### 3.1 Obtain empty names

In [12]:
def get_empty_names(gva_input_annotations_file):
    empty_names=set()

    gva_data=utils.load_pickle(gva_input_annotations_file)
    for doc_id, doc_data in gva_data.items():
        for part_id, part_data in doc_data.items():
            if 'Name' not in part_data.keys() or not part_data['Name']:
                print(doc_id, part_id)
                empty_names.add(part_id)
    return empty_names

In [13]:
empty_names=get_empty_names(gva_input_annotations_file)

5023fd87fcffa600ce5306da1ae7aa19 b848071b112108badfccd731b49f7a8d
9897ff64ff1c41541dd9c4bdb3e2026b 730dadb6ff8df098d1855c483ae89f00
9897ff64ff1c41541dd9c4bdb3e2026b 0964adc35543d31edbbd3f49fe58b387
9897ff64ff1c41541dd9c4bdb3e2026b 2a26c39d7255166c951029be92833331
9897ff64ff1c41541dd9c4bdb3e2026b 57b7ef5ea3fd474db48a2d2df6f2792a
329f03de6e9aa5176d5ede16b043f84f bf9f9baa899f624090b6ee4f3e458fe3
329f03de6e9aa5176d5ede16b043f84f 3139c57cba423dc96ed5e7bc05c3d421
329f03de6e9aa5176d5ede16b043f84f 2b0f245a7a75812f263d21cab4075462
329f03de6e9aa5176d5ede16b043f84f 1b8b34f4b63938b3ad105979d73b59a0


In [14]:
which_partition='partial'
gold_file = '../data/gold/%s/participants_gold.p' % which_partition
out_gold_file = '../data/gold/%s/participants.json' % which_partition

with open(gold_file, 'rb') as g:
    gold_data = pickle.load(g)

In [15]:
gold_json=utils.transform_gold_to_json(gold_data, skip_empty=True, empty_names=empty_names)

In [16]:
with open(out_gold_file, 'w') as w:
    json.dump(gold_json, w)