In [60]:
import json
from pathlib import Path
from collections import defaultdict, Counter

In [65]:
def load_alias():
    alias_dict = defaultdict(list)
    with open('../../resources/aliasCache') as f:
        alias_set = [[k for k in x.strip().split("###")  if k] for x in f.readlines()]
        alias_set = [k for k in alias_set if len(k)>1]
        for alias in alias_set:
            for each_name in alias:
                alias_dict[each_name].extend(alias)
    return alias_dict

In [176]:
def load_annot(fname):
    elem_dict = {}
    with open(fname) as f:
        lines = f.readlines()
        for line in lines:
            elem = json.loads(line.strip())
            docId = elem.pop('docId')
            elem_dict[docId] = elem
    return elem_dict
def load_doc(dirname, annot_ids):
    doc_dict = {}
    for annot_id in annot_ids:
        fname = dirname + annot_id + ".json"
        with open(fname) as f:
            doc_elem = json.load(f)
            doc_dict[annot_id] = doc_elem
    return doc_dict

In [229]:
def find_occurences(named_entity_list, cluster_json, alias_dict):
    ne_per_locations = defaultdict(list)
    ne_per_clusters = defaultdict(list)
    alter_names = defaultdict(set)
    ne_type = {}
    # check type, and merge person type with shared token.
    for ne, occurences in named_entity_list.items():
        ne_type[ne] = most_common([occ[1] for occ in occurences]) 
    for ne in sorted(named_entity_list, key=lambda k: len(named_entity_list[k]), reverse=True):
        occurences = named_entity_list[ne]
        # Merge aliases:
        total_aliases = [alias_dict.get(k,[]) for k in named_entity_list.keys() if not k == ne]
        total_aliases = [item for sublist in total_aliases for item in sublist]
        alias_in_doc = set(alias_dict.get(ne, [])).intersection(total_aliases)
        recorded = False
        if alias_in_doc:
            for ne2 in named_entity_list:
                if ne == ne2:
                    continue
                elif set(alias_dict.get(ne2,[])).intersection(set(alias_dict.get(ne,[]))):
                    if ne2 in ne_per_locations:
                        alter_names[ne2].add(ne)
                        alter_names[ne].add(ne2)
                        ne_per_locations[ne2].extend([(occ[0], occ[2], occ[3]) for occ in occurences])
                        recorded = True
            if not recorded:
                ne_per_locations[ne].extend([(occ[0], occ[2], occ[3]) for occ in occurences])
        elif ne_type[ne] == 'PERSON':
            for ne2 in named_entity_list:
                if ne == ne2 or not ne_type[ne2]=='PERSON':
                    continue
                elif (ne.split()[-1])==(ne2.split()[-1]) and (not len(ne.split()) == len(ne2.split())):
                    if ne2 in ne_per_locations:
                        alter_names[ne2].add(ne)
                        alter_names[ne].add(ne2)
                        ne_per_locations[ne2].extend([(occ[0], occ[2], occ[3]) for occ in occurences])
                        recorded = True
                    elif ne in ne_per_locations:
                        ne_per_locations[ne].extend([(occ[0], occ[2], occ[3]) for occ in occurences])
                        recorded = True
            if not recorded:
                ne_per_locations[ne].extend([(occ[0], occ[2], occ[3]) for occ in occurences])
        else:
            ne_per_locations[ne].extend([(occ[0], occ[2], occ[3]) for occ in occurences])
    cluster_used = []
    # add mentions from coref clusters.
    for cluster in cluster_json:
        cluster_mention_loc = [(mention['sent_ind'], mention['token_ind'], mention['end_ind']-1) for mention in cluster]
        for ne, ne_locs in ne_per_locations.items():
            if not set(ne_locs).isdisjoint(cluster_mention_loc) and (not cluster in cluster_used):
                ne_per_clusters[ne] = cluster
                cluster_used.append(cluster)
    ne_total = defaultdict(list)
    for k in ne_per_locations:
        ne_total[k].extend(ne_per_locations[k])
        cluster_mention_loc = [(mention['sent_ind'], mention['token_ind'], mention['end_ind']-1) for mention in  ne_per_clusters.get(k, [])]
        ne_total[k].extend(cluster_mention_loc)
    return ne_total, alter_names

In [253]:
annot_dict = load_annot('../annotation/acl_mpqa_eval.json')
doc_dict = load_doc('../doc/mpqa_data/', annot_dict.keys())
alias_dict = load_alias()
def most_common(lst):
    return max(set(lst), key=lst.count)

In [254]:
def map_sent(sent, strength):
    if sent == 'NotNegative':
        return 'Positive'
    if sent == 'NotPositive':
        return 'Negative'
total_pair=0
label_count=0
orig_ctn = Counter()
label_counter = Counter()
with open('./mpqa_new.json', 'w') as fout:
    for doc_id, annot_elem in annot_dict.items():
        doc_elem = doc_dict[doc_id]
        output_list = []
        ne_loc_dict, alter_names = find_occurences(doc_elem['named_entity'], doc_elem['cluster_json'], alias_dict)
        doc_level_ne_loc = {}
        sent_dict = {(d['holder'], d['target']): map_sent(d['sentiment'], d['strength']) for d in annot_elem['sentiments'] if not d['holder']==d['target']}
        orig_ctn.update(sent_dict.values())
        label_count += len(sent_dict.keys())
        exist = 0
        total_tokens = [item for sublist in doc_elem['text'] for item in sublist]
        per_sent_token = [len(sublist) for sublist in (doc_elem['text'])]
        for ne, ne_loc_list in ne_loc_dict.items():
            new_ne_locs = set([])
            for sent_ind, start_ind, end_ind in ne_loc_list:
                prev_sent_token_sum = sum(per_sent_token[:sent_ind-1])
                new_ne_locs.add((prev_sent_token_sum+start_ind-1, prev_sent_token_sum+end_ind-1))
            doc_level_ne_loc[ne] = list(new_ne_locs)
        for ent1, ent1_loc in ne_loc_dict.items():
            for ent2, ent2_loc in ne_loc_dict.items():
                label = 'Null'
                total_pair+=1
                if ent1 == ent2:
                    continue
                if (ent1, ent2) in sent_dict:
                    label = sent_dict.pop((ent1, ent2))
                else:
                    for ent1_alt_name in alter_names.get(ent1, []):
                        if (ent1_alt_name, ent2) in sent_dict:
                            label = sent_dict.pop((ent1_alt_name, ent2))
                        for ent2_alt_name in alter_names.get(ent2, []):
                            if (ent1, ent2_alt_name) in sent_dict:
                                label = sent_dict.pop((ent1, ent2_alt_name))
                            if (ent1_alt_name, ent2_alt_name) in sent_dict:
                                label = sent_dict.pop((ent1_alt_name, ent2_alt_name))
                    for ent2_alt_name in alter_names.get(ent2, []):
                        if (ent1, ent2_alt_name) in sent_dict:
                            label = sent_dict.pop((ent1, ent2_alt_name))
                ent1_sent = [k[0] for k in ent1_loc]
                ent2_sent = [k[0] for k in ent2_loc]
                joint_sent = set(ent1_sent).intersection(set(ent2_sent))
                single_pair = {'holder':ent1, 'target':ent2, 'docid':doc_id,
                               'label': label, 'token':total_tokens, 
                               'holder_index':doc_level_ne_loc[ent1], 'target_index': doc_level_ne_loc[ent2]}
                fout.write(json.dumps(single_pair) + "\n")