In [1]:
import glob
import csv
from collections import defaultdict, Counter

In [2]:
# PICK A YEAR

year='2015_train'
#year='2015_test'
#year='2016'
#year='2017'

### 1. Analysis of the gold data

In [3]:
edl_data_dir='data'
analysis_dir='analysis'

In [4]:
edl2016_file='%s/edl2016_gold_data/data/tac_kbp_2016_edl_evaluation_gold_standard_entity_mentions.tab' % edl_data_dir
edl2015_test='%s/edl2015_testing_data/data/tac_kbp_2015_tedl_evaluation_gold_standard_entity_mentions.tab' % edl_data_dir
edl2015_train='%s/edl2015_training_data_2.0/data/tac_kbp_2015_tedl_training_gold_standard_entity_mentions.tab' % edl_data_dir
edl2017_file='%s/LDC2017E25_TAC_KBP_2017_Evaluation_Source_Corpus/'

#### 1a. Investigation of variance

In [5]:
def is_nil(link):
    return link.startswith('NIL')

In [6]:
def extract_row_info(row):
    mid=row[1]
    form=row[2]
    
    doc_id=row[3].split(':')[0]
    collection=doc_id[:3]
    
    link=row[4]
    etype=row[5]
    mtype=row[6]
    
    return mid, form, doc_id, collection, link, etype, mtype

In [7]:
def analyze_file(rows):

    cnt_nils=0
    collections=defaultdict(int)
    nom_only=0
    nam_only=0
    
    cluster_nam_nom=defaultdict(list)
    
    nil_clusters=set()
    clusters_with_mentions=defaultdict(list)
    clusters_with_docs=defaultdict(list)
    form_to_meaning=defaultdict(set)

    for row in rows:

        mid, form, doc_id, collection, link, etype, mtype = extract_row_info(row)

        if collection not in {'ENG', 'NYT'}:
            continue
        form_to_meaning[form].add(link)
        collections[collection]+=1
        if is_nil(link) and etype=='PER':
            cnt_nils+=1
            nil_clusters.add(link)
            
            clusters_with_mentions[link].append(form)
            clusters_with_docs[link].append(doc_id)
            
            cluster_nam_nom[link].append(mtype)
            
    print('Count of docs per collection', collections)
    print('# Mentions of NILs = ', cnt_nils)
    print('# NIL clusters', len(nil_clusters))
    
    for link, nam_nom in cluster_nam_nom.items():
        if 'NAM' not in nam_nom:
            nom_only+=1
        elif 'NOM' not in nam_nom:
            nam_only+=1

    print('NOM only', nom_only)
    print('NAM only', nam_only)
        
    return nil_clusters, clusters_with_mentions, clusters_with_docs, form_to_meaning

In [8]:
def aggregate_per_cluster_size(clusters):
    agg_per_cluster_size=defaultdict(int)

    for l, ids in clusters.items():
        agg_per_cluster_size[len(ids)]+=1
    
    return agg_per_cluster_size

In [9]:
def print_largest_clusters(clusters, deduplicate=False, how_many=5):
    counts={}
    for link, ids in clusters.items():
        if deduplicate:
            ids=set(ids)
        counts[link]=len(ids)
    
    print('Deduplicated forms', deduplicate)
    for link, count in Counter(counts).most_common(2):
        print(link, count, set(clusters[link]))
    return

In [10]:
for file_to_open in [edl2015_train, edl2015_test, edl2016_file]:
    print(file_to_open)
    with open(file_to_open, 'r') as csvfile:
        rdrr = csv.reader(csvfile, delimiter='\t')

        nil_clusters, clusters_with_mentions, clusters_with_docs, form_to_meaning = analyze_file(rdrr)

        agg_per_cluster_size_men = aggregate_per_cluster_size(clusters_with_mentions)
        agg_per_cluster_size_doc = aggregate_per_cluster_size(clusters_with_docs)

        for dedup in [True, False]:
            print_largest_clusters(clusters_with_mentions, deduplicate=dedup)
    print()

data/edl2015_training_data_2.0/data/tac_kbp_2015_tedl_training_gold_standard_entity_mentions.tab
Count of docs per collection defaultdict(<class 'int'>, {'ENG': 13545})
# Mentions of NILs =  3574
# NIL clusters 1800
NOM only 140
NAM only 1524
Deduplicated forms True
NIL03593 31 {'shit', 'vet', 'veteran', 'guy', 'victim', 'neighbor', 'killer', 'Eddie Routh', 'Muslim', 'child', 'Sympathizer', 'dude', 'Eddie Ray Routh', 'user', 'performer', 'guard', 'son', 'friend', 'asshole', 'doper', 'Dude', 'creep', 'Killer', 'sufferer', 'POS', 'Eddie', 'cretin', 'Routh', 'individual', 'Schizophrenic', 'man'}
NIL00087 12 {'Gamadhere', 'terrorist', 'leader', 'mastermind', 'Mohamed Mohamud', 'Mohamed Kuno', 'Kuno', 'Dulyadayna', 'Mohamud', 'man', 'teacher', 'Dulyadin Gamadhere'}
Deduplicated forms False
NIL03593 125 {'shit', 'vet', 'veteran', 'guy', 'victim', 'neighbor', 'killer', 'Eddie Routh', 'Muslim', 'child', 'Sympathizer', 'dude', 'Eddie Ray Routh', 'user', 'performer', 'guard', 'son', 'friend', 'a

#### 1b. Investigate ambiguity

In [12]:

for year in ['2015_train', '2015_test', '2016']:

    if year=='2015_train':
        file_to_open=edl2015_train
    elif year=='2015_test':
        file_to_open=edl2015_test
    else: # 2016
        file_to_open=edl2016_file
    
    print(file_to_open)
    with open(file_to_open, 'r') as csvfile:
        rdrr = csv.reader(csvfile, delimiter='\t')

        nil_clusters, clusters_with_mentions, clusters_with_docs, form_to_meaning = analyze_file(rdrr)
    
    
        ambiguous_all_file='%s/%s_ambiguous_all.tsv' % (analysis_dir, year)
        ambiguous_nils_file='%s/%s_ambiguous_nils.tsv' % (analysis_dir, year)
        ambiguous_nonnils_file='%s/%s_ambiguous_nonnils.tsv' % (analysis_dir, year)
        ambiguous_both_file='%s/%s_ambiguous_both.tsv' % (analysis_dir, year)

        c=0
        with open(ambiguous_all_file, 'w') as w:
            with open(ambiguous_nils_file, 'w') as w_n:
                with open(ambiguous_nonnils_file, 'w') as w_nn:
                    with open(ambiguous_both_file, 'w') as w_both:
                        w.write('\t'.join(['FORM','TOTAL', 'NILs', 'non-NILs']))
                        for form, meanings in form_to_meaning.items():
                            if len(meanings)>1:
                                nils=0
                                non_nils=0
                                for m in meanings:
                                    if is_nil(m):
                                        nils+=1
                                    else:
                                        non_nils+=1
                                print(form, len(meanings), 'total', nils, 'NILs', non_nils, 'non-NILs')   
                                c+=1
                                row='%s\t%d\t%d\t%d\n' % (form, len(meanings), nils, non_nils)
                                w.write(row)
                                if nils==0: # only non_nils
                                    w_nn.write(row)
                                elif non_nils==0: # only nils
                                    w_n.write(row)
                                else: # there are both
                                    w_both.write(row)

data/edl2015_training_data_2.0/data/tac_kbp_2015_tedl_training_gold_standard_entity_mentions.tab
Count of docs per collection defaultdict(<class 'int'>, {'ENG': 13545})
# Mentions of NILs =  3574
# NIL clusters 1800
NOM only 140
NAM only 1524
Whole Foods 2 total 1 NILs 1 non-NILs
Natanz 2 total 1 NILs 1 non-NILs
SE 2 total 1 NILs 1 non-NILs
Constantia Kloof 2 total 1 NILs 1 non-NILs
US District Court 2 total 1 NILs 1 non-NILs
correspondent 3 total 3 NILs 0 non-NILs
friend 14 total 11 NILs 3 non-NILs
director 4 total 2 NILs 2 non-NILs
member 5 total 3 NILs 2 non-NILs
student 3 total 3 NILs 0 non-NILs
teacher 2 total 2 NILs 0 non-NILs
one 14 total 9 NILs 5 non-NILs
spokesman 8 total 8 NILs 0 non-NILs
Warren 2 total 1 NILs 1 non-NILs
prosecutor 3 total 3 NILs 0 non-NILs
Weinreb 2 total 2 NILs 0 non-NILs
agent 2 total 2 NILs 0 non-NILs
Miriam Conrad 2 total 2 NILs 0 non-NILs
author 5 total 3 NILs 2 non-NILs
baby 5 total 2 NILs 3 non-NILs
son 11 total 8 NILs 3 non-NILs
child 7 total 4 NILs 

manager 6 total 6 NILs 0 non-NILs
cop 2 total 2 NILs 0 non-NILs
Dylann 2 total 2 NILs 0 non-NILs
boss 4 total 1 NILs 3 non-NILs
employee 4 total 4 NILs 0 non-NILs
socialist 2 total 2 NILs 0 non-NILs
lobbyist 2 total 2 NILs 0 non-NILs
examiner 3 total 3 NILs 0 non-NILs
donor 2 total 2 NILs 0 non-NILs
Georgia 3 total 1 NILs 2 non-NILs
clerk 2 total 2 NILs 0 non-NILs
shit 2 total 2 NILs 0 non-NILs
Sheriff 3 total 3 NILs 0 non-NILs
CEO 4 total 2 NILs 2 non-NILs
eu 2 total 0 NILs 2 non-NILs
Attorney General 5 total 3 NILs 2 non-NILs
vice president 4 total 3 NILs 1 non-NILs
Wall Street 3 total 0 NILs 3 non-NILs
child 2 total 2 NILs 0 non-NILs
Director 4 total 3 NILs 1 non-NILs
sister 5 total 5 NILs 0 non-NILs
mom 3 total 3 NILs 0 non-NILs
mother 13 total 12 NILs 1 non-NILs
Freddie Gray 2 total 1 NILs 1 non-NILs
Mosby 2 total 2 NILs 0 non-NILs
Secretary 2 total 2 NILs 0 non-NILs
prosecutor 2 total 1 NILs 1 non-NILs
Master_Live 2 total 2 NILs 0 non-NILs
RadecSupreme 2 total 2 NILs 0 non-NILs
w

father 4 total 0 NILs 4 non-NILs
side 3 total 1 NILs 2 non-NILs
coast 3 total 3 NILs 0 non-NILs
bank 7 total 3 NILs 4 non-NILs
founder 3 total 0 NILs 3 non-NILs
apple 2 total 0 NILs 2 non-NILs
premises 2 total 2 NILs 0 non-NILs
Terminal F 2 total 2 NILs 0 non-NILs
Zorlu Center 5 total 5 NILs 0 non-NILs
structure 2 total 2 NILs 0 non-NILs
manufacturer 3 total 0 NILs 3 non-NILs
official 14 total 10 NILs 4 non-NILs
terrorist 4 total 0 NILs 4 non-NILs
monitor 2 total 1 NILs 1 non-NILs
National 7 total 0 NILs 7 non-NILs
friend 8 total 7 NILs 1 non-NILs
coastal 2 total 2 NILs 0 non-NILs
Zhao Jiemin 4 total 4 NILs 0 non-NILs
firm 2 total 0 NILs 2 non-NILs
member 13 total 5 NILs 8 non-NILs
Samar 2 total 0 NILs 2 non-NILs
border 5 total 3 NILs 2 non-NILs
dictator 2 total 0 NILs 2 non-NILs
spokesman 9 total 5 NILs 4 non-NILs
head 10 total 3 NILs 7 non-NILs
alset 2 total 2 NILs 0 non-NILs
lawyer 8 total 6 NILs 2 non-NILs
prison 4 total 4 NILs 0 non-NILs
neighbor 2 total 0 NILs 2 non-NILs
judge 3 