In [28]:
import pickle
import glob
from collections import Counter, defaultdict
import networkx as nx
import numpy as np

In [29]:
def inspect(data, with_types=False, graph=None):
    num_mentions=0
    identities=set()
    occurence_types=[]
    instance_types={}
    degrees_per_type=defaultdict(list)
    
    max_degree=0
    max_degree_node=None
    
    print(len(data), 'news documents')
    for news_item_obj in data:
        for m in news_item_obj.sys_entity_mentions:
            identities.add(m.identity)
            if with_types:
                a_type=m.the_type
                occurence_types.append(a_type)
                instance_types[m.identity]=a_type
        num_mentions+=len(news_item_obj.sys_entity_mentions)
        
    for identity in instance_types.keys():
        degree=len(graph.adj[identity])
        degrees_per_type[identity[-3:]].append(degree)
        if degree>max_degree:
            max_degree=degree
            max_degree_node=identity
        
    print('Num mentions', num_mentions)
    print('Num identities', len(identities))
    print('Type distribution of occurrences', Counter(occurence_types))
    print('Type distribution of aggregated instances', Counter(instance_types.values()))
    for k,v in degrees_per_type.items():
        print(k, round(np.mean(v),1), '/', round(np.std(v),1))
    print('Max degree node', max_degree_node, max_degree)

In [31]:
input_dir='bin'
if __name__ == "__main__":
    for file in glob.glob('%s/*.pkl' % input_dir):
        print(file)
        with open(file, 'rb') as f:
            data=pickle.load(f)
        with_types='type' in file
        g=None
        if with_types:
            graph_file=file.replace('.pkl', '.graph')
            g=nx.read_gpickle(graph_file)
        inspect(data, with_types, g)
        print()

bin/mention_docid_type_graph.pkl
3925 news documents
Num mentions 66728
Num identities 49686
Type distribution of occurrences Counter({'LOC': 25890, 'MISC': 16203, 'PER': 15612, 'ORG': 9023})
Type distribution of aggregated instances Counter({'LOC': 18099, 'MISC': 12833, 'PER': 11778, 'ORG': 6976})
LOC 16.2 / 11.5
ISC 16.1 / 11.8
ORG 18.7 / 14.1
PER 17.8 / 12.5
Max degree node http://cltl.nl/entity#Nederland2795LOC 107

bin/mention__graph.pkl
3925 news documents
Num mentions 66728
Num identities 21034
Type distribution of occurrences Counter()
Type distribution of aggregated instances Counter()
Max degree node None 0

bin/documents.pkl
3925 news documents
Num mentions 66728
Num identities 1
Type distribution of occurrences Counter()
Type distribution of aggregated instances Counter()
Max degree node None 0

bin/mention_type_graph.pkl
3925 news documents
Num mentions 66728
Num identities 22641
Type distribution of occurrences Counter({'LOC': 25890, 'MISC': 16203, 'PER': 15612, 'ORG': 90