### Installations, if needed

In [None]:
# Install the right version of spaCy
!pip install spacy==2.0.12 # Above 2.0.12 doesn't seem work with the neuralcoref resolution (at least 2.0.13 and 2.0.16 don't)

In [None]:
# Install the large Neural Coref model
!pip install https://github.com/huggingface/neuralcoref-models/releases/download/en_coref_lg-3.0.0/en_coref_lg-3.0.0.tar.gz # This is the coref language model

In [None]:
# Install dill which we're using for pickling our Named Entity Type (NET) graphs
!pip install dill

### Importing Libraries

In [None]:
import spacy
import en_coref_lg
from spacy import displacy
import pandas as pd
import numpy as np
import networkx as nx
import time
import os
import re
import dill as pickle
from collections import defaultdict

### Skip: Loading and previewing our export from OntoNotes5

In [2]:
JSON_FILENAME = 'ner_output_1.json'
FILEPATH_TO_JSON = "onto_sql_output/"

onto_import = pd.read_json(FILEPATH_TO_JSON + JSON_FILENAME)

In [None]:
onto_import.head()

### Load the Large Neural Coref spaCy Model

In [7]:
%time
nlp = en_coref_lg.load()

CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 9.06 µs


#### Skip: Using it to loop through dependency parsing for a selection of sentences

In [8]:
# Establish which sentence rows we want to work through right now
SENT_MIN = 0
SENT_MAX = 500

# Check the time and start parsing via spaCy
start_time = time.time()
onto_import["spacy_parse"] = onto_import.loc[SENT_MIN:SENT_MAX,:].apply(lambda x: nlp(x["sentence_string"]), axis=1)

# Calculate the duration 
duration = time.time() - start_time
print("Applying the spaCy pipeline took {0:.2f} seconds".format(duration))

Applying the spaCy pipeline took 13.52 seconds


### Skip: Viewing some example text, with highlighted named entities

In [None]:
# Choose an entry integer to see its text and the parse below.
ENTRY = 490

displacy.render(onto_import.loc[ENTRY,"spacy_parse"], jupyter=True, style='ent')

#### Skip: Viewing dependencies

In [None]:
displacy.render(onto_import.loc[ENTRY,"spacy_parse"], jupyter=True, style='dep')

#### Skip: Identifying some useful attributes of the spaCy tokens

In [None]:
# This gives us the dependency type

onto_import.loc[490,"spacy_parse"][13].head

In [None]:
# This gives us the token of its head, from which we can call other attributes of the head.

onto_import.loc[490,"spacy_parse"][13].head.i

In [None]:
# If we just want the string of the head, that's here:

onto_import.loc[0,"spacy_parse"][32].head.text

In [None]:
# If we want the index of the head within the sentence (to find when a multi-word NE
# depends on something outside of that NE phrase)

onto_import.loc[0,"spacy_parse"][32].head.i

### Skip: Creating a Dictionary with a Graph for Each NER Type

#### Skip: Establish dictionary to hold graphs and define a function to graph a row 

In [9]:
graphs_dict = dict()

def graph_row(df_row):
    
    NO_OF_GENERATIONS = 2
    
    root_node = str(df_row["ner_type"])

    # Retrieve our Directed Graph for this NE Type or create a new one
    G = graphs_dict.get(root_node, nx.DiGraph())
    
    # For each row, add a node for the Named Entity's type
    G.add_node(root_node)
    node_weight = G.nodes[root_node].get('weight', 0)
    G.node[root_node]['weight'] = node_weight + 1

    # Let's assume the head is the last word of the phrase
    # and get that Token from the spaCy parse:
    
    head_index = df_row["ner_end_word_index"]
    head_of_phrase = df_row["spacy_parse"][head_index]
    
    nodes_needing_head_branches = [(head_of_phrase, root_node)]
    next_head_nodes = []
    nodes_needing_child_branches = [(head_of_phrase, root_node)]
    next_child_nodes = []
    
    current_gen = 1
    
    while current_gen <= NO_OF_GENERATIONS:
        
        for (node, child_label) in nodes_needing_head_branches:
            try: 
                # Get the explanation of its dependency type in this usage
                relation = spacy.explain(node.dep_)

                # If no explanation, revert to the raw dependency type.
                if relation is None:
                    relation = node.dep_

                # Trying to catch and diagnose some problem cases
                elif relation == "punctuation":
                    print("NE '{1}' marked as punctuation in sentence '{0}'".format(df_row["sentence_string"], df_row["ner_string"]))
                    print(" --- ")
                elif relation == "determiner":
                    print("NE '{1}' marked as determiner in sentence '{0}'".format(df_row["sentence_string"], df_row["ner_string"]))
                    print(" --- ")

                # Object of preposition doesn't do much, so let's see what's on the other side of that.
                elif relation == "object of preposition":
                    relation = "head of prep phrase"
                    # move to the preposition so we get its head later on when adding node
                    node = node.head
                    
                print("relation: " + relation)
                
                if relation == 'ROOT':
                    continue
                
                # differentiating head-focused edges from child-focused edges
                intermediary_node_label = "head g{0} {1}".format(current_gen, relation)
                
                print("intermediary node label:" + intermediary_node_label)

                # Add a node for the relation, and connect that to the main entity.
                # Add the weights to the nodes and edges.
                G.add_node(intermediary_node_label)
                relation_weight = G.node[intermediary_node_label].get('weight', 0)
                G.node[intermediary_node_label]['weight'] = relation_weight + 1
                
                # Add a node from the dependency type to the entity's head and add weights
                norm = node.head.norm_
                G.add_node(norm)
                norm_edge_weight = G.node[norm].get('weight', 0)
                G.node[norm]['weight'] = norm_edge_weight + 1
                
                G.add_edge(child_label, intermediary_node_label, label="head")
                
                G.add_edge(intermediary_node_label, norm)
                
                # Add the next round for the next generation
                if node.head != node:
                    next_head_nodes.append((node.head, norm))
                
            except:
                print("passed in head")
                pass
            
            # Move the next round into the queue and clear it
            nodes_needing_head_branches = next_head_nodes
            next_head_nodes = []

            for (node, parent_label) in nodes_needing_child_branches:
                for child in node.children:

                    # Get the relation of the child node to this one.
                    relation = spacy.explain(child.dep_)

                    # If no explanation, revert to the raw dependency type.
                    if relation is None:
                        relation = child.dep_

                    if relation == 'punctuation':
                        continue

                    # Differentiate these relations from head relations
                    # and add the node and its weights
                    intermediary_node_label = "child g{0} {1}".format(current_gen, relation)
                    G.add_node(intermediary_node_label)
                    relation_weight = G.nodes[intermediary_node_label].get('weight', 0)
                    G.node[intermediary_node_label]['weight'] = relation_weight + 1
                    
                    # Add the child as normed, and add its edge and weights
                    child_norm = child.norm_
                    G.add_node(child_norm)
                    leaf_weight = G.node[child_norm].get('weight', 0)
                    G.node[child_norm]['weight'] = leaf_weight + 1

                    # add edge between the parent node and this relation and weights
                    G.add_edge(parent_label, intermediary_node_label, label="child")
                    
                    G.add_edge(intermediary_node_label, child_norm)

                    # Queue up the children for the next generation
                    for childs_child in child.children:
                        next_child_nodes.append((childs_child, child_norm))      
                
                # Move the children into the queue and clear it.
                nodes_needing_child_branches = next_child_nodes
                next_child_nodes = []
        
        # Increment the generation
        current_gen += 1

    graphs_dict[root_node] = G

#### Skip: Applying that to our selected rows of our dataframe

In [10]:
%time
_ = onto_import.loc[SENT_MIN:SENT_MAX,:].apply(lambda x: graph_row(x), axis=1)

CPU times: user 4 µs, sys: 1e+03 ns, total: 5 µs
Wall time: 11 µs
relation: adjectival modifier
intermediary node label:head g1 adjectival modifier
relation: head of prep phrase
intermediary node label:head g2 head of prep phrase
relation: adjectival modifier
intermediary node label:head g1 adjectival modifier
relation: head of prep phrase
intermediary node label:head g2 head of prep phrase
relation: head of prep phrase
intermediary node label:head g1 head of prep phrase
relation: nominal subject
intermediary node label:head g2 nominal subject
relation: head of prep phrase
intermediary node label:head g1 head of prep phrase
relation: head of prep phrase
intermediary node label:head g2 head of prep phrase
relation: head of prep phrase
intermediary node label:head g1 head of prep phrase
relation: head of prep phrase
intermediary node label:head g2 head of prep phrase
relation: noun phrase as adverbial modifier
intermediary node label:head g1 noun phrase as adverbial modifier
relation: co

intermediary node label:head g1 direct object
relation: open clausal complement
intermediary node label:head g2 open clausal complement
relation: head of prep phrase
intermediary node label:head g1 head of prep phrase
relation: conjunct
intermediary node label:head g2 conjunct
relation: nominal subject
intermediary node label:head g1 nominal subject
relation: relcl
intermediary node label:head g2 relcl
relation: nominal subject
intermediary node label:head g1 nominal subject
relation: conjunct
intermediary node label:head g2 conjunct
relation: conjunct
intermediary node label:head g1 conjunct
relation: nominal subject
intermediary node label:head g2 nominal subject
relation: nominal subject
intermediary node label:head g1 nominal subject
relation: ROOT
relation: nominal subject
intermediary node label:head g1 nominal subject
relation: ROOT
relation: adjectival modifier
intermediary node label:head g1 adjectival modifier
relation: head of prep phrase
intermediary node label:head g2 head

relation: conjunct
intermediary node label:head g2 conjunct
relation: attribute
intermediary node label:head g1 attribute
relation: acl
intermediary node label:head g2 acl
relation: nominal subject
intermediary node label:head g1 nominal subject
relation: conjunct
intermediary node label:head g2 conjunct
relation: conjunct
intermediary node label:head g1 conjunct
relation: nominal subject
intermediary node label:head g2 nominal subject
relation: noun phrase as adverbial modifier
intermediary node label:head g1 noun phrase as adverbial modifier
relation: ROOT
relation: nummod
intermediary node label:head g1 nummod
relation: noun phrase as adverbial modifier
intermediary node label:head g2 noun phrase as adverbial modifier
relation: head of prep phrase
intermediary node label:head g1 head of prep phrase
relation: ROOT
relation: nominal subject
intermediary node label:head g1 nominal subject
relation: adverbial clause modifier
intermediary node label:head g2 adverbial clause modifier
rela

#### Skip: Calculate the Log Probability of the Named Entity Types

In [None]:
def normalize_as_logp(successor_nodes, G):
    log_total = np.log(sum([G.node[node]['weight'] for node in successor_nodes]))
    for n in successor_nodes:
        G.node[n]['lp'] = np.log(G.node[n]['weight']) - log_total

def calc_lps(G):
    for node in nx.nodes(G):
        if G.successors(node):
            [normalize_as_logp(succ[1], G) for succ in nx.bfs_successors(G, node)]
    return G

In [None]:
%time

log_graphs_dict = {key:calc_lps(value) for (key,value) in graphs_dict.items()}

#### Skip: Draw the graphs for the Named Entity Types

In [12]:
for key, value in graphs_dict.items():
    
    graph_filepath = 'NER_Type_Graphs/'
    graph_filename = 'G_{0}'.format(key)

    # Write our graph to DOT format to be read and visualized by GraphViz
    nx.drawing.nx_pydot.write_dot(value, "{0}.dot".format(graph_filepath + graph_filename))
    
    # Write the graphs via neato
    os.system("neato -Tpng -Goverlap=false -Gsplines=true -Gsep=0.1 {0}.dot -o {0}.png".format(graph_filepath + graph_filename))

In [13]:
len(graphs_dict)

16

### Graphing a Candidate Named Entity

In [None]:
# Defining helper functions to build the candidate graphs

def reconcile_ents_and_clusters(document_id, doc):
    """"Reconcile the coreference and entities lists into a
        a single dict of graphs to make.
        
        Keys are ent_id strings [document_id]:[start_word_index]:[end_word_index].
        Values are (spaCy.Span, graph_id) tuples."""
    
    # Keys are (start.idx, end.idx) tuples.
    # Values are (spaCy.Span, graph_id) tuples."
    occurence_ind  = {}
    
    if doc._.has_coref:
        cluster_offset = len(doc._.coref_clusters)
        for cluster_idx, cluster in enumerate(doc._.coref_clusters):
            for mention in cluster:
                key = ":".join([document_id,str(mention.start), str(mention.end -1)])
                occurence_ind[key] = (mention, cluster_idx)
                
    # Now let's see if each ent is in there. If not, we'll add it to
    # our cluster list.
    new_cluster_idx = 0
    
    for ent_ind, ent in enumerate(doc.ents):
        key = ":".join([document_id,str(ent.start), str(ent.end -1)])
        try:
            occurence_ind[key]
        except:
            occurence_ind[key] = (ent, cluster_offset + new_cluster_idx)
            new_cluster_idx += 1
    
    print("Length of occurence index for doc {0} is: {1}".format(document_id, len(occurence_ind)))
    
    return occurence_ind

def graph_entity(ent, doc, G, root_node):
    
    NO_OF_GENERATIONS = 2
    
    # Assume the head of the phrase, if it is a phrase, is the last word
    # in the phrase.
    head_of_phrase = ent[-1]
    
    nodes_needing_head_branches = [(head_of_phrase, root_node)]
    next_head_nodes = []
    nodes_needing_child_branches = [(head_of_phrase, root_node)]
    next_child_nodes = []
    
    current_gen = 1
    while current_gen <= NO_OF_GENERATIONS:
        
        for (node, child_label) in nodes_needing_head_branches:
            try: 
                # Get the explanation of its relation arc in this usage
                relation = spacy.explain(node.dep_)
                # If no explanation, revert to the raw dependency type.
                if relation is None:
                    relation = node.dep_
                    
                if relation == 'ROOT':
                    continue

                # Object of preposition doesn't do much, so let's see what's on the other side of that.
                elif relation == "object of preposition":
                    relation = "head of prep phrase"
                    # move to the preposition so we get its head later on when adding node
                    node = node.head
                    
                intermediary_node_label = "head g{0} {1}".format(current_gen, relation)

                # Add a node for the relation, and connect that to the main entity
                G.add_node(intermediary_node_label)
                G.add_edge(child_label, intermediary_node_label, label="head")

                # Add a node from the relation to the entity's head, and connect that
                # to the relation type
                normed_head = node.head.norm_
                #print("adding head node: {0}".format(normed_head))
                G.add_node(normed_head)
                G.add_edge(intermediary_node_label, normed_head)
                
                if node.head != node:
                    next_head_nodes.append((node.head, node.head.text))
            except Exception as ex:
                print("passed in head.\n{0}".format(ex))
                pass
            
        nodes_needing_head_branches = next_head_nodes
        next_head_nodes = []
        
        for (node, parent_label) in nodes_needing_child_branches:
            try:
                for child in node.children:
                    relation = spacy.explain(child.dep_)
                    # If no explanation, revert to the raw dependency type.
                    if relation is None:
                        relation = node.dep_
                        
                    elif relation == 'punctuation':
                        continue
                        
                    intermediary_node_label = "child g{0} {1}".format(current_gen, relation)

                    G.add_node(intermediary_node_label)
                    G.add_edge(parent_label, intermediary_node_label, label="child")
                    
                    normed_child = child.norm_
                    G.add_node(normed_child)
                    #print("adding child node: {0}".format(normed_child))
                    G.add_edge(intermediary_node_label, normed_child)
                    
                    for childs_child in child.children:
                        next_child_nodes.append((childs_child, childs_child.text))
            except Exception as ex:
                print("passed in child.\n{0}".format(ex))
                pass                
        nodes_needing_child_branches = next_child_nodes
        next_child_nodes = []
        
        # Increment the generation
        current_gen += 1
        
    return G
    
def graph_candidates_in_doc(document_id, candidate_text):
    
    doc = nlp(candidate_text)
    
    # Key of clustered_ents is ent_it. Value is (Span, graph_cluter_id)
    clustered_ents = reconcile_ents_and_clusters(document_id, doc)
    
    # Initialize a graph for each clustered_ent
    candidate_graphs = dict()
    
    for ent_id, (ent, graph_idx) in clustered_ents.items():
        
        # Make sure we have our root. No harm done if it already exists.
        # If it's a cluster, we get the Span of the most representative
        # mention in the cluster
        try:
            root_node = doc._.coref_clusters[graph_idx].main.text
        # If it's not, we just use the ent name
        except:
            root_node = ent.text
        
        # Get the cluster's root node and existing graph from previous mentions
        # or create a new tuple of those.
        root_node, G, _ = candidate_graphs.get(graph_idx, (root_node, nx.DiGraph(), graph_idx))
        
        G.add_node(root_node)
        
        # A helper function adds the rest of the graph
        print("\nGraphing entity: {0}".format(ent.text))
        candidate_graphs[graph_idx] = (root_node, graph_entity(ent, doc, G, root_node), graph_idx)
    
    spacy_ent_graphs = {}
    
    # Filter back to just the spaCy ents
    for ent in doc.ents:
        key = ":".join([document_id,str(ent.start), str(ent.end - 1)])
        _, ent_graph_idx = clustered_ents[key]
        spacy_ent_graphs[key] = candidate_graphs[ent_graph_idx]
    
    #graph_entity(ent, doc) for ent in doc.ents]
    return spacy_ent_graphs

#### Skip: Applying the functions on a selected article

In [None]:
TEST_ENTRY = 90
test_candidate_text = onto_import.loc[TEST_ENTRY,"document_text"]
print(test_candidate_text)

candidate_roots_and_graphs = graph_candidates_in_doc("test_doc_id",test_candidate_text)

In [None]:
for ent_id in candidate_roots_and_graphs.values():
    print(ent_id)

#### Skip: Draw the candidate graphs for that article

In [None]:
for key, (root, cand_graph, clust) in candidate_roots_and_graphs.items():
    
    graph_filepath = 'NER_Type_Graphs/'
    graph_filename = 'G_' + str(key)

    # Write our graph to DOT format to be read and visualized by GraphViz
    nx.drawing.nx_pydot.write_dot(cand_graph, "{0}.dot".format(graph_filepath + graph_filename))
    
    os.system("neato -Tsvg -Goverlap=false -Gsplines=true -Gsep=0.1 {0}.dot -o {0}.svg".format(graph_filepath + graph_filename))

## Similarity Between Graphs

In [None]:
def logsumexp(a):
    """Simple re-implementation of scipy.misc.logsumexp."""
    a_max = np.max(a)
    if a_max == -np.inf:
        return -np.inf
    sumexp = np.sum(np.exp(a - a_max))
    return np.log(sumexp) + a_max

def score_relation_and_children(nlp, cand_G, NET_G, cand_parent, net_options, cand_succ_dict, NET_succ_dict, sim_dict, parent_weight):
        
    # If this node has no further children,
    # compare it to its NET options
    if cand_parent not in cand_succ_dict:
        
        sim_scores = list()
                
        cand_token = nlp(cand_parent)[0]
        
        for net_opt in net_options:
            try:
                sim_scores.append(sim_dict[cand_parent][net_opt])
            except:
                score = cand_token.similarity(nlp(net_opt)[0])
                sim_scores.append(score)
                sim_dict[cand_parent][net_opt] = score

        # Get the index of the most similar word
        sim_idx = np.argmax(sim_scores)
                
        # Recover the float value of the winning word's weight
        sim_weight = float(NET_G.node[net_options[sim_idx]]['weight'])
                
        # Score the similarity times the square root of the word's frequency (weight)
        similarity_score = sim_scores[sim_idx] #* (sim_weight / parent_weight) #np.log(sim_scores[sim_idx]) #+ sim_lp
                        
        return similarity_score
    
    # Otherwise let's score the dependency tags and
    # recursively call this on their children
    else:
        # Prepare to hold scores from multiple branches
        accumulated_scores = []
                
        # Iterate over dependency relations from the parent
        for relation in cand_succ_dict[cand_parent]:
                    
            # Proceed if the NET_graph has this relation:
            try:
                # Get the options from the NET graph branching from this relation type
                child_net_options = NET_succ_dict[relation]
                relation_weight = float(NET_G.node[relation]['weight'])
            
                # Iterate over the children of each relation
                for cand_child in cand_succ_dict[relation]:
                    score_from_this_node_to_leaves = score_relation_and_children(nlp, cand_G, NET_G, cand_child, child_net_options, cand_succ_dict, NET_succ_dict, sim_dict, relation_weight)
                    if score_from_this_node_to_leaves is not None:
                        accumulated_scores.append(score_from_this_node_to_leaves) # * (np.log(relation_weight) / np.log(parent_weight)))
                    
            except Exception as ex:
                pass
        
        # If we have more than an empty list
        if accumulated_scores != list():
            return sum(accumulated_scores) # logsumexp(accumulated_scores)
            
def compare_candidate_to_NET(nlp, candidate_G, candidate_root, NET_G, net_root, sim_dict):
    
    # Calculate the breadth-first search of the candidate graph
    cand_succ_dict = {par:child_list for (par,child_list)in nx.bfs_successors(candidate_G, candidate_root)}

    # Calculate the breadth-first search of the NET graph
    # (For a speed boost we should do this externallay and pass it in.)
    NET_succ_dict = {par:child_list for (par,child_list)in nx.bfs_successors(NET_G, net_root)}
    
    #print("\n\nNET: {0}".format(net_root))
    
    # Run the results of runnign the recursive score_relation_and_children function on the initial roots
    return score_relation_and_children(nlp, candidate_G, NET_G, candidate_root, [], cand_succ_dict, NET_succ_dict, sim_dict, NET_G.node[net_root]['weight'])

#### Skip: Calculate similarity between one candidate graph and one NET

In [None]:
CANDIDATE_GRAPH_NO = 0
NET_TO_COMPARE = 'PRODUCT'

score = compare_candidate_to_NET(nlp, candidate_roots_and_graphs[CANDIDATE_GRAPH_NO][1], candidate_roots_and_graphs[CANDIDATE_GRAPH_NO][0], log_graphs_dict[NET_TO_COMPARE], NET_TO_COMPARE)

print("\nThe comparison score between G_{2} and the {0} graph is {1}".format(NET_TO_COMPARE, score, CANDIDATE_GRAPH_NO))

#### Skip: To manually examine the dicts in the comparison above:

In [None]:
NET_succ_dict = {par:child_list for (par,child_list)in nx.bfs_successors(log_graphs_dict[NET_TO_COMPARE], NET_TO_COMPARE)}

NET_succ_dict

In [None]:
cand_succ_dict = cand_succ_dict = {par:child_list for (par,child_list)in nx.bfs_successors(candidate_roots_and_graphs[CANDIDATE_GRAPH_NO][1], candidate_roots_and_graphs[CANDIDATE_GRAPH_NO][0])}

cand_succ_dict

### To loop through all the NETs and predict the Max

In [None]:
def predict_max_from_all_nets(cand_root_node, cand_G, NETs_dict, nlp, sim_dict):
    
    # Initialize dict to hold the scores
    similarity_scores_dict = dict()
    
    # Iterate through the NETs
    for net_string, net_graph in NETs_dict.items():
        
        # Score the candidate against each NET
        sim_score = compare_candidate_to_NET(nlp, cand_G, cand_root_node, net_graph, net_string, sim_dict)
        
        # If there is a score for this NET, store it
        if sim_score is not None:
            similarity_scores_dict[net_string] = sim_score
    print(" *** ")
    print(cand_root_node)
    print(similarity_scores_dict)
    
    types_in_likelihood_order = ['ORG', 'GPE', 'PERSON', 'DATE', 'CARDINAL',
                                 'NORP', 'MONEY', 'PERCENT', 'ORDINAL', 'LOC',
                                 'TIME', 'WORK_OF_ART', 'QUANTITY', 'FAC',
                                 'PRODUCT', 'EVENT', 'LAW', 'LANGUAGE']
    
    high_score = float("-inf")
    prediction_so_far = 'NO_MATCH'
    
    for key,value in similarity_scores_dict.items():
        if value > high_score:
            high_score = value
            prediction_so_far = key
        
    return prediction_so_far

def predict_on_doc(doc_row, NETs_dict, nlp, Y_pred, sim_dict):
    
    cluster_dict = dict()
    
    removed_pw = re.sub(r'%pw', 'unk_date', doc_row["document"])
    removed_hypenation = re.sub(r'([a-z])-([a-z])', '\1\2', removed_pw)
    unified_digits = re.sub(r'[0-9]','D',removed_hypenation)
    filtered_doc = unified_digits
    
    ent_dict = graph_candidates_in_doc(doc_row["document_id"],filtered_doc)
    
    for ent_id, (cand_root_node, cand_G, cluster_id) in ent_dict.items():
        try:
            Y_pred.append([ent_id, cluster_dict[cluster_id]])
        except:
            cluster_dict[cluster_id] = predict_max_from_all_nets(cand_root_node, cand_G, NETs_dict, nlp, sim_dict)
            Y_pred.append([ent_id, cluster_dict[cluster_id]])

### Loop through documents to generate Y_Pred entitites

In [None]:
### Import the documents with trace strings removed
documents = pd.read_csv('../data/document.csv')

In [None]:
### Import the pickled NET graphs and calculate and add the log probabilities 
%time

with open('../data/LP_NET_Graphs.20181205.pkl', 'rb') as file:
  lp_net_graphs = pickle.load(file)

In [None]:
checkpoint_1 = time.time()

DOC_MIN = 0 
DOC_MAX = 1

# Build the graphs of candidates from the documents.
Y_pred = list()
sim_dict = defaultdict(lambda: dict())

# Loading a spaCy model but disabling parsers to speed up similarity measurements
sim_nlp = spacy.load('en_core_web_lg', disable=['parser', 'tagger', 'ner', 'neuralcoref'])

documents.loc[DOC_MIN:DOC_MAX,:].apply(lambda x: predict_on_doc(x, lp_net_graphs, sim_nlp, Y_pred, sim_dict), axis=1)

print("That took: {0} seconds".format(round(time.time() - checkpoint_1, 4)))


#### Confirming the number of entities match for the first two documents

In [None]:
len(Y_pred)

In [None]:
doc0 = nlp(removed_hypenation)
doc1 = nlp(documents.loc[1,"document"])

len(doc0.ents) + len(doc1.ents)

### Comparing Y_Pred and Y_True on the first two documents

In [None]:
ypred_df = pd.DataFrame(Y_pred, columns = ['entity_id', 'prediction']).set_index('entity_id')

In [None]:
#Save ypred to file
ypred_df.to_csv('../data/Y_pred.csv')

In [None]:
with open('../data/Y_true.pkl', 'rb') as file:
  ytrue_df = pickle.load(file)

In [None]:
ytrue_df_filtered = ytrue_df.filter(like="bc/cctv/00/cctv_0000@0000@cctv@bc@en@on",axis=0).sort_values(by=['sentence_index', 'start_word_index'])

ymerged_df = ypred_df.merge(ytrue_df_filtered,how='inner',on='entity_id')
ymerged_df.shape

In [None]:
ymerged_df.head()

In [None]:
matches = ymerged_df.loc[:,["prediction","type"]].apply(lambda x: x["prediction"] == x["type"], axis=1)
accuracy = sum(matches) / ymerged_df.shape[0]
print("Accuracy: {0}".format(accuracy))

In [None]:
baseline = ytrue_df.groupby("type").size().sort_values()[-1] / ytrue_df.groupby("type").size().sum()
print("Baseline if always guessing the most common entity type: {0}".format(baseline))

In [None]:
ytrue_df.groupby("type").size().sort_values()[-1]

### Plotting results for paper

In [None]:
net_names = []
no_nodes = []
densities = []

for net_type, graph in lp_net_graphs.items():
    print()
    print('*** {} ***'.format(net_type))
    print(nx.info(graph))

In [None]:
no_nodes = [1554, 9640, 6078, 8976, 5293, 5109, 11135, 1350, 1743, 3253, 452, 1739, 2312, 1471, 701, 1478, 1348, 2171]
avg_degree = [2.0077, 2.6008, 3.1048, 3.1491, 2.4049, 2.5858, 2.7688, 1.7356, 1.7722, 2.2266, 1.7434,  1.7464, 1.9957, 1.8559, 1.5920, 2.0325, 1.8427, 2.0511]
performance = [10.28, 39.48, 33.36, 28.55, 24.35, 23.95, 27.03, 2.87, 3.02, 35.25, 3.23, 6.16, 5.32, 5.41, 4.80, 7.43, 5.06, 18.44]

In [None]:
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale

In [None]:
plt.figure(1, figsize=(12,3))
x = no_nodes
y = performance
plt.subplot(121)
plt.scatter(no_nodes, performance)
plt.plot(np.unique(x), np.poly1d(np.polyfit(x, y, 1))(np.unique(x)))
plt.title("No. Nodes on Performance")
plt.ylabel("Performance")
plt.xlabel("No. Nodes")

plt.subplot(122)
x = avg_degree
plt.scatter(avg_degree, performance)
plt.plot(np.unique(x), np.poly1d(np.polyfit(x, y, 1))(np.unique(x)))
plt.title("Avg. Degree on Performance")
plt.xlabel("Average Degree")

plt.savefig('performance.svg', bbox_inches='tight')