### Installations, if needed

In [None]:
# Install the right version of spaCy
!pip install spacy==2.0.12 # Above 2.0.12 doesn't seem work with the neuralcoref resolution (at least 2.0.13 and 2.0.16 don't)

In [None]:
# Install the large Neural Coref model
!pip install https://github.com/huggingface/neuralcoref-models/releases/download/en_coref_lg-3.0.0/en_coref_lg-3.0.0.tar.gz # This is the coref language model

### Importing Libraries

In [None]:
import spacy
import en_coref_lg
from spacy import displacy
import pandas as pd
import numpy as np
import networkx as nx
import time
from graphviz import Source
import os

### Loading and previewing our export from OntoNotes5

In [None]:
JSON_FILENAME = 'ner_output_1.json'
FILEPATH_TO_JSON = "onto_sql_output/"

onto_import = pd.read_json(FILEPATH_TO_JSON + JSON_FILENAME)

In [None]:
onto_import.head()

In [None]:
onto_import.loc[0].sentence_string

### Loading the Large Neural Coref spaCy Model

In [None]:
%time
nlp = en_coref_lg.load()

#### Using it to loop through dependency parsing for a selection of sentences

In [None]:
# Establish which sentence rows we want to work through right now
SENT_MIN = 0
SENT_MAX = 500

# Check the time and start parsing via spaCy
start_time = time.time()
onto_import["spacy_parse"] = onto_import.loc[SENT_MIN:SENT_MAX,:].apply(lambda x: nlp(x["sentence_string"]), axis=1)

# Calculate the duration 
duration = time.time() - start_time
print("Applying the spaCy pipeline took {0:.2f} seconds".format(duration))

### Viewing some example text, with highlighted named entities

In [None]:
# Choose an entry integer to see its text and the parse below.
ENTRY = 490

displacy.render(onto_import.loc[ENTRY,"spacy_parse"], jupyter=True, style='ent')

#### Viewing dependencies

In [None]:
displacy.render(onto_import.loc[ENTRY,"spacy_parse"], jupyter=True, style='dep')

#### Identifying some useful attributes of the spaCy tokens

In [None]:
# This gives us the dependency type

onto_import.loc[490,"spacy_parse"][13].head

In [None]:
# This gives us the token of its head, from which we can call other attributes of the head.

onto_import.loc[490,"spacy_parse"][13].head.i

In [None]:
# If we just want the string of the head, that's here:

onto_import.loc[0,"spacy_parse"][32].head.text

In [None]:
# If we want the index of the head within the sentence (to find when a multi-word NE
# depends on something outside of that NE phrase)

onto_import.loc[0,"spacy_parse"][32].head.i

### Creating a Dictionary with a Graph for Each NER Type

#### Establish dictionary to hold graphs and define a function to graph a row 

In [None]:
graphs_dict = dict()

def graph_row(df_row):
    
    NO_OF_GENERATIONS = 2
    
    root_node = str(df_row["ner_type"])

    # Retrieve our Directed Graph for this NE Type or create a new one
    G = graphs_dict.get(root_node, nx.DiGraph())
    
    # For each row, add a node for the Named Entity's type
    G.add_node(root_node)
    node_weight = G.nodes[root_node].get('weight', 0)
    G.node[root_node]['weight'] = node_weight + 1
    G.node[root_node]['xlabel'] = G.node[root_node]['weight']

    # Let's assume the head is the last word of the phrase
    # and get that Token from the spaCy parse:
    
    head_index = df_row["ner_end_word_index"]
    head_of_phrase = df_row["spacy_parse"][head_index]
    
    nodes_needing_head_branches = [(head_of_phrase, root_node)]
    next_head_nodes = []
    nodes_needing_child_branches = [(head_of_phrase, root_node)]
    next_child_nodes = []
    
    current_gen = 1
    
    while current_gen <= NO_OF_GENERATIONS:
        
        for (node, child_label) in nodes_needing_head_branches:
            try: 
                # Get the explanation of its dependency type in this usage
                relation = spacy.explain(node.dep_)

                # If no explanation, revert to the raw dependency type.
                if relation is None:
                    relation = node.dep_

                # Trying to catch and diagnose some problem cases
                elif relation == "punctuation":
                    print("NE '{1}' marked as punctuation in sentence '{0}'".format(df_row["sentence_string"], df_row["ner_string"]))
                    print(" --- ")
                elif relation == "determiner":
                    print("NE '{1}' marked as determiner in sentence '{0}'".format(df_row["sentence_string"], df_row["ner_string"]))
                    print(" --- ")

                # Object of preposition doesn't do much, so let's see what's on the other side of that.
                elif relation == "object of preposition":
                    relation = "head of prep phrase"
                    # move to the preposition so we get its head later on when adding node
                    node = node.head
                    
                print("relation: " + relation)
                
                if relation == 'ROOT':
                    continue
                
                # differentiating head-focused edges from child-focused edges
                intermediary_node_label = "head g{0} {1}".format(current_gen, relation)
                
                print("intermediary node label:" + intermediary_node_label)

                # Add a node for the relation, and connect that to the main entity.
                # Add the weights to the nodes and edges.
                G.add_node(intermediary_node_label)
                relation_weight = G.node[intermediary_node_label].get('weight', 0)
                G.node[intermediary_node_label]['weight'] = relation_weight + 1
                
                # Add a node from the dependency type to the entity's head and add weights
                norm = node.head.norm_
                G.add_node(norm)
                norm_edge_weight = G.node[norm].get('weight', 0)
                G.node[norm]['weight'] = norm_edge_weight + 1
                
                G.add_edge(child_label, intermediary_node_label, label="head")
                
                G.add_edge(intermediary_node_label, norm)
                
                # Add the next round for the next generation
                if node.head != node:
                    next_head_nodes.append((node.head, norm))
                
            except:
                print("passed in head")
                pass
            
            # Move the next round into the queue and clear it
            nodes_needing_head_branches = next_head_nodes
            next_head_nodes = []

            for (node, parent_label) in nodes_needing_child_branches:
                for child in node.children:

                    # Get the relation of the child node to this one.
                    relation = spacy.explain(child.dep_)

                    # If no explanation, revert to the raw dependency type.
                    if relation is None:
                        relation = child.dep_

                    if relation == 'punctuation':
                        continue

                    # Differentiate these relations from head relations
                    # and add the node and its weights
                    intermediary_node_label = "child g{0} {1}".format(current_gen, relation)
                    G.add_node(intermediary_node_label)
                    relation_weight = G.nodes[intermediary_node_label].get('weight', 0)
                    G.node[intermediary_node_label]['weight'] = relation_weight + 1
                    
                    # Add the child as normed, and add its edge and weights
                    child_norm = child.norm_
                    G.add_node(child_norm)
                    leaf_weight = G.node[child_norm].get('weight', 0)
                    G.node[child_norm]['weight'] = leaf_weight + 1

                    # add edge between the parent node and this relation and weights
                    G.add_edge(parent_label, intermediary_node_label, label="child")
                    
                    G.add_edge(intermediary_node_label, child_norm)

                    # Queue up the children for the next generation
                    for childs_child in child.children:
                        next_child_nodes.append((childs_child, child_norm))      
                
                # Move the children into the queue and clear it.
                nodes_needing_child_branches = next_child_nodes
                next_child_nodes = []
        
        # Increment the generation
        current_gen += 1

    graphs_dict[root_node] = G

#### Applying that to our selected rows of our dataframe

In [None]:
%time
_ = onto_import.loc[SENT_MIN:SENT_MAX,:].apply(lambda x: graph_row(x), axis=1)

#### Calculate the Log Probability of the Named Entity Types

In [None]:
def normalize_as_logp(successor_nodes, G):
    log_total = np.log(sum([G.node[node]['weight'] for node in successor_nodes]))
    for n in successor_nodes:
        G.node[n]['lp'] = np.log(G.node[n]['weight']) - log_total

def calc_lps(G):
    for node in nx.nodes(G):
        if G.successors(node):
            [normalize_as_logp(succ[1], G) for succ in nx.bfs_successors(G, node)]
    return G

In [None]:
%time

log_graphs_dict = {key:calc_lps(value) for (key,value) in graphs_dict.items()}

#### Draw the graphs for the Named Entity Types

In [None]:
%time

for key, value in graphs_dict.items():
    
    graph_filepath = 'NER_Type_Graphs/'
    graph_filename = 'G_{0}'.format(key)

    # Write our graph to DOT format to be read and visualized by GraphViz
    nx.drawing.nx_pydot.write_dot(value, "{0}.dot".format(graph_filepath + graph_filename))
    
    # Write the graphs via neato
    os.system("neato -Tsvg -Goverlap=false -Gsplines=true -Gsep=0.1 {0}.dot -o {0}.svg".format(graph_filepath + graph_filename))

### Graphing a Candidate Named Entity

In [None]:
# Defining helper functions to build the candidate graphs

def reconcile_ents_and_clusters(doc):
    """"Reconcile the coreference and entities lists into a
        a single dict of graphs to make.
        
        Keys are (start.idx, end.idx) tuples.
        Values are (spaCy.Span, graph_id) tuples."""
    
    # Keys are (start.idx, end.idx) tuples.
    # Values are (spaCy.Span, graph_id) tuples."
    occurence_ind  = {}
    
    cluster_offset = 0
    if doc._.has_coref:
        cluster_offset = len(doc._.coref_clusters)
        for cluster_idx, cluster in enumerate(doc._.coref_clusters):
            for mention in cluster:
                key = (mention.start, mention.end)
                occurence_ind[key] = (mention, cluster_idx)
    
    # Now let's see if each ent is in there. If not, we'll add it to
    # our cluster list.
    new_cluster_idx = 0
    
    for ent_ind, ent in enumerate(doc.ents):
        key = (ent.start, ent.end)
        try:
            occurence_ind[key]
        except:
            occurence_ind[key] = (ent, cluster_offset + new_cluster_idx)
            new_cluster_idx += 1
    return occurence_ind

def graph_entity(ent, doc, G, root_node):
    
    NO_OF_GENERATIONS = 2
    
    # Assume the head of the phrase, if it is a phrase, is the last word
    # in the phrase.
    head_of_phrase = ent[-1]
    
    nodes_needing_head_branches = [(head_of_phrase, root_node)]
    next_head_nodes = []
    nodes_needing_child_branches = [(head_of_phrase, root_node)]
    next_child_nodes = []
    
    current_gen = 1
    while current_gen <= NO_OF_GENERATIONS:
        
        for (node, child_label) in nodes_needing_head_branches:
            try: 
                # Get the explanation of its relation arc in this usage
                relation = spacy.explain(node.dep_)
                # If no explanation, revert to the raw dependency type.
                if relation is None:
                    relation = node.dep_
                    
                if relation == 'ROOT':
                    continue

                # Object of preposition doesn't do much, so let's see what's on the other side of that.
                elif relation == "object of preposition":
                    relation = "head of prep phrase"
                    # move to the preposition so we get its head later on when adding node
                    node = node.head
                    
                intermediary_node_label = "head g{0} {1}".format(current_gen, relation)

                # Add a node for the relation, and connect that to the main entity
                G.add_node(intermediary_node_label)
                G.add_edge(child_label, intermediary_node_label, label="head")

                # Add a node from the relation to the entity's head, and connect that
                # to the relation type
                normed_head = node.head.norm_
                G.add_node(normed_head)
                G.add_edge(intermediary_node_label, normed_head)
                
                if node.head != node:
                    next_head_nodes.append((node.head, node.head.text))
            except Exception as ex:
                print("passed in head.\n{0}".format(ex))
                pass
            
        nodes_needing_head_branches = next_head_nodes
        next_head_nodes = []
        
        for (node, parent_label) in nodes_needing_child_branches:
            try:
                for child in node.children:
                    relation = spacy.explain(child.dep_)
                    # If no explanation, revert to the raw dependency type.
                    if relation is None:
                        relation = node.dep_
                        
                    elif relation == 'punctuation':
                        continue
                        
                    intermediary_node_label = "child g{0} {1}".format(current_gen, relation)

                    G.add_node(intermediary_node_label)
                    G.add_edge(parent_label, intermediary_node_label, label="child")
                    print("added child edge from {0} to {1}".format(parent_label, intermediary_node_label))
                    
                    normed_child = child.norm_
                    G.add_node(normed_child)
                    print("adding child node: {0}".format(normed_child))
                    G.add_edge(intermediary_node_label, normed_child)
                    
                    for childs_child in child.children:
                        next_child_nodes.append((childs_child, node.childs_child.text))
            except Exception as ex:
                print("passed in child.\n{0}".format(ex))
                pass                
        nodes_needing_child_branches = next_child_nodes
        next_child_nodes = []
        
        # Increment the generation
        current_gen += 1
        
    return G
    
def graph_candidates_in_doc(candidate_text):
    
    doc = nlp(candidate_text)
    
    clustered_ents = reconcile_ents_and_clusters(doc)
    
    # Initialize a graph for each clustered_ent
    candidate_graphs = dict()
    
    for ((start_idx, end_idx), (ent,graph_idx)) in clustered_ents.items():
        
        # Make sure we have our root. No harm done if it already exists.
        # If it's a cluster, we get the Span of the most representative
        # mention in the cluster
        try:
            root_node = doc._.coref_clusters[graph_idx].main.text
        # If it's not, we just use the ent name
        except:
            root_node = ent.text
        
        # Get the cluster's existing graph from previous mentions
        # or create a new one.
        root_node, G = candidate_graphs.get(graph_idx, (root_node, nx.DiGraph()))
        
        G.add_node(root_node)
        
        # A helper function adds the rest of the graph
        print("\nGraphing entity: {0}".format(ent.text))
        candidate_graphs[graph_idx] = (root_node, graph_entity(ent, doc, G, root_node))
    
    #graph_entity(ent, doc) for ent in doc.ents]
    return candidate_graphs

#### Applying the functions on a selected article

In [None]:
TEST_ENTRY = 90
test_candidate_text = onto_import.loc[TEST_ENTRY,"document_text"]
print(test_candidate_text)

candidate_roots_and_graphs = graph_candidates_in_doc(test_candidate_text)

#### Draw the candidate graphs for that article

In [None]:
for key, (root, cand_graph) in candidate_roots_and_graphs.items():
    
    graph_filepath = 'NER_Type_Graphs/'
    graph_filename = 'G_' + str(key)

    # Write our graph to DOT format to be read and visualized by GraphViz
    nx.drawing.nx_pydot.write_dot(cand_graph, "{0}.dot".format(graph_filepath + graph_filename))
    
    os.system("neato -Tsvg -Goverlap=false -Gsplines=true -Gsep=0.1 {0}.dot -o {0}.svg".format(graph_filepath + graph_filename))

#### Holding area for test to show that the candidate doc ents are all in the reconciled list

In [None]:
# Some tests
TEST_ENTRY = 90
doc = nlp(onto_import.loc[TEST_ENTRY,"document_text"])

# Testing to make sure all the ents are present in the reconciled list
reconciled = reconcile_ents_and_clusters(doc)
for key in [(ent.start, ent.end) for ent in doc.ents]:
    assert key in reconciled.keys()

### Similarity Between Graphs

In [None]:
def logsumexp(a):
    """Simple re-implementation of scipy.misc.logsumexp."""
    a_max = np.max(a)
    if a_max == -np.inf:
        return -np.inf
    sumexp = np.sum(np.exp(a - a_max))
    return np.log(sumexp) + a_max

def score_relation_and_children(nlp, cand_G, NET_G, cand_parent, net_options, cand_succ_dict, NET_succ_dict):
    
    # If this node has no further children,
    # compare it to its NET options
    if cand_parent not in cand_succ_dict:
        
        cand_token = nlp(cand_parent)[0]
        
        sim_scores = [cand_token.similarity(nlp(net_opt)[0]) for net_opt in net_options]
        
        # Get the index of the most similar word
        sim_idx = np.argmax(sim_scores)
        
        # Recover the float value of the winning word's log probability
        sim_lp = float(NET_G.node[net_options[sim_idx]]['lp'])
        
        # Recover the score: log probability of that word under the NET dependency plus
        # the log prob of the similarity score of the most similar word
        similarity_score = sim_lp + np.log(sim_scores[sim_idx])
        
        print("log of similarity between {0} and {1} is {2}".format(cand_parent, net_options[sim_idx], np.log(sim_scores[sim_idx])))
        
        return similarity_score
    
    # Otherwise let's score the dependency tags and
    # recursively call this on their children
    else:
        # Prepare to hold scores from multiple branches
        accumulated_scores = []
        
        # Iterate over dependency relations from the parent
        for relation in cand_succ_dict[cand_parent]:
            
            # Proceed if the NET_graph has this relation:
            try:
                # Get the options from the NET graph branching from this relation type
                child_net_options = NET_succ_dict[relation]
            
                # Iterate over the children of each relation
                for cand_child in cand_succ_dict[relation]:
                    accumulated_scores.append(score_relation_and_children(nlp, cand_G, NET_G, cand_child, child_net_options, cand_succ_dict, NET_succ_dict))
                    
            except Exception as ex:
                print("Exception down candidate {0} branch. No match in NET?\n{1}".format(relation,ex))
        
        # If we have more than an empty list
        if accumulated_scores != list():
            return logsumexp(accumulated_scores)
            
def compare_candidate_to_NET(nlp, candidate_G, candidate_root, NET_G, net_root):
    
    # Calculate the breadth-first search of the candidate graph
    cand_succ_dict = {par:child_list for (par,child_list)in nx.bfs_successors(candidate_G, candidate_root)}

    # Calculate the breadth-first search of the NET graph
    # (For a speed boost we should do this externallay and pass it in.)
    NET_succ_dict = {par:child_list for (par,child_list)in nx.bfs_successors(NET_G, net_root)}
    
    # Run the results of runnign the recursive score_relation_and_children function on the initial roots
    return score_relation_and_children(nlp, candidate_G, NET_G, candidate_root, [], cand_succ_dict, NET_succ_dict)

In [None]:
CANDIDATE_GRAPH_NO = 0
NET_TO_COMPARE = 'PRODUCT'

score = compare_candidate_to_NET(nlp, candidate_roots_and_graphs[CANDIDATE_GRAPH_NO][1], candidate_roots_and_graphs[CANDIDATE_GRAPH_NO][0], log_graphs_dict[NET_TO_COMPARE], NET_TO_COMPARE)

print("\nThe comparison score between G_{2} and the {0} graph is {1}".format(NET_TO_COMPARE, score, CANDIDATE_GRAPH_NO))

#### To manually examine the dicts in the comparison above:

In [None]:
NET_succ_dict = {par:child_list for (par,child_list)in nx.bfs_successors(log_graphs_dict[NET_TO_COMPARE], NET_TO_COMPARE)}

NET_succ_dict

In [None]:
cand_succ_dict = cand_succ_dict = {par:child_list for (par,child_list)in nx.bfs_successors(candidate_roots_and_graphs[CANDIDATE_GRAPH_NO][1], candidate_roots_and_graphs[CANDIDATE_GRAPH_NO][0])}

cand_succ_dict