### Installations, if needed

In [None]:
# Install the right version of spaCy
!pip install spacy==2.0.12 # Above 2.0.12 doesn't seem work with the neuralcoref resolution (at least 2.0.13 and 2.0.16 don't)

In [None]:
# Install the large Neural Coref model
!pip install https://github.com/huggingface/neuralcoref-models/releases/download/en_coref_lg-3.0.0/en_coref_lg-3.0.0.tar.gz # This is the coref language model

### Importing Libraries

In [None]:
import spacy
import en_coref_lg
from spacy import displacy
import pandas as pd
import networkx as nx
import time
from graphviz import Source
import matplotlib.pyplot as plt

### Loading and previewing our export from OntoNotes5

In [None]:
JSON_FILENAME = 'ner_output_1.json'
FILEPATH_TO_JSON = "onto_sql_output/"

onto_import = pd.read_json(FILEPATH_TO_JSON + JSON_FILENAME)

In [None]:
onto_import.head()

In [None]:
onto_import.loc[0].sentence_string

### Downloading and loading the large spaCy English pipeline

In [None]:
nlp = en_coref_lg.load()

### Looping through dependency parsing for all sentences

In [None]:
# Establish which sentence rows we want to work through right now
SENT_MIN = 0
SENT_MAX = 500

# Check the time and start parsing via spaCy
start_time = time.time()
onto_import["spacy_parse"] = onto_import.loc[SENT_MIN:SENT_MAX,:].apply(lambda x: nlp(x["sentence_string"]), axis=1)

# Calculate the duration 
duration = time.time() - start_time
print("Applying the spaCy pipeline took {0:.2f} seconds".format(duration))

### Viewing the text, with highlighted named entities

In [None]:
# Choose an entry integer to see its text and the parse below.
ENTRY = 490

displacy.render(onto_import.loc[ENTRY,"spacy_parse"], jupyter=True, style='ent')

### Viewing dependencies

In [None]:
displacy.render(onto_import.loc[ENTRY,"spacy_parse"], jupyter=True, style='dep')

### Identifying some useful attributes of the spaCy tokens

In [None]:
# This gives us the dependency type

onto_import.loc[490,"spacy_parse"][13].head

In [None]:
# This gives us the token of its head, from which we can call other attributes of the head.

onto_import.loc[490,"spacy_parse"][13].head.i

In [None]:
# If we just want the string of the head, that's here:

onto_import.loc[0,"spacy_parse"][32].head.text

In [None]:
# If we want the index of the head within the sentence (to find when a multi-word NE
# depends on something outside of that NE phrase)

onto_import.loc[0,"spacy_parse"][32].head.i

### Creating a Dictionary with a Graph for Each NER Type

In [None]:
graphs_dict = dict()

### Graphing a row

In [None]:
def graph_row(df_row):
    ner_type = str(df_row["ner_type"])

    # Retrieve our Directed Graph for this NE Type or create a new one
    G = graphs_dict.get(ner_type, nx.DiGraph())
    
    # For each row, add a node for the Named Entity's type
    G.add_node(ner_type)
    node_weight = G.nodes[ner_type].get('weight', 0)
    G.nodes[ner_type]['weight'] = node_weight + 1

    # If it's a phrase, let's find the node that reaches outside the range of this phrase:
    head_index = df_row["ner_end_word_index"]
    head_of_phrase = df_row["spacy_parse"][head_index]
        
    # Get the explanation of its dependency type in this usage
    explanation = spacy.explain(head_of_phrase.dep_)
    
    # If no explanation, revert to the raw dependency type.
    if explanation is None:
        explanation = head_of_phrase.dep_
    
    # Trying to catch and diagnose some problem cases
    elif explanation == "punctuation":
        print("NE '{1}' marked as punctuation in sentence '{0}'".format(df_row["sentence_string"], df_row["ner_string"]))
        print(" --- ")
    elif explanation == "determiner":
        print("NE '{1}' marked as determiner in sentence '{0}'".format(df_row["sentence_string"], df_row["ner_string"]))
        print(" --- ")

    # Object of preposition doesn't do much, so let's see what's on the other side of that.
    elif explanation == "object of preposition":
        explanation = "head of prep phrase"
        # move to the preposition so we get its head later on when adding node
        head_of_phrase = head_of_phrase.head
        
    # Add a node for that explanation, and connect that to the main entity
    G.add_node(explanation)
    explanation_weight = G.nodes[explanation].get('weight', 0)
    G.nodes[explanation]['weight'] = explanation_weight + 1
    G.add_edge(ner_type, explanation)
    edge_weight = G[ner_type][explanation].get('weight', 0)
    G[ner_type][explanation]['weight'] = edge_weight + 1
    
    # Add a node from the dependency type to the head of the phrase head's index, and connect that
    # to the dependency type
    norm = head_of_phrase.head.norm_
    G.add_edge(explanation, norm)
    norm_edge_weight = G[explanation][norm].get('weight', 0)
    G[explanation][norm]['weight'] = norm_edge_weight + 1
    
    graphs_dict[ner_type] = G

In [None]:
_ = onto_import.loc[SENT_MIN:SENT_MAX,:].apply(lambda x: graph_row(x), axis=1)

#### Draw the NET graphs

In [None]:
for key, value in graphs_dict.items():
    
    graph_filepath = 'NER_Type_Graphs/'
    graph_filename = 'G_' + str(key)

    # Write our graph to DOT format to be read and visualized by GraphViz
    nx.drawing.nx_pydot.write_dot(value, graph_filepath + graph_filename)

    # Load the saved DOT format
    graph_visualized = Source.from_file(graph_filepath + graph_filename, engine='neato')

    # Uncomment the following line to show all graphs.
    #display(graph_visualized)
    
    with open(graph_filepath + graph_filename, "r") as file:
        graph_dot = file.readlines()
        graph_dot.insert(1,'graph [overlap = scale, layout = neato];\n')
        
    with open(graph_filepath + graph_filename, "w") as file:
        file.writelines(graph_dot)

    # Save it to an svg
    graph_visualized.render(filename=graph_filepath + graph_filename,format='svg') #, cleanup='true')

# View just one in the notebook
graph_visualized

### Graphing a Candidate Named Entity

In [None]:
# Defining helper functions to build the candidate graphs

def reconcile_ents_and_clusters(doc):
    """"Reconcile the coreference and entities lists into a
        a single dict of graphs to make.
        
        Keys are (start.idx, end.idx) tuples.
        Values are (spaCy.Span, graph_id) tuples."""
    
    # A dictionary with key of 
    occurence_ind  = {}
    
    for cluster_idx, cluster in enumerate(doc._.coref_clusters):
        for mention in cluster:
            key = (mention.start, mention.end)
            occurence_ind[key] = (mention, cluster_idx)
    
    # Now let's see if each ent is in there. If not, we'll add it to
    # our cluster list.
    new_cluster_idx = 0
    
    for ent_ind, ent in enumerate(doc.ents):
        key = (ent.start, ent.end)
        try:
            occurence_ind[key]
        except:
            occurence_ind[key] = (ent, len(doc._.coref_clusters) + new_cluster_idx)
            new_cluster_idx += 1
    return occurence_ind

def graph_entity(ent, doc, G, root_node):
    
    # Assume the head of the phrase, if it is a phrase, is the last word
    # in the phrase.
    head_of_phrase = ent[-1]
        
    # Get the explanation of its relation arc in this usage
    relation = spacy.explain(head_of_phrase.dep_)
    
    # If no explanation, revert to the raw dependency type.
    if relation is None:
        relation = head_of_phrase.dep_

    # Object of preposition doesn't do much, so let's see what's on the other side of that.
    elif relation == "object of preposition":
        relation = "head of prep phrase"
        # move to the preposition so we get its head later on when adding node
        head_of_phrase = head_of_phrase.head
        
    # Add a node for the relation, and connect that to the main entity
    G.add_node(relation)
    G.add_edge(root_node, relation)
    
    # Add a node from the relation to the entity's head, and connect that
    # to the relation type
    normed_head = head_of_phrase.head.norm_
    G.add_edge(relation, normed_head)
        
    return G
    
def graph_candidates_in_doc(candidate_text):
    
    doc = nlp(candidate_text)
    
    clustered_ents = reconcile_ents_and_clusters(doc)
    
    # Initialize a graph for each clustered_ent
    candidate_graphs = dict()
    
    for ((start_idx, end_idx), (ent,graph_idx)) in clustered_ents.items():
        
        # Get the cluster's existing graph from previous mentions
        # or create a new one.
        G = candidate_graphs.get(graph_idx, nx.DiGraph())
        
        # Make sure we have our root. No harm done if it already exists.
        # If it's a cluster, we get the Span of the most representative
        # mention in the cluster
        try:
            root_node = doc._.coref_clusters[graph_idx].main.text
        # If it's not, we just use the ent name
        except:
            root_node = ent.text
        G.add_node(root_node)
        
        # A helper function adds the rest of the graph
        candidate_graphs[graph_idx] = graph_entity(ent, doc, G, root_node)
    
    #graph_entity(ent, doc) for ent in doc.ents]
    return candidate_graphs

In [None]:
# Some tests

# Testing to make sure all the ents are present in the reconciled list
reconciled = reconcile_ents_and_clusters(doc)
for key in [(ent.start, ent.end) for ent in doc.ents]:
    assert key in reconciled.keys()

In [None]:
test_graphs = graph_candidates_in_doc(test_candidate_text)

#### Draw the candidate graphs

In [None]:
for key, value in test_graphs.items():
    
    graph_filepath = 'NER_Type_Graphs/'
    graph_filename = 'G_' + str(key)

    # Write our graph to DOT format to be read and visualized by GraphViz
    nx.drawing.nx_pydot.write_dot(value, graph_filepath + graph_filename)

    # Load the saved DOT format
    graph_visualized = Source.from_file(graph_filepath + graph_filename, engine='neato')

    # Uncomment the following line to show all graphs.
    #display(graph_visualized)
    
    with open(graph_filepath + graph_filename, "r") as file:
        graph_dot = file.readlines()
        graph_dot.insert(1,'graph [overlap = scale, layout = neato];\n')
        
    with open(graph_filepath + graph_filename, "w") as file:
        file.writelines(graph_dot)

    # Save it to an svg
    graph_visualized.render(filename=graph_filepath + graph_filename,format='svg') #, cleanup='true')

# View just one in the notebook
graph_visualized